blob: 2e7984d429be9253a7b500f817aface80d3e317e [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillardf403d292003-10-05 13:51:35 +0000112 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000113 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000114 XML_ERR_ERROR, NULL, 0,
115 (const char *) str1, (const char *) str2,
116 NULL, 0, 0,
117 msg, str1, str2);
118 ctxt->wellFormed = 0;
119}
120
121/**
122 * htmlParseErrInt:
123 * @ctxt: an HTML parser context
124 * @error: the error number
125 * @msg: the error message
126 * @val: integer info
127 *
128 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
129 */
130static void
131htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
132 const char *msg, int val)
133{
Daniel Veillard157fee02003-10-31 10:36:03 +0000134 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
135 (ctxt->instate == XML_PARSER_EOF))
136 return;
Daniel Veillardf403d292003-10-05 13:51:35 +0000137 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000138 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000139 XML_ERR_ERROR, NULL, 0, NULL, NULL,
140 NULL, val, 0, msg, val);
141 ctxt->wellFormed = 0;
142}
143
144/************************************************************************
145 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000146 * Parser stacks related functions and macros *
147 * *
148 ************************************************************************/
149
Daniel Veillard1c732d22002-11-30 11:22:59 +0000150/**
151 * htmlnamePush:
152 * @ctxt: an HTML parser context
153 * @value: the element name
154 *
155 * Pushes a new element name on top of the name stack
156 *
157 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000158 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000159static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000160htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000161{
162 if (ctxt->nameNr >= ctxt->nameMax) {
163 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000165 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000166 ctxt->nameMax *
167 sizeof(ctxt->nameTab[0]));
168 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000169 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 return (0);
171 }
172 }
173 ctxt->nameTab[ctxt->nameNr] = value;
174 ctxt->name = value;
175 return (ctxt->nameNr++);
176}
177/**
178 * htmlnamePop:
179 * @ctxt: an HTML parser context
180 *
181 * Pops the top element name from the name stack
182 *
183 * Returns the name just removed
184 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000185static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000186htmlnamePop(htmlParserCtxtPtr ctxt)
187{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000188 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000189
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190 if (ctxt->nameNr <= 0)
191 return (0);
192 ctxt->nameNr--;
193 if (ctxt->nameNr < 0)
194 return (0);
195 if (ctxt->nameNr > 0)
196 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
197 else
198 ctxt->name = NULL;
199 ret = ctxt->nameTab[ctxt->nameNr];
200 ctxt->nameTab[ctxt->nameNr] = 0;
201 return (ret);
202}
Owen Taylor3473f882001-02-23 17:55:21 +0000203
204/*
205 * Macros for accessing the content. Those should be used only by the parser,
206 * and not exported.
207 *
208 * Dirty macros, i.e. one need to make assumption on the context to use them
209 *
210 * CUR_PTR return the current pointer to the xmlChar to be parsed.
211 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
212 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
213 * in UNICODE mode. This should be used internally by the parser
214 * only to compare to ASCII values otherwise it would break when
215 * running with UTF-8 encoding.
216 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
217 * to compare on ASCII based substring.
218 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
219 * it should be used only to compare on ASCII based substring.
220 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000221 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000222 *
223 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
224 *
225 * CURRENT Returns the current char value, with the full decoding of
226 * UTF-8 if we are using this mode. It returns an int.
227 * NEXT Skip to the next character, this does the proper decoding
228 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000229 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000230 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
231 */
232
233#define UPPER (toupper(*ctxt->input->cur))
234
Daniel Veillard77a90a72003-03-22 00:04:05 +0000235#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000236
237#define NXT(val) ctxt->input->cur[(val)]
238
239#define UPP(val) (toupper(ctxt->input->cur[(val)]))
240
241#define CUR_PTR ctxt->input->cur
242
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000243#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
244 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
245 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define GROW if ((ctxt->progressive == 0) && \
248 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
249 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
251#define CURRENT ((int) (*ctxt->input->cur))
252
253#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
254
255/* Inported from XML */
256
Daniel Veillard561b7f82002-03-20 21:55:57 +0000257/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
258#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000259#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000262#define NXT(val) ctxt->input->cur[(val)]
263#define CUR_PTR ctxt->input->cur
264
265
266#define NEXTL(l) do { \
267 if (*(ctxt->input->cur) == '\n') { \
268 ctxt->input->line++; ctxt->input->col = 1; \
269 } else ctxt->input->col++; \
270 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
271 } while (0)
272
273/************
274 \
275 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
276 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
277 ************/
278
279#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
280#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
281
282#define COPY_BUF(l,b,i,v) \
283 if (l == 1) b[i++] = (xmlChar) v; \
284 else i += xmlCopyChar(l,&b[i],v)
285
286/**
287 * htmlCurrentChar:
288 * @ctxt: the HTML parser context
289 * @len: pointer to the length of the char read
290 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000291 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000292 * bytes in the input buffer. Implement the end of line normalization:
293 * 2.11 End-of-Line Handling
294 * If the encoding is unspecified, in the case we find an ISO-Latin-1
295 * char, then the encoding converter is plugged in automatically.
296 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000297 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000298 */
299
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000300static int
Owen Taylor3473f882001-02-23 17:55:21 +0000301htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
302 if (ctxt->instate == XML_PARSER_EOF)
303 return(0);
304
305 if (ctxt->token != 0) {
306 *len = 0;
307 return(ctxt->token);
308 }
309 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
310 /*
311 * We are supposed to handle UTF8, check it's valid
312 * From rfc2044: encoding of the Unicode values on UTF-8:
313 *
314 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
315 * 0000 0000-0000 007F 0xxxxxxx
316 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
317 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
318 *
319 * Check for the 0x110000 limit too
320 */
321 const unsigned char *cur = ctxt->input->cur;
322 unsigned char c;
323 unsigned int val;
324
325 c = *cur;
326 if (c & 0x80) {
327 if (cur[1] == 0)
328 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
329 if ((cur[1] & 0xc0) != 0x80)
330 goto encoding_error;
331 if ((c & 0xe0) == 0xe0) {
332
333 if (cur[2] == 0)
334 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
335 if ((cur[2] & 0xc0) != 0x80)
336 goto encoding_error;
337 if ((c & 0xf0) == 0xf0) {
338 if (cur[3] == 0)
339 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
340 if (((c & 0xf8) != 0xf0) ||
341 ((cur[3] & 0xc0) != 0x80))
342 goto encoding_error;
343 /* 4-byte code */
344 *len = 4;
345 val = (cur[0] & 0x7) << 18;
346 val |= (cur[1] & 0x3f) << 12;
347 val |= (cur[2] & 0x3f) << 6;
348 val |= cur[3] & 0x3f;
349 } else {
350 /* 3-byte code */
351 *len = 3;
352 val = (cur[0] & 0xf) << 12;
353 val |= (cur[1] & 0x3f) << 6;
354 val |= cur[2] & 0x3f;
355 }
356 } else {
357 /* 2-byte code */
358 *len = 2;
359 val = (cur[0] & 0x1f) << 6;
360 val |= cur[1] & 0x3f;
361 }
362 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000363 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
364 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000365 }
366 return(val);
367 } else {
368 /* 1-byte code */
369 *len = 1;
370 return((int) *ctxt->input->cur);
371 }
372 }
373 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000374 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000375 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000376 * XML constructs only use < 128 chars
377 */
378 *len = 1;
379 if ((int) *ctxt->input->cur < 0x80)
380 return((int) *ctxt->input->cur);
381
382 /*
383 * Humm this is bad, do an automatic flow conversion
384 */
385 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
386 ctxt->charset = XML_CHAR_ENCODING_UTF8;
387 return(xmlCurrentChar(ctxt, len));
388
389encoding_error:
390 /*
391 * If we detect an UTF8 error that probably mean that the
392 * input encoding didn't get properly advertized in the
393 * declaration header. Report the error and switch the encoding
394 * to ISO-Latin-1 (if you don't like this policy, just declare the
395 * encoding !)
396 */
Daniel Veillardf403d292003-10-05 13:51:35 +0000397 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
398 "Input is not proper UTF-8, indicate encoding !\n",
399 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000400 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +0000401 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
402 ctxt->input->cur[0], ctxt->input->cur[1],
403 ctxt->input->cur[2], ctxt->input->cur[3]);
404 }
405
406 ctxt->charset = XML_CHAR_ENCODING_8859_1;
407 *len = 1;
408 return((int) *ctxt->input->cur);
409}
410
411/**
Owen Taylor3473f882001-02-23 17:55:21 +0000412 * htmlSkipBlankChars:
413 * @ctxt: the HTML parser context
414 *
415 * skip all blanks character found at that point in the input streams.
416 *
417 * Returns the number of space chars skipped
418 */
419
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000420static int
Owen Taylor3473f882001-02-23 17:55:21 +0000421htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
422 int res = 0;
423
William M. Brack76e95df2003-10-18 16:20:14 +0000424 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000425 if ((*ctxt->input->cur == 0) &&
426 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
427 xmlPopInput(ctxt);
428 } else {
429 if (*(ctxt->input->cur) == '\n') {
430 ctxt->input->line++; ctxt->input->col = 1;
431 } else ctxt->input->col++;
432 ctxt->input->cur++;
433 ctxt->nbChars++;
434 if (*ctxt->input->cur == 0)
435 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
436 }
437 res++;
438 }
439 return(res);
440}
441
442
443
444/************************************************************************
445 * *
446 * The list of HTML elements and their properties *
447 * *
448 ************************************************************************/
449
450/*
451 * Start Tag: 1 means the start tag can be ommited
452 * End Tag: 1 means the end tag can be ommited
453 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000454 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000455 * Depr: this element is deprecated
456 * DTD: 1 means that this element is valid only in the Loose DTD
457 * 2 means that this element is valid only in the Frameset DTD
458 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000459 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000460 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000461 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000462
463/* Definitions and a couple of vars for HTML Elements */
464
465#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
466#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
467#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
468#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
469#define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
470#define FORMCTRL "input", "select", "textarea", "label", "button"
471#define PCDATA
472#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
473#define LIST "ul", "ol", "dir", "menu"
474#define MODIFIER
475#define FLOW BLOCK,INLINE
476#define EMPTY NULL
477
478
479static const char* html_flow[] = { FLOW, NULL } ;
480static const char* html_inline[] = { INLINE, NULL } ;
481
482/* placeholders: elts with content but no subelements */
483static const char* html_pcdata[] = { NULL } ;
484#define html_cdata html_pcdata
485
486
487/* ... and for HTML Attributes */
488
489#define COREATTRS "id", "class", "style", "title"
490#define I18N "lang", "dir"
491#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
492#define ATTRS COREATTRS,I18N,EVENTS
493#define CELLHALIGN "align", "char", "charoff"
494#define CELLVALIGN "valign"
495
496static const char* html_attrs[] = { ATTRS, NULL } ;
497static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
498static const char* core_attrs[] = { COREATTRS, NULL } ;
499static const char* i18n_attrs[] = { I18N, NULL } ;
500
501
502/* Other declarations that should go inline ... */
503static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
504 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
505 "tabindex", "onfocus", "onblur", NULL } ;
506static const char* target_attr[] = { "target", NULL } ;
507static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
508static const char* alt_attr[] = { "alt", NULL } ;
509static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
510static const char* href_attrs[] = { "href", NULL } ;
511static const char* clear_attrs[] = { "clear", NULL } ;
512static const char* inline_p[] = { INLINE, "p", NULL } ;
513static const char* flow_param[] = { FLOW, "param", NULL } ;
514static const char* applet_attrs[] = { COREATTRS , "codebase",
515 "archive", "alt", "name", "height", "width", "align",
516 "hspace", "vspace", NULL } ;
517static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
518 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
519static const char* basefont_attrs[] =
520 { "id", "size", "color", "face", NULL } ;
521static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
522static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
523static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
524static const char* body_depr[] = { "background", "bgcolor", "text",
525 "link", "vlink", "alink", NULL } ;
526static const char* button_attrs[] = { ATTRS, "name", "value", "type",
527 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
528
529
530static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
531static const char* col_elt[] = { "col", NULL } ;
532static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
533static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
534static const char* dl_contents[] = { "dt", "dd", NULL } ;
535static const char* compact_attr[] = { "compact", NULL } ;
536static const char* label_attr[] = { "label", NULL } ;
537static const char* fieldset_contents[] = { FLOW, "legend" } ;
538static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
539static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
540static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
541static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
542static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
543static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
544static const char* head_attrs[] = { I18N, "profile", NULL } ;
545static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
546static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
547static const char* version_attr[] = { "version", NULL } ;
548static const char* html_content[] = { "head", "body", "frameset", NULL } ;
549static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
550static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
551static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
552static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
553static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
554static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
555static const char* align_attr[] = { "align", NULL } ;
556static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
557static const char* map_contents[] = { BLOCK, "area", NULL } ;
558static const char* name_attr[] = { "name", NULL } ;
559static const char* action_attr[] = { "action", NULL } ;
560static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
561static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
562static const char* content_attr[] = { "content", NULL } ;
563static const char* type_attr[] = { "type", NULL } ;
564static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
565static const char* object_contents[] = { FLOW, "param", NULL } ;
566static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
567static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
568static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
569static const char* option_elt[] = { "option", NULL } ;
570static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
571static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
572static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
573static const char* width_attr[] = { "width", NULL } ;
574static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
575static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
576static const char* language_attr[] = { "language", NULL } ;
577static const char* select_content[] = { "optgroup", "option", NULL } ;
578static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
579static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
580static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
581static const char* table_depr[] = { "align", "bgcolor", NULL } ;
582static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
583static const char* tr_elt[] = { "tr", NULL } ;
584static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
585static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
586static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
587static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
588static const char* tr_contents[] = { "th", "td", NULL } ;
589static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
590static const char* li_elt[] = { "li", NULL } ;
591static const char* ul_depr[] = { "type", "compact", NULL} ;
592static const char* dir_attr[] = { "dir", NULL} ;
593
594#define DECL (const char**)
595
Daniel Veillard22090732001-07-16 00:06:07 +0000596static const htmlElemDesc
597html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000598{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
599 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
600},
601{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
602 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
603},
604{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
605 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
606},
607{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
608 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
609},
610{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
611 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
612},
613{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
614 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
615},
616{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
617 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
618},
619{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
620 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
621},
622{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
623 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
624},
625{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
626 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
627},
628{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
629 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
630},
631{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
632 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
633},
634{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
635 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
636},
637{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
638 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
639},
640{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
641 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
642},
643{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
644 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
645},
646{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
647 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
648},
649{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
650 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
651},
652{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
653 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
654},
655{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
656 EMPTY , NULL , DECL col_attrs , NULL, NULL
657},
658{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
659 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
660},
661{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
662 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
663},
664{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
665 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
666},
667{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
668 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
669},
670{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
671 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
672},
673{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
674 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
675},
676{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
677 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
678},
679{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
680 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
681},
682{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
683 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
684},
685{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
686 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
687},
688{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
689 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
690},
691{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
692 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
693},
694{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
695 EMPTY, NULL, NULL, DECL frame_attrs, NULL
696},
697{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
698 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
699},
700{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
701 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
702},
703{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
704 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
705},
706{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
707 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
708},
709{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
710 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
711},
712{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
713 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
714},
715{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
716 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
717},
718{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
719 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
720},
721{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
722 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
723},
724{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
725 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
726},
727{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
728 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
729},
730{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
731 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
732},
733{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
734 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
735},
736{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
737 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
738},
739{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
740 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
741},
742{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
743 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
744},
745{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
746 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
747},
748{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
749 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
750},
751{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
752 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
753},
754{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
755 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
756},
757{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
758 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
759},
760{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
761 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
762},
763{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
764 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
765},
766{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
767 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
768},
769{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
770 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
771},
772{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
773 DECL html_flow, "div", DECL html_attrs, NULL, NULL
774},
775{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
776 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
777},
778{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
779 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
780},
781{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
782 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
783},
784{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
785 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
786},
787{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
788 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
789},
790{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
791 EMPTY, NULL, DECL param_attrs, NULL, name_attr
792},
793{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
794 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
795},
796{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
797 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
798},
799{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
800 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
801},
802{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
803 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
804},
805{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
806 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
807},
808{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
809 DECL select_content, NULL, DECL select_attrs, NULL, NULL
810},
811{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
812 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
813},
814{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
815 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
816},
817{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
818 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
819},
820{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
821 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
822},
823{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
824 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
825},
826{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
827 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
828},
829{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
830 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
831},
832{ "table", 0, 0, 0, 0, 0, 0, 0, "",
833 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
834},
835{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
836 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
837},
838{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
839 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
840},
841{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
842 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
843},
844{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
845 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
846},
847{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
848 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
849},
850{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
851 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
852},
853{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
854 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
855},
856{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
857 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
858},
859{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
860 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
861},
862{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
863 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
864},
865{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
866 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
867},
868{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
869 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
870}
Owen Taylor3473f882001-02-23 17:55:21 +0000871};
872
873/*
Owen Taylor3473f882001-02-23 17:55:21 +0000874 * start tags that imply the end of current element
875 */
Daniel Veillard22090732001-07-16 00:06:07 +0000876static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000877"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
878 "dl", "ul", "ol", "menu", "dir", "address", "pre",
879 "listing", "xmp", "head", NULL,
880"head", "p", NULL,
881"title", "p", NULL,
882"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +0000883"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000884"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
885 "pre", "listing", "xmp", "head", "li", NULL,
886"hr", "p", "head", NULL,
887"h1", "p", "head", NULL,
888"h2", "p", "head", NULL,
889"h3", "p", "head", NULL,
890"h4", "p", "head", NULL,
891"h5", "p", "head", NULL,
892"h6", "p", "head", NULL,
893"dir", "p", "head", NULL,
894"address", "p", "head", "ul", NULL,
895"pre", "p", "head", "ul", NULL,
896"listing", "p", "head", NULL,
897"xmp", "p", "head", NULL,
898"blockquote", "p", "head", NULL,
899"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
900 "xmp", "head", NULL,
901"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
902 "head", "dd", NULL,
903"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
904 "head", "dt", NULL,
905"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
906 "listing", "xmp", NULL,
907"ol", "p", "head", "ul", NULL,
908"menu", "p", "head", "ul", NULL,
909"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
910"div", "p", "head", NULL,
911"noscript", "p", "head", NULL,
912"center", "font", "b", "i", "p", "head", NULL,
913"a", "a", NULL,
914"caption", "p", NULL,
915"colgroup", "caption", "colgroup", "col", "p", NULL,
916"col", "caption", "col", "p", NULL,
917"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
918 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000919"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
920"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000921"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
922"thead", "caption", "col", "colgroup", NULL,
923"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
924 "tbody", "p", NULL,
925"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
926 "tfoot", "tbody", "p", NULL,
927"optgroup", "option", NULL,
928"option", "option", NULL,
929"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
930 "pre", "listing", "xmp", "a", NULL,
931NULL
932};
933
934/*
935 * The list of HTML elements which are supposed not to have
936 * CDATA content and where a p element will be implied
937 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000938 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000939 * implied paragraph
940 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000941static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000942 "html",
943 "head",
944 "body",
945 NULL
946};
947
948/*
949 * The list of HTML attributes which are of content %Script;
950 * NOTE: when adding ones, check htmlIsScriptAttribute() since
951 * it assumes the name starts with 'on'
952 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000953static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000954 "onclick",
955 "ondblclick",
956 "onmousedown",
957 "onmouseup",
958 "onmouseover",
959 "onmousemove",
960 "onmouseout",
961 "onkeypress",
962 "onkeydown",
963 "onkeyup",
964 "onload",
965 "onunload",
966 "onfocus",
967 "onblur",
968 "onsubmit",
969 "onrest",
970 "onchange",
971 "onselect"
972};
973
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000974/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000975 * This table is used by the htmlparser to know what to do with
976 * broken html pages. By assigning different priorities to different
977 * elements the parser can decide how to handle extra endtags.
978 * Endtags are only allowed to close elements with lower or equal
979 * priority.
980 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000981
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000982typedef struct {
983 const char *name;
984 int priority;
985} elementPriority;
986
Daniel Veillard22090732001-07-16 00:06:07 +0000987static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000988 {"div", 150},
989 {"td", 160},
990 {"th", 160},
991 {"tr", 170},
992 {"thead", 180},
993 {"tbody", 180},
994 {"tfoot", 180},
995 {"table", 190},
996 {"head", 200},
997 {"body", 200},
998 {"html", 220},
999 {NULL, 100} /* Default priority */
1000};
Owen Taylor3473f882001-02-23 17:55:21 +00001001
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001002static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001003static int htmlStartCloseIndexinitialized = 0;
1004
1005/************************************************************************
1006 * *
1007 * functions to handle HTML specific data *
1008 * *
1009 ************************************************************************/
1010
1011/**
1012 * htmlInitAutoClose:
1013 *
1014 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1015 * This is not reentrant. Call xmlInitParser() once before processing in
1016 * case of use in multithreaded programs.
1017 */
1018void
1019htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001020 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001021
1022 if (htmlStartCloseIndexinitialized) return;
1023
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001024 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1025 indx = 0;
1026 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1027 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001028 while (htmlStartClose[i] != NULL) i++;
1029 i++;
1030 }
1031 htmlStartCloseIndexinitialized = 1;
1032}
1033
1034/**
1035 * htmlTagLookup:
1036 * @tag: The tag name in lowercase
1037 *
1038 * Lookup the HTML tag in the ElementTable
1039 *
1040 * Returns the related htmlElemDescPtr or NULL if not found.
1041 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001042const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001043htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001044 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001045
1046 for (i = 0; i < (sizeof(html40ElementTable) /
1047 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001048 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001049 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001050 }
1051 return(NULL);
1052}
1053
1054/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001055 * htmlGetEndPriority:
1056 * @name: The name of the element to look up the priority for.
1057 *
1058 * Return value: The "endtag" priority.
1059 **/
1060static int
1061htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001062 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001063
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001064 while ((htmlEndPriority[i].name != NULL) &&
1065 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1066 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001067
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001068 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001069}
1070
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001071
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001072/**
Owen Taylor3473f882001-02-23 17:55:21 +00001073 * htmlCheckAutoClose:
1074 * @newtag: The new tag name
1075 * @oldtag: The old tag name
1076 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001077 * Checks whether the new tag is one of the registered valid tags for
1078 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001079 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1080 *
1081 * Returns 0 if no, 1 if yes.
1082 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001083static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001084htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1085{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001086 int i, indx;
1087 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001088
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001089 if (htmlStartCloseIndexinitialized == 0)
1090 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001091
1092 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001093 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001094 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001095 if (closed == NULL)
1096 return (0);
1097 if (xmlStrEqual(BAD_CAST * closed, newtag))
1098 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001099 }
1100
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001101 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001102 i++;
1103 while (htmlStartClose[i] != NULL) {
1104 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001105 return (1);
1106 }
1107 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001108 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001109 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001110}
1111
1112/**
1113 * htmlAutoCloseOnClose:
1114 * @ctxt: an HTML parser context
1115 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001116 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001117 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001118 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001119 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001120static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001121htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1122{
1123 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001124 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001125
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001126 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001127
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001128 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001129
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001130 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1131 break;
1132 /*
1133 * A missplaced endtag can only close elements with lower
1134 * or equal priority, so if we find an element with higher
1135 * priority before we find an element with
1136 * matching name, we just ignore this endtag
1137 */
1138 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1139 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001140 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001141 if (i < 0)
1142 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001143
1144 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001145 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001146 if ((info != NULL) && (info->endTag == 3)) {
1147 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1148 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001149 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001150 }
1151 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1152 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001153 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001154 }
1155}
1156
1157/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001158 * htmlAutoCloseOnEnd:
1159 * @ctxt: an HTML parser context
1160 *
1161 * Close all remaining tags at the end of the stream
1162 */
1163static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001164htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1165{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001166 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001167
William M. Brack899e64a2003-09-26 18:03:42 +00001168 if (ctxt->nameNr == 0)
1169 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001170 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001171 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1172 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001173 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001174 }
1175}
1176
1177/**
Owen Taylor3473f882001-02-23 17:55:21 +00001178 * htmlAutoClose:
1179 * @ctxt: an HTML parser context
1180 * @newtag: The new tag name or NULL
1181 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001182 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001183 * The list is kept in htmlStartClose array. This function is
1184 * called when a new tag has been detected and generates the
1185 * appropriates closes if possible/needed.
1186 * If newtag is NULL this mean we are at the end of the resource
1187 * and we should check
1188 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001189static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001190htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1191{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001192 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001193 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001194 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1195 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001196 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001197 }
1198 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001199 htmlAutoCloseOnEnd(ctxt);
1200 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001201 }
1202 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001203 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1204 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1205 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001206 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1207 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001208 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001209 }
Owen Taylor3473f882001-02-23 17:55:21 +00001210}
1211
1212/**
1213 * htmlAutoCloseTag:
1214 * @doc: the HTML document
1215 * @name: The tag name
1216 * @elem: the HTML element
1217 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001218 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001219 * The list is kept in htmlStartClose array. This function checks
1220 * if the element or one of it's children would autoclose the
1221 * given tag.
1222 *
1223 * Returns 1 if autoclose, 0 otherwise
1224 */
1225int
1226htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1227 htmlNodePtr child;
1228
1229 if (elem == NULL) return(1);
1230 if (xmlStrEqual(name, elem->name)) return(0);
1231 if (htmlCheckAutoClose(elem->name, name)) return(1);
1232 child = elem->children;
1233 while (child != NULL) {
1234 if (htmlAutoCloseTag(doc, name, child)) return(1);
1235 child = child->next;
1236 }
1237 return(0);
1238}
1239
1240/**
1241 * htmlIsAutoClosed:
1242 * @doc: the HTML document
1243 * @elem: the HTML element
1244 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001245 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001246 * The list is kept in htmlStartClose array. This function checks
1247 * if a tag is autoclosed by one of it's child
1248 *
1249 * Returns 1 if autoclosed, 0 otherwise
1250 */
1251int
1252htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1253 htmlNodePtr child;
1254
1255 if (elem == NULL) return(1);
1256 child = elem->children;
1257 while (child != NULL) {
1258 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1259 child = child->next;
1260 }
1261 return(0);
1262}
1263
1264/**
1265 * htmlCheckImplied:
1266 * @ctxt: an HTML parser context
1267 * @newtag: The new tag name
1268 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001269 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001270 * called when a new tag has been detected and generates the
1271 * appropriates implicit tags if missing
1272 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001273static void
Owen Taylor3473f882001-02-23 17:55:21 +00001274htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1275 if (!htmlOmittedDefaultValue)
1276 return;
1277 if (xmlStrEqual(newtag, BAD_CAST"html"))
1278 return;
1279 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001280 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001281 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1282 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1283 }
1284 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1285 return;
1286 if ((ctxt->nameNr <= 1) &&
1287 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1288 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1289 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1290 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1291 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1292 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1293 /*
1294 * dropped OBJECT ... i you put it first BODY will be
1295 * assumed !
1296 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001297 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001298 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1299 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1300 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1301 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1302 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1303 int i;
1304 for (i = 0;i < ctxt->nameNr;i++) {
1305 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1306 return;
1307 }
1308 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1309 return;
1310 }
1311 }
1312
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001313 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001314 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1315 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1316 }
1317}
1318
1319/**
1320 * htmlCheckParagraph
1321 * @ctxt: an HTML parser context
1322 *
1323 * Check whether a p element need to be implied before inserting
1324 * characters in the current element.
1325 *
1326 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1327 * in case of error.
1328 */
1329
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001330static int
Owen Taylor3473f882001-02-23 17:55:21 +00001331htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1332 const xmlChar *tag;
1333 int i;
1334
1335 if (ctxt == NULL)
1336 return(-1);
1337 tag = ctxt->name;
1338 if (tag == NULL) {
1339 htmlAutoClose(ctxt, BAD_CAST"p");
1340 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001341 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001342 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1343 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1344 return(1);
1345 }
1346 if (!htmlOmittedDefaultValue)
1347 return(0);
1348 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1349 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001350 htmlAutoClose(ctxt, BAD_CAST"p");
1351 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001352 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001353 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1354 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1355 return(1);
1356 }
1357 }
1358 return(0);
1359}
1360
1361/**
1362 * htmlIsScriptAttribute:
1363 * @name: an attribute name
1364 *
1365 * Check if an attribute is of content type Script
1366 *
1367 * Returns 1 is the attribute is a script 0 otherwise
1368 */
1369int
1370htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001371 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001372
1373 if (name == NULL)
1374 return(0);
1375 /*
1376 * all script attributes start with 'on'
1377 */
1378 if ((name[0] != 'o') || (name[1] != 'n'))
1379 return(0);
1380 for (i = 0;
1381 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1382 i++) {
1383 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1384 return(1);
1385 }
1386 return(0);
1387}
1388
1389/************************************************************************
1390 * *
1391 * The list of HTML predefined entities *
1392 * *
1393 ************************************************************************/
1394
1395
Daniel Veillard22090732001-07-16 00:06:07 +00001396static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001397/*
1398 * the 4 absolute ones, plus apostrophe.
1399 */
1400{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1401{ 38, "amp", "ampersand, U+0026 ISOnum" },
1402{ 39, "apos", "single quote" },
1403{ 60, "lt", "less-than sign, U+003C ISOnum" },
1404{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1405
1406/*
1407 * A bunch still in the 128-255 range
1408 * Replacing them depend really on the charset used.
1409 */
1410{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1411{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1412{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1413{ 163, "pound","pound sign, U+00A3 ISOnum" },
1414{ 164, "curren","currency sign, U+00A4 ISOnum" },
1415{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1416{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1417{ 167, "sect", "section sign, U+00A7 ISOnum" },
1418{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1419{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1420{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1421{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1422{ 172, "not", "not sign, U+00AC ISOnum" },
1423{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1424{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1425{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1426{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1427{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1428{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1429{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1430{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1431{ 181, "micro","micro sign, U+00B5 ISOnum" },
1432{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1433{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1434{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1435{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1436{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1437{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1438{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1439{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1440{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1441{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1442{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1443{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1444{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1445{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1446{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1447{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1448{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1449{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1450{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1451{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1452{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1453{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1454{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1455{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1456{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1457{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1458{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1459{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1460{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1461{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1462{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1463{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1464{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1465{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1466{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1467{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1468{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1469{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1470{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1471{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1472{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1473{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1474{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1475{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1476{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1477{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1478{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1479{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1480{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1481{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1482{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1483{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1484{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1485{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1486{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1487{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1488{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1489{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1490{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1491{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1492{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1493{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1494{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1495{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1496{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1497{ 247, "divide","division sign, U+00F7 ISOnum" },
1498{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1499{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1500{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1501{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1502{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1503{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1504{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1505{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1506
1507{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1508{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1509{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1510{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1511{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1512
1513/*
1514 * Anything below should really be kept as entities references
1515 */
1516{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1517
1518{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1519{ 732, "tilde","small tilde, U+02DC ISOdia" },
1520
1521{ 913, "Alpha","greek capital letter alpha, U+0391" },
1522{ 914, "Beta", "greek capital letter beta, U+0392" },
1523{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1524{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1525{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1526{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1527{ 919, "Eta", "greek capital letter eta, U+0397" },
1528{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1529{ 921, "Iota", "greek capital letter iota, U+0399" },
1530{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001531{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001532{ 924, "Mu", "greek capital letter mu, U+039C" },
1533{ 925, "Nu", "greek capital letter nu, U+039D" },
1534{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1535{ 927, "Omicron","greek capital letter omicron, U+039F" },
1536{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1537{ 929, "Rho", "greek capital letter rho, U+03A1" },
1538{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1539{ 932, "Tau", "greek capital letter tau, U+03A4" },
1540{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1541{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1542{ 935, "Chi", "greek capital letter chi, U+03A7" },
1543{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1544{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1545
1546{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1547{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1548{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1549{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1550{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1551{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1552{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1553{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1554{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1555{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1556{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1557{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1558{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1559{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1560{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1561{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1562{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1563{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1564{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1565{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1566{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1567{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1568{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1569{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1570{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1571{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1572{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1573{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1574
1575{ 8194, "ensp", "en space, U+2002 ISOpub" },
1576{ 8195, "emsp", "em space, U+2003 ISOpub" },
1577{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1578{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1579{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1580{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1581{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1582{ 8211, "ndash","en dash, U+2013 ISOpub" },
1583{ 8212, "mdash","em dash, U+2014 ISOpub" },
1584{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1585{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1586{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1587{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1588{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1589{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1590{ 8224, "dagger","dagger, U+2020 ISOpub" },
1591{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1592
1593{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1594{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1595
1596{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1597
1598{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1599{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1600
1601{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1602{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1603
1604{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1605{ 8260, "frasl","fraction slash, U+2044 NEW" },
1606
1607{ 8364, "euro", "euro sign, U+20AC NEW" },
1608
1609{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1610{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1611{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1612{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1613{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1614{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1615{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1616{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1617{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1618{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1619{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1620{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1621{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1622{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1623{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1624{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1625
1626{ 8704, "forall","for all, U+2200 ISOtech" },
1627{ 8706, "part", "partial differential, U+2202 ISOtech" },
1628{ 8707, "exist","there exists, U+2203 ISOtech" },
1629{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1630{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1631{ 8712, "isin", "element of, U+2208 ISOtech" },
1632{ 8713, "notin","not an element of, U+2209 ISOtech" },
1633{ 8715, "ni", "contains as member, U+220B ISOtech" },
1634{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001635{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001636{ 8722, "minus","minus sign, U+2212 ISOtech" },
1637{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1638{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1639{ 8733, "prop", "proportional to, U+221D ISOtech" },
1640{ 8734, "infin","infinity, U+221E ISOtech" },
1641{ 8736, "ang", "angle, U+2220 ISOamso" },
1642{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1643{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1644{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1645{ 8746, "cup", "union = cup, U+222A ISOtech" },
1646{ 8747, "int", "integral, U+222B ISOtech" },
1647{ 8756, "there4","therefore, U+2234 ISOtech" },
1648{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1649{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1650{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1651{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1652{ 8801, "equiv","identical to, U+2261 ISOtech" },
1653{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1654{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1655{ 8834, "sub", "subset of, U+2282 ISOtech" },
1656{ 8835, "sup", "superset of, U+2283 ISOtech" },
1657{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1658{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1659{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1660{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1661{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1662{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1663{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1664{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1665{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1666{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1667{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1668{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1669{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1670{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1671
1672{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1673{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1674{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1675{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1676
1677};
1678
1679/************************************************************************
1680 * *
1681 * Commodity functions to handle entities *
1682 * *
1683 ************************************************************************/
1684
1685/*
1686 * Macro used to grow the current buffer.
1687 */
1688#define growBuffer(buffer) { \
1689 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001690 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001691 if (buffer == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001692 htmlErrMemory(ctxt, "growing buffer\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001693 return(NULL); \
1694 } \
1695}
1696
1697/**
1698 * htmlEntityLookup:
1699 * @name: the entity name
1700 *
1701 * Lookup the given entity in EntitiesTable
1702 *
1703 * TODO: the linear scan is really ugly, an hash table is really needed.
1704 *
1705 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1706 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001707const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001708htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001709 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001710
1711 for (i = 0;i < (sizeof(html40EntitiesTable)/
1712 sizeof(html40EntitiesTable[0]));i++) {
1713 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001714 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001715 }
1716 }
1717 return(NULL);
1718}
1719
1720/**
1721 * htmlEntityValueLookup:
1722 * @value: the entity's unicode value
1723 *
1724 * Lookup the given entity in EntitiesTable
1725 *
1726 * TODO: the linear scan is really ugly, an hash table is really needed.
1727 *
1728 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1729 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001730const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001731htmlEntityValueLookup(unsigned int value) {
1732 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001733
1734 for (i = 0;i < (sizeof(html40EntitiesTable)/
1735 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001736 if (html40EntitiesTable[i].value >= value) {
1737 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001738 break;
William M. Brack78637da2003-07-31 14:47:38 +00001739 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001740 }
Owen Taylor3473f882001-02-23 17:55:21 +00001741 }
1742 return(NULL);
1743}
1744
1745/**
1746 * UTF8ToHtml:
1747 * @out: a pointer to an array of bytes to store the result
1748 * @outlen: the length of @out
1749 * @in: a pointer to an array of UTF-8 chars
1750 * @inlen: the length of @in
1751 *
1752 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1753 * plus HTML entities block of chars out.
1754 *
1755 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1756 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001757 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001758 * The value of @outlen after return is the number of octets consumed.
1759 */
1760int
1761UTF8ToHtml(unsigned char* out, int *outlen,
1762 const unsigned char* in, int *inlen) {
1763 const unsigned char* processed = in;
1764 const unsigned char* outend;
1765 const unsigned char* outstart = out;
1766 const unsigned char* instart = in;
1767 const unsigned char* inend;
1768 unsigned int c, d;
1769 int trailing;
1770
1771 if (in == NULL) {
1772 /*
1773 * initialization nothing to do
1774 */
1775 *outlen = 0;
1776 *inlen = 0;
1777 return(0);
1778 }
1779 inend = in + (*inlen);
1780 outend = out + (*outlen);
1781 while (in < inend) {
1782 d = *in++;
1783 if (d < 0x80) { c= d; trailing= 0; }
1784 else if (d < 0xC0) {
1785 /* trailing byte in leading position */
1786 *outlen = out - outstart;
1787 *inlen = processed - instart;
1788 return(-2);
1789 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1790 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1791 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1792 else {
1793 /* no chance for this in Ascii */
1794 *outlen = out - outstart;
1795 *inlen = processed - instart;
1796 return(-2);
1797 }
1798
1799 if (inend - in < trailing) {
1800 break;
1801 }
1802
1803 for ( ; trailing; trailing--) {
1804 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1805 break;
1806 c <<= 6;
1807 c |= d & 0x3F;
1808 }
1809
1810 /* assertion: c is a single UTF-4 value */
1811 if (c < 0x80) {
1812 if (out + 1 >= outend)
1813 break;
1814 *out++ = c;
1815 } else {
1816 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001817 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001818
1819 /*
1820 * Try to lookup a predefined HTML entity for it
1821 */
1822
1823 ent = htmlEntityValueLookup(c);
1824 if (ent == NULL) {
1825 /* no chance for this in Ascii */
1826 *outlen = out - outstart;
1827 *inlen = processed - instart;
1828 return(-2);
1829 }
1830 len = strlen(ent->name);
1831 if (out + 2 + len >= outend)
1832 break;
1833 *out++ = '&';
1834 memcpy(out, ent->name, len);
1835 out += len;
1836 *out++ = ';';
1837 }
1838 processed = in;
1839 }
1840 *outlen = out - outstart;
1841 *inlen = processed - instart;
1842 return(0);
1843}
1844
1845/**
1846 * htmlEncodeEntities:
1847 * @out: a pointer to an array of bytes to store the result
1848 * @outlen: the length of @out
1849 * @in: a pointer to an array of UTF-8 chars
1850 * @inlen: the length of @in
1851 * @quoteChar: the quote character to escape (' or ") or zero.
1852 *
1853 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1854 * plus HTML entities block of chars out.
1855 *
1856 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1857 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001858 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001859 * The value of @outlen after return is the number of octets consumed.
1860 */
1861int
1862htmlEncodeEntities(unsigned char* out, int *outlen,
1863 const unsigned char* in, int *inlen, int quoteChar) {
1864 const unsigned char* processed = in;
1865 const unsigned char* outend = out + (*outlen);
1866 const unsigned char* outstart = out;
1867 const unsigned char* instart = in;
1868 const unsigned char* inend = in + (*inlen);
1869 unsigned int c, d;
1870 int trailing;
1871
1872 while (in < inend) {
1873 d = *in++;
1874 if (d < 0x80) { c= d; trailing= 0; }
1875 else if (d < 0xC0) {
1876 /* trailing byte in leading position */
1877 *outlen = out - outstart;
1878 *inlen = processed - instart;
1879 return(-2);
1880 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1881 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1882 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1883 else {
1884 /* no chance for this in Ascii */
1885 *outlen = out - outstart;
1886 *inlen = processed - instart;
1887 return(-2);
1888 }
1889
1890 if (inend - in < trailing)
1891 break;
1892
1893 while (trailing--) {
1894 if (((d= *in++) & 0xC0) != 0x80) {
1895 *outlen = out - outstart;
1896 *inlen = processed - instart;
1897 return(-2);
1898 }
1899 c <<= 6;
1900 c |= d & 0x3F;
1901 }
1902
1903 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001904 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1905 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001906 if (out >= outend)
1907 break;
1908 *out++ = c;
1909 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001910 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001911 const char *cp;
1912 char nbuf[16];
1913 int len;
1914
1915 /*
1916 * Try to lookup a predefined HTML entity for it
1917 */
1918 ent = htmlEntityValueLookup(c);
1919 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001920 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001921 cp = nbuf;
1922 }
1923 else
1924 cp = ent->name;
1925 len = strlen(cp);
1926 if (out + 2 + len > outend)
1927 break;
1928 *out++ = '&';
1929 memcpy(out, cp, len);
1930 out += len;
1931 *out++ = ';';
1932 }
1933 processed = in;
1934 }
1935 *outlen = out - outstart;
1936 *inlen = processed - instart;
1937 return(0);
1938}
1939
Owen Taylor3473f882001-02-23 17:55:21 +00001940/************************************************************************
1941 * *
1942 * Commodity functions to handle streams *
1943 * *
1944 ************************************************************************/
1945
1946/**
Owen Taylor3473f882001-02-23 17:55:21 +00001947 * htmlNewInputStream:
1948 * @ctxt: an HTML parser context
1949 *
1950 * Create a new input stream structure
1951 * Returns the new input stream or NULL
1952 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001953static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001954htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1955 htmlParserInputPtr input;
1956
1957 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1958 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00001959 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001960 return(NULL);
1961 }
1962 memset(input, 0, sizeof(htmlParserInput));
1963 input->filename = NULL;
1964 input->directory = NULL;
1965 input->base = NULL;
1966 input->cur = NULL;
1967 input->buf = NULL;
1968 input->line = 1;
1969 input->col = 1;
1970 input->buf = NULL;
1971 input->free = NULL;
1972 input->version = NULL;
1973 input->consumed = 0;
1974 input->length = 0;
1975 return(input);
1976}
1977
1978
1979/************************************************************************
1980 * *
1981 * Commodity functions, cleanup needed ? *
1982 * *
1983 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001984/*
1985 * all tags allowing pc data from the html 4.01 loose dtd
1986 * NOTE: it might be more apropriate to integrate this information
1987 * into the html40ElementTable array but I don't want to risk any
1988 * binary incomptibility
1989 */
1990static const char *allowPCData[] = {
1991 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1992 "blockquote", "body", "button", "caption", "center", "cite", "code",
1993 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1994 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1995 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1996 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1997};
Owen Taylor3473f882001-02-23 17:55:21 +00001998
1999/**
2000 * areBlanks:
2001 * @ctxt: an HTML parser context
2002 * @str: a xmlChar *
2003 * @len: the size of @str
2004 *
2005 * Is this a sequence of blank chars that one can ignore ?
2006 *
2007 * Returns 1 if ignorable 0 otherwise.
2008 */
2009
2010static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002011 unsigned int i;
2012 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002013 xmlNodePtr lastChild;
2014
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002015 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002016 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002017
2018 if (CUR == 0) return(1);
2019 if (CUR != '<') return(0);
2020 if (ctxt->name == NULL)
2021 return(1);
2022 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2023 return(1);
2024 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2025 return(1);
2026 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2027 return(1);
2028 if (ctxt->node == NULL) return(0);
2029 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002030 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2031 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002032 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002033 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2034 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002035 /* keep ws in constructs like ...<b> </b>...
2036 for all tags "b" allowing PCDATA */
2037 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2038 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2039 return(0);
2040 }
2041 }
Owen Taylor3473f882001-02-23 17:55:21 +00002042 } else if (xmlNodeIsText(lastChild)) {
2043 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002044 } else {
2045 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2046 for all tags "p" allowing PCDATA */
2047 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2048 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2049 return(0);
2050 }
2051 }
Owen Taylor3473f882001-02-23 17:55:21 +00002052 }
2053 return(1);
2054}
2055
2056/**
Owen Taylor3473f882001-02-23 17:55:21 +00002057 * htmlNewDocNoDtD:
2058 * @URI: URI for the dtd, or NULL
2059 * @ExternalID: the external ID of the DTD, or NULL
2060 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002061 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2062 * are NULL
2063 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002064 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002065 */
2066htmlDocPtr
2067htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2068 xmlDocPtr cur;
2069
2070 /*
2071 * Allocate a new document and fill the fields.
2072 */
2073 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2074 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002075 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002076 return(NULL);
2077 }
2078 memset(cur, 0, sizeof(xmlDoc));
2079
2080 cur->type = XML_HTML_DOCUMENT_NODE;
2081 cur->version = NULL;
2082 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002083 cur->doc = cur;
2084 cur->name = NULL;
2085 cur->children = NULL;
2086 cur->extSubset = NULL;
2087 cur->oldNs = NULL;
2088 cur->encoding = NULL;
2089 cur->standalone = 1;
2090 cur->compression = 0;
2091 cur->ids = NULL;
2092 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002093 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002094 if ((ExternalID != NULL) ||
2095 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002096 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002097 return(cur);
2098}
2099
2100/**
2101 * htmlNewDoc:
2102 * @URI: URI for the dtd, or NULL
2103 * @ExternalID: the external ID of the DTD, or NULL
2104 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002105 * Creates a new HTML document
2106 *
Owen Taylor3473f882001-02-23 17:55:21 +00002107 * Returns a new document
2108 */
2109htmlDocPtr
2110htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2111 if ((URI == NULL) && (ExternalID == NULL))
2112 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002113 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2114 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002115
2116 return(htmlNewDocNoDtD(URI, ExternalID));
2117}
2118
2119
2120/************************************************************************
2121 * *
2122 * The parser itself *
2123 * Relates to http://www.w3.org/TR/html40 *
2124 * *
2125 ************************************************************************/
2126
2127/************************************************************************
2128 * *
2129 * The parser itself *
2130 * *
2131 ************************************************************************/
2132
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002133static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002134
Owen Taylor3473f882001-02-23 17:55:21 +00002135/**
2136 * htmlParseHTMLName:
2137 * @ctxt: an HTML parser context
2138 *
2139 * parse an HTML tag or attribute name, note that we convert it to lowercase
2140 * since HTML names are not case-sensitive.
2141 *
2142 * Returns the Tag Name parsed or NULL
2143 */
2144
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002145static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002146htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002147 int i = 0;
2148 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2149
William M. Brack76e95df2003-10-18 16:20:14 +00002150 if (!IS_LETTER_CH(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002151 (CUR != ':')) return(NULL);
2152
2153 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brack76e95df2003-10-18 16:20:14 +00002154 ((IS_LETTER_CH(CUR)) || (IS_DIGIT_CH(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002155 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2156 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2157 else loc[i] = CUR;
2158 i++;
2159
2160 NEXT;
2161 }
2162
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002163 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002164}
2165
2166/**
2167 * htmlParseName:
2168 * @ctxt: an HTML parser context
2169 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002170 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002171 *
2172 * Returns the Name parsed or NULL
2173 */
2174
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002175static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002176htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002177 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002178 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002179 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002180
2181 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002182
2183 /*
2184 * Accelerator for simple ASCII names
2185 */
2186 in = ctxt->input->cur;
2187 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2188 ((*in >= 0x41) && (*in <= 0x5A)) ||
2189 (*in == '_') || (*in == ':')) {
2190 in++;
2191 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2192 ((*in >= 0x41) && (*in <= 0x5A)) ||
2193 ((*in >= 0x30) && (*in <= 0x39)) ||
2194 (*in == '_') || (*in == '-') ||
2195 (*in == ':') || (*in == '.'))
2196 in++;
2197 if ((*in > 0) && (*in < 0x80)) {
2198 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002199 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002200 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002201 ctxt->nbChars += count;
2202 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002203 return(ret);
2204 }
2205 }
2206 return(htmlParseNameComplex(ctxt));
2207}
2208
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002209static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002210htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002211 int len = 0, l;
2212 int c;
2213 int count = 0;
2214
2215 /*
2216 * Handler for more complex cases
2217 */
2218 GROW;
2219 c = CUR_CHAR(l);
2220 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2221 (!IS_LETTER(c) && (c != '_') &&
2222 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002223 return(NULL);
2224 }
2225
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002226 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2227 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2228 (c == '.') || (c == '-') ||
2229 (c == '_') || (c == ':') ||
2230 (IS_COMBINING(c)) ||
2231 (IS_EXTENDER(c)))) {
2232 if (count++ > 100) {
2233 count = 0;
2234 GROW;
2235 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002236 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002237 NEXTL(l);
2238 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002239 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002240 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002241}
2242
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002243
Owen Taylor3473f882001-02-23 17:55:21 +00002244/**
2245 * htmlParseHTMLAttribute:
2246 * @ctxt: an HTML parser context
2247 * @stop: a char stop value
2248 *
2249 * parse an HTML attribute value till the stop (quote), if
2250 * stop is 0 then it stops at the first space
2251 *
2252 * Returns the attribute parsed or NULL
2253 */
2254
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002255static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002256htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2257 xmlChar *buffer = NULL;
2258 int buffer_size = 0;
2259 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002260 const xmlChar *name = NULL;
2261 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002262 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002263
2264 /*
2265 * allocate a translation buffer.
2266 */
2267 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002268 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002269 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002270 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002271 return(NULL);
2272 }
2273 out = buffer;
2274
2275 /*
2276 * Ok loop until we reach one of the ending chars
2277 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002278 while ((CUR != 0) && (CUR != stop)) {
2279 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002280 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002281 if (CUR == '&') {
2282 if (NXT(1) == '#') {
2283 unsigned int c;
2284 int bits;
2285
2286 c = htmlParseCharRef(ctxt);
2287 if (c < 0x80)
2288 { *out++ = c; bits= -6; }
2289 else if (c < 0x800)
2290 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2291 else if (c < 0x10000)
2292 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2293 else
2294 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2295
2296 for ( ; bits >= 0; bits-= 6) {
2297 *out++ = ((c >> bits) & 0x3F) | 0x80;
2298 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002299
2300 if (out - buffer > buffer_size - 100) {
2301 int indx = out - buffer;
2302
2303 growBuffer(buffer);
2304 out = &buffer[indx];
2305 }
Owen Taylor3473f882001-02-23 17:55:21 +00002306 } else {
2307 ent = htmlParseEntityRef(ctxt, &name);
2308 if (name == NULL) {
2309 *out++ = '&';
2310 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002311 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002312
2313 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002314 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002315 }
2316 } else if (ent == NULL) {
2317 *out++ = '&';
2318 cur = name;
2319 while (*cur != 0) {
2320 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002321 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002322
2323 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002324 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002325 }
2326 *out++ = *cur++;
2327 }
Owen Taylor3473f882001-02-23 17:55:21 +00002328 } else {
2329 unsigned int c;
2330 int bits;
2331
2332 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002333 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002334
2335 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002336 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002337 }
2338 c = (xmlChar)ent->value;
2339 if (c < 0x80)
2340 { *out++ = c; bits= -6; }
2341 else if (c < 0x800)
2342 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2343 else if (c < 0x10000)
2344 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2345 else
2346 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2347
2348 for ( ; bits >= 0; bits-= 6) {
2349 *out++ = ((c >> bits) & 0x3F) | 0x80;
2350 }
Owen Taylor3473f882001-02-23 17:55:21 +00002351 }
2352 }
2353 } else {
2354 unsigned int c;
2355 int bits, l;
2356
2357 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002358 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002359
2360 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002361 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002362 }
2363 c = CUR_CHAR(l);
2364 if (c < 0x80)
2365 { *out++ = c; bits= -6; }
2366 else if (c < 0x800)
2367 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2368 else if (c < 0x10000)
2369 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2370 else
2371 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2372
2373 for ( ; bits >= 0; bits-= 6) {
2374 *out++ = ((c >> bits) & 0x3F) | 0x80;
2375 }
2376 NEXT;
2377 }
2378 }
2379 *out++ = 0;
2380 return(buffer);
2381}
2382
2383/**
Owen Taylor3473f882001-02-23 17:55:21 +00002384 * htmlParseEntityRef:
2385 * @ctxt: an HTML parser context
2386 * @str: location to store the entity name
2387 *
2388 * parse an HTML ENTITY references
2389 *
2390 * [68] EntityRef ::= '&' Name ';'
2391 *
2392 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2393 * if non-NULL *str will have to be freed by the caller.
2394 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002395const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002396htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2397 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002398 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002399 *str = NULL;
2400
2401 if (CUR == '&') {
2402 NEXT;
2403 name = htmlParseName(ctxt);
2404 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002405 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2406 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002407 } else {
2408 GROW;
2409 if (CUR == ';') {
2410 *str = name;
2411
2412 /*
2413 * Lookup the entity in the table.
2414 */
2415 ent = htmlEntityLookup(name);
2416 if (ent != NULL) /* OK that's ugly !!! */
2417 NEXT;
2418 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002419 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2420 "htmlParseEntityRef: expecting ';'\n",
2421 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002422 *str = name;
2423 }
2424 }
2425 }
2426 return(ent);
2427}
2428
2429/**
2430 * htmlParseAttValue:
2431 * @ctxt: an HTML parser context
2432 *
2433 * parse a value for an attribute
2434 * Note: the parser won't do substitution of entities here, this
2435 * will be handled later in xmlStringGetNodeList, unless it was
2436 * asked for ctxt->replaceEntities != 0
2437 *
2438 * Returns the AttValue parsed or NULL.
2439 */
2440
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002441static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002442htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2443 xmlChar *ret = NULL;
2444
2445 if (CUR == '"') {
2446 NEXT;
2447 ret = htmlParseHTMLAttribute(ctxt, '"');
2448 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002449 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2450 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002451 } else
2452 NEXT;
2453 } else if (CUR == '\'') {
2454 NEXT;
2455 ret = htmlParseHTMLAttribute(ctxt, '\'');
2456 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002457 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2458 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002459 } else
2460 NEXT;
2461 } else {
2462 /*
2463 * That's an HTMLism, the attribute value may not be quoted
2464 */
2465 ret = htmlParseHTMLAttribute(ctxt, 0);
2466 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002467 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2468 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002469 }
2470 }
2471 return(ret);
2472}
2473
2474/**
2475 * htmlParseSystemLiteral:
2476 * @ctxt: an HTML parser context
2477 *
2478 * parse an HTML Literal
2479 *
2480 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2481 *
2482 * Returns the SystemLiteral parsed or NULL
2483 */
2484
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002485static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002486htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2487 const xmlChar *q;
2488 xmlChar *ret = NULL;
2489
2490 if (CUR == '"') {
2491 NEXT;
2492 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002493 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002494 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002495 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002496 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2497 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002498 } else {
2499 ret = xmlStrndup(q, CUR_PTR - q);
2500 NEXT;
2501 }
2502 } else if (CUR == '\'') {
2503 NEXT;
2504 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002505 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002506 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002507 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002508 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2509 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002510 } else {
2511 ret = xmlStrndup(q, CUR_PTR - q);
2512 NEXT;
2513 }
2514 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002515 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2516 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002517 }
2518
2519 return(ret);
2520}
2521
2522/**
2523 * htmlParsePubidLiteral:
2524 * @ctxt: an HTML parser context
2525 *
2526 * parse an HTML public literal
2527 *
2528 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2529 *
2530 * Returns the PubidLiteral parsed or NULL.
2531 */
2532
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002533static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002534htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2535 const xmlChar *q;
2536 xmlChar *ret = NULL;
2537 /*
2538 * Name ::= (Letter | '_') (NameChar)*
2539 */
2540 if (CUR == '"') {
2541 NEXT;
2542 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002543 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002544 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002545 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2546 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002547 } else {
2548 ret = xmlStrndup(q, CUR_PTR - q);
2549 NEXT;
2550 }
2551 } else if (CUR == '\'') {
2552 NEXT;
2553 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002554 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002555 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002556 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002557 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2558 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002559 } else {
2560 ret = xmlStrndup(q, CUR_PTR - q);
2561 NEXT;
2562 }
2563 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002564 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2565 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002566 }
2567
2568 return(ret);
2569}
2570
2571/**
2572 * htmlParseScript:
2573 * @ctxt: an HTML parser context
2574 *
2575 * parse the content of an HTML SCRIPT or STYLE element
2576 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2577 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2578 * http://www.w3.org/TR/html4/types.html#type-script
2579 * http://www.w3.org/TR/html4/types.html#h-6.15
2580 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2581 *
2582 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2583 * element and the value of intrinsic event attributes. User agents must
2584 * not evaluate script data as HTML markup but instead must pass it on as
2585 * data to a script engine.
2586 * NOTES:
2587 * - The content is passed like CDATA
2588 * - the attributes for style and scripting "onXXX" are also described
2589 * as CDATA but SGML allows entities references in attributes so their
2590 * processing is identical as other attributes
2591 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002592static void
Owen Taylor3473f882001-02-23 17:55:21 +00002593htmlParseScript(htmlParserCtxtPtr ctxt) {
2594 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2595 int nbchar = 0;
2596 xmlChar cur;
2597
2598 SHRINK;
2599 cur = CUR;
William M. Brack76e95df2003-10-18 16:20:14 +00002600 while (IS_CHAR_CH(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002601 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2602 (NXT(3) == '-')) {
2603 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2604 if (ctxt->sax->cdataBlock!= NULL) {
2605 /*
2606 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2607 */
2608 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002609 } else if (ctxt->sax->characters != NULL) {
2610 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002611 }
2612 }
2613 nbchar = 0;
2614 htmlParseComment(ctxt);
2615 cur = CUR;
2616 continue;
2617 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002618 /*
2619 * One should break here, the specification is clear:
2620 * Authors should therefore escape "</" within the content.
2621 * Escape mechanisms are specific to each scripting or
2622 * style sheet language.
2623 */
2624 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2625 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2626 break; /* while */
2627 }
2628 buf[nbchar++] = cur;
2629 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2630 if (ctxt->sax->cdataBlock!= NULL) {
2631 /*
2632 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2633 */
2634 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002635 } else if (ctxt->sax->characters != NULL) {
2636 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002637 }
2638 nbchar = 0;
2639 }
2640 NEXT;
2641 cur = CUR;
2642 }
William M. Brack76e95df2003-10-18 16:20:14 +00002643 if (!(IS_CHAR_CH(cur))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002644 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2645 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002646 NEXT;
2647 }
2648
2649 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2650 if (ctxt->sax->cdataBlock!= NULL) {
2651 /*
2652 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2653 */
2654 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002655 } else if (ctxt->sax->characters != NULL) {
2656 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002657 }
2658 }
2659}
2660
2661
2662/**
2663 * htmlParseCharData:
2664 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002665 *
2666 * parse a CharData section.
2667 * if we are within a CDATA section ']]>' marks an end of section.
2668 *
2669 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2670 */
2671
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002672static void
2673htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002674 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2675 int nbchar = 0;
2676 int cur, l;
2677
2678 SHRINK;
2679 cur = CUR_CHAR(l);
2680 while (((cur != '<') || (ctxt->token == '<')) &&
2681 ((cur != '&') || (ctxt->token == '&')) &&
2682 (IS_CHAR(cur))) {
2683 COPY_BUF(l,buf,nbchar,cur);
2684 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2685 /*
2686 * Ok the segment is to be consumed as chars.
2687 */
2688 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2689 if (areBlanks(ctxt, buf, nbchar)) {
2690 if (ctxt->sax->ignorableWhitespace != NULL)
2691 ctxt->sax->ignorableWhitespace(ctxt->userData,
2692 buf, nbchar);
2693 } else {
2694 htmlCheckParagraph(ctxt);
2695 if (ctxt->sax->characters != NULL)
2696 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2697 }
2698 }
2699 nbchar = 0;
2700 }
2701 NEXTL(l);
2702 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002703 if (cur == 0) {
2704 SHRINK;
2705 GROW;
2706 cur = CUR_CHAR(l);
2707 }
Owen Taylor3473f882001-02-23 17:55:21 +00002708 }
2709 if (nbchar != 0) {
2710 /*
2711 * Ok the segment is to be consumed as chars.
2712 */
2713 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2714 if (areBlanks(ctxt, buf, nbchar)) {
2715 if (ctxt->sax->ignorableWhitespace != NULL)
2716 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2717 } else {
2718 htmlCheckParagraph(ctxt);
2719 if (ctxt->sax->characters != NULL)
2720 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2721 }
2722 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002723 } else {
2724 /*
2725 * Loop detection
2726 */
2727 if (cur == 0)
2728 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002729 }
2730}
2731
2732/**
2733 * htmlParseExternalID:
2734 * @ctxt: an HTML parser context
2735 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002736 *
2737 * Parse an External ID or a Public ID
2738 *
Owen Taylor3473f882001-02-23 17:55:21 +00002739 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2740 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2741 *
2742 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2743 *
2744 * Returns the function returns SystemLiteral and in the second
2745 * case publicID receives PubidLiteral, is strict is off
2746 * it is possible to return NULL and have publicID set.
2747 */
2748
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002749static xmlChar *
2750htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002751 xmlChar *URI = NULL;
2752
2753 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2754 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2755 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2756 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002757 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002758 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2759 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002760 }
2761 SKIP_BLANKS;
2762 URI = htmlParseSystemLiteral(ctxt);
2763 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002764 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2765 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002766 }
2767 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2768 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2769 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2770 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002771 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002772 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2773 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002774 }
2775 SKIP_BLANKS;
2776 *publicID = htmlParsePubidLiteral(ctxt);
2777 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002778 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2779 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2780 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002781 }
2782 SKIP_BLANKS;
2783 if ((CUR == '"') || (CUR == '\'')) {
2784 URI = htmlParseSystemLiteral(ctxt);
2785 }
2786 }
2787 return(URI);
2788}
2789
2790/**
2791 * htmlParseComment:
2792 * @ctxt: an HTML parser context
2793 *
2794 * Parse an XML (SGML) comment <!-- .... -->
2795 *
2796 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2797 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002798static void
Owen Taylor3473f882001-02-23 17:55:21 +00002799htmlParseComment(htmlParserCtxtPtr ctxt) {
2800 xmlChar *buf = NULL;
2801 int len;
2802 int size = HTML_PARSER_BUFFER_SIZE;
2803 int q, ql;
2804 int r, rl;
2805 int cur, l;
2806 xmlParserInputState state;
2807
2808 /*
2809 * Check that there is a comment right here.
2810 */
2811 if ((RAW != '<') || (NXT(1) != '!') ||
2812 (NXT(2) != '-') || (NXT(3) != '-')) return;
2813
2814 state = ctxt->instate;
2815 ctxt->instate = XML_PARSER_COMMENT;
2816 SHRINK;
2817 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002818 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002819 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002820 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002821 ctxt->instate = state;
2822 return;
2823 }
2824 q = CUR_CHAR(ql);
2825 NEXTL(ql);
2826 r = CUR_CHAR(rl);
2827 NEXTL(rl);
2828 cur = CUR_CHAR(l);
2829 len = 0;
2830 while (IS_CHAR(cur) &&
2831 ((cur != '>') ||
2832 (r != '-') || (q != '-'))) {
2833 if (len + 5 >= size) {
2834 size *= 2;
2835 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2836 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002837 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002838 ctxt->instate = state;
2839 return;
2840 }
2841 }
2842 COPY_BUF(ql,buf,len,q);
2843 q = r;
2844 ql = rl;
2845 r = cur;
2846 rl = l;
2847 NEXTL(l);
2848 cur = CUR_CHAR(l);
2849 if (cur == 0) {
2850 SHRINK;
2851 GROW;
2852 cur = CUR_CHAR(l);
2853 }
2854 }
2855 buf[len] = 0;
2856 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002857 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
2858 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002859 xmlFree(buf);
2860 } else {
2861 NEXT;
2862 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2863 (!ctxt->disableSAX))
2864 ctxt->sax->comment(ctxt->userData, buf);
2865 xmlFree(buf);
2866 }
2867 ctxt->instate = state;
2868}
2869
2870/**
2871 * htmlParseCharRef:
2872 * @ctxt: an HTML parser context
2873 *
2874 * parse Reference declarations
2875 *
2876 * [66] CharRef ::= '&#' [0-9]+ ';' |
2877 * '&#x' [0-9a-fA-F]+ ';'
2878 *
2879 * Returns the value parsed (as an int)
2880 */
2881int
2882htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2883 int val = 0;
2884
2885 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00002886 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002887 SKIP(3);
2888 while (CUR != ';') {
2889 if ((CUR >= '0') && (CUR <= '9'))
2890 val = val * 16 + (CUR - '0');
2891 else if ((CUR >= 'a') && (CUR <= 'f'))
2892 val = val * 16 + (CUR - 'a') + 10;
2893 else if ((CUR >= 'A') && (CUR <= 'F'))
2894 val = val * 16 + (CUR - 'A') + 10;
2895 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002896 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
2897 "htmlParseCharRef: invalid hexadecimal value\n",
2898 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002899 return(0);
2900 }
2901 NEXT;
2902 }
2903 if (CUR == ';')
2904 NEXT;
2905 } else if ((CUR == '&') && (NXT(1) == '#')) {
2906 SKIP(2);
2907 while (CUR != ';') {
2908 if ((CUR >= '0') && (CUR <= '9'))
2909 val = val * 10 + (CUR - '0');
2910 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002911 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
2912 "htmlParseCharRef: invalid decimal value\n",
2913 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002914 return(0);
2915 }
2916 NEXT;
2917 }
2918 if (CUR == ';')
2919 NEXT;
2920 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002921 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
2922 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002923 }
2924 /*
2925 * Check the value IS_CHAR ...
2926 */
2927 if (IS_CHAR(val)) {
2928 return(val);
2929 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002930 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2931 "htmlParseCharRef: invalid xmlChar value %d\n",
2932 val);
Owen Taylor3473f882001-02-23 17:55:21 +00002933 }
2934 return(0);
2935}
2936
2937
2938/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002939 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00002940 * @ctxt: an HTML parser context
2941 *
2942 * parse a DOCTYPE declaration
2943 *
2944 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2945 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2946 */
2947
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002948static void
Owen Taylor3473f882001-02-23 17:55:21 +00002949htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002950 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00002951 xmlChar *ExternalID = NULL;
2952 xmlChar *URI = NULL;
2953
2954 /*
2955 * We know that '<!DOCTYPE' has been detected.
2956 */
2957 SKIP(9);
2958
2959 SKIP_BLANKS;
2960
2961 /*
2962 * Parse the DOCTYPE name.
2963 */
2964 name = htmlParseName(ctxt);
2965 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002966 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2967 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
2968 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002969 }
2970 /*
2971 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2972 */
2973
2974 SKIP_BLANKS;
2975
2976 /*
2977 * Check for SystemID and ExternalID
2978 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002979 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002980 SKIP_BLANKS;
2981
2982 /*
2983 * We should be at the end of the DOCTYPE declaration.
2984 */
2985 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002986 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
2987 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002988 /* We shouldn't try to resynchronize ... */
2989 }
2990 NEXT;
2991
2992 /*
2993 * Create or update the document accordingly to the DOCTYPE
2994 */
2995 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2996 (!ctxt->disableSAX))
2997 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2998
2999 /*
3000 * Cleanup, since we don't use all those identifiers
3001 */
3002 if (URI != NULL) xmlFree(URI);
3003 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003004}
3005
3006/**
3007 * htmlParseAttribute:
3008 * @ctxt: an HTML parser context
3009 * @value: a xmlChar ** used to store the value of the attribute
3010 *
3011 * parse an attribute
3012 *
3013 * [41] Attribute ::= Name Eq AttValue
3014 *
3015 * [25] Eq ::= S? '=' S?
3016 *
3017 * With namespace:
3018 *
3019 * [NS 11] Attribute ::= QName Eq AttValue
3020 *
3021 * Also the case QName == xmlns:??? is handled independently as a namespace
3022 * definition.
3023 *
3024 * Returns the attribute name, and the value in *value.
3025 */
3026
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003027static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003028htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003029 const xmlChar *name;
3030 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003031
3032 *value = NULL;
3033 name = htmlParseHTMLName(ctxt);
3034 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003035 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3036 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003037 return(NULL);
3038 }
3039
3040 /*
3041 * read the value
3042 */
3043 SKIP_BLANKS;
3044 if (CUR == '=') {
3045 NEXT;
3046 SKIP_BLANKS;
3047 val = htmlParseAttValue(ctxt);
3048 /******
3049 } else {
3050 * TODO : some attribute must have values, some may not
3051 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3052 ctxt->sax->warning(ctxt->userData,
3053 "No value for attribute %s\n", name); */
3054 }
3055
3056 *value = val;
3057 return(name);
3058}
3059
3060/**
3061 * htmlCheckEncoding:
3062 * @ctxt: an HTML parser context
3063 * @attvalue: the attribute value
3064 *
3065 * Checks an http-equiv attribute from a Meta tag to detect
3066 * the encoding
3067 * If a new encoding is detected the parser is switched to decode
3068 * it and pass UTF8
3069 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003070static void
Owen Taylor3473f882001-02-23 17:55:21 +00003071htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3072 const xmlChar *encoding;
3073
3074 if ((ctxt == NULL) || (attvalue == NULL))
3075 return;
3076
3077 /* do not change encoding */
3078 if (ctxt->input->encoding != NULL)
3079 return;
3080
3081 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3082 if (encoding != NULL) {
3083 encoding += 8;
3084 } else {
3085 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3086 if (encoding != NULL)
3087 encoding += 9;
3088 }
3089 if (encoding != NULL) {
3090 xmlCharEncoding enc;
3091 xmlCharEncodingHandlerPtr handler;
3092
3093 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3094
3095 if (ctxt->input->encoding != NULL)
3096 xmlFree((xmlChar *) ctxt->input->encoding);
3097 ctxt->input->encoding = xmlStrdup(encoding);
3098
3099 enc = xmlParseCharEncoding((const char *) encoding);
3100 /*
3101 * registered set of known encodings
3102 */
3103 if (enc != XML_CHAR_ENCODING_ERROR) {
3104 xmlSwitchEncoding(ctxt, enc);
3105 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3106 } else {
3107 /*
3108 * fallback for unknown encodings
3109 */
3110 handler = xmlFindCharEncodingHandler((const char *) encoding);
3111 if (handler != NULL) {
3112 xmlSwitchToEncoding(ctxt, handler);
3113 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3114 } else {
3115 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3116 }
3117 }
3118
3119 if ((ctxt->input->buf != NULL) &&
3120 (ctxt->input->buf->encoder != NULL) &&
3121 (ctxt->input->buf->raw != NULL) &&
3122 (ctxt->input->buf->buffer != NULL)) {
3123 int nbchars;
3124 int processed;
3125
3126 /*
3127 * convert as much as possible to the parser reading buffer.
3128 */
3129 processed = ctxt->input->cur - ctxt->input->base;
3130 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3131 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3132 ctxt->input->buf->buffer,
3133 ctxt->input->buf->raw);
3134 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003135 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3136 "htmlCheckEncoding: encoder error\n",
3137 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003138 }
3139 ctxt->input->base =
3140 ctxt->input->cur = ctxt->input->buf->buffer->content;
3141 }
3142 }
3143}
3144
3145/**
3146 * htmlCheckMeta:
3147 * @ctxt: an HTML parser context
3148 * @atts: the attributes values
3149 *
3150 * Checks an attributes from a Meta tag
3151 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003152static void
Owen Taylor3473f882001-02-23 17:55:21 +00003153htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3154 int i;
3155 const xmlChar *att, *value;
3156 int http = 0;
3157 const xmlChar *content = NULL;
3158
3159 if ((ctxt == NULL) || (atts == NULL))
3160 return;
3161
3162 i = 0;
3163 att = atts[i++];
3164 while (att != NULL) {
3165 value = atts[i++];
3166 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3167 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3168 http = 1;
3169 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3170 content = value;
3171 att = atts[i++];
3172 }
3173 if ((http) && (content != NULL))
3174 htmlCheckEncoding(ctxt, content);
3175
3176}
3177
3178/**
3179 * htmlParseStartTag:
3180 * @ctxt: an HTML parser context
3181 *
3182 * parse a start of tag either for rule element or
3183 * EmptyElement. In both case we don't parse the tag closing chars.
3184 *
3185 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3186 *
3187 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3188 *
3189 * With namespace:
3190 *
3191 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3192 *
3193 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3194 *
3195 */
3196
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003197static void
Owen Taylor3473f882001-02-23 17:55:21 +00003198htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003199 const xmlChar *name;
3200 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003201 xmlChar *attvalue;
Daniel Veillardf403d292003-10-05 13:51:35 +00003202 const xmlChar **atts = ctxt->atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003203 int nbatts = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +00003204 int maxatts = ctxt->maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003205 int meta = 0;
3206 int i;
3207
3208 if (CUR != '<') return;
3209 NEXT;
3210
3211 GROW;
3212 name = htmlParseHTMLName(ctxt);
3213 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003214 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3215 "htmlParseStartTag: invalid element name\n",
3216 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003217 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003218 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003219 NEXT;
3220 return;
3221 }
3222 if (xmlStrEqual(name, BAD_CAST"meta"))
3223 meta = 1;
3224
3225 /*
3226 * Check for auto-closure of HTML elements.
3227 */
3228 htmlAutoClose(ctxt, name);
3229
3230 /*
3231 * Check for implied HTML elements.
3232 */
3233 htmlCheckImplied(ctxt, name);
3234
3235 /*
3236 * Avoid html at any level > 0, head at any level != 1
3237 * or any attempt to recurse body
3238 */
3239 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003240 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3241 "htmlParseStartTag: misplaced <html> tag\n",
3242 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003243 return;
3244 }
3245 if ((ctxt->nameNr != 1) &&
3246 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003247 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3248 "htmlParseStartTag: misplaced <head> tag\n",
3249 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003250 return;
3251 }
3252 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003253 int indx;
3254 for (indx = 0;indx < ctxt->nameNr;indx++) {
3255 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003256 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3257 "htmlParseStartTag: misplaced <body> tag\n",
3258 name, NULL);
Daniel Veillardc59d8262003-11-20 21:59:12 +00003259 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3260 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003261 return;
3262 }
3263 }
3264 }
3265
3266 /*
3267 * Now parse the attributes, it ends up with the ending
3268 *
3269 * (S Attribute)* S?
3270 */
3271 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003272 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003273 (CUR != '>') &&
3274 ((CUR != '/') || (NXT(1) != '>'))) {
3275 long cons = ctxt->nbChars;
3276
3277 GROW;
3278 attname = htmlParseAttribute(ctxt, &attvalue);
3279 if (attname != NULL) {
3280
3281 /*
3282 * Well formedness requires at most one declaration of an attribute
3283 */
3284 for (i = 0; i < nbatts;i += 2) {
3285 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003286 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3287 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003288 if (attvalue != NULL)
3289 xmlFree(attvalue);
3290 goto failed;
3291 }
3292 }
3293
3294 /*
3295 * Add the pair to atts
3296 */
3297 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003298 maxatts = 22; /* allow for 10 attrs by default */
3299 atts = (const xmlChar **)
3300 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003301 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003302 htmlErrMemory(ctxt, NULL);
3303 if (attvalue != NULL)
3304 xmlFree(attvalue);
3305 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003306 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003307 ctxt->atts = atts;
3308 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003309 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003310 const xmlChar **n;
3311
Owen Taylor3473f882001-02-23 17:55:21 +00003312 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003313 n = (const xmlChar **) xmlRealloc((void *) atts,
3314 maxatts * sizeof(const xmlChar *));
3315 if (n == NULL) {
3316 htmlErrMemory(ctxt, NULL);
3317 if (attvalue != NULL)
3318 xmlFree(attvalue);
3319 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003320 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003321 atts = n;
3322 ctxt->atts = atts;
3323 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003324 }
3325 atts[nbatts++] = attname;
3326 atts[nbatts++] = attvalue;
3327 atts[nbatts] = NULL;
3328 atts[nbatts + 1] = NULL;
3329 }
3330 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003331 if (attvalue != NULL)
3332 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003333 /* Dump the bogus attribute string up to the next blank or
3334 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003335 while ((IS_CHAR_CH(CUR)) &&
3336 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003337 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003338 NEXT;
3339 }
3340
3341failed:
3342 SKIP_BLANKS;
3343 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003344 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3345 "htmlParseStartTag: problem parsing attributes\n",
3346 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003347 break;
3348 }
3349 }
3350
3351 /*
3352 * Handle specific association to the META tag
3353 */
3354 if (meta)
3355 htmlCheckMeta(ctxt, atts);
3356
3357 /*
3358 * SAX: Start of Element !
3359 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003360 htmlnamePush(ctxt, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003361 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3362 if (nbatts != 0)
3363 ctxt->sax->startElement(ctxt->userData, name, atts);
3364 else
3365 ctxt->sax->startElement(ctxt->userData, name, NULL);
3366 }
Owen Taylor3473f882001-02-23 17:55:21 +00003367
3368 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003369 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003370 if (atts[i] != NULL)
3371 xmlFree((xmlChar *) atts[i]);
3372 }
Owen Taylor3473f882001-02-23 17:55:21 +00003373 }
Owen Taylor3473f882001-02-23 17:55:21 +00003374}
3375
3376/**
3377 * htmlParseEndTag:
3378 * @ctxt: an HTML parser context
3379 *
3380 * parse an end of tag
3381 *
3382 * [42] ETag ::= '</' Name S? '>'
3383 *
3384 * With namespace
3385 *
3386 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003387 *
3388 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003389 */
3390
Daniel Veillardf420ac52001-07-04 16:04:09 +00003391static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003392htmlParseEndTag(htmlParserCtxtPtr ctxt)
3393{
3394 const xmlChar *name;
3395 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003396 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003397
3398 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003399 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3400 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003401 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003402 }
3403 SKIP(2);
3404
3405 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003406 if (name == NULL)
3407 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003408
3409 /*
3410 * We should definitely be at the ending "S? '>'" part
3411 */
3412 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003413 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003414 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3415 "End tag : expected '>'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003416 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003417 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003418
3419 /*
3420 * If the name read is not one of the element in the parsing stack
3421 * then return, it's just an error.
3422 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003423 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3424 if (xmlStrEqual(name, ctxt->nameTab[i]))
3425 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003426 }
3427 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003428 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3429 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003430 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003431 }
3432
3433
3434 /*
3435 * Check for auto-closure of HTML elements.
3436 */
3437
3438 htmlAutoCloseOnClose(ctxt, name);
3439
3440 /*
3441 * Well formedness constraints, opening and closing must match.
3442 * With the exception that the autoclose may have popped stuff out
3443 * of the stack.
3444 */
3445 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003446 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003447 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3448 "Opening and ending tag mismatch: %s and %s\n",
3449 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003450 }
3451 }
3452
3453 /*
3454 * SAX: End of Tag
3455 */
3456 oldname = ctxt->name;
3457 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003458 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3459 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003460 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003461 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003462 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003463 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003464 }
3465
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003466 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003467}
3468
3469
3470/**
3471 * htmlParseReference:
3472 * @ctxt: an HTML parser context
3473 *
3474 * parse and handle entity references in content,
3475 * this will end-up in a call to character() since this is either a
3476 * CharRef, or a predefined entity.
3477 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003478static void
Owen Taylor3473f882001-02-23 17:55:21 +00003479htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003480 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003481 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003482 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003483 if (CUR != '&') return;
3484
3485 if (NXT(1) == '#') {
3486 unsigned int c;
3487 int bits, i = 0;
3488
3489 c = htmlParseCharRef(ctxt);
3490 if (c == 0)
3491 return;
3492
3493 if (c < 0x80) { out[i++]= c; bits= -6; }
3494 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3495 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3496 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3497
3498 for ( ; bits >= 0; bits-= 6) {
3499 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3500 }
3501 out[i] = 0;
3502
3503 htmlCheckParagraph(ctxt);
3504 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3505 ctxt->sax->characters(ctxt->userData, out, i);
3506 } else {
3507 ent = htmlParseEntityRef(ctxt, &name);
3508 if (name == NULL) {
3509 htmlCheckParagraph(ctxt);
3510 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3511 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3512 return;
3513 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003514 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003515 htmlCheckParagraph(ctxt);
3516 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3517 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3518 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3519 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3520 }
3521 } else {
3522 unsigned int c;
3523 int bits, i = 0;
3524
3525 c = ent->value;
3526 if (c < 0x80)
3527 { out[i++]= c; bits= -6; }
3528 else if (c < 0x800)
3529 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3530 else if (c < 0x10000)
3531 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3532 else
3533 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3534
3535 for ( ; bits >= 0; bits-= 6) {
3536 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3537 }
3538 out[i] = 0;
3539
3540 htmlCheckParagraph(ctxt);
3541 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3542 ctxt->sax->characters(ctxt->userData, out, i);
3543 }
Owen Taylor3473f882001-02-23 17:55:21 +00003544 }
3545}
3546
3547/**
3548 * htmlParseContent:
3549 * @ctxt: an HTML parser context
3550 * @name: the node name
3551 *
3552 * Parse a content: comment, sub-element, reference or text.
3553 *
3554 */
3555
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003556static void
Owen Taylor3473f882001-02-23 17:55:21 +00003557htmlParseContent(htmlParserCtxtPtr ctxt) {
3558 xmlChar *currentNode;
3559 int depth;
3560
3561 currentNode = xmlStrdup(ctxt->name);
3562 depth = ctxt->nameNr;
3563 while (1) {
3564 long cons = ctxt->nbChars;
3565
3566 GROW;
3567 /*
3568 * Our tag or one of it's parent or children is ending.
3569 */
3570 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003571 if (htmlParseEndTag(ctxt) &&
3572 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3573 if (currentNode != NULL)
3574 xmlFree(currentNode);
3575 return;
3576 }
3577 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003578 }
3579
3580 /*
3581 * Has this node been popped out during parsing of
3582 * the next element
3583 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003584 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3585 (!xmlStrEqual(currentNode, ctxt->name)))
3586 {
Owen Taylor3473f882001-02-23 17:55:21 +00003587 if (currentNode != NULL) xmlFree(currentNode);
3588 return;
3589 }
3590
Daniel Veillardf9533d12001-03-03 10:04:57 +00003591 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3592 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003593 /*
3594 * Handle SCRIPT/STYLE separately
3595 */
3596 htmlParseScript(ctxt);
3597 } else {
3598 /*
3599 * Sometimes DOCTYPE arrives in the middle of the document
3600 */
3601 if ((CUR == '<') && (NXT(1) == '!') &&
3602 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3603 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3604 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3605 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003606 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3607 "Misplaced DOCTYPE declaration\n",
3608 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003609 htmlParseDocTypeDecl(ctxt);
3610 }
3611
3612 /*
3613 * First case : a comment
3614 */
3615 if ((CUR == '<') && (NXT(1) == '!') &&
3616 (NXT(2) == '-') && (NXT(3) == '-')) {
3617 htmlParseComment(ctxt);
3618 }
3619
3620 /*
3621 * Second case : a sub-element.
3622 */
3623 else if (CUR == '<') {
3624 htmlParseElement(ctxt);
3625 }
3626
3627 /*
3628 * Third case : a reference. If if has not been resolved,
3629 * parsing returns it's Name, create the node
3630 */
3631 else if (CUR == '&') {
3632 htmlParseReference(ctxt);
3633 }
3634
3635 /*
3636 * Fourth : end of the resource
3637 */
3638 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003639 htmlAutoCloseOnEnd(ctxt);
3640 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003641 }
3642
3643 /*
3644 * Last case, text. Note that References are handled directly.
3645 */
3646 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003647 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003648 }
3649
3650 if (cons == ctxt->nbChars) {
3651 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003652 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3653 "detected an error in element content\n",
3654 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003655 }
3656 break;
3657 }
3658 }
3659 GROW;
3660 }
3661 if (currentNode != NULL) xmlFree(currentNode);
3662}
3663
3664/**
3665 * htmlParseElement:
3666 * @ctxt: an HTML parser context
3667 *
3668 * parse an HTML element, this is highly recursive
3669 *
3670 * [39] element ::= EmptyElemTag | STag content ETag
3671 *
3672 * [41] Attribute ::= Name Eq AttValue
3673 */
3674
3675void
3676htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003677 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003678 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003679 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003680 htmlParserNodeInfo node_info;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003681 const xmlChar *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00003682 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003683 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003684
3685 /* Capture start position */
3686 if (ctxt->record_info) {
3687 node_info.begin_pos = ctxt->input->consumed +
3688 (CUR_PTR - ctxt->input->base);
3689 node_info.begin_line = ctxt->input->line;
3690 }
3691
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003692 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003693 htmlParseStartTag(ctxt);
3694 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003695 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3696 (name == NULL)) {
3697 if (CUR == '>')
3698 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003699 return;
3700 }
Owen Taylor3473f882001-02-23 17:55:21 +00003701
3702 /*
3703 * Lookup the info for that element.
3704 */
3705 info = htmlTagLookup(name);
3706 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003707 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3708 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003709 }
3710
3711 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003712 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003713 */
3714 if ((CUR == '/') && (NXT(1) == '>')) {
3715 SKIP(2);
3716 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3717 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003718 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003719 return;
3720 }
3721
3722 if (CUR == '>') {
3723 NEXT;
3724 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003725 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3726 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003727
3728 /*
3729 * end of parsing of this node.
3730 */
3731 if (xmlStrEqual(name, ctxt->name)) {
3732 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00003733 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003734 }
3735
3736 /*
3737 * Capture end position and add node
3738 */
3739 if ( currentNode != NULL && ctxt->record_info ) {
3740 node_info.end_pos = ctxt->input->consumed +
3741 (CUR_PTR - ctxt->input->base);
3742 node_info.end_line = ctxt->input->line;
3743 node_info.node = ctxt->node;
3744 xmlParserAddNodeInfo(ctxt, &node_info);
3745 }
3746 return;
3747 }
3748
3749 /*
3750 * Check for an Empty Element from DTD definition
3751 */
3752 if ((info != NULL) && (info->empty)) {
3753 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3754 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003755 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003756 return;
3757 }
3758
3759 /*
3760 * Parse the content of the element:
3761 */
3762 currentNode = xmlStrdup(ctxt->name);
3763 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00003764 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003765 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003766 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003767 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003768 if (ctxt->nameNr < depth) break;
3769 }
3770
Owen Taylor3473f882001-02-23 17:55:21 +00003771 /*
3772 * Capture end position and add node
3773 */
3774 if ( currentNode != NULL && ctxt->record_info ) {
3775 node_info.end_pos = ctxt->input->consumed +
3776 (CUR_PTR - ctxt->input->base);
3777 node_info.end_line = ctxt->input->line;
3778 node_info.node = ctxt->node;
3779 xmlParserAddNodeInfo(ctxt, &node_info);
3780 }
William M. Brack76e95df2003-10-18 16:20:14 +00003781 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003782 htmlAutoCloseOnEnd(ctxt);
3783 }
3784
Owen Taylor3473f882001-02-23 17:55:21 +00003785 if (currentNode != NULL)
3786 xmlFree(currentNode);
3787}
3788
3789/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003790 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003791 * @ctxt: an HTML parser context
3792 *
3793 * parse an HTML document (and build a tree if using the standard SAX
3794 * interface).
3795 *
3796 * Returns 0, -1 in case of error. the parser context is augmented
3797 * as a result of the parsing.
3798 */
3799
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003800int
Owen Taylor3473f882001-02-23 17:55:21 +00003801htmlParseDocument(htmlParserCtxtPtr ctxt) {
3802 xmlDtdPtr dtd;
3803
Daniel Veillardd0463562001-10-13 09:15:48 +00003804 xmlInitParser();
3805
Owen Taylor3473f882001-02-23 17:55:21 +00003806 htmlDefaultSAXHandlerInit();
3807 ctxt->html = 1;
3808
3809 GROW;
3810 /*
3811 * SAX: beginning of the document processing.
3812 */
3813 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3814 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3815
3816 /*
3817 * Wipe out everything which is before the first '<'
3818 */
3819 SKIP_BLANKS;
3820 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003821 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
3822 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003823 }
3824
3825 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3826 ctxt->sax->startDocument(ctxt->userData);
3827
3828
3829 /*
3830 * Parse possible comments before any content
3831 */
3832 while ((CUR == '<') && (NXT(1) == '!') &&
3833 (NXT(2) == '-') && (NXT(3) == '-')) {
3834 htmlParseComment(ctxt);
3835 SKIP_BLANKS;
3836 }
3837
3838
3839 /*
3840 * Then possibly doc type declaration(s) and more Misc
3841 * (doctypedecl Misc*)?
3842 */
3843 if ((CUR == '<') && (NXT(1) == '!') &&
3844 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3845 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3846 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3847 (UPP(8) == 'E')) {
3848 htmlParseDocTypeDecl(ctxt);
3849 }
3850 SKIP_BLANKS;
3851
3852 /*
3853 * Parse possible comments before any content
3854 */
3855 while ((CUR == '<') && (NXT(1) == '!') &&
3856 (NXT(2) == '-') && (NXT(3) == '-')) {
3857 htmlParseComment(ctxt);
3858 SKIP_BLANKS;
3859 }
3860
3861 /*
3862 * Time to start parsing the tree itself
3863 */
3864 htmlParseContent(ctxt);
3865
3866 /*
3867 * autoclose
3868 */
3869 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003870 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003871
3872
3873 /*
3874 * SAX: end of the document processing.
3875 */
3876 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3877 ctxt->sax->endDocument(ctxt->userData);
3878
3879 if (ctxt->myDoc != NULL) {
3880 dtd = xmlGetIntSubset(ctxt->myDoc);
3881 if (dtd == NULL)
3882 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00003883 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00003884 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3885 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3886 }
3887 if (! ctxt->wellFormed) return(-1);
3888 return(0);
3889}
3890
3891
3892/************************************************************************
3893 * *
3894 * Parser contexts handling *
3895 * *
3896 ************************************************************************/
3897
3898/**
William M. Brackedb65a72004-02-06 07:36:04 +00003899 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00003900 * @ctxt: an HTML parser context
3901 *
3902 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00003903 *
3904 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00003905 */
3906
Daniel Veillardf403d292003-10-05 13:51:35 +00003907static int
Owen Taylor3473f882001-02-23 17:55:21 +00003908htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3909{
3910 htmlSAXHandler *sax;
3911
Daniel Veillardf403d292003-10-05 13:51:35 +00003912 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003913 memset(ctxt, 0, sizeof(htmlParserCtxt));
3914
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003915 ctxt->dict = xmlDictCreate();
3916 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003917 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3918 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003919 }
Owen Taylor3473f882001-02-23 17:55:21 +00003920 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3921 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003922 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3923 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003924 }
3925 else
3926 memset(sax, 0, sizeof(htmlSAXHandler));
3927
3928 /* Allocate the Input stack */
3929 ctxt->inputTab = (htmlParserInputPtr *)
3930 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3931 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003932 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003933 ctxt->inputNr = 0;
3934 ctxt->inputMax = 0;
3935 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003936 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003937 }
3938 ctxt->inputNr = 0;
3939 ctxt->inputMax = 5;
3940 ctxt->input = NULL;
3941 ctxt->version = NULL;
3942 ctxt->encoding = NULL;
3943 ctxt->standalone = -1;
3944 ctxt->instate = XML_PARSER_START;
3945
3946 /* Allocate the Node stack */
3947 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3948 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003949 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003950 ctxt->nodeNr = 0;
3951 ctxt->nodeMax = 0;
3952 ctxt->node = NULL;
3953 ctxt->inputNr = 0;
3954 ctxt->inputMax = 0;
3955 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003956 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003957 }
3958 ctxt->nodeNr = 0;
3959 ctxt->nodeMax = 10;
3960 ctxt->node = NULL;
3961
3962 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003963 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003964 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003965 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003966 ctxt->nameNr = 0;
3967 ctxt->nameMax = 10;
3968 ctxt->name = NULL;
3969 ctxt->nodeNr = 0;
3970 ctxt->nodeMax = 0;
3971 ctxt->node = NULL;
3972 ctxt->inputNr = 0;
3973 ctxt->inputMax = 0;
3974 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003975 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003976 }
3977 ctxt->nameNr = 0;
3978 ctxt->nameMax = 10;
3979 ctxt->name = NULL;
3980
Daniel Veillard092643b2003-09-25 14:29:29 +00003981 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00003982 else {
3983 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00003984 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00003985 }
3986 ctxt->userData = ctxt;
3987 ctxt->myDoc = NULL;
3988 ctxt->wellFormed = 1;
3989 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003990 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003991 ctxt->html = 1;
William M. Brackedb65a72004-02-06 07:36:04 +00003992 ctxt->vctxt.userData = ctxt;
3993 ctxt->vctxt.error = xmlParserValidityError;
3994 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00003995 ctxt->record_info = 0;
3996 ctxt->validate = 0;
3997 ctxt->nbChars = 0;
3998 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003999 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004000 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004001 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004002}
4003
4004/**
4005 * htmlFreeParserCtxt:
4006 * @ctxt: an HTML parser context
4007 *
4008 * Free all the memory used by a parser context. However the parsed
4009 * document in ctxt->myDoc is not freed.
4010 */
4011
4012void
4013htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4014{
4015 xmlFreeParserCtxt(ctxt);
4016}
4017
4018/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004019 * htmlNewParserCtxt:
4020 *
4021 * Allocate and initialize a new parser context.
4022 *
4023 * Returns the xmlParserCtxtPtr or NULL
4024 */
4025
4026static htmlParserCtxtPtr
4027htmlNewParserCtxt(void)
4028{
4029 xmlParserCtxtPtr ctxt;
4030
4031 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4032 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004033 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004034 return(NULL);
4035 }
4036 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004037 if (htmlInitParserCtxt(ctxt) < 0) {
4038 htmlFreeParserCtxt(ctxt);
4039 return(NULL);
4040 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004041 return(ctxt);
4042}
4043
4044/**
4045 * htmlCreateMemoryParserCtxt:
4046 * @buffer: a pointer to a char array
4047 * @size: the size of the array
4048 *
4049 * Create a parser context for an HTML in-memory document.
4050 *
4051 * Returns the new parser context or NULL
4052 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004053htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004054htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4055 xmlParserCtxtPtr ctxt;
4056 xmlParserInputPtr input;
4057 xmlParserInputBufferPtr buf;
4058
4059 if (buffer == NULL)
4060 return(NULL);
4061 if (size <= 0)
4062 return(NULL);
4063
4064 ctxt = htmlNewParserCtxt();
4065 if (ctxt == NULL)
4066 return(NULL);
4067
4068 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4069 if (buf == NULL) return(NULL);
4070
4071 input = xmlNewInputStream(ctxt);
4072 if (input == NULL) {
4073 xmlFreeParserCtxt(ctxt);
4074 return(NULL);
4075 }
4076
4077 input->filename = NULL;
4078 input->buf = buf;
4079 input->base = input->buf->buffer->content;
4080 input->cur = input->buf->buffer->content;
4081 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4082
4083 inputPush(ctxt, input);
4084 return(ctxt);
4085}
4086
4087/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004088 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004089 * @cur: a pointer to an array of xmlChar
4090 * @encoding: a free form C string describing the HTML document encoding, or NULL
4091 *
4092 * Create a parser context for an HTML document.
4093 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004094 * TODO: check the need to add encoding handling there
4095 *
Owen Taylor3473f882001-02-23 17:55:21 +00004096 * Returns the new parser context or NULL
4097 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004098static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004099htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004100 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004101 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004102
Daniel Veillard1d995272002-07-22 16:43:32 +00004103 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004104 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004105 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004106 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4107
4108 if (encoding != NULL) {
4109 xmlCharEncoding enc;
4110 xmlCharEncodingHandlerPtr handler;
4111
4112 if (ctxt->input->encoding != NULL)
4113 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004114 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004115
4116 enc = xmlParseCharEncoding(encoding);
4117 /*
4118 * registered set of known encodings
4119 */
4120 if (enc != XML_CHAR_ENCODING_ERROR) {
4121 xmlSwitchEncoding(ctxt, enc);
4122 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004123 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4124 "Unsupported encoding %s\n",
4125 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004126 }
4127 } else {
4128 /*
4129 * fallback for unknown encodings
4130 */
4131 handler = xmlFindCharEncodingHandler((const char *) encoding);
4132 if (handler != NULL) {
4133 xmlSwitchToEncoding(ctxt, handler);
4134 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004135 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4136 "Unsupported encoding %s\n",
4137 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004138 }
4139 }
4140 }
4141 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004142}
4143
Daniel Veillard73b013f2003-09-30 12:36:01 +00004144#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004145/************************************************************************
4146 * *
4147 * Progressive parsing interfaces *
4148 * *
4149 ************************************************************************/
4150
4151/**
4152 * htmlParseLookupSequence:
4153 * @ctxt: an HTML parser context
4154 * @first: the first char to lookup
4155 * @next: the next char to lookup or zero
4156 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004157 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004158 *
4159 * Try to find if a sequence (first, next, third) or just (first next) or
4160 * (first) is available in the input stream.
4161 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4162 * to avoid rescanning sequences of bytes, it DOES change the state of the
4163 * parser, do not use liberally.
4164 * This is basically similar to xmlParseLookupSequence()
4165 *
4166 * Returns the index to the current parsing point if the full sequence
4167 * is available, -1 otherwise.
4168 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004169static int
Owen Taylor3473f882001-02-23 17:55:21 +00004170htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004171 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004172 int base, len;
4173 htmlParserInputPtr in;
4174 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004175 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004176
4177 in = ctxt->input;
4178 if (in == NULL) return(-1);
4179 base = in->cur - in->base;
4180 if (base < 0) return(-1);
4181 if (ctxt->checkIndex > base)
4182 base = ctxt->checkIndex;
4183 if (in->buf == NULL) {
4184 buf = in->base;
4185 len = in->length;
4186 } else {
4187 buf = in->buf->buffer->content;
4188 len = in->buf->buffer->use;
4189 }
4190 /* take into account the sequence length */
4191 if (third) len -= 2;
4192 else if (next) len --;
4193 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004194 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004195 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4196 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4197 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004198 /* do not increment past <! - some people use <!--> */
4199 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004200 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004201 }
4202 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004203 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004204 return(-1);
4205 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4206 (buf[base + 2] == '>')) {
4207 incomment = 0;
4208 base += 2;
4209 }
4210 continue;
4211 }
Owen Taylor3473f882001-02-23 17:55:21 +00004212 if (buf[base] == first) {
4213 if (third != 0) {
4214 if ((buf[base + 1] != next) ||
4215 (buf[base + 2] != third)) continue;
4216 } else if (next != 0) {
4217 if (buf[base + 1] != next) continue;
4218 }
4219 ctxt->checkIndex = 0;
4220#ifdef DEBUG_PUSH
4221 if (next == 0)
4222 xmlGenericError(xmlGenericErrorContext,
4223 "HPP: lookup '%c' found at %d\n",
4224 first, base);
4225 else if (third == 0)
4226 xmlGenericError(xmlGenericErrorContext,
4227 "HPP: lookup '%c%c' found at %d\n",
4228 first, next, base);
4229 else
4230 xmlGenericError(xmlGenericErrorContext,
4231 "HPP: lookup '%c%c%c' found at %d\n",
4232 first, next, third, base);
4233#endif
4234 return(base - (in->cur - in->base));
4235 }
4236 }
4237 ctxt->checkIndex = base;
4238#ifdef DEBUG_PUSH
4239 if (next == 0)
4240 xmlGenericError(xmlGenericErrorContext,
4241 "HPP: lookup '%c' failed\n", first);
4242 else if (third == 0)
4243 xmlGenericError(xmlGenericErrorContext,
4244 "HPP: lookup '%c%c' failed\n", first, next);
4245 else
4246 xmlGenericError(xmlGenericErrorContext,
4247 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4248#endif
4249 return(-1);
4250}
4251
4252/**
4253 * htmlParseTryOrFinish:
4254 * @ctxt: an HTML parser context
4255 * @terminate: last chunk indicator
4256 *
4257 * Try to progress on parsing
4258 *
4259 * Returns zero if no parsing was possible
4260 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004261static int
Owen Taylor3473f882001-02-23 17:55:21 +00004262htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4263 int ret = 0;
4264 htmlParserInputPtr in;
4265 int avail = 0;
4266 xmlChar cur, next;
4267
4268#ifdef DEBUG_PUSH
4269 switch (ctxt->instate) {
4270 case XML_PARSER_EOF:
4271 xmlGenericError(xmlGenericErrorContext,
4272 "HPP: try EOF\n"); break;
4273 case XML_PARSER_START:
4274 xmlGenericError(xmlGenericErrorContext,
4275 "HPP: try START\n"); break;
4276 case XML_PARSER_MISC:
4277 xmlGenericError(xmlGenericErrorContext,
4278 "HPP: try MISC\n");break;
4279 case XML_PARSER_COMMENT:
4280 xmlGenericError(xmlGenericErrorContext,
4281 "HPP: try COMMENT\n");break;
4282 case XML_PARSER_PROLOG:
4283 xmlGenericError(xmlGenericErrorContext,
4284 "HPP: try PROLOG\n");break;
4285 case XML_PARSER_START_TAG:
4286 xmlGenericError(xmlGenericErrorContext,
4287 "HPP: try START_TAG\n");break;
4288 case XML_PARSER_CONTENT:
4289 xmlGenericError(xmlGenericErrorContext,
4290 "HPP: try CONTENT\n");break;
4291 case XML_PARSER_CDATA_SECTION:
4292 xmlGenericError(xmlGenericErrorContext,
4293 "HPP: try CDATA_SECTION\n");break;
4294 case XML_PARSER_END_TAG:
4295 xmlGenericError(xmlGenericErrorContext,
4296 "HPP: try END_TAG\n");break;
4297 case XML_PARSER_ENTITY_DECL:
4298 xmlGenericError(xmlGenericErrorContext,
4299 "HPP: try ENTITY_DECL\n");break;
4300 case XML_PARSER_ENTITY_VALUE:
4301 xmlGenericError(xmlGenericErrorContext,
4302 "HPP: try ENTITY_VALUE\n");break;
4303 case XML_PARSER_ATTRIBUTE_VALUE:
4304 xmlGenericError(xmlGenericErrorContext,
4305 "HPP: try ATTRIBUTE_VALUE\n");break;
4306 case XML_PARSER_DTD:
4307 xmlGenericError(xmlGenericErrorContext,
4308 "HPP: try DTD\n");break;
4309 case XML_PARSER_EPILOG:
4310 xmlGenericError(xmlGenericErrorContext,
4311 "HPP: try EPILOG\n");break;
4312 case XML_PARSER_PI:
4313 xmlGenericError(xmlGenericErrorContext,
4314 "HPP: try PI\n");break;
4315 case XML_PARSER_SYSTEM_LITERAL:
4316 xmlGenericError(xmlGenericErrorContext,
4317 "HPP: try SYSTEM_LITERAL\n");break;
4318 }
4319#endif
4320
4321 while (1) {
4322
4323 in = ctxt->input;
4324 if (in == NULL) break;
4325 if (in->buf == NULL)
4326 avail = in->length - (in->cur - in->base);
4327 else
4328 avail = in->buf->buffer->use - (in->cur - in->base);
4329 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004330 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004331 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4332 /*
4333 * SAX: end of the document processing.
4334 */
4335 ctxt->instate = XML_PARSER_EOF;
4336 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4337 ctxt->sax->endDocument(ctxt->userData);
4338 }
4339 }
4340 if (avail < 1)
4341 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004342 cur = in->cur[0];
4343 if (cur == 0) {
4344 SKIP(1);
4345 continue;
4346 }
4347
Owen Taylor3473f882001-02-23 17:55:21 +00004348 switch (ctxt->instate) {
4349 case XML_PARSER_EOF:
4350 /*
4351 * Document parsing is done !
4352 */
4353 goto done;
4354 case XML_PARSER_START:
4355 /*
4356 * Very first chars read from the document flow.
4357 */
4358 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004359 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004360 SKIP_BLANKS;
4361 if (in->buf == NULL)
4362 avail = in->length - (in->cur - in->base);
4363 else
4364 avail = in->buf->buffer->use - (in->cur - in->base);
4365 }
4366 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4367 ctxt->sax->setDocumentLocator(ctxt->userData,
4368 &xmlDefaultSAXLocator);
4369 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4370 (!ctxt->disableSAX))
4371 ctxt->sax->startDocument(ctxt->userData);
4372
4373 cur = in->cur[0];
4374 next = in->cur[1];
4375 if ((cur == '<') && (next == '!') &&
4376 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4377 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4378 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4379 (UPP(8) == 'E')) {
4380 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004381 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004382 goto done;
4383#ifdef DEBUG_PUSH
4384 xmlGenericError(xmlGenericErrorContext,
4385 "HPP: Parsing internal subset\n");
4386#endif
4387 htmlParseDocTypeDecl(ctxt);
4388 ctxt->instate = XML_PARSER_PROLOG;
4389#ifdef DEBUG_PUSH
4390 xmlGenericError(xmlGenericErrorContext,
4391 "HPP: entering PROLOG\n");
4392#endif
4393 } else {
4394 ctxt->instate = XML_PARSER_MISC;
4395 }
4396#ifdef DEBUG_PUSH
4397 xmlGenericError(xmlGenericErrorContext,
4398 "HPP: entering MISC\n");
4399#endif
4400 break;
4401 case XML_PARSER_MISC:
4402 SKIP_BLANKS;
4403 if (in->buf == NULL)
4404 avail = in->length - (in->cur - in->base);
4405 else
4406 avail = in->buf->buffer->use - (in->cur - in->base);
4407 if (avail < 2)
4408 goto done;
4409 cur = in->cur[0];
4410 next = in->cur[1];
4411 if ((cur == '<') && (next == '!') &&
4412 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4413 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004414 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004415 goto done;
4416#ifdef DEBUG_PUSH
4417 xmlGenericError(xmlGenericErrorContext,
4418 "HPP: Parsing Comment\n");
4419#endif
4420 htmlParseComment(ctxt);
4421 ctxt->instate = XML_PARSER_MISC;
4422 } else if ((cur == '<') && (next == '!') &&
4423 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4424 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4425 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4426 (UPP(8) == 'E')) {
4427 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004428 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004429 goto done;
4430#ifdef DEBUG_PUSH
4431 xmlGenericError(xmlGenericErrorContext,
4432 "HPP: Parsing internal subset\n");
4433#endif
4434 htmlParseDocTypeDecl(ctxt);
4435 ctxt->instate = XML_PARSER_PROLOG;
4436#ifdef DEBUG_PUSH
4437 xmlGenericError(xmlGenericErrorContext,
4438 "HPP: entering PROLOG\n");
4439#endif
4440 } else if ((cur == '<') && (next == '!') &&
4441 (avail < 9)) {
4442 goto done;
4443 } else {
4444 ctxt->instate = XML_PARSER_START_TAG;
4445#ifdef DEBUG_PUSH
4446 xmlGenericError(xmlGenericErrorContext,
4447 "HPP: entering START_TAG\n");
4448#endif
4449 }
4450 break;
4451 case XML_PARSER_PROLOG:
4452 SKIP_BLANKS;
4453 if (in->buf == NULL)
4454 avail = in->length - (in->cur - in->base);
4455 else
4456 avail = in->buf->buffer->use - (in->cur - in->base);
4457 if (avail < 2)
4458 goto done;
4459 cur = in->cur[0];
4460 next = in->cur[1];
4461 if ((cur == '<') && (next == '!') &&
4462 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4463 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004464 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004465 goto done;
4466#ifdef DEBUG_PUSH
4467 xmlGenericError(xmlGenericErrorContext,
4468 "HPP: Parsing Comment\n");
4469#endif
4470 htmlParseComment(ctxt);
4471 ctxt->instate = XML_PARSER_PROLOG;
4472 } else if ((cur == '<') && (next == '!') &&
4473 (avail < 4)) {
4474 goto done;
4475 } else {
4476 ctxt->instate = XML_PARSER_START_TAG;
4477#ifdef DEBUG_PUSH
4478 xmlGenericError(xmlGenericErrorContext,
4479 "HPP: entering START_TAG\n");
4480#endif
4481 }
4482 break;
4483 case XML_PARSER_EPILOG:
4484 if (in->buf == NULL)
4485 avail = in->length - (in->cur - in->base);
4486 else
4487 avail = in->buf->buffer->use - (in->cur - in->base);
4488 if (avail < 1)
4489 goto done;
4490 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004491 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004492 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004493 goto done;
4494 }
4495 if (avail < 2)
4496 goto done;
4497 next = in->cur[1];
4498 if ((cur == '<') && (next == '!') &&
4499 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4500 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004501 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004502 goto done;
4503#ifdef DEBUG_PUSH
4504 xmlGenericError(xmlGenericErrorContext,
4505 "HPP: Parsing Comment\n");
4506#endif
4507 htmlParseComment(ctxt);
4508 ctxt->instate = XML_PARSER_EPILOG;
4509 } else if ((cur == '<') && (next == '!') &&
4510 (avail < 4)) {
4511 goto done;
4512 } else {
4513 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004514 ctxt->wellFormed = 0;
4515 ctxt->instate = XML_PARSER_EOF;
4516#ifdef DEBUG_PUSH
4517 xmlGenericError(xmlGenericErrorContext,
4518 "HPP: entering EOF\n");
4519#endif
4520 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4521 ctxt->sax->endDocument(ctxt->userData);
4522 goto done;
4523 }
4524 break;
4525 case XML_PARSER_START_TAG: {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004526 const xmlChar *name, *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00004527 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004528 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004529
4530 if (avail < 2)
4531 goto done;
4532 cur = in->cur[0];
4533 if (cur != '<') {
4534 ctxt->instate = XML_PARSER_CONTENT;
4535#ifdef DEBUG_PUSH
4536 xmlGenericError(xmlGenericErrorContext,
4537 "HPP: entering CONTENT\n");
4538#endif
4539 break;
4540 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004541 if (in->cur[1] == '/') {
4542 ctxt->instate = XML_PARSER_END_TAG;
4543 ctxt->checkIndex = 0;
4544#ifdef DEBUG_PUSH
4545 xmlGenericError(xmlGenericErrorContext,
4546 "HPP: entering END_TAG\n");
4547#endif
4548 break;
4549 }
Owen Taylor3473f882001-02-23 17:55:21 +00004550 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004551 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004552 goto done;
4553
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004554 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004555 htmlParseStartTag(ctxt);
4556 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004557 if (((depth == ctxt->nameNr) &&
4558 (xmlStrEqual(oldname, ctxt->name))) ||
4559 (name == NULL)) {
4560 if (CUR == '>')
4561 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004562 break;
4563 }
Owen Taylor3473f882001-02-23 17:55:21 +00004564
4565 /*
4566 * Lookup the info for that element.
4567 */
4568 info = htmlTagLookup(name);
4569 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004570 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4571 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004572 }
4573
4574 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004575 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004576 */
4577 if ((CUR == '/') && (NXT(1) == '>')) {
4578 SKIP(2);
4579 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4580 ctxt->sax->endElement(ctxt->userData, name);
4581 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004582 ctxt->instate = XML_PARSER_CONTENT;
4583#ifdef DEBUG_PUSH
4584 xmlGenericError(xmlGenericErrorContext,
4585 "HPP: entering CONTENT\n");
4586#endif
4587 break;
4588 }
4589
4590 if (CUR == '>') {
4591 NEXT;
4592 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004593 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4594 "Couldn't find end of Start Tag %s\n",
4595 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004596
4597 /*
4598 * end of parsing of this node.
4599 */
4600 if (xmlStrEqual(name, ctxt->name)) {
4601 nodePop(ctxt);
4602 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004603 }
4604
4605 ctxt->instate = XML_PARSER_CONTENT;
4606#ifdef DEBUG_PUSH
4607 xmlGenericError(xmlGenericErrorContext,
4608 "HPP: entering CONTENT\n");
4609#endif
4610 break;
4611 }
4612
4613 /*
4614 * Check for an Empty Element from DTD definition
4615 */
4616 if ((info != NULL) && (info->empty)) {
4617 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4618 ctxt->sax->endElement(ctxt->userData, name);
4619 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004620 }
4621 ctxt->instate = XML_PARSER_CONTENT;
4622#ifdef DEBUG_PUSH
4623 xmlGenericError(xmlGenericErrorContext,
4624 "HPP: entering CONTENT\n");
4625#endif
4626 break;
4627 }
4628 case XML_PARSER_CONTENT: {
4629 long cons;
4630 /*
4631 * Handle preparsed entities and charRef
4632 */
4633 if (ctxt->token != 0) {
4634 xmlChar chr[2] = { 0 , 0 } ;
4635
4636 chr[0] = (xmlChar) ctxt->token;
4637 htmlCheckParagraph(ctxt);
4638 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4639 ctxt->sax->characters(ctxt->userData, chr, 1);
4640 ctxt->token = 0;
4641 ctxt->checkIndex = 0;
4642 }
4643 if ((avail == 1) && (terminate)) {
4644 cur = in->cur[0];
4645 if ((cur != '<') && (cur != '&')) {
4646 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00004647 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004648 if (ctxt->sax->ignorableWhitespace != NULL)
4649 ctxt->sax->ignorableWhitespace(
4650 ctxt->userData, &cur, 1);
4651 } else {
4652 htmlCheckParagraph(ctxt);
4653 if (ctxt->sax->characters != NULL)
4654 ctxt->sax->characters(
4655 ctxt->userData, &cur, 1);
4656 }
4657 }
4658 ctxt->token = 0;
4659 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004660 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004661 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004662 }
Owen Taylor3473f882001-02-23 17:55:21 +00004663 }
4664 if (avail < 2)
4665 goto done;
4666 cur = in->cur[0];
4667 next = in->cur[1];
4668 cons = ctxt->nbChars;
4669 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4670 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4671 /*
4672 * Handle SCRIPT/STYLE separately
4673 */
4674 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004675 (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004676 goto done;
4677 htmlParseScript(ctxt);
4678 if ((cur == '<') && (next == '/')) {
4679 ctxt->instate = XML_PARSER_END_TAG;
4680 ctxt->checkIndex = 0;
4681#ifdef DEBUG_PUSH
4682 xmlGenericError(xmlGenericErrorContext,
4683 "HPP: entering END_TAG\n");
4684#endif
4685 break;
4686 }
4687 } else {
4688 /*
4689 * Sometimes DOCTYPE arrives in the middle of the document
4690 */
4691 if ((cur == '<') && (next == '!') &&
4692 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4693 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4694 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4695 (UPP(8) == 'E')) {
4696 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004697 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004698 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00004699 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4700 "Misplaced DOCTYPE declaration\n",
4701 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004702 htmlParseDocTypeDecl(ctxt);
4703 } else if ((cur == '<') && (next == '!') &&
4704 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4705 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004706 (htmlParseLookupSequence(
4707 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004708 goto done;
4709#ifdef DEBUG_PUSH
4710 xmlGenericError(xmlGenericErrorContext,
4711 "HPP: Parsing Comment\n");
4712#endif
4713 htmlParseComment(ctxt);
4714 ctxt->instate = XML_PARSER_CONTENT;
4715 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4716 goto done;
4717 } else if ((cur == '<') && (next == '/')) {
4718 ctxt->instate = XML_PARSER_END_TAG;
4719 ctxt->checkIndex = 0;
4720#ifdef DEBUG_PUSH
4721 xmlGenericError(xmlGenericErrorContext,
4722 "HPP: entering END_TAG\n");
4723#endif
4724 break;
4725 } else if (cur == '<') {
4726 ctxt->instate = XML_PARSER_START_TAG;
4727 ctxt->checkIndex = 0;
4728#ifdef DEBUG_PUSH
4729 xmlGenericError(xmlGenericErrorContext,
4730 "HPP: entering START_TAG\n");
4731#endif
4732 break;
4733 } else if (cur == '&') {
4734 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004735 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004736 goto done;
4737#ifdef DEBUG_PUSH
4738 xmlGenericError(xmlGenericErrorContext,
4739 "HPP: Parsing Reference\n");
4740#endif
4741 /* TODO: check generation of subtrees if noent !!! */
4742 htmlParseReference(ctxt);
4743 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004744 /*
4745 * check that the text sequence is complete
4746 * before handing out the data to the parser
4747 * to avoid problems with erroneous end of
4748 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00004749 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00004750 if ((!terminate) &&
4751 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
4752 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00004753 ctxt->checkIndex = 0;
4754#ifdef DEBUG_PUSH
4755 xmlGenericError(xmlGenericErrorContext,
4756 "HPP: Parsing char data\n");
4757#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004758 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004759 }
4760 }
4761 if (cons == ctxt->nbChars) {
4762 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004763 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4764 "detected an error in element content\n",
4765 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004766 }
4767 NEXT;
4768 break;
4769 }
4770
4771 break;
4772 }
4773 case XML_PARSER_END_TAG:
4774 if (avail < 2)
4775 goto done;
4776 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004777 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004778 goto done;
4779 htmlParseEndTag(ctxt);
4780 if (ctxt->nameNr == 0) {
4781 ctxt->instate = XML_PARSER_EPILOG;
4782 } else {
4783 ctxt->instate = XML_PARSER_CONTENT;
4784 }
4785 ctxt->checkIndex = 0;
4786#ifdef DEBUG_PUSH
4787 xmlGenericError(xmlGenericErrorContext,
4788 "HPP: entering CONTENT\n");
4789#endif
4790 break;
4791 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00004792 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4793 "HPP: internal error, state == CDATA\n",
4794 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004795 ctxt->instate = XML_PARSER_CONTENT;
4796 ctxt->checkIndex = 0;
4797#ifdef DEBUG_PUSH
4798 xmlGenericError(xmlGenericErrorContext,
4799 "HPP: entering CONTENT\n");
4800#endif
4801 break;
4802 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00004803 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4804 "HPP: internal error, state == DTD\n",
4805 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004806 ctxt->instate = XML_PARSER_CONTENT;
4807 ctxt->checkIndex = 0;
4808#ifdef DEBUG_PUSH
4809 xmlGenericError(xmlGenericErrorContext,
4810 "HPP: entering CONTENT\n");
4811#endif
4812 break;
4813 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00004814 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4815 "HPP: internal error, state == COMMENT\n",
4816 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004817 ctxt->instate = XML_PARSER_CONTENT;
4818 ctxt->checkIndex = 0;
4819#ifdef DEBUG_PUSH
4820 xmlGenericError(xmlGenericErrorContext,
4821 "HPP: entering CONTENT\n");
4822#endif
4823 break;
4824 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00004825 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4826 "HPP: internal error, state == PI\n",
4827 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004828 ctxt->instate = XML_PARSER_CONTENT;
4829 ctxt->checkIndex = 0;
4830#ifdef DEBUG_PUSH
4831 xmlGenericError(xmlGenericErrorContext,
4832 "HPP: entering CONTENT\n");
4833#endif
4834 break;
4835 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004836 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4837 "HPP: internal error, state == ENTITY_DECL\n",
4838 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004839 ctxt->instate = XML_PARSER_CONTENT;
4840 ctxt->checkIndex = 0;
4841#ifdef DEBUG_PUSH
4842 xmlGenericError(xmlGenericErrorContext,
4843 "HPP: entering CONTENT\n");
4844#endif
4845 break;
4846 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004847 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4848 "HPP: internal error, state == ENTITY_VALUE\n",
4849 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004850 ctxt->instate = XML_PARSER_CONTENT;
4851 ctxt->checkIndex = 0;
4852#ifdef DEBUG_PUSH
4853 xmlGenericError(xmlGenericErrorContext,
4854 "HPP: entering DTD\n");
4855#endif
4856 break;
4857 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004858 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4859 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
4860 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004861 ctxt->instate = XML_PARSER_START_TAG;
4862 ctxt->checkIndex = 0;
4863#ifdef DEBUG_PUSH
4864 xmlGenericError(xmlGenericErrorContext,
4865 "HPP: entering START_TAG\n");
4866#endif
4867 break;
4868 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004869 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4870 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
4871 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004872 ctxt->instate = XML_PARSER_CONTENT;
4873 ctxt->checkIndex = 0;
4874#ifdef DEBUG_PUSH
4875 xmlGenericError(xmlGenericErrorContext,
4876 "HPP: entering CONTENT\n");
4877#endif
4878 break;
4879 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004880 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4881 "HPP: internal error, state == XML_PARSER_IGNORE\n",
4882 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004883 ctxt->instate = XML_PARSER_CONTENT;
4884 ctxt->checkIndex = 0;
4885#ifdef DEBUG_PUSH
4886 xmlGenericError(xmlGenericErrorContext,
4887 "HPP: entering CONTENT\n");
4888#endif
4889 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004890 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004891 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4892 "HPP: internal error, state == XML_PARSER_LITERAL\n",
4893 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004894 ctxt->instate = XML_PARSER_CONTENT;
4895 ctxt->checkIndex = 0;
4896#ifdef DEBUG_PUSH
4897 xmlGenericError(xmlGenericErrorContext,
4898 "HPP: entering CONTENT\n");
4899#endif
4900 break;
4901
Owen Taylor3473f882001-02-23 17:55:21 +00004902 }
4903 }
4904done:
4905 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004906 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004907 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4908 /*
4909 * SAX: end of the document processing.
4910 */
4911 ctxt->instate = XML_PARSER_EOF;
4912 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4913 ctxt->sax->endDocument(ctxt->userData);
4914 }
4915 }
4916 if ((ctxt->myDoc != NULL) &&
4917 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4918 (ctxt->instate == XML_PARSER_EPILOG))) {
4919 xmlDtdPtr dtd;
4920 dtd = xmlGetIntSubset(ctxt->myDoc);
4921 if (dtd == NULL)
4922 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004923 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004924 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4925 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4926 }
4927#ifdef DEBUG_PUSH
4928 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4929#endif
4930 return(ret);
4931}
4932
4933/**
Owen Taylor3473f882001-02-23 17:55:21 +00004934 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00004935 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00004936 * @chunk: an char array
4937 * @size: the size in byte of the chunk
4938 * @terminate: last chunk indicator
4939 *
4940 * Parse a Chunk of memory
4941 *
4942 * Returns zero if no error, the xmlParserErrors otherwise.
4943 */
4944int
4945htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4946 int terminate) {
4947 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4948 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4949 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4950 int cur = ctxt->input->cur - ctxt->input->base;
4951
4952 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4953 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4954 ctxt->input->cur = ctxt->input->base + cur;
4955#ifdef DEBUG_PUSH
4956 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4957#endif
4958
Daniel Veillard14f752c2003-08-09 11:44:50 +00004959#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00004960 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4961 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00004962#endif
Owen Taylor3473f882001-02-23 17:55:21 +00004963 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004964 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
4965 xmlParserInputBufferPtr in = ctxt->input->buf;
4966 if ((in->encoder != NULL) && (in->buffer != NULL) &&
4967 (in->raw != NULL)) {
4968 int nbchars;
4969
4970 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
4971 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004972 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
4973 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00004974 return(XML_ERR_INVALID_ENCODING);
4975 }
4976 }
4977 }
Owen Taylor3473f882001-02-23 17:55:21 +00004978 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00004979 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00004980 if (terminate) {
4981 if ((ctxt->instate != XML_PARSER_EOF) &&
4982 (ctxt->instate != XML_PARSER_EPILOG) &&
4983 (ctxt->instate != XML_PARSER_MISC)) {
4984 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004985 ctxt->wellFormed = 0;
4986 }
4987 if (ctxt->instate != XML_PARSER_EOF) {
4988 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4989 ctxt->sax->endDocument(ctxt->userData);
4990 }
4991 ctxt->instate = XML_PARSER_EOF;
4992 }
4993 return((xmlParserErrors) ctxt->errNo);
4994}
Daniel Veillard73b013f2003-09-30 12:36:01 +00004995#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00004996
4997/************************************************************************
4998 * *
4999 * User entry points *
5000 * *
5001 ************************************************************************/
5002
5003/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005004 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005005 * @sax: a SAX handler
5006 * @user_data: The user data returned on SAX callbacks
5007 * @chunk: a pointer to an array of chars
5008 * @size: number of chars in the array
5009 * @filename: an optional file name or URI
5010 * @enc: an optional encoding
5011 *
5012 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005013 * The value of @filename is used for fetching external entities
5014 * and error/warning reports.
5015 *
5016 * Returns the new parser context or NULL
5017 */
5018htmlParserCtxtPtr
5019htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5020 const char *chunk, int size, const char *filename,
5021 xmlCharEncoding enc) {
5022 htmlParserCtxtPtr ctxt;
5023 htmlParserInputPtr inputStream;
5024 xmlParserInputBufferPtr buf;
5025
Daniel Veillardd0463562001-10-13 09:15:48 +00005026 xmlInitParser();
5027
Owen Taylor3473f882001-02-23 17:55:21 +00005028 buf = xmlAllocParserInputBuffer(enc);
5029 if (buf == NULL) return(NULL);
5030
Daniel Veillardf403d292003-10-05 13:51:35 +00005031 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005032 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005033 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005034 return(NULL);
5035 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005036 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5037 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005038 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005039 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005040 xmlFree(ctxt->sax);
5041 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5042 if (ctxt->sax == NULL) {
5043 xmlFree(buf);
5044 xmlFree(ctxt);
5045 return(NULL);
5046 }
5047 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5048 if (user_data != NULL)
5049 ctxt->userData = user_data;
5050 }
5051 if (filename == NULL) {
5052 ctxt->directory = NULL;
5053 } else {
5054 ctxt->directory = xmlParserGetDirectory(filename);
5055 }
5056
5057 inputStream = htmlNewInputStream(ctxt);
5058 if (inputStream == NULL) {
5059 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005060 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005061 return(NULL);
5062 }
5063
5064 if (filename == NULL)
5065 inputStream->filename = NULL;
5066 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005067 inputStream->filename = (char *)
5068 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005069 inputStream->buf = buf;
5070 inputStream->base = inputStream->buf->buffer->content;
5071 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005072 inputStream->end =
5073 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005074
5075 inputPush(ctxt, inputStream);
5076
5077 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5078 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005079 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5080 int cur = ctxt->input->cur - ctxt->input->base;
5081
Owen Taylor3473f882001-02-23 17:55:21 +00005082 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005083
5084 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5085 ctxt->input->cur = ctxt->input->base + cur;
5086 ctxt->input->end =
5087 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005088#ifdef DEBUG_PUSH
5089 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5090#endif
5091 }
5092
5093 return(ctxt);
5094}
5095
5096/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005097 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005098 * @cur: a pointer to an array of xmlChar
5099 * @encoding: a free form C string describing the HTML document encoding, or NULL
5100 * @sax: the SAX handler block
5101 * @userData: if using SAX, this pointer will be provided on callbacks.
5102 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005103 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5104 * to handle parse events. If sax is NULL, fallback to the default DOM
5105 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005106 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005107 * Returns the resulting document tree unless SAX is NULL or the document is
5108 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005109 */
5110
5111htmlDocPtr
5112htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5113 htmlDocPtr ret;
5114 htmlParserCtxtPtr ctxt;
5115
Daniel Veillardd0463562001-10-13 09:15:48 +00005116 xmlInitParser();
5117
Owen Taylor3473f882001-02-23 17:55:21 +00005118 if (cur == NULL) return(NULL);
5119
5120
5121 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5122 if (ctxt == NULL) return(NULL);
5123 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005124 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005125 ctxt->sax = sax;
5126 ctxt->userData = userData;
5127 }
5128
5129 htmlParseDocument(ctxt);
5130 ret = ctxt->myDoc;
5131 if (sax != NULL) {
5132 ctxt->sax = NULL;
5133 ctxt->userData = NULL;
5134 }
5135 htmlFreeParserCtxt(ctxt);
5136
5137 return(ret);
5138}
5139
5140/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005141 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005142 * @cur: a pointer to an array of xmlChar
5143 * @encoding: a free form C string describing the HTML document encoding, or NULL
5144 *
5145 * parse an HTML in-memory document and build a tree.
5146 *
5147 * Returns the resulting document tree
5148 */
5149
5150htmlDocPtr
5151htmlParseDoc(xmlChar *cur, const char *encoding) {
5152 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5153}
5154
5155
5156/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005157 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005158 * @filename: the filename
5159 * @encoding: a free form C string describing the HTML document encoding, or NULL
5160 *
5161 * Create a parser context for a file content.
5162 * Automatic support for ZLIB/Compress compressed document is provided
5163 * by default if found at compile-time.
5164 *
5165 * Returns the new parser context or NULL
5166 */
5167htmlParserCtxtPtr
5168htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5169{
5170 htmlParserCtxtPtr ctxt;
5171 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005172 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005173 /* htmlCharEncoding enc; */
5174 xmlChar *content, *content_line = (xmlChar *) "charset=";
5175
Daniel Veillardf403d292003-10-05 13:51:35 +00005176 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005177 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005178 return(NULL);
5179 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005180 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5181 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005182#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005183 if (xmlDefaultSAXHandler.error != NULL) {
5184 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5185 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005186#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005187 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005188 return(NULL);
5189 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005190
5191 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5192 xmlFree(canonicFilename);
5193 if (inputStream == NULL) {
5194 xmlFreeParserCtxt(ctxt);
5195 return(NULL);
5196 }
Owen Taylor3473f882001-02-23 17:55:21 +00005197
5198 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005199
Owen Taylor3473f882001-02-23 17:55:21 +00005200 /* set encoding */
5201 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005202 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005203 if (content) {
5204 strcpy ((char *)content, (char *)content_line);
5205 strcat ((char *)content, (char *)encoding);
5206 htmlCheckEncoding (ctxt, content);
5207 xmlFree (content);
5208 }
5209 }
5210
5211 return(ctxt);
5212}
5213
5214/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005215 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005216 * @filename: the filename
5217 * @encoding: a free form C string describing the HTML document encoding, or NULL
5218 * @sax: the SAX handler block
5219 * @userData: if using SAX, this pointer will be provided on callbacks.
5220 *
5221 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5222 * compressed document is provided by default if found at compile-time.
5223 * It use the given SAX function block to handle the parsing callback.
5224 * If sax is NULL, fallback to the default DOM tree building routines.
5225 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005226 * Returns the resulting document tree unless SAX is NULL or the document is
5227 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005228 */
5229
5230htmlDocPtr
5231htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5232 void *userData) {
5233 htmlDocPtr ret;
5234 htmlParserCtxtPtr ctxt;
5235 htmlSAXHandlerPtr oldsax = NULL;
5236
Daniel Veillardd0463562001-10-13 09:15:48 +00005237 xmlInitParser();
5238
Owen Taylor3473f882001-02-23 17:55:21 +00005239 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5240 if (ctxt == NULL) return(NULL);
5241 if (sax != NULL) {
5242 oldsax = ctxt->sax;
5243 ctxt->sax = sax;
5244 ctxt->userData = userData;
5245 }
5246
5247 htmlParseDocument(ctxt);
5248
5249 ret = ctxt->myDoc;
5250 if (sax != NULL) {
5251 ctxt->sax = oldsax;
5252 ctxt->userData = NULL;
5253 }
5254 htmlFreeParserCtxt(ctxt);
5255
5256 return(ret);
5257}
5258
5259/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005260 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005261 * @filename: the filename
5262 * @encoding: a free form C string describing the HTML document encoding, or NULL
5263 *
5264 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5265 * compressed document is provided by default if found at compile-time.
5266 *
5267 * Returns the resulting document tree
5268 */
5269
5270htmlDocPtr
5271htmlParseFile(const char *filename, const char *encoding) {
5272 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5273}
5274
5275/**
5276 * htmlHandleOmittedElem:
5277 * @val: int 0 or 1
5278 *
5279 * Set and return the previous value for handling HTML omitted tags.
5280 *
5281 * Returns the last value for 0 for no handling, 1 for auto insertion.
5282 */
5283
5284int
5285htmlHandleOmittedElem(int val) {
5286 int old = htmlOmittedDefaultValue;
5287
5288 htmlOmittedDefaultValue = val;
5289 return(old);
5290}
5291
Daniel Veillard930dfb62003-02-05 10:17:38 +00005292/**
5293 * htmlElementAllowedHere:
5294 * @parent: HTML parent element
5295 * @elt: HTML element
5296 *
5297 * Checks whether an HTML element may be a direct child of a parent element.
5298 * Note - doesn't check for deprecated elements
5299 *
5300 * Returns 1 if allowed; 0 otherwise.
5301 */
5302int
5303htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5304 const char** p ;
5305
5306 if ( ! elt || ! parent || ! parent->subelts )
5307 return 0 ;
5308
5309 for ( p = parent->subelts; *p; ++p )
5310 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5311 return 1 ;
5312
5313 return 0 ;
5314}
5315/**
5316 * htmlElementStatusHere:
5317 * @parent: HTML parent element
5318 * @elt: HTML element
5319 *
5320 * Checks whether an HTML element may be a direct child of a parent element.
5321 * and if so whether it is valid or deprecated.
5322 *
5323 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5324 */
5325htmlStatus
5326htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5327 if ( ! parent || ! elt )
5328 return HTML_INVALID ;
5329 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5330 return HTML_INVALID ;
5331
5332 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5333}
5334/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005335 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005336 * @elt: HTML element
5337 * @attr: HTML attribute
5338 * @legacy: whether to allow deprecated attributes
5339 *
5340 * Checks whether an attribute is valid for an element
5341 * Has full knowledge of Required and Deprecated attributes
5342 *
5343 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5344 */
5345htmlStatus
5346htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5347 const char** p ;
5348
5349 if ( !elt || ! attr )
5350 return HTML_INVALID ;
5351
5352 if ( elt->attrs_req )
5353 for ( p = elt->attrs_req; *p; ++p)
5354 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5355 return HTML_REQUIRED ;
5356
5357 if ( elt->attrs_opt )
5358 for ( p = elt->attrs_opt; *p; ++p)
5359 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5360 return HTML_VALID ;
5361
5362 if ( legacy && elt->attrs_depr )
5363 for ( p = elt->attrs_depr; *p; ++p)
5364 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5365 return HTML_DEPRECATED ;
5366
5367 return HTML_INVALID ;
5368}
5369/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005370 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005371 * @node: an htmlNodePtr in a tree
5372 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005373 * for Element nodes)
5374 *
5375 * Checks whether the tree node is valid. Experimental (the author
5376 * only uses the HTML enhancements in a SAX parser)
5377 *
5378 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5379 * legacy allowed) or htmlElementStatusHere (otherwise).
5380 * for Attribute nodes, a return from htmlAttrAllowed
5381 * for other nodes, HTML_NA (no checks performed)
5382 */
5383htmlStatus
5384htmlNodeStatus(const htmlNodePtr node, int legacy) {
5385 if ( ! node )
5386 return HTML_INVALID ;
5387
5388 switch ( node->type ) {
5389 case XML_ELEMENT_NODE:
5390 return legacy
5391 ? ( htmlElementAllowedHere (
5392 htmlTagLookup(node->parent->name) , node->name
5393 ) ? HTML_VALID : HTML_INVALID )
5394 : htmlElementStatusHere(
5395 htmlTagLookup(node->parent->name) ,
5396 htmlTagLookup(node->name) )
5397 ;
5398 case XML_ATTRIBUTE_NODE:
5399 return htmlAttrAllowed(
5400 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5401 default: return HTML_NA ;
5402 }
5403}
Daniel Veillard9475a352003-09-26 12:47:50 +00005404/************************************************************************
5405 * *
5406 * New set (2.6.0) of simpler and more flexible APIs *
5407 * *
5408 ************************************************************************/
5409/**
5410 * DICT_FREE:
5411 * @str: a string
5412 *
5413 * Free a string if it is not owned by the "dict" dictionnary in the
5414 * current scope
5415 */
5416#define DICT_FREE(str) \
5417 if ((str) && ((!dict) || \
5418 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5419 xmlFree((char *)(str));
5420
5421/**
5422 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005423 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005424 *
5425 * Reset a parser context
5426 */
5427void
5428htmlCtxtReset(htmlParserCtxtPtr ctxt)
5429{
5430 xmlParserInputPtr input;
5431 xmlDictPtr dict = ctxt->dict;
5432
5433 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5434 xmlFreeInputStream(input);
5435 }
5436 ctxt->inputNr = 0;
5437 ctxt->input = NULL;
5438
5439 ctxt->spaceNr = 0;
5440 ctxt->spaceTab[0] = -1;
5441 ctxt->space = &ctxt->spaceTab[0];
5442
5443
5444 ctxt->nodeNr = 0;
5445 ctxt->node = NULL;
5446
5447 ctxt->nameNr = 0;
5448 ctxt->name = NULL;
5449
5450 DICT_FREE(ctxt->version);
5451 ctxt->version = NULL;
5452 DICT_FREE(ctxt->encoding);
5453 ctxt->encoding = NULL;
5454 DICT_FREE(ctxt->directory);
5455 ctxt->directory = NULL;
5456 DICT_FREE(ctxt->extSubURI);
5457 ctxt->extSubURI = NULL;
5458 DICT_FREE(ctxt->extSubSystem);
5459 ctxt->extSubSystem = NULL;
5460 if (ctxt->myDoc != NULL)
5461 xmlFreeDoc(ctxt->myDoc);
5462 ctxt->myDoc = NULL;
5463
5464 ctxt->standalone = -1;
5465 ctxt->hasExternalSubset = 0;
5466 ctxt->hasPErefs = 0;
5467 ctxt->html = 1;
5468 ctxt->external = 0;
5469 ctxt->instate = XML_PARSER_START;
5470 ctxt->token = 0;
5471
5472 ctxt->wellFormed = 1;
5473 ctxt->nsWellFormed = 1;
5474 ctxt->valid = 1;
5475 ctxt->vctxt.userData = ctxt;
5476 ctxt->vctxt.error = xmlParserValidityError;
5477 ctxt->vctxt.warning = xmlParserValidityWarning;
5478 ctxt->record_info = 0;
5479 ctxt->nbChars = 0;
5480 ctxt->checkIndex = 0;
5481 ctxt->inSubset = 0;
5482 ctxt->errNo = XML_ERR_OK;
5483 ctxt->depth = 0;
5484 ctxt->charset = XML_CHAR_ENCODING_UTF8;
5485 ctxt->catalogs = NULL;
5486 xmlInitNodeInfoSeq(&ctxt->node_seq);
5487
5488 if (ctxt->attsDefault != NULL) {
5489 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5490 ctxt->attsDefault = NULL;
5491 }
5492 if (ctxt->attsSpecial != NULL) {
5493 xmlHashFree(ctxt->attsSpecial, NULL);
5494 ctxt->attsSpecial = NULL;
5495 }
5496}
5497
5498/**
5499 * htmlCtxtUseOptions:
5500 * @ctxt: an HTML parser context
5501 * @options: a combination of htmlParserOption(s)
5502 *
5503 * Applies the options to the parser context
5504 *
5505 * Returns 0 in case of success, the set of unknown or unimplemented options
5506 * in case of error.
5507 */
5508int
5509htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5510{
5511 if (options & HTML_PARSE_NOWARNING) {
5512 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005513 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005514 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005515 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00005516 }
5517 if (options & HTML_PARSE_NOERROR) {
5518 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005519 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00005520 ctxt->sax->fatalError = NULL;
5521 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005522 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00005523 }
5524 if (options & HTML_PARSE_PEDANTIC) {
5525 ctxt->pedantic = 1;
5526 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005527 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00005528 } else
5529 ctxt->pedantic = 0;
5530 if (options & XML_PARSE_NOBLANKS) {
5531 ctxt->keepBlanks = 0;
5532 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5533 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00005534 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00005535 } else
5536 ctxt->keepBlanks = 1;
5537 ctxt->dictNames = 0;
5538 return (options);
5539}
5540
5541/**
5542 * htmlDoRead:
5543 * @ctxt: an HTML parser context
5544 * @URL: the base URL to use for the document
5545 * @encoding: the document encoding, or NULL
5546 * @options: a combination of htmlParserOption(s)
5547 * @reuse: keep the context for reuse
5548 *
5549 * Common front-end for the htmlRead functions
5550 *
5551 * Returns the resulting document tree or NULL
5552 */
5553static htmlDocPtr
5554htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5555 int options, int reuse)
5556{
5557 htmlDocPtr ret;
5558
5559 htmlCtxtUseOptions(ctxt, options);
5560 ctxt->html = 1;
5561 if (encoding != NULL) {
5562 xmlCharEncodingHandlerPtr hdlr;
5563
5564 hdlr = xmlFindCharEncodingHandler(encoding);
5565 if (hdlr != NULL)
5566 xmlSwitchToEncoding(ctxt, hdlr);
5567 }
5568 if ((URL != NULL) && (ctxt->input != NULL) &&
5569 (ctxt->input->filename == NULL))
5570 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5571 htmlParseDocument(ctxt);
5572 ret = ctxt->myDoc;
5573 ctxt->myDoc = NULL;
5574 if (!reuse) {
5575 if ((ctxt->dictNames) &&
5576 (ret != NULL) &&
5577 (ret->dict == ctxt->dict))
5578 ctxt->dict = NULL;
5579 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00005580 }
5581 return (ret);
5582}
5583
5584/**
5585 * htmlReadDoc:
5586 * @cur: a pointer to a zero terminated string
5587 * @URL: the base URL to use for the document
5588 * @encoding: the document encoding, or NULL
5589 * @options: a combination of htmlParserOption(s)
5590 *
5591 * parse an XML in-memory document and build a tree.
5592 *
5593 * Returns the resulting document tree
5594 */
5595htmlDocPtr
5596htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5597{
5598 htmlParserCtxtPtr ctxt;
5599
5600 if (cur == NULL)
5601 return (NULL);
5602
5603 ctxt = xmlCreateDocParserCtxt(cur);
5604 if (ctxt == NULL)
5605 return (NULL);
5606 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5607}
5608
5609/**
5610 * htmlReadFile:
5611 * @filename: a file or URL
5612 * @encoding: the document encoding, or NULL
5613 * @options: a combination of htmlParserOption(s)
5614 *
5615 * parse an XML file from the filesystem or the network.
5616 *
5617 * Returns the resulting document tree
5618 */
5619htmlDocPtr
5620htmlReadFile(const char *filename, const char *encoding, int options)
5621{
5622 htmlParserCtxtPtr ctxt;
5623
5624 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5625 if (ctxt == NULL)
5626 return (NULL);
5627 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5628}
5629
5630/**
5631 * htmlReadMemory:
5632 * @buffer: a pointer to a char array
5633 * @size: the size of the array
5634 * @URL: the base URL to use for the document
5635 * @encoding: the document encoding, or NULL
5636 * @options: a combination of htmlParserOption(s)
5637 *
5638 * parse an XML in-memory document and build a tree.
5639 *
5640 * Returns the resulting document tree
5641 */
5642htmlDocPtr
5643htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5644{
5645 htmlParserCtxtPtr ctxt;
5646
5647 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5648 if (ctxt == NULL)
5649 return (NULL);
5650 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5651}
5652
5653/**
5654 * htmlReadFd:
5655 * @fd: an open file descriptor
5656 * @URL: the base URL to use for the document
5657 * @encoding: the document encoding, or NULL
5658 * @options: a combination of htmlParserOption(s)
5659 *
5660 * parse an XML from a file descriptor and build a tree.
5661 *
5662 * Returns the resulting document tree
5663 */
5664htmlDocPtr
5665htmlReadFd(int fd, const char *URL, const char *encoding, int options)
5666{
5667 htmlParserCtxtPtr ctxt;
5668 xmlParserInputBufferPtr input;
5669 xmlParserInputPtr stream;
5670
5671 if (fd < 0)
5672 return (NULL);
5673
5674 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5675 if (input == NULL)
5676 return (NULL);
5677 ctxt = xmlNewParserCtxt();
5678 if (ctxt == NULL) {
5679 xmlFreeParserInputBuffer(input);
5680 return (NULL);
5681 }
5682 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5683 if (stream == NULL) {
5684 xmlFreeParserInputBuffer(input);
5685 xmlFreeParserCtxt(ctxt);
5686 return (NULL);
5687 }
5688 inputPush(ctxt, stream);
5689 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5690}
5691
5692/**
5693 * htmlReadIO:
5694 * @ioread: an I/O read function
5695 * @ioclose: an I/O close function
5696 * @ioctx: an I/O handler
5697 * @URL: the base URL to use for the document
5698 * @encoding: the document encoding, or NULL
5699 * @options: a combination of htmlParserOption(s)
5700 *
5701 * parse an HTML document from I/O functions and source and build a tree.
5702 *
5703 * Returns the resulting document tree
5704 */
5705htmlDocPtr
5706htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5707 void *ioctx, const char *URL, const char *encoding, int options)
5708{
5709 htmlParserCtxtPtr ctxt;
5710 xmlParserInputBufferPtr input;
5711 xmlParserInputPtr stream;
5712
5713 if (ioread == NULL)
5714 return (NULL);
5715
5716 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5717 XML_CHAR_ENCODING_NONE);
5718 if (input == NULL)
5719 return (NULL);
5720 ctxt = xmlNewParserCtxt();
5721 if (ctxt == NULL) {
5722 xmlFreeParserInputBuffer(input);
5723 return (NULL);
5724 }
5725 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5726 if (stream == NULL) {
5727 xmlFreeParserInputBuffer(input);
5728 xmlFreeParserCtxt(ctxt);
5729 return (NULL);
5730 }
5731 inputPush(ctxt, stream);
5732 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5733}
5734
5735/**
5736 * htmlCtxtReadDoc:
5737 * @ctxt: an HTML parser context
5738 * @cur: a pointer to a zero terminated string
5739 * @URL: the base URL to use for the document
5740 * @encoding: the document encoding, or NULL
5741 * @options: a combination of htmlParserOption(s)
5742 *
5743 * parse an XML in-memory document and build a tree.
5744 * This reuses the existing @ctxt parser context
5745 *
5746 * Returns the resulting document tree
5747 */
5748htmlDocPtr
5749htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
5750 const char *URL, const char *encoding, int options)
5751{
5752 xmlParserInputPtr stream;
5753
5754 if (cur == NULL)
5755 return (NULL);
5756 if (ctxt == NULL)
5757 return (NULL);
5758
5759 htmlCtxtReset(ctxt);
5760
5761 stream = xmlNewStringInputStream(ctxt, cur);
5762 if (stream == NULL) {
5763 return (NULL);
5764 }
5765 inputPush(ctxt, stream);
5766 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5767}
5768
5769/**
5770 * htmlCtxtReadFile:
5771 * @ctxt: an HTML parser context
5772 * @filename: a file or URL
5773 * @encoding: the document encoding, or NULL
5774 * @options: a combination of htmlParserOption(s)
5775 *
5776 * parse an XML file from the filesystem or the network.
5777 * This reuses the existing @ctxt parser context
5778 *
5779 * Returns the resulting document tree
5780 */
5781htmlDocPtr
5782htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
5783 const char *encoding, int options)
5784{
5785 xmlParserInputPtr stream;
5786
5787 if (filename == NULL)
5788 return (NULL);
5789 if (ctxt == NULL)
5790 return (NULL);
5791
5792 htmlCtxtReset(ctxt);
5793
5794 stream = xmlNewInputFromFile(ctxt, filename);
5795 if (stream == NULL) {
5796 return (NULL);
5797 }
5798 inputPush(ctxt, stream);
5799 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
5800}
5801
5802/**
5803 * htmlCtxtReadMemory:
5804 * @ctxt: an HTML parser context
5805 * @buffer: a pointer to a char array
5806 * @size: the size of the array
5807 * @URL: the base URL to use for the document
5808 * @encoding: the document encoding, or NULL
5809 * @options: a combination of htmlParserOption(s)
5810 *
5811 * parse an XML in-memory document and build a tree.
5812 * This reuses the existing @ctxt parser context
5813 *
5814 * Returns the resulting document tree
5815 */
5816htmlDocPtr
5817htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
5818 const char *URL, const char *encoding, int options)
5819{
5820 xmlParserInputBufferPtr input;
5821 xmlParserInputPtr stream;
5822
5823 if (ctxt == NULL)
5824 return (NULL);
5825 if (buffer == NULL)
5826 return (NULL);
5827
5828 htmlCtxtReset(ctxt);
5829
5830 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5831 if (input == NULL) {
5832 return(NULL);
5833 }
5834
5835 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5836 if (stream == NULL) {
5837 xmlFreeParserInputBuffer(input);
5838 return(NULL);
5839 }
5840
5841 inputPush(ctxt, stream);
5842 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5843}
5844
5845/**
5846 * htmlCtxtReadFd:
5847 * @ctxt: an HTML parser context
5848 * @fd: an open file descriptor
5849 * @URL: the base URL to use for the document
5850 * @encoding: the document encoding, or NULL
5851 * @options: a combination of htmlParserOption(s)
5852 *
5853 * parse an XML from a file descriptor and build a tree.
5854 * This reuses the existing @ctxt parser context
5855 *
5856 * Returns the resulting document tree
5857 */
5858htmlDocPtr
5859htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
5860 const char *URL, const char *encoding, int options)
5861{
5862 xmlParserInputBufferPtr input;
5863 xmlParserInputPtr stream;
5864
5865 if (fd < 0)
5866 return (NULL);
5867 if (ctxt == NULL)
5868 return (NULL);
5869
5870 htmlCtxtReset(ctxt);
5871
5872
5873 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5874 if (input == NULL)
5875 return (NULL);
5876 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5877 if (stream == NULL) {
5878 xmlFreeParserInputBuffer(input);
5879 return (NULL);
5880 }
5881 inputPush(ctxt, stream);
5882 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5883}
5884
5885/**
5886 * htmlCtxtReadIO:
5887 * @ctxt: an HTML parser context
5888 * @ioread: an I/O read function
5889 * @ioclose: an I/O close function
5890 * @ioctx: an I/O handler
5891 * @URL: the base URL to use for the document
5892 * @encoding: the document encoding, or NULL
5893 * @options: a combination of htmlParserOption(s)
5894 *
5895 * parse an HTML document from I/O functions and source and build a tree.
5896 * This reuses the existing @ctxt parser context
5897 *
5898 * Returns the resulting document tree
5899 */
5900htmlDocPtr
5901htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
5902 xmlInputCloseCallback ioclose, void *ioctx,
5903 const char *URL,
5904 const char *encoding, int options)
5905{
5906 xmlParserInputBufferPtr input;
5907 xmlParserInputPtr stream;
5908
5909 if (ioread == NULL)
5910 return (NULL);
5911 if (ctxt == NULL)
5912 return (NULL);
5913
5914 htmlCtxtReset(ctxt);
5915
5916 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5917 XML_CHAR_ENCODING_NONE);
5918 if (input == NULL)
5919 return (NULL);
5920 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5921 if (stream == NULL) {
5922 xmlFreeParserInputBuffer(input);
5923 return (NULL);
5924 }
5925 inputPush(ctxt, stream);
5926 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5927}
5928
Owen Taylor3473f882001-02-23 17:55:21 +00005929#endif /* LIBXML_HTML_ENABLED */