blob: db7a485ada80b78de1149bf76264c1faca278ea7 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
Daniel Veillard22090732001-07-16 00:06:07 +000054static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000055
Daniel Veillard56a4cb82001-03-24 17:00:36 +000056xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000058static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059
60/************************************************************************
61 * *
Daniel Veillardf403d292003-10-05 13:51:35 +000062 * Some factorized error routines *
63 * *
64 ************************************************************************/
65
66/**
William M. Brackedb65a72004-02-06 07:36:04 +000067 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000068 * @ctxt: an HTML parser context
69 * @extra: extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
Daniel Veillard157fee02003-10-31 10:36:03 +000076 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77 (ctxt->instate == XML_PARSER_EOF))
78 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000079 if (ctxt != NULL) {
80 ctxt->errNo = XML_ERR_NO_MEMORY;
81 ctxt->instate = XML_PARSER_EOF;
82 ctxt->disableSAX = 1;
83 }
84 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000085 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000086 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87 NULL, NULL, 0, 0,
88 "Memory allocation failed : %s\n", extra);
89 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000090 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000091 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92 NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt: an HTML parser context
98 * @error: the error number
99 * @msg: the error message
100 * @str1: string infor
101 * @str2: string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107 const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
Daniel Veillard157fee02003-10-31 10:36:03 +0000109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110 (ctxt->instate == XML_PARSER_EOF))
111 return;
Daniel Veillardf403d292003-10-05 13:51:35 +0000112 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000113 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000114 XML_ERR_ERROR, NULL, 0,
115 (const char *) str1, (const char *) str2,
116 NULL, 0, 0,
117 msg, str1, str2);
118 ctxt->wellFormed = 0;
119}
120
121/**
122 * htmlParseErrInt:
123 * @ctxt: an HTML parser context
124 * @error: the error number
125 * @msg: the error message
126 * @val: integer info
127 *
128 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
129 */
130static void
131htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
132 const char *msg, int val)
133{
Daniel Veillard157fee02003-10-31 10:36:03 +0000134 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
135 (ctxt->instate == XML_PARSER_EOF))
136 return;
Daniel Veillardf403d292003-10-05 13:51:35 +0000137 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000138 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000139 XML_ERR_ERROR, NULL, 0, NULL, NULL,
140 NULL, val, 0, msg, val);
141 ctxt->wellFormed = 0;
142}
143
144/************************************************************************
145 * *
Owen Taylor3473f882001-02-23 17:55:21 +0000146 * Parser stacks related functions and macros *
147 * *
148 ************************************************************************/
149
Daniel Veillard1c732d22002-11-30 11:22:59 +0000150/**
151 * htmlnamePush:
152 * @ctxt: an HTML parser context
153 * @value: the element name
154 *
155 * Pushes a new element name on top of the name stack
156 *
157 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000158 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000159static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000160htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000161{
162 if (ctxt->nameNr >= ctxt->nameMax) {
163 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000164 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000165 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000166 ctxt->nameMax *
167 sizeof(ctxt->nameTab[0]));
168 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000169 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000170 return (0);
171 }
172 }
173 ctxt->nameTab[ctxt->nameNr] = value;
174 ctxt->name = value;
175 return (ctxt->nameNr++);
176}
177/**
178 * htmlnamePop:
179 * @ctxt: an HTML parser context
180 *
181 * Pops the top element name from the name stack
182 *
183 * Returns the name just removed
184 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000185static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000186htmlnamePop(htmlParserCtxtPtr ctxt)
187{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000188 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000189
Daniel Veillard1c732d22002-11-30 11:22:59 +0000190 if (ctxt->nameNr <= 0)
191 return (0);
192 ctxt->nameNr--;
193 if (ctxt->nameNr < 0)
194 return (0);
195 if (ctxt->nameNr > 0)
196 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
197 else
198 ctxt->name = NULL;
199 ret = ctxt->nameTab[ctxt->nameNr];
200 ctxt->nameTab[ctxt->nameNr] = 0;
201 return (ret);
202}
Owen Taylor3473f882001-02-23 17:55:21 +0000203
204/*
205 * Macros for accessing the content. Those should be used only by the parser,
206 * and not exported.
207 *
208 * Dirty macros, i.e. one need to make assumption on the context to use them
209 *
210 * CUR_PTR return the current pointer to the xmlChar to be parsed.
211 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
212 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
213 * in UNICODE mode. This should be used internally by the parser
214 * only to compare to ASCII values otherwise it would break when
215 * running with UTF-8 encoding.
216 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
217 * to compare on ASCII based substring.
218 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
219 * it should be used only to compare on ASCII based substring.
220 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000221 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000222 *
223 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
224 *
225 * CURRENT Returns the current char value, with the full decoding of
226 * UTF-8 if we are using this mode. It returns an int.
227 * NEXT Skip to the next character, this does the proper decoding
228 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000229 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000230 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
231 */
232
233#define UPPER (toupper(*ctxt->input->cur))
234
Daniel Veillard77a90a72003-03-22 00:04:05 +0000235#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000236
237#define NXT(val) ctxt->input->cur[(val)]
238
239#define UPP(val) (toupper(ctxt->input->cur[(val)]))
240
241#define CUR_PTR ctxt->input->cur
242
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000243#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
244 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
245 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000246
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000247#define GROW if ((ctxt->progressive == 0) && \
248 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
249 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000250
251#define CURRENT ((int) (*ctxt->input->cur))
252
253#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
254
255/* Inported from XML */
256
Daniel Veillard561b7f82002-03-20 21:55:57 +0000257/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
258#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000259#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000260
Daniel Veillard561b7f82002-03-20 21:55:57 +0000261#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000262#define NXT(val) ctxt->input->cur[(val)]
263#define CUR_PTR ctxt->input->cur
264
265
266#define NEXTL(l) do { \
267 if (*(ctxt->input->cur) == '\n') { \
268 ctxt->input->line++; ctxt->input->col = 1; \
269 } else ctxt->input->col++; \
270 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
271 } while (0)
272
273/************
274 \
275 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
276 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
277 ************/
278
279#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
280#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
281
282#define COPY_BUF(l,b,i,v) \
283 if (l == 1) b[i++] = (xmlChar) v; \
284 else i += xmlCopyChar(l,&b[i],v)
285
286/**
287 * htmlCurrentChar:
288 * @ctxt: the HTML parser context
289 * @len: pointer to the length of the char read
290 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000291 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000292 * bytes in the input buffer. Implement the end of line normalization:
293 * 2.11 End-of-Line Handling
294 * If the encoding is unspecified, in the case we find an ISO-Latin-1
295 * char, then the encoding converter is plugged in automatically.
296 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000297 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000298 */
299
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000300static int
Owen Taylor3473f882001-02-23 17:55:21 +0000301htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
302 if (ctxt->instate == XML_PARSER_EOF)
303 return(0);
304
305 if (ctxt->token != 0) {
306 *len = 0;
307 return(ctxt->token);
308 }
309 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
310 /*
311 * We are supposed to handle UTF8, check it's valid
312 * From rfc2044: encoding of the Unicode values on UTF-8:
313 *
314 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
315 * 0000 0000-0000 007F 0xxxxxxx
316 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
317 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
318 *
319 * Check for the 0x110000 limit too
320 */
321 const unsigned char *cur = ctxt->input->cur;
322 unsigned char c;
323 unsigned int val;
324
325 c = *cur;
326 if (c & 0x80) {
327 if (cur[1] == 0)
328 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
329 if ((cur[1] & 0xc0) != 0x80)
330 goto encoding_error;
331 if ((c & 0xe0) == 0xe0) {
332
333 if (cur[2] == 0)
334 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
335 if ((cur[2] & 0xc0) != 0x80)
336 goto encoding_error;
337 if ((c & 0xf0) == 0xf0) {
338 if (cur[3] == 0)
339 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
340 if (((c & 0xf8) != 0xf0) ||
341 ((cur[3] & 0xc0) != 0x80))
342 goto encoding_error;
343 /* 4-byte code */
344 *len = 4;
345 val = (cur[0] & 0x7) << 18;
346 val |= (cur[1] & 0x3f) << 12;
347 val |= (cur[2] & 0x3f) << 6;
348 val |= cur[3] & 0x3f;
349 } else {
350 /* 3-byte code */
351 *len = 3;
352 val = (cur[0] & 0xf) << 12;
353 val |= (cur[1] & 0x3f) << 6;
354 val |= cur[2] & 0x3f;
355 }
356 } else {
357 /* 2-byte code */
358 *len = 2;
359 val = (cur[0] & 0x1f) << 6;
360 val |= cur[1] & 0x3f;
361 }
362 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000363 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
364 "Char 0x%X out of allowed range\n", val);
Owen Taylor3473f882001-02-23 17:55:21 +0000365 }
366 return(val);
367 } else {
368 /* 1-byte code */
369 *len = 1;
370 return((int) *ctxt->input->cur);
371 }
372 }
373 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000374 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000375 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000376 * XML constructs only use < 128 chars
377 */
378 *len = 1;
379 if ((int) *ctxt->input->cur < 0x80)
380 return((int) *ctxt->input->cur);
381
382 /*
383 * Humm this is bad, do an automatic flow conversion
384 */
385 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
386 ctxt->charset = XML_CHAR_ENCODING_UTF8;
387 return(xmlCurrentChar(ctxt, len));
388
389encoding_error:
390 /*
391 * If we detect an UTF8 error that probably mean that the
392 * input encoding didn't get properly advertized in the
393 * declaration header. Report the error and switch the encoding
394 * to ISO-Latin-1 (if you don't like this policy, just declare the
395 * encoding !)
396 */
Daniel Veillardf403d292003-10-05 13:51:35 +0000397 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
398 "Input is not proper UTF-8, indicate encoding !\n",
399 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000400 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +0000401 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
402 ctxt->input->cur[0], ctxt->input->cur[1],
403 ctxt->input->cur[2], ctxt->input->cur[3]);
404 }
405
406 ctxt->charset = XML_CHAR_ENCODING_8859_1;
407 *len = 1;
408 return((int) *ctxt->input->cur);
409}
410
411/**
Owen Taylor3473f882001-02-23 17:55:21 +0000412 * htmlSkipBlankChars:
413 * @ctxt: the HTML parser context
414 *
415 * skip all blanks character found at that point in the input streams.
416 *
417 * Returns the number of space chars skipped
418 */
419
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000420static int
Owen Taylor3473f882001-02-23 17:55:21 +0000421htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
422 int res = 0;
423
William M. Brack76e95df2003-10-18 16:20:14 +0000424 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000425 if ((*ctxt->input->cur == 0) &&
426 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
427 xmlPopInput(ctxt);
428 } else {
429 if (*(ctxt->input->cur) == '\n') {
430 ctxt->input->line++; ctxt->input->col = 1;
431 } else ctxt->input->col++;
432 ctxt->input->cur++;
433 ctxt->nbChars++;
434 if (*ctxt->input->cur == 0)
435 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
436 }
437 res++;
438 }
439 return(res);
440}
441
442
443
444/************************************************************************
445 * *
446 * The list of HTML elements and their properties *
447 * *
448 ************************************************************************/
449
450/*
451 * Start Tag: 1 means the start tag can be ommited
452 * End Tag: 1 means the end tag can be ommited
453 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000454 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000455 * Depr: this element is deprecated
456 * DTD: 1 means that this element is valid only in the Loose DTD
457 * 2 means that this element is valid only in the Frameset DTD
458 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000459 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000460 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000461 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000462
463/* Definitions and a couple of vars for HTML Elements */
464
465#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
466#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
467#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
468#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
469#define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
470#define FORMCTRL "input", "select", "textarea", "label", "button"
471#define PCDATA
472#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
473#define LIST "ul", "ol", "dir", "menu"
474#define MODIFIER
475#define FLOW BLOCK,INLINE
476#define EMPTY NULL
477
478
479static const char* html_flow[] = { FLOW, NULL } ;
480static const char* html_inline[] = { INLINE, NULL } ;
481
482/* placeholders: elts with content but no subelements */
483static const char* html_pcdata[] = { NULL } ;
484#define html_cdata html_pcdata
485
486
487/* ... and for HTML Attributes */
488
489#define COREATTRS "id", "class", "style", "title"
490#define I18N "lang", "dir"
491#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
492#define ATTRS COREATTRS,I18N,EVENTS
493#define CELLHALIGN "align", "char", "charoff"
494#define CELLVALIGN "valign"
495
496static const char* html_attrs[] = { ATTRS, NULL } ;
497static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
498static const char* core_attrs[] = { COREATTRS, NULL } ;
499static const char* i18n_attrs[] = { I18N, NULL } ;
500
501
502/* Other declarations that should go inline ... */
503static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
504 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
505 "tabindex", "onfocus", "onblur", NULL } ;
506static const char* target_attr[] = { "target", NULL } ;
507static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
508static const char* alt_attr[] = { "alt", NULL } ;
509static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
510static const char* href_attrs[] = { "href", NULL } ;
511static const char* clear_attrs[] = { "clear", NULL } ;
512static const char* inline_p[] = { INLINE, "p", NULL } ;
513static const char* flow_param[] = { FLOW, "param", NULL } ;
514static const char* applet_attrs[] = { COREATTRS , "codebase",
515 "archive", "alt", "name", "height", "width", "align",
516 "hspace", "vspace", NULL } ;
517static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
518 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
519static const char* basefont_attrs[] =
520 { "id", "size", "color", "face", NULL } ;
521static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
522static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
523static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
524static const char* body_depr[] = { "background", "bgcolor", "text",
525 "link", "vlink", "alink", NULL } ;
526static const char* button_attrs[] = { ATTRS, "name", "value", "type",
527 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
528
529
530static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
531static const char* col_elt[] = { "col", NULL } ;
532static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
533static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
534static const char* dl_contents[] = { "dt", "dd", NULL } ;
535static const char* compact_attr[] = { "compact", NULL } ;
536static const char* label_attr[] = { "label", NULL } ;
537static const char* fieldset_contents[] = { FLOW, "legend" } ;
538static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
539static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
540static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
541static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
542static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
543static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
544static const char* head_attrs[] = { I18N, "profile", NULL } ;
545static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
546static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
547static const char* version_attr[] = { "version", NULL } ;
548static const char* html_content[] = { "head", "body", "frameset", NULL } ;
549static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
550static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
551static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
552static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
553static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
554static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
555static const char* align_attr[] = { "align", NULL } ;
556static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
557static const char* map_contents[] = { BLOCK, "area", NULL } ;
558static const char* name_attr[] = { "name", NULL } ;
559static const char* action_attr[] = { "action", NULL } ;
560static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
561static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
562static const char* content_attr[] = { "content", NULL } ;
563static const char* type_attr[] = { "type", NULL } ;
564static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
565static const char* object_contents[] = { FLOW, "param", NULL } ;
566static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
567static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
568static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
569static const char* option_elt[] = { "option", NULL } ;
570static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
571static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
572static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
573static const char* width_attr[] = { "width", NULL } ;
574static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
575static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
576static const char* language_attr[] = { "language", NULL } ;
577static const char* select_content[] = { "optgroup", "option", NULL } ;
578static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
579static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
580static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
581static const char* table_depr[] = { "align", "bgcolor", NULL } ;
582static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
583static const char* tr_elt[] = { "tr", NULL } ;
584static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
585static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
586static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
587static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
588static const char* tr_contents[] = { "th", "td", NULL } ;
589static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
590static const char* li_elt[] = { "li", NULL } ;
591static const char* ul_depr[] = { "type", "compact", NULL} ;
592static const char* dir_attr[] = { "dir", NULL} ;
593
594#define DECL (const char**)
595
Daniel Veillard22090732001-07-16 00:06:07 +0000596static const htmlElemDesc
597html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000598{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
599 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
600},
601{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
602 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
603},
604{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
605 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
606},
607{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
608 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
609},
610{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
611 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
612},
613{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
614 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
615},
616{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
617 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
618},
619{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
620 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
621},
622{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
623 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
624},
625{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
626 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
627},
628{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
629 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
630},
631{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
632 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
633},
634{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
635 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
636},
637{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
638 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
639},
640{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
641 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
642},
643{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
644 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
645},
646{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
647 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
648},
649{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
650 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
651},
652{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
653 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
654},
655{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
656 EMPTY , NULL , DECL col_attrs , NULL, NULL
657},
658{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
659 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
660},
661{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
662 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
663},
664{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
665 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
666},
667{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
668 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
669},
670{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
671 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
672},
673{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
674 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
675},
676{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
677 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
678},
679{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
680 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
681},
682{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
683 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
684},
685{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
686 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
687},
688{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
689 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
690},
691{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
692 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
693},
694{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
695 EMPTY, NULL, NULL, DECL frame_attrs, NULL
696},
697{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
698 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
699},
700{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
701 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
702},
703{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
704 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
705},
706{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
707 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
708},
709{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
710 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
711},
712{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
713 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
714},
715{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
716 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
717},
718{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
719 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
720},
721{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
722 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
723},
724{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
725 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
726},
727{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
728 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
729},
730{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
731 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
732},
733{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
734 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
735},
736{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
737 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
738},
739{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
740 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
741},
742{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
743 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
744},
745{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
746 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
747},
748{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
749 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
750},
751{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
752 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
753},
754{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
755 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
756},
757{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
758 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
759},
760{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
761 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
762},
763{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
764 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
765},
766{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
767 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
768},
769{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
770 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
771},
772{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
773 DECL html_flow, "div", DECL html_attrs, NULL, NULL
774},
775{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
776 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
777},
778{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
779 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
780},
781{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
782 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
783},
784{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
785 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
786},
787{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
788 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
789},
790{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
791 EMPTY, NULL, DECL param_attrs, NULL, name_attr
792},
793{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
794 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
795},
796{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
797 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
798},
799{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
800 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
801},
802{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
803 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
804},
805{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
806 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
807},
808{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
809 DECL select_content, NULL, DECL select_attrs, NULL, NULL
810},
811{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
812 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
813},
814{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
815 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
816},
817{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
818 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
819},
820{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
821 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
822},
823{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
824 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
825},
826{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
827 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
828},
829{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
830 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
831},
832{ "table", 0, 0, 0, 0, 0, 0, 0, "",
833 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
834},
835{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
836 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
837},
838{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
839 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
840},
841{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
842 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
843},
844{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
845 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
846},
847{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
848 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
849},
850{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
851 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
852},
853{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
854 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
855},
856{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
857 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
858},
859{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
860 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
861},
862{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
863 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
864},
865{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
866 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
867},
868{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
869 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
870}
Owen Taylor3473f882001-02-23 17:55:21 +0000871};
872
873/*
Owen Taylor3473f882001-02-23 17:55:21 +0000874 * start tags that imply the end of current element
875 */
Daniel Veillard22090732001-07-16 00:06:07 +0000876static const char *htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000877"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
878 "dl", "ul", "ol", "menu", "dir", "address", "pre",
879 "listing", "xmp", "head", NULL,
880"head", "p", NULL,
881"title", "p", NULL,
882"body", "head", "style", "link", "title", "p", NULL,
883"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
884 "pre", "listing", "xmp", "head", "li", NULL,
885"hr", "p", "head", NULL,
886"h1", "p", "head", NULL,
887"h2", "p", "head", NULL,
888"h3", "p", "head", NULL,
889"h4", "p", "head", NULL,
890"h5", "p", "head", NULL,
891"h6", "p", "head", NULL,
892"dir", "p", "head", NULL,
893"address", "p", "head", "ul", NULL,
894"pre", "p", "head", "ul", NULL,
895"listing", "p", "head", NULL,
896"xmp", "p", "head", NULL,
897"blockquote", "p", "head", NULL,
898"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
899 "xmp", "head", NULL,
900"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
901 "head", "dd", NULL,
902"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
903 "head", "dt", NULL,
904"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
905 "listing", "xmp", NULL,
906"ol", "p", "head", "ul", NULL,
907"menu", "p", "head", "ul", NULL,
908"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
909"div", "p", "head", NULL,
910"noscript", "p", "head", NULL,
911"center", "font", "b", "i", "p", "head", NULL,
912"a", "a", NULL,
913"caption", "p", NULL,
914"colgroup", "caption", "colgroup", "col", "p", NULL,
915"col", "caption", "col", "p", NULL,
916"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
917 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +0000918"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
919"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +0000920"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
921"thead", "caption", "col", "colgroup", NULL,
922"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
923 "tbody", "p", NULL,
924"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
925 "tfoot", "tbody", "p", NULL,
926"optgroup", "option", NULL,
927"option", "option", NULL,
928"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
929 "pre", "listing", "xmp", "a", NULL,
930NULL
931};
932
933/*
934 * The list of HTML elements which are supposed not to have
935 * CDATA content and where a p element will be implied
936 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000937 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +0000938 * implied paragraph
939 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000940static const char *htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000941 "html",
942 "head",
943 "body",
944 NULL
945};
946
947/*
948 * The list of HTML attributes which are of content %Script;
949 * NOTE: when adding ones, check htmlIsScriptAttribute() since
950 * it assumes the name starts with 'on'
951 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000952static const char *htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +0000953 "onclick",
954 "ondblclick",
955 "onmousedown",
956 "onmouseup",
957 "onmouseover",
958 "onmousemove",
959 "onmouseout",
960 "onkeypress",
961 "onkeydown",
962 "onkeyup",
963 "onload",
964 "onunload",
965 "onfocus",
966 "onblur",
967 "onsubmit",
968 "onrest",
969 "onchange",
970 "onselect"
971};
972
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000973/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000974 * This table is used by the htmlparser to know what to do with
975 * broken html pages. By assigning different priorities to different
976 * elements the parser can decide how to handle extra endtags.
977 * Endtags are only allowed to close elements with lower or equal
978 * priority.
979 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +0000980
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000981typedef struct {
982 const char *name;
983 int priority;
984} elementPriority;
985
Daniel Veillard22090732001-07-16 00:06:07 +0000986static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +0000987 {"div", 150},
988 {"td", 160},
989 {"th", 160},
990 {"tr", 170},
991 {"thead", 180},
992 {"tbody", 180},
993 {"tfoot", 180},
994 {"table", 190},
995 {"head", 200},
996 {"body", 200},
997 {"html", 220},
998 {NULL, 100} /* Default priority */
999};
Owen Taylor3473f882001-02-23 17:55:21 +00001000
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001001static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001002static int htmlStartCloseIndexinitialized = 0;
1003
1004/************************************************************************
1005 * *
1006 * functions to handle HTML specific data *
1007 * *
1008 ************************************************************************/
1009
1010/**
1011 * htmlInitAutoClose:
1012 *
1013 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1014 * This is not reentrant. Call xmlInitParser() once before processing in
1015 * case of use in multithreaded programs.
1016 */
1017void
1018htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001019 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001020
1021 if (htmlStartCloseIndexinitialized) return;
1022
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001023 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1024 indx = 0;
1025 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1026 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001027 while (htmlStartClose[i] != NULL) i++;
1028 i++;
1029 }
1030 htmlStartCloseIndexinitialized = 1;
1031}
1032
1033/**
1034 * htmlTagLookup:
1035 * @tag: The tag name in lowercase
1036 *
1037 * Lookup the HTML tag in the ElementTable
1038 *
1039 * Returns the related htmlElemDescPtr or NULL if not found.
1040 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001041const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001042htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001043 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001044
1045 for (i = 0; i < (sizeof(html40ElementTable) /
1046 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001047 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001048 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001049 }
1050 return(NULL);
1051}
1052
1053/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001054 * htmlGetEndPriority:
1055 * @name: The name of the element to look up the priority for.
1056 *
1057 * Return value: The "endtag" priority.
1058 **/
1059static int
1060htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001061 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001062
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001063 while ((htmlEndPriority[i].name != NULL) &&
1064 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1065 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001066
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001067 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001068}
1069
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001070
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001071/**
Owen Taylor3473f882001-02-23 17:55:21 +00001072 * htmlCheckAutoClose:
1073 * @newtag: The new tag name
1074 * @oldtag: The old tag name
1075 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001076 * Checks whether the new tag is one of the registered valid tags for
1077 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001078 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1079 *
1080 * Returns 0 if no, 1 if yes.
1081 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001082static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001083htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1084{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001085 int i, indx;
1086 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001087
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001088 if (htmlStartCloseIndexinitialized == 0)
1089 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001090
1091 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001092 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001093 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001094 if (closed == NULL)
1095 return (0);
1096 if (xmlStrEqual(BAD_CAST * closed, newtag))
1097 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001098 }
1099
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001100 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001101 i++;
1102 while (htmlStartClose[i] != NULL) {
1103 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001104 return (1);
1105 }
1106 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001107 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001108 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001109}
1110
1111/**
1112 * htmlAutoCloseOnClose:
1113 * @ctxt: an HTML parser context
1114 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001115 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001116 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001117 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001118 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001119static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001120htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1121{
1122 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001123 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001124
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001125 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001126
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001127 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001128
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001129 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1130 break;
1131 /*
1132 * A missplaced endtag can only close elements with lower
1133 * or equal priority, so if we find an element with higher
1134 * priority before we find an element with
1135 * matching name, we just ignore this endtag
1136 */
1137 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1138 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001139 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001140 if (i < 0)
1141 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001142
1143 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001144 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001145 if ((info != NULL) && (info->endTag == 3)) {
1146 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1147 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001148 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001149 }
1150 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1151 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001152 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001153 }
1154}
1155
1156/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001157 * htmlAutoCloseOnEnd:
1158 * @ctxt: an HTML parser context
1159 *
1160 * Close all remaining tags at the end of the stream
1161 */
1162static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001163htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1164{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001165 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001166
William M. Brack899e64a2003-09-26 18:03:42 +00001167 if (ctxt->nameNr == 0)
1168 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001169 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001170 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1171 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001172 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001173 }
1174}
1175
1176/**
Owen Taylor3473f882001-02-23 17:55:21 +00001177 * htmlAutoClose:
1178 * @ctxt: an HTML parser context
1179 * @newtag: The new tag name or NULL
1180 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001181 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001182 * The list is kept in htmlStartClose array. This function is
1183 * called when a new tag has been detected and generates the
1184 * appropriates closes if possible/needed.
1185 * If newtag is NULL this mean we are at the end of the resource
1186 * and we should check
1187 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001188static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001189htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1190{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001191 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001192 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001193 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1194 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001195 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001196 }
1197 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001198 htmlAutoCloseOnEnd(ctxt);
1199 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001200 }
1201 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001202 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1203 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1204 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001205 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1206 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001207 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001208 }
Owen Taylor3473f882001-02-23 17:55:21 +00001209}
1210
1211/**
1212 * htmlAutoCloseTag:
1213 * @doc: the HTML document
1214 * @name: The tag name
1215 * @elem: the HTML element
1216 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001217 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001218 * The list is kept in htmlStartClose array. This function checks
1219 * if the element or one of it's children would autoclose the
1220 * given tag.
1221 *
1222 * Returns 1 if autoclose, 0 otherwise
1223 */
1224int
1225htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1226 htmlNodePtr child;
1227
1228 if (elem == NULL) return(1);
1229 if (xmlStrEqual(name, elem->name)) return(0);
1230 if (htmlCheckAutoClose(elem->name, name)) return(1);
1231 child = elem->children;
1232 while (child != NULL) {
1233 if (htmlAutoCloseTag(doc, name, child)) return(1);
1234 child = child->next;
1235 }
1236 return(0);
1237}
1238
1239/**
1240 * htmlIsAutoClosed:
1241 * @doc: the HTML document
1242 * @elem: the HTML element
1243 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001244 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001245 * The list is kept in htmlStartClose array. This function checks
1246 * if a tag is autoclosed by one of it's child
1247 *
1248 * Returns 1 if autoclosed, 0 otherwise
1249 */
1250int
1251htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1252 htmlNodePtr child;
1253
1254 if (elem == NULL) return(1);
1255 child = elem->children;
1256 while (child != NULL) {
1257 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1258 child = child->next;
1259 }
1260 return(0);
1261}
1262
1263/**
1264 * htmlCheckImplied:
1265 * @ctxt: an HTML parser context
1266 * @newtag: The new tag name
1267 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001268 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001269 * called when a new tag has been detected and generates the
1270 * appropriates implicit tags if missing
1271 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001272static void
Owen Taylor3473f882001-02-23 17:55:21 +00001273htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1274 if (!htmlOmittedDefaultValue)
1275 return;
1276 if (xmlStrEqual(newtag, BAD_CAST"html"))
1277 return;
1278 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001279 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001280 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1281 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1282 }
1283 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1284 return;
1285 if ((ctxt->nameNr <= 1) &&
1286 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1287 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1288 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1289 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1290 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1291 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1292 /*
1293 * dropped OBJECT ... i you put it first BODY will be
1294 * assumed !
1295 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001296 htmlnamePush(ctxt, BAD_CAST"head");
Owen Taylor3473f882001-02-23 17:55:21 +00001297 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1298 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1299 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1300 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1301 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1302 int i;
1303 for (i = 0;i < ctxt->nameNr;i++) {
1304 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1305 return;
1306 }
1307 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1308 return;
1309 }
1310 }
1311
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001312 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001313 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1314 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1315 }
1316}
1317
1318/**
1319 * htmlCheckParagraph
1320 * @ctxt: an HTML parser context
1321 *
1322 * Check whether a p element need to be implied before inserting
1323 * characters in the current element.
1324 *
1325 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1326 * in case of error.
1327 */
1328
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001329static int
Owen Taylor3473f882001-02-23 17:55:21 +00001330htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1331 const xmlChar *tag;
1332 int i;
1333
1334 if (ctxt == NULL)
1335 return(-1);
1336 tag = ctxt->name;
1337 if (tag == NULL) {
1338 htmlAutoClose(ctxt, BAD_CAST"p");
1339 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001340 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001341 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1342 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1343 return(1);
1344 }
1345 if (!htmlOmittedDefaultValue)
1346 return(0);
1347 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1348 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001349 htmlAutoClose(ctxt, BAD_CAST"p");
1350 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001351 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001352 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1353 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1354 return(1);
1355 }
1356 }
1357 return(0);
1358}
1359
1360/**
1361 * htmlIsScriptAttribute:
1362 * @name: an attribute name
1363 *
1364 * Check if an attribute is of content type Script
1365 *
1366 * Returns 1 is the attribute is a script 0 otherwise
1367 */
1368int
1369htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001370 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001371
1372 if (name == NULL)
1373 return(0);
1374 /*
1375 * all script attributes start with 'on'
1376 */
1377 if ((name[0] != 'o') || (name[1] != 'n'))
1378 return(0);
1379 for (i = 0;
1380 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1381 i++) {
1382 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1383 return(1);
1384 }
1385 return(0);
1386}
1387
1388/************************************************************************
1389 * *
1390 * The list of HTML predefined entities *
1391 * *
1392 ************************************************************************/
1393
1394
Daniel Veillard22090732001-07-16 00:06:07 +00001395static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001396/*
1397 * the 4 absolute ones, plus apostrophe.
1398 */
1399{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1400{ 38, "amp", "ampersand, U+0026 ISOnum" },
1401{ 39, "apos", "single quote" },
1402{ 60, "lt", "less-than sign, U+003C ISOnum" },
1403{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1404
1405/*
1406 * A bunch still in the 128-255 range
1407 * Replacing them depend really on the charset used.
1408 */
1409{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1410{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1411{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1412{ 163, "pound","pound sign, U+00A3 ISOnum" },
1413{ 164, "curren","currency sign, U+00A4 ISOnum" },
1414{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1415{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1416{ 167, "sect", "section sign, U+00A7 ISOnum" },
1417{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1418{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1419{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1420{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1421{ 172, "not", "not sign, U+00AC ISOnum" },
1422{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1423{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1424{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1425{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1426{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1427{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1428{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1429{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1430{ 181, "micro","micro sign, U+00B5 ISOnum" },
1431{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1432{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1433{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1434{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1435{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1436{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1437{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1438{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1439{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1440{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1441{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1442{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1443{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1444{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1445{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1446{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1447{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1448{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1449{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1450{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1451{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1452{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1453{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1454{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1455{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1456{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1457{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1458{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1459{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1460{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1461{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1462{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1463{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1464{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1465{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1466{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1467{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1468{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1469{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1470{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1471{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1472{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1473{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1474{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1475{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1476{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1477{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1478{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1479{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1480{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1481{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1482{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1483{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1484{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1485{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1486{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1487{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1488{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1489{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1490{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1491{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1492{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1493{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1494{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1495{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1496{ 247, "divide","division sign, U+00F7 ISOnum" },
1497{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1498{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1499{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1500{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1501{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1502{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1503{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1504{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1505
1506{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1507{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1508{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1509{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1510{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1511
1512/*
1513 * Anything below should really be kept as entities references
1514 */
1515{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1516
1517{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1518{ 732, "tilde","small tilde, U+02DC ISOdia" },
1519
1520{ 913, "Alpha","greek capital letter alpha, U+0391" },
1521{ 914, "Beta", "greek capital letter beta, U+0392" },
1522{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1523{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1524{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1525{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1526{ 919, "Eta", "greek capital letter eta, U+0397" },
1527{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1528{ 921, "Iota", "greek capital letter iota, U+0399" },
1529{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001530{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001531{ 924, "Mu", "greek capital letter mu, U+039C" },
1532{ 925, "Nu", "greek capital letter nu, U+039D" },
1533{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1534{ 927, "Omicron","greek capital letter omicron, U+039F" },
1535{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1536{ 929, "Rho", "greek capital letter rho, U+03A1" },
1537{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1538{ 932, "Tau", "greek capital letter tau, U+03A4" },
1539{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1540{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1541{ 935, "Chi", "greek capital letter chi, U+03A7" },
1542{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1543{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1544
1545{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1546{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1547{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1548{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1549{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1550{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1551{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1552{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1553{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1554{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1555{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1556{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1557{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1558{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1559{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1560{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1561{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1562{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1563{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1564{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1565{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1566{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1567{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1568{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1569{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1570{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1571{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1572{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1573
1574{ 8194, "ensp", "en space, U+2002 ISOpub" },
1575{ 8195, "emsp", "em space, U+2003 ISOpub" },
1576{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1577{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1578{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1579{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1580{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1581{ 8211, "ndash","en dash, U+2013 ISOpub" },
1582{ 8212, "mdash","em dash, U+2014 ISOpub" },
1583{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1584{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1585{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1586{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1587{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1588{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1589{ 8224, "dagger","dagger, U+2020 ISOpub" },
1590{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1591
1592{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1593{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1594
1595{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1596
1597{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1598{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1599
1600{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1601{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1602
1603{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1604{ 8260, "frasl","fraction slash, U+2044 NEW" },
1605
1606{ 8364, "euro", "euro sign, U+20AC NEW" },
1607
1608{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1609{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1610{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1611{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1612{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1613{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1614{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1615{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1616{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1617{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1618{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1619{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1620{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1621{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1622{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1623{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1624
1625{ 8704, "forall","for all, U+2200 ISOtech" },
1626{ 8706, "part", "partial differential, U+2202 ISOtech" },
1627{ 8707, "exist","there exists, U+2203 ISOtech" },
1628{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1629{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1630{ 8712, "isin", "element of, U+2208 ISOtech" },
1631{ 8713, "notin","not an element of, U+2209 ISOtech" },
1632{ 8715, "ni", "contains as member, U+220B ISOtech" },
1633{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001634{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001635{ 8722, "minus","minus sign, U+2212 ISOtech" },
1636{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1637{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1638{ 8733, "prop", "proportional to, U+221D ISOtech" },
1639{ 8734, "infin","infinity, U+221E ISOtech" },
1640{ 8736, "ang", "angle, U+2220 ISOamso" },
1641{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1642{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1643{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1644{ 8746, "cup", "union = cup, U+222A ISOtech" },
1645{ 8747, "int", "integral, U+222B ISOtech" },
1646{ 8756, "there4","therefore, U+2234 ISOtech" },
1647{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1648{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1649{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1650{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1651{ 8801, "equiv","identical to, U+2261 ISOtech" },
1652{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1653{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1654{ 8834, "sub", "subset of, U+2282 ISOtech" },
1655{ 8835, "sup", "superset of, U+2283 ISOtech" },
1656{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1657{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1658{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1659{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1660{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1661{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1662{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1663{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1664{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1665{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1666{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1667{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1668{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1669{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1670
1671{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1672{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1673{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1674{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1675
1676};
1677
1678/************************************************************************
1679 * *
1680 * Commodity functions to handle entities *
1681 * *
1682 ************************************************************************/
1683
1684/*
1685 * Macro used to grow the current buffer.
1686 */
1687#define growBuffer(buffer) { \
1688 buffer##_size *= 2; \
Daniel Veillard3487c8d2002-09-05 11:33:25 +00001689 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
Owen Taylor3473f882001-02-23 17:55:21 +00001690 if (buffer == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001691 htmlErrMemory(ctxt, "growing buffer\n"); \
Owen Taylor3473f882001-02-23 17:55:21 +00001692 return(NULL); \
1693 } \
1694}
1695
1696/**
1697 * htmlEntityLookup:
1698 * @name: the entity name
1699 *
1700 * Lookup the given entity in EntitiesTable
1701 *
1702 * TODO: the linear scan is really ugly, an hash table is really needed.
1703 *
1704 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1705 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001706const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001707htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001708 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001709
1710 for (i = 0;i < (sizeof(html40EntitiesTable)/
1711 sizeof(html40EntitiesTable[0]));i++) {
1712 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001713 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001714 }
1715 }
1716 return(NULL);
1717}
1718
1719/**
1720 * htmlEntityValueLookup:
1721 * @value: the entity's unicode value
1722 *
1723 * Lookup the given entity in EntitiesTable
1724 *
1725 * TODO: the linear scan is really ugly, an hash table is really needed.
1726 *
1727 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1728 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001729const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001730htmlEntityValueLookup(unsigned int value) {
1731 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001732
1733 for (i = 0;i < (sizeof(html40EntitiesTable)/
1734 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001735 if (html40EntitiesTable[i].value >= value) {
1736 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001737 break;
William M. Brack78637da2003-07-31 14:47:38 +00001738 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001739 }
Owen Taylor3473f882001-02-23 17:55:21 +00001740 }
1741 return(NULL);
1742}
1743
1744/**
1745 * UTF8ToHtml:
1746 * @out: a pointer to an array of bytes to store the result
1747 * @outlen: the length of @out
1748 * @in: a pointer to an array of UTF-8 chars
1749 * @inlen: the length of @in
1750 *
1751 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1752 * plus HTML entities block of chars out.
1753 *
1754 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1755 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001756 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001757 * The value of @outlen after return is the number of octets consumed.
1758 */
1759int
1760UTF8ToHtml(unsigned char* out, int *outlen,
1761 const unsigned char* in, int *inlen) {
1762 const unsigned char* processed = in;
1763 const unsigned char* outend;
1764 const unsigned char* outstart = out;
1765 const unsigned char* instart = in;
1766 const unsigned char* inend;
1767 unsigned int c, d;
1768 int trailing;
1769
1770 if (in == NULL) {
1771 /*
1772 * initialization nothing to do
1773 */
1774 *outlen = 0;
1775 *inlen = 0;
1776 return(0);
1777 }
1778 inend = in + (*inlen);
1779 outend = out + (*outlen);
1780 while (in < inend) {
1781 d = *in++;
1782 if (d < 0x80) { c= d; trailing= 0; }
1783 else if (d < 0xC0) {
1784 /* trailing byte in leading position */
1785 *outlen = out - outstart;
1786 *inlen = processed - instart;
1787 return(-2);
1788 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1789 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1790 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1791 else {
1792 /* no chance for this in Ascii */
1793 *outlen = out - outstart;
1794 *inlen = processed - instart;
1795 return(-2);
1796 }
1797
1798 if (inend - in < trailing) {
1799 break;
1800 }
1801
1802 for ( ; trailing; trailing--) {
1803 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1804 break;
1805 c <<= 6;
1806 c |= d & 0x3F;
1807 }
1808
1809 /* assertion: c is a single UTF-4 value */
1810 if (c < 0x80) {
1811 if (out + 1 >= outend)
1812 break;
1813 *out++ = c;
1814 } else {
1815 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00001816 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001817
1818 /*
1819 * Try to lookup a predefined HTML entity for it
1820 */
1821
1822 ent = htmlEntityValueLookup(c);
1823 if (ent == NULL) {
1824 /* no chance for this in Ascii */
1825 *outlen = out - outstart;
1826 *inlen = processed - instart;
1827 return(-2);
1828 }
1829 len = strlen(ent->name);
1830 if (out + 2 + len >= outend)
1831 break;
1832 *out++ = '&';
1833 memcpy(out, ent->name, len);
1834 out += len;
1835 *out++ = ';';
1836 }
1837 processed = in;
1838 }
1839 *outlen = out - outstart;
1840 *inlen = processed - instart;
1841 return(0);
1842}
1843
1844/**
1845 * htmlEncodeEntities:
1846 * @out: a pointer to an array of bytes to store the result
1847 * @outlen: the length of @out
1848 * @in: a pointer to an array of UTF-8 chars
1849 * @inlen: the length of @in
1850 * @quoteChar: the quote character to escape (' or ") or zero.
1851 *
1852 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1853 * plus HTML entities block of chars out.
1854 *
1855 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1856 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001857 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001858 * The value of @outlen after return is the number of octets consumed.
1859 */
1860int
1861htmlEncodeEntities(unsigned char* out, int *outlen,
1862 const unsigned char* in, int *inlen, int quoteChar) {
1863 const unsigned char* processed = in;
1864 const unsigned char* outend = out + (*outlen);
1865 const unsigned char* outstart = out;
1866 const unsigned char* instart = in;
1867 const unsigned char* inend = in + (*inlen);
1868 unsigned int c, d;
1869 int trailing;
1870
1871 while (in < inend) {
1872 d = *in++;
1873 if (d < 0x80) { c= d; trailing= 0; }
1874 else if (d < 0xC0) {
1875 /* trailing byte in leading position */
1876 *outlen = out - outstart;
1877 *inlen = processed - instart;
1878 return(-2);
1879 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1880 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1881 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1882 else {
1883 /* no chance for this in Ascii */
1884 *outlen = out - outstart;
1885 *inlen = processed - instart;
1886 return(-2);
1887 }
1888
1889 if (inend - in < trailing)
1890 break;
1891
1892 while (trailing--) {
1893 if (((d= *in++) & 0xC0) != 0x80) {
1894 *outlen = out - outstart;
1895 *inlen = processed - instart;
1896 return(-2);
1897 }
1898 c <<= 6;
1899 c |= d & 0x3F;
1900 }
1901
1902 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001903 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1904 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00001905 if (out >= outend)
1906 break;
1907 *out++ = c;
1908 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00001909 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00001910 const char *cp;
1911 char nbuf[16];
1912 int len;
1913
1914 /*
1915 * Try to lookup a predefined HTML entity for it
1916 */
1917 ent = htmlEntityValueLookup(c);
1918 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00001919 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00001920 cp = nbuf;
1921 }
1922 else
1923 cp = ent->name;
1924 len = strlen(cp);
1925 if (out + 2 + len > outend)
1926 break;
1927 *out++ = '&';
1928 memcpy(out, cp, len);
1929 out += len;
1930 *out++ = ';';
1931 }
1932 processed = in;
1933 }
1934 *outlen = out - outstart;
1935 *inlen = processed - instart;
1936 return(0);
1937}
1938
Owen Taylor3473f882001-02-23 17:55:21 +00001939/************************************************************************
1940 * *
1941 * Commodity functions to handle streams *
1942 * *
1943 ************************************************************************/
1944
1945/**
Owen Taylor3473f882001-02-23 17:55:21 +00001946 * htmlNewInputStream:
1947 * @ctxt: an HTML parser context
1948 *
1949 * Create a new input stream structure
1950 * Returns the new input stream or NULL
1951 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001952static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001953htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1954 htmlParserInputPtr input;
1955
1956 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1957 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00001958 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00001959 return(NULL);
1960 }
1961 memset(input, 0, sizeof(htmlParserInput));
1962 input->filename = NULL;
1963 input->directory = NULL;
1964 input->base = NULL;
1965 input->cur = NULL;
1966 input->buf = NULL;
1967 input->line = 1;
1968 input->col = 1;
1969 input->buf = NULL;
1970 input->free = NULL;
1971 input->version = NULL;
1972 input->consumed = 0;
1973 input->length = 0;
1974 return(input);
1975}
1976
1977
1978/************************************************************************
1979 * *
1980 * Commodity functions, cleanup needed ? *
1981 * *
1982 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00001983/*
1984 * all tags allowing pc data from the html 4.01 loose dtd
1985 * NOTE: it might be more apropriate to integrate this information
1986 * into the html40ElementTable array but I don't want to risk any
1987 * binary incomptibility
1988 */
1989static const char *allowPCData[] = {
1990 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1991 "blockquote", "body", "button", "caption", "center", "cite", "code",
1992 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1993 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1994 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1995 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1996};
Owen Taylor3473f882001-02-23 17:55:21 +00001997
1998/**
1999 * areBlanks:
2000 * @ctxt: an HTML parser context
2001 * @str: a xmlChar *
2002 * @len: the size of @str
2003 *
2004 * Is this a sequence of blank chars that one can ignore ?
2005 *
2006 * Returns 1 if ignorable 0 otherwise.
2007 */
2008
2009static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002010 unsigned int i;
2011 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002012 xmlNodePtr lastChild;
2013
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002014 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002015 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002016
2017 if (CUR == 0) return(1);
2018 if (CUR != '<') return(0);
2019 if (ctxt->name == NULL)
2020 return(1);
2021 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2022 return(1);
2023 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2024 return(1);
2025 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
2026 return(1);
2027 if (ctxt->node == NULL) return(0);
2028 lastChild = xmlGetLastChild(ctxt->node);
2029 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002030 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2031 (ctxt->node->content != NULL)) return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002032 /* keep ws in constructs like ...<b> </b>...
2033 for all tags "b" allowing PCDATA */
2034 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2035 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2036 return(0);
2037 }
2038 }
Owen Taylor3473f882001-02-23 17:55:21 +00002039 } else if (xmlNodeIsText(lastChild)) {
2040 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002041 } else {
2042 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2043 for all tags "p" allowing PCDATA */
2044 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2045 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2046 return(0);
2047 }
2048 }
Owen Taylor3473f882001-02-23 17:55:21 +00002049 }
2050 return(1);
2051}
2052
2053/**
Owen Taylor3473f882001-02-23 17:55:21 +00002054 * htmlNewDocNoDtD:
2055 * @URI: URI for the dtd, or NULL
2056 * @ExternalID: the external ID of the DTD, or NULL
2057 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002058 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2059 * are NULL
2060 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002061 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002062 */
2063htmlDocPtr
2064htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2065 xmlDocPtr cur;
2066
2067 /*
2068 * Allocate a new document and fill the fields.
2069 */
2070 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2071 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002072 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002073 return(NULL);
2074 }
2075 memset(cur, 0, sizeof(xmlDoc));
2076
2077 cur->type = XML_HTML_DOCUMENT_NODE;
2078 cur->version = NULL;
2079 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002080 cur->doc = cur;
2081 cur->name = NULL;
2082 cur->children = NULL;
2083 cur->extSubset = NULL;
2084 cur->oldNs = NULL;
2085 cur->encoding = NULL;
2086 cur->standalone = 1;
2087 cur->compression = 0;
2088 cur->ids = NULL;
2089 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002090 cur->_private = NULL;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002091 if ((ExternalID != NULL) ||
2092 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002093 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002094 return(cur);
2095}
2096
2097/**
2098 * htmlNewDoc:
2099 * @URI: URI for the dtd, or NULL
2100 * @ExternalID: the external ID of the DTD, or NULL
2101 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002102 * Creates a new HTML document
2103 *
Owen Taylor3473f882001-02-23 17:55:21 +00002104 * Returns a new document
2105 */
2106htmlDocPtr
2107htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2108 if ((URI == NULL) && (ExternalID == NULL))
2109 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002110 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2111 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002112
2113 return(htmlNewDocNoDtD(URI, ExternalID));
2114}
2115
2116
2117/************************************************************************
2118 * *
2119 * The parser itself *
2120 * Relates to http://www.w3.org/TR/html40 *
2121 * *
2122 ************************************************************************/
2123
2124/************************************************************************
2125 * *
2126 * The parser itself *
2127 * *
2128 ************************************************************************/
2129
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002130static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002131
Owen Taylor3473f882001-02-23 17:55:21 +00002132/**
2133 * htmlParseHTMLName:
2134 * @ctxt: an HTML parser context
2135 *
2136 * parse an HTML tag or attribute name, note that we convert it to lowercase
2137 * since HTML names are not case-sensitive.
2138 *
2139 * Returns the Tag Name parsed or NULL
2140 */
2141
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002142static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002143htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002144 int i = 0;
2145 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2146
William M. Brack76e95df2003-10-18 16:20:14 +00002147 if (!IS_LETTER_CH(CUR) && (CUR != '_') &&
Owen Taylor3473f882001-02-23 17:55:21 +00002148 (CUR != ':')) return(NULL);
2149
2150 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brack76e95df2003-10-18 16:20:14 +00002151 ((IS_LETTER_CH(CUR)) || (IS_DIGIT_CH(CUR)) ||
Owen Taylor3473f882001-02-23 17:55:21 +00002152 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2153 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2154 else loc[i] = CUR;
2155 i++;
2156
2157 NEXT;
2158 }
2159
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002160 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002161}
2162
2163/**
2164 * htmlParseName:
2165 * @ctxt: an HTML parser context
2166 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002167 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002168 *
2169 * Returns the Name parsed or NULL
2170 */
2171
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002172static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002173htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002174 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002175 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002176 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002177
2178 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002179
2180 /*
2181 * Accelerator for simple ASCII names
2182 */
2183 in = ctxt->input->cur;
2184 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2185 ((*in >= 0x41) && (*in <= 0x5A)) ||
2186 (*in == '_') || (*in == ':')) {
2187 in++;
2188 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2189 ((*in >= 0x41) && (*in <= 0x5A)) ||
2190 ((*in >= 0x30) && (*in <= 0x39)) ||
2191 (*in == '_') || (*in == '-') ||
2192 (*in == ':') || (*in == '.'))
2193 in++;
2194 if ((*in > 0) && (*in < 0x80)) {
2195 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002196 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002197 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002198 ctxt->nbChars += count;
2199 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002200 return(ret);
2201 }
2202 }
2203 return(htmlParseNameComplex(ctxt));
2204}
2205
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002206static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002207htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002208 int len = 0, l;
2209 int c;
2210 int count = 0;
2211
2212 /*
2213 * Handler for more complex cases
2214 */
2215 GROW;
2216 c = CUR_CHAR(l);
2217 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2218 (!IS_LETTER(c) && (c != '_') &&
2219 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002220 return(NULL);
2221 }
2222
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002223 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2224 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2225 (c == '.') || (c == '-') ||
2226 (c == '_') || (c == ':') ||
2227 (IS_COMBINING(c)) ||
2228 (IS_EXTENDER(c)))) {
2229 if (count++ > 100) {
2230 count = 0;
2231 GROW;
2232 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002233 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002234 NEXTL(l);
2235 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002236 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002237 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002238}
2239
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002240
Owen Taylor3473f882001-02-23 17:55:21 +00002241/**
2242 * htmlParseHTMLAttribute:
2243 * @ctxt: an HTML parser context
2244 * @stop: a char stop value
2245 *
2246 * parse an HTML attribute value till the stop (quote), if
2247 * stop is 0 then it stops at the first space
2248 *
2249 * Returns the attribute parsed or NULL
2250 */
2251
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002252static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002253htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2254 xmlChar *buffer = NULL;
2255 int buffer_size = 0;
2256 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002257 const xmlChar *name = NULL;
2258 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002259 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002260
2261 /*
2262 * allocate a translation buffer.
2263 */
2264 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002265 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002266 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002267 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002268 return(NULL);
2269 }
2270 out = buffer;
2271
2272 /*
2273 * Ok loop until we reach one of the ending chars
2274 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002275 while ((CUR != 0) && (CUR != stop)) {
2276 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002277 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002278 if (CUR == '&') {
2279 if (NXT(1) == '#') {
2280 unsigned int c;
2281 int bits;
2282
2283 c = htmlParseCharRef(ctxt);
2284 if (c < 0x80)
2285 { *out++ = c; bits= -6; }
2286 else if (c < 0x800)
2287 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2288 else if (c < 0x10000)
2289 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2290 else
2291 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2292
2293 for ( ; bits >= 0; bits-= 6) {
2294 *out++ = ((c >> bits) & 0x3F) | 0x80;
2295 }
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002296
2297 if (out - buffer > buffer_size - 100) {
2298 int indx = out - buffer;
2299
2300 growBuffer(buffer);
2301 out = &buffer[indx];
2302 }
Owen Taylor3473f882001-02-23 17:55:21 +00002303 } else {
2304 ent = htmlParseEntityRef(ctxt, &name);
2305 if (name == NULL) {
2306 *out++ = '&';
2307 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002308 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002309
2310 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002311 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002312 }
2313 } else if (ent == NULL) {
2314 *out++ = '&';
2315 cur = name;
2316 while (*cur != 0) {
2317 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002318 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002319
2320 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002321 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002322 }
2323 *out++ = *cur++;
2324 }
Owen Taylor3473f882001-02-23 17:55:21 +00002325 } else {
2326 unsigned int c;
2327 int bits;
2328
2329 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002330 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002331
2332 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002333 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002334 }
2335 c = (xmlChar)ent->value;
2336 if (c < 0x80)
2337 { *out++ = c; bits= -6; }
2338 else if (c < 0x800)
2339 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2340 else if (c < 0x10000)
2341 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2342 else
2343 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2344
2345 for ( ; bits >= 0; bits-= 6) {
2346 *out++ = ((c >> bits) & 0x3F) | 0x80;
2347 }
Owen Taylor3473f882001-02-23 17:55:21 +00002348 }
2349 }
2350 } else {
2351 unsigned int c;
2352 int bits, l;
2353
2354 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002355 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002356
2357 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002358 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002359 }
2360 c = CUR_CHAR(l);
2361 if (c < 0x80)
2362 { *out++ = c; bits= -6; }
2363 else if (c < 0x800)
2364 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2365 else if (c < 0x10000)
2366 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2367 else
2368 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2369
2370 for ( ; bits >= 0; bits-= 6) {
2371 *out++ = ((c >> bits) & 0x3F) | 0x80;
2372 }
2373 NEXT;
2374 }
2375 }
2376 *out++ = 0;
2377 return(buffer);
2378}
2379
2380/**
Owen Taylor3473f882001-02-23 17:55:21 +00002381 * htmlParseEntityRef:
2382 * @ctxt: an HTML parser context
2383 * @str: location to store the entity name
2384 *
2385 * parse an HTML ENTITY references
2386 *
2387 * [68] EntityRef ::= '&' Name ';'
2388 *
2389 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2390 * if non-NULL *str will have to be freed by the caller.
2391 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002392const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002393htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2394 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002395 const htmlEntityDesc * ent = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002396 *str = NULL;
2397
2398 if (CUR == '&') {
2399 NEXT;
2400 name = htmlParseName(ctxt);
2401 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002402 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2403 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002404 } else {
2405 GROW;
2406 if (CUR == ';') {
2407 *str = name;
2408
2409 /*
2410 * Lookup the entity in the table.
2411 */
2412 ent = htmlEntityLookup(name);
2413 if (ent != NULL) /* OK that's ugly !!! */
2414 NEXT;
2415 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002416 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2417 "htmlParseEntityRef: expecting ';'\n",
2418 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002419 *str = name;
2420 }
2421 }
2422 }
2423 return(ent);
2424}
2425
2426/**
2427 * htmlParseAttValue:
2428 * @ctxt: an HTML parser context
2429 *
2430 * parse a value for an attribute
2431 * Note: the parser won't do substitution of entities here, this
2432 * will be handled later in xmlStringGetNodeList, unless it was
2433 * asked for ctxt->replaceEntities != 0
2434 *
2435 * Returns the AttValue parsed or NULL.
2436 */
2437
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002438static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002439htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2440 xmlChar *ret = NULL;
2441
2442 if (CUR == '"') {
2443 NEXT;
2444 ret = htmlParseHTMLAttribute(ctxt, '"');
2445 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002446 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2447 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002448 } else
2449 NEXT;
2450 } else if (CUR == '\'') {
2451 NEXT;
2452 ret = htmlParseHTMLAttribute(ctxt, '\'');
2453 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002454 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2455 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002456 } else
2457 NEXT;
2458 } else {
2459 /*
2460 * That's an HTMLism, the attribute value may not be quoted
2461 */
2462 ret = htmlParseHTMLAttribute(ctxt, 0);
2463 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002464 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2465 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002466 }
2467 }
2468 return(ret);
2469}
2470
2471/**
2472 * htmlParseSystemLiteral:
2473 * @ctxt: an HTML parser context
2474 *
2475 * parse an HTML Literal
2476 *
2477 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2478 *
2479 * Returns the SystemLiteral parsed or NULL
2480 */
2481
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002482static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002483htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2484 const xmlChar *q;
2485 xmlChar *ret = NULL;
2486
2487 if (CUR == '"') {
2488 NEXT;
2489 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002490 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002491 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002492 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002493 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2494 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002495 } else {
2496 ret = xmlStrndup(q, CUR_PTR - q);
2497 NEXT;
2498 }
2499 } else if (CUR == '\'') {
2500 NEXT;
2501 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002502 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002503 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002504 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002505 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2506 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002507 } else {
2508 ret = xmlStrndup(q, CUR_PTR - q);
2509 NEXT;
2510 }
2511 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002512 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2513 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002514 }
2515
2516 return(ret);
2517}
2518
2519/**
2520 * htmlParsePubidLiteral:
2521 * @ctxt: an HTML parser context
2522 *
2523 * parse an HTML public literal
2524 *
2525 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2526 *
2527 * Returns the PubidLiteral parsed or NULL.
2528 */
2529
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002530static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002531htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2532 const xmlChar *q;
2533 xmlChar *ret = NULL;
2534 /*
2535 * Name ::= (Letter | '_') (NameChar)*
2536 */
2537 if (CUR == '"') {
2538 NEXT;
2539 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002540 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002541 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002542 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2543 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002544 } else {
2545 ret = xmlStrndup(q, CUR_PTR - q);
2546 NEXT;
2547 }
2548 } else if (CUR == '\'') {
2549 NEXT;
2550 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002551 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002552 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002553 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002554 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2555 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002556 } else {
2557 ret = xmlStrndup(q, CUR_PTR - q);
2558 NEXT;
2559 }
2560 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002561 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2562 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002563 }
2564
2565 return(ret);
2566}
2567
2568/**
2569 * htmlParseScript:
2570 * @ctxt: an HTML parser context
2571 *
2572 * parse the content of an HTML SCRIPT or STYLE element
2573 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2574 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2575 * http://www.w3.org/TR/html4/types.html#type-script
2576 * http://www.w3.org/TR/html4/types.html#h-6.15
2577 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2578 *
2579 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2580 * element and the value of intrinsic event attributes. User agents must
2581 * not evaluate script data as HTML markup but instead must pass it on as
2582 * data to a script engine.
2583 * NOTES:
2584 * - The content is passed like CDATA
2585 * - the attributes for style and scripting "onXXX" are also described
2586 * as CDATA but SGML allows entities references in attributes so their
2587 * processing is identical as other attributes
2588 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002589static void
Owen Taylor3473f882001-02-23 17:55:21 +00002590htmlParseScript(htmlParserCtxtPtr ctxt) {
2591 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2592 int nbchar = 0;
2593 xmlChar cur;
2594
2595 SHRINK;
2596 cur = CUR;
William M. Brack76e95df2003-10-18 16:20:14 +00002597 while (IS_CHAR_CH(cur)) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00002598 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2599 (NXT(3) == '-')) {
2600 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2601 if (ctxt->sax->cdataBlock!= NULL) {
2602 /*
2603 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2604 */
2605 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002606 } else if (ctxt->sax->characters != NULL) {
2607 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Daniel Veillardc1f78342001-11-10 11:43:05 +00002608 }
2609 }
2610 nbchar = 0;
2611 htmlParseComment(ctxt);
2612 cur = CUR;
2613 continue;
2614 } else if ((cur == '<') && (NXT(1) == '/')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002615 /*
2616 * One should break here, the specification is clear:
2617 * Authors should therefore escape "</" within the content.
2618 * Escape mechanisms are specific to each scripting or
2619 * style sheet language.
2620 */
2621 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2622 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2623 break; /* while */
2624 }
2625 buf[nbchar++] = cur;
2626 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2627 if (ctxt->sax->cdataBlock!= NULL) {
2628 /*
2629 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2630 */
2631 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002632 } else if (ctxt->sax->characters != NULL) {
2633 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002634 }
2635 nbchar = 0;
2636 }
2637 NEXT;
2638 cur = CUR;
2639 }
William M. Brack76e95df2003-10-18 16:20:14 +00002640 if (!(IS_CHAR_CH(cur))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002641 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2642 "Invalid char in CDATA 0x%X\n", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002643 NEXT;
2644 }
2645
2646 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2647 if (ctxt->sax->cdataBlock!= NULL) {
2648 /*
2649 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2650 */
2651 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002652 } else if (ctxt->sax->characters != NULL) {
2653 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002654 }
2655 }
2656}
2657
2658
2659/**
2660 * htmlParseCharData:
2661 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002662 *
2663 * parse a CharData section.
2664 * if we are within a CDATA section ']]>' marks an end of section.
2665 *
2666 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2667 */
2668
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002669static void
2670htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002671 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2672 int nbchar = 0;
2673 int cur, l;
2674
2675 SHRINK;
2676 cur = CUR_CHAR(l);
2677 while (((cur != '<') || (ctxt->token == '<')) &&
2678 ((cur != '&') || (ctxt->token == '&')) &&
2679 (IS_CHAR(cur))) {
2680 COPY_BUF(l,buf,nbchar,cur);
2681 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2682 /*
2683 * Ok the segment is to be consumed as chars.
2684 */
2685 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2686 if (areBlanks(ctxt, buf, nbchar)) {
2687 if (ctxt->sax->ignorableWhitespace != NULL)
2688 ctxt->sax->ignorableWhitespace(ctxt->userData,
2689 buf, nbchar);
2690 } else {
2691 htmlCheckParagraph(ctxt);
2692 if (ctxt->sax->characters != NULL)
2693 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2694 }
2695 }
2696 nbchar = 0;
2697 }
2698 NEXTL(l);
2699 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002700 if (cur == 0) {
2701 SHRINK;
2702 GROW;
2703 cur = CUR_CHAR(l);
2704 }
Owen Taylor3473f882001-02-23 17:55:21 +00002705 }
2706 if (nbchar != 0) {
2707 /*
2708 * Ok the segment is to be consumed as chars.
2709 */
2710 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2711 if (areBlanks(ctxt, buf, nbchar)) {
2712 if (ctxt->sax->ignorableWhitespace != NULL)
2713 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2714 } else {
2715 htmlCheckParagraph(ctxt);
2716 if (ctxt->sax->characters != NULL)
2717 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2718 }
2719 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002720 } else {
2721 /*
2722 * Loop detection
2723 */
2724 if (cur == 0)
2725 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002726 }
2727}
2728
2729/**
2730 * htmlParseExternalID:
2731 * @ctxt: an HTML parser context
2732 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00002733 *
2734 * Parse an External ID or a Public ID
2735 *
Owen Taylor3473f882001-02-23 17:55:21 +00002736 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2737 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2738 *
2739 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2740 *
2741 * Returns the function returns SystemLiteral and in the second
2742 * case publicID receives PubidLiteral, is strict is off
2743 * it is possible to return NULL and have publicID set.
2744 */
2745
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002746static xmlChar *
2747htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00002748 xmlChar *URI = NULL;
2749
2750 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2751 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2752 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2753 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002754 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002755 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2756 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002757 }
2758 SKIP_BLANKS;
2759 URI = htmlParseSystemLiteral(ctxt);
2760 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002761 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2762 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002763 }
2764 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2765 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2766 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2767 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00002768 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002769 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2770 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002771 }
2772 SKIP_BLANKS;
2773 *publicID = htmlParsePubidLiteral(ctxt);
2774 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002775 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2776 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2777 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002778 }
2779 SKIP_BLANKS;
2780 if ((CUR == '"') || (CUR == '\'')) {
2781 URI = htmlParseSystemLiteral(ctxt);
2782 }
2783 }
2784 return(URI);
2785}
2786
2787/**
2788 * htmlParseComment:
2789 * @ctxt: an HTML parser context
2790 *
2791 * Parse an XML (SGML) comment <!-- .... -->
2792 *
2793 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2794 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002795static void
Owen Taylor3473f882001-02-23 17:55:21 +00002796htmlParseComment(htmlParserCtxtPtr ctxt) {
2797 xmlChar *buf = NULL;
2798 int len;
2799 int size = HTML_PARSER_BUFFER_SIZE;
2800 int q, ql;
2801 int r, rl;
2802 int cur, l;
2803 xmlParserInputState state;
2804
2805 /*
2806 * Check that there is a comment right here.
2807 */
2808 if ((RAW != '<') || (NXT(1) != '!') ||
2809 (NXT(2) != '-') || (NXT(3) != '-')) return;
2810
2811 state = ctxt->instate;
2812 ctxt->instate = XML_PARSER_COMMENT;
2813 SHRINK;
2814 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002815 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002816 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002817 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002818 ctxt->instate = state;
2819 return;
2820 }
2821 q = CUR_CHAR(ql);
2822 NEXTL(ql);
2823 r = CUR_CHAR(rl);
2824 NEXTL(rl);
2825 cur = CUR_CHAR(l);
2826 len = 0;
2827 while (IS_CHAR(cur) &&
2828 ((cur != '>') ||
2829 (r != '-') || (q != '-'))) {
2830 if (len + 5 >= size) {
2831 size *= 2;
2832 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2833 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002834 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002835 ctxt->instate = state;
2836 return;
2837 }
2838 }
2839 COPY_BUF(ql,buf,len,q);
2840 q = r;
2841 ql = rl;
2842 r = cur;
2843 rl = l;
2844 NEXTL(l);
2845 cur = CUR_CHAR(l);
2846 if (cur == 0) {
2847 SHRINK;
2848 GROW;
2849 cur = CUR_CHAR(l);
2850 }
2851 }
2852 buf[len] = 0;
2853 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002854 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
2855 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002856 xmlFree(buf);
2857 } else {
2858 NEXT;
2859 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2860 (!ctxt->disableSAX))
2861 ctxt->sax->comment(ctxt->userData, buf);
2862 xmlFree(buf);
2863 }
2864 ctxt->instate = state;
2865}
2866
2867/**
2868 * htmlParseCharRef:
2869 * @ctxt: an HTML parser context
2870 *
2871 * parse Reference declarations
2872 *
2873 * [66] CharRef ::= '&#' [0-9]+ ';' |
2874 * '&#x' [0-9a-fA-F]+ ';'
2875 *
2876 * Returns the value parsed (as an int)
2877 */
2878int
2879htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2880 int val = 0;
2881
2882 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00002883 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002884 SKIP(3);
2885 while (CUR != ';') {
2886 if ((CUR >= '0') && (CUR <= '9'))
2887 val = val * 16 + (CUR - '0');
2888 else if ((CUR >= 'a') && (CUR <= 'f'))
2889 val = val * 16 + (CUR - 'a') + 10;
2890 else if ((CUR >= 'A') && (CUR <= 'F'))
2891 val = val * 16 + (CUR - 'A') + 10;
2892 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002893 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
2894 "htmlParseCharRef: invalid hexadecimal value\n",
2895 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002896 return(0);
2897 }
2898 NEXT;
2899 }
2900 if (CUR == ';')
2901 NEXT;
2902 } else if ((CUR == '&') && (NXT(1) == '#')) {
2903 SKIP(2);
2904 while (CUR != ';') {
2905 if ((CUR >= '0') && (CUR <= '9'))
2906 val = val * 10 + (CUR - '0');
2907 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002908 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
2909 "htmlParseCharRef: invalid decimal value\n",
2910 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002911 return(0);
2912 }
2913 NEXT;
2914 }
2915 if (CUR == ';')
2916 NEXT;
2917 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002918 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
2919 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002920 }
2921 /*
2922 * Check the value IS_CHAR ...
2923 */
2924 if (IS_CHAR(val)) {
2925 return(val);
2926 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002927 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2928 "htmlParseCharRef: invalid xmlChar value %d\n",
2929 val);
Owen Taylor3473f882001-02-23 17:55:21 +00002930 }
2931 return(0);
2932}
2933
2934
2935/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002936 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00002937 * @ctxt: an HTML parser context
2938 *
2939 * parse a DOCTYPE declaration
2940 *
2941 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2942 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2943 */
2944
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002945static void
Owen Taylor3473f882001-02-23 17:55:21 +00002946htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002947 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00002948 xmlChar *ExternalID = NULL;
2949 xmlChar *URI = NULL;
2950
2951 /*
2952 * We know that '<!DOCTYPE' has been detected.
2953 */
2954 SKIP(9);
2955
2956 SKIP_BLANKS;
2957
2958 /*
2959 * Parse the DOCTYPE name.
2960 */
2961 name = htmlParseName(ctxt);
2962 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002963 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2964 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
2965 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002966 }
2967 /*
2968 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2969 */
2970
2971 SKIP_BLANKS;
2972
2973 /*
2974 * Check for SystemID and ExternalID
2975 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002976 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00002977 SKIP_BLANKS;
2978
2979 /*
2980 * We should be at the end of the DOCTYPE declaration.
2981 */
2982 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002983 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
2984 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002985 /* We shouldn't try to resynchronize ... */
2986 }
2987 NEXT;
2988
2989 /*
2990 * Create or update the document accordingly to the DOCTYPE
2991 */
2992 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2993 (!ctxt->disableSAX))
2994 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2995
2996 /*
2997 * Cleanup, since we don't use all those identifiers
2998 */
2999 if (URI != NULL) xmlFree(URI);
3000 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003001}
3002
3003/**
3004 * htmlParseAttribute:
3005 * @ctxt: an HTML parser context
3006 * @value: a xmlChar ** used to store the value of the attribute
3007 *
3008 * parse an attribute
3009 *
3010 * [41] Attribute ::= Name Eq AttValue
3011 *
3012 * [25] Eq ::= S? '=' S?
3013 *
3014 * With namespace:
3015 *
3016 * [NS 11] Attribute ::= QName Eq AttValue
3017 *
3018 * Also the case QName == xmlns:??? is handled independently as a namespace
3019 * definition.
3020 *
3021 * Returns the attribute name, and the value in *value.
3022 */
3023
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003024static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003025htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003026 const xmlChar *name;
3027 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003028
3029 *value = NULL;
3030 name = htmlParseHTMLName(ctxt);
3031 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003032 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3033 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003034 return(NULL);
3035 }
3036
3037 /*
3038 * read the value
3039 */
3040 SKIP_BLANKS;
3041 if (CUR == '=') {
3042 NEXT;
3043 SKIP_BLANKS;
3044 val = htmlParseAttValue(ctxt);
3045 /******
3046 } else {
3047 * TODO : some attribute must have values, some may not
3048 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3049 ctxt->sax->warning(ctxt->userData,
3050 "No value for attribute %s\n", name); */
3051 }
3052
3053 *value = val;
3054 return(name);
3055}
3056
3057/**
3058 * htmlCheckEncoding:
3059 * @ctxt: an HTML parser context
3060 * @attvalue: the attribute value
3061 *
3062 * Checks an http-equiv attribute from a Meta tag to detect
3063 * the encoding
3064 * If a new encoding is detected the parser is switched to decode
3065 * it and pass UTF8
3066 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003067static void
Owen Taylor3473f882001-02-23 17:55:21 +00003068htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3069 const xmlChar *encoding;
3070
3071 if ((ctxt == NULL) || (attvalue == NULL))
3072 return;
3073
3074 /* do not change encoding */
3075 if (ctxt->input->encoding != NULL)
3076 return;
3077
3078 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3079 if (encoding != NULL) {
3080 encoding += 8;
3081 } else {
3082 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3083 if (encoding != NULL)
3084 encoding += 9;
3085 }
3086 if (encoding != NULL) {
3087 xmlCharEncoding enc;
3088 xmlCharEncodingHandlerPtr handler;
3089
3090 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3091
3092 if (ctxt->input->encoding != NULL)
3093 xmlFree((xmlChar *) ctxt->input->encoding);
3094 ctxt->input->encoding = xmlStrdup(encoding);
3095
3096 enc = xmlParseCharEncoding((const char *) encoding);
3097 /*
3098 * registered set of known encodings
3099 */
3100 if (enc != XML_CHAR_ENCODING_ERROR) {
3101 xmlSwitchEncoding(ctxt, enc);
3102 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3103 } else {
3104 /*
3105 * fallback for unknown encodings
3106 */
3107 handler = xmlFindCharEncodingHandler((const char *) encoding);
3108 if (handler != NULL) {
3109 xmlSwitchToEncoding(ctxt, handler);
3110 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3111 } else {
3112 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3113 }
3114 }
3115
3116 if ((ctxt->input->buf != NULL) &&
3117 (ctxt->input->buf->encoder != NULL) &&
3118 (ctxt->input->buf->raw != NULL) &&
3119 (ctxt->input->buf->buffer != NULL)) {
3120 int nbchars;
3121 int processed;
3122
3123 /*
3124 * convert as much as possible to the parser reading buffer.
3125 */
3126 processed = ctxt->input->cur - ctxt->input->base;
3127 xmlBufferShrink(ctxt->input->buf->buffer, processed);
3128 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3129 ctxt->input->buf->buffer,
3130 ctxt->input->buf->raw);
3131 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003132 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3133 "htmlCheckEncoding: encoder error\n",
3134 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003135 }
3136 ctxt->input->base =
3137 ctxt->input->cur = ctxt->input->buf->buffer->content;
3138 }
3139 }
3140}
3141
3142/**
3143 * htmlCheckMeta:
3144 * @ctxt: an HTML parser context
3145 * @atts: the attributes values
3146 *
3147 * Checks an attributes from a Meta tag
3148 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003149static void
Owen Taylor3473f882001-02-23 17:55:21 +00003150htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3151 int i;
3152 const xmlChar *att, *value;
3153 int http = 0;
3154 const xmlChar *content = NULL;
3155
3156 if ((ctxt == NULL) || (atts == NULL))
3157 return;
3158
3159 i = 0;
3160 att = atts[i++];
3161 while (att != NULL) {
3162 value = atts[i++];
3163 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3164 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3165 http = 1;
3166 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3167 content = value;
3168 att = atts[i++];
3169 }
3170 if ((http) && (content != NULL))
3171 htmlCheckEncoding(ctxt, content);
3172
3173}
3174
3175/**
3176 * htmlParseStartTag:
3177 * @ctxt: an HTML parser context
3178 *
3179 * parse a start of tag either for rule element or
3180 * EmptyElement. In both case we don't parse the tag closing chars.
3181 *
3182 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3183 *
3184 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3185 *
3186 * With namespace:
3187 *
3188 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3189 *
3190 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3191 *
3192 */
3193
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003194static void
Owen Taylor3473f882001-02-23 17:55:21 +00003195htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003196 const xmlChar *name;
3197 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003198 xmlChar *attvalue;
Daniel Veillardf403d292003-10-05 13:51:35 +00003199 const xmlChar **atts = ctxt->atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003200 int nbatts = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +00003201 int maxatts = ctxt->maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003202 int meta = 0;
3203 int i;
3204
3205 if (CUR != '<') return;
3206 NEXT;
3207
3208 GROW;
3209 name = htmlParseHTMLName(ctxt);
3210 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003211 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3212 "htmlParseStartTag: invalid element name\n",
3213 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003214 /* Dump the bogus tag like browsers do */
William M. Brack76e95df2003-10-18 16:20:14 +00003215 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Owen Taylor3473f882001-02-23 17:55:21 +00003216 NEXT;
3217 return;
3218 }
3219 if (xmlStrEqual(name, BAD_CAST"meta"))
3220 meta = 1;
3221
3222 /*
3223 * Check for auto-closure of HTML elements.
3224 */
3225 htmlAutoClose(ctxt, name);
3226
3227 /*
3228 * Check for implied HTML elements.
3229 */
3230 htmlCheckImplied(ctxt, name);
3231
3232 /*
3233 * Avoid html at any level > 0, head at any level != 1
3234 * or any attempt to recurse body
3235 */
3236 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003237 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3238 "htmlParseStartTag: misplaced <html> tag\n",
3239 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003240 return;
3241 }
3242 if ((ctxt->nameNr != 1) &&
3243 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003244 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3245 "htmlParseStartTag: misplaced <head> tag\n",
3246 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003247 return;
3248 }
3249 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003250 int indx;
3251 for (indx = 0;indx < ctxt->nameNr;indx++) {
3252 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003253 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3254 "htmlParseStartTag: misplaced <body> tag\n",
3255 name, NULL);
Daniel Veillardc59d8262003-11-20 21:59:12 +00003256 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3257 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003258 return;
3259 }
3260 }
3261 }
3262
3263 /*
3264 * Now parse the attributes, it ends up with the ending
3265 *
3266 * (S Attribute)* S?
3267 */
3268 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003269 while ((IS_CHAR_CH(CUR)) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003270 (CUR != '>') &&
3271 ((CUR != '/') || (NXT(1) != '>'))) {
3272 long cons = ctxt->nbChars;
3273
3274 GROW;
3275 attname = htmlParseAttribute(ctxt, &attvalue);
3276 if (attname != NULL) {
3277
3278 /*
3279 * Well formedness requires at most one declaration of an attribute
3280 */
3281 for (i = 0; i < nbatts;i += 2) {
3282 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003283 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3284 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003285 if (attvalue != NULL)
3286 xmlFree(attvalue);
3287 goto failed;
3288 }
3289 }
3290
3291 /*
3292 * Add the pair to atts
3293 */
3294 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003295 maxatts = 22; /* allow for 10 attrs by default */
3296 atts = (const xmlChar **)
3297 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003298 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003299 htmlErrMemory(ctxt, NULL);
3300 if (attvalue != NULL)
3301 xmlFree(attvalue);
3302 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003303 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003304 ctxt->atts = atts;
3305 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003306 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003307 const xmlChar **n;
3308
Owen Taylor3473f882001-02-23 17:55:21 +00003309 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003310 n = (const xmlChar **) xmlRealloc((void *) atts,
3311 maxatts * sizeof(const xmlChar *));
3312 if (n == NULL) {
3313 htmlErrMemory(ctxt, NULL);
3314 if (attvalue != NULL)
3315 xmlFree(attvalue);
3316 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003317 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003318 atts = n;
3319 ctxt->atts = atts;
3320 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003321 }
3322 atts[nbatts++] = attname;
3323 atts[nbatts++] = attvalue;
3324 atts[nbatts] = NULL;
3325 atts[nbatts + 1] = NULL;
3326 }
3327 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003328 if (attvalue != NULL)
3329 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003330 /* Dump the bogus attribute string up to the next blank or
3331 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003332 while ((IS_CHAR_CH(CUR)) &&
3333 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003334 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003335 NEXT;
3336 }
3337
3338failed:
3339 SKIP_BLANKS;
3340 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003341 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3342 "htmlParseStartTag: problem parsing attributes\n",
3343 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003344 break;
3345 }
3346 }
3347
3348 /*
3349 * Handle specific association to the META tag
3350 */
3351 if (meta)
3352 htmlCheckMeta(ctxt, atts);
3353
3354 /*
3355 * SAX: Start of Element !
3356 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003357 htmlnamePush(ctxt, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003358 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3359 if (nbatts != 0)
3360 ctxt->sax->startElement(ctxt->userData, name, atts);
3361 else
3362 ctxt->sax->startElement(ctxt->userData, name, NULL);
3363 }
Owen Taylor3473f882001-02-23 17:55:21 +00003364
3365 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003366 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003367 if (atts[i] != NULL)
3368 xmlFree((xmlChar *) atts[i]);
3369 }
Owen Taylor3473f882001-02-23 17:55:21 +00003370 }
Owen Taylor3473f882001-02-23 17:55:21 +00003371}
3372
3373/**
3374 * htmlParseEndTag:
3375 * @ctxt: an HTML parser context
3376 *
3377 * parse an end of tag
3378 *
3379 * [42] ETag ::= '</' Name S? '>'
3380 *
3381 * With namespace
3382 *
3383 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003384 *
3385 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003386 */
3387
Daniel Veillardf420ac52001-07-04 16:04:09 +00003388static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003389htmlParseEndTag(htmlParserCtxtPtr ctxt)
3390{
3391 const xmlChar *name;
3392 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003393 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003394
3395 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003396 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3397 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003398 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003399 }
3400 SKIP(2);
3401
3402 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003403 if (name == NULL)
3404 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003405
3406 /*
3407 * We should definitely be at the ending "S? '>'" part
3408 */
3409 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003410 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003411 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3412 "End tag : expected '>'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003413 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003414 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003415
3416 /*
3417 * If the name read is not one of the element in the parsing stack
3418 * then return, it's just an error.
3419 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003420 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3421 if (xmlStrEqual(name, ctxt->nameTab[i]))
3422 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003423 }
3424 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003425 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3426 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003427 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003428 }
3429
3430
3431 /*
3432 * Check for auto-closure of HTML elements.
3433 */
3434
3435 htmlAutoCloseOnClose(ctxt, name);
3436
3437 /*
3438 * Well formedness constraints, opening and closing must match.
3439 * With the exception that the autoclose may have popped stuff out
3440 * of the stack.
3441 */
3442 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003443 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003444 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3445 "Opening and ending tag mismatch: %s and %s\n",
3446 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003447 }
3448 }
3449
3450 /*
3451 * SAX: End of Tag
3452 */
3453 oldname = ctxt->name;
3454 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003455 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3456 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003457 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003458 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003459 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003460 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003461 }
3462
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003463 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003464}
3465
3466
3467/**
3468 * htmlParseReference:
3469 * @ctxt: an HTML parser context
3470 *
3471 * parse and handle entity references in content,
3472 * this will end-up in a call to character() since this is either a
3473 * CharRef, or a predefined entity.
3474 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003475static void
Owen Taylor3473f882001-02-23 17:55:21 +00003476htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003477 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003478 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003479 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003480 if (CUR != '&') return;
3481
3482 if (NXT(1) == '#') {
3483 unsigned int c;
3484 int bits, i = 0;
3485
3486 c = htmlParseCharRef(ctxt);
3487 if (c == 0)
3488 return;
3489
3490 if (c < 0x80) { out[i++]= c; bits= -6; }
3491 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3492 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3493 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3494
3495 for ( ; bits >= 0; bits-= 6) {
3496 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3497 }
3498 out[i] = 0;
3499
3500 htmlCheckParagraph(ctxt);
3501 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3502 ctxt->sax->characters(ctxt->userData, out, i);
3503 } else {
3504 ent = htmlParseEntityRef(ctxt, &name);
3505 if (name == NULL) {
3506 htmlCheckParagraph(ctxt);
3507 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3508 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3509 return;
3510 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003511 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003512 htmlCheckParagraph(ctxt);
3513 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3514 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3515 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3516 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3517 }
3518 } else {
3519 unsigned int c;
3520 int bits, i = 0;
3521
3522 c = ent->value;
3523 if (c < 0x80)
3524 { out[i++]= c; bits= -6; }
3525 else if (c < 0x800)
3526 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3527 else if (c < 0x10000)
3528 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3529 else
3530 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3531
3532 for ( ; bits >= 0; bits-= 6) {
3533 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3534 }
3535 out[i] = 0;
3536
3537 htmlCheckParagraph(ctxt);
3538 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3539 ctxt->sax->characters(ctxt->userData, out, i);
3540 }
Owen Taylor3473f882001-02-23 17:55:21 +00003541 }
3542}
3543
3544/**
3545 * htmlParseContent:
3546 * @ctxt: an HTML parser context
3547 * @name: the node name
3548 *
3549 * Parse a content: comment, sub-element, reference or text.
3550 *
3551 */
3552
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003553static void
Owen Taylor3473f882001-02-23 17:55:21 +00003554htmlParseContent(htmlParserCtxtPtr ctxt) {
3555 xmlChar *currentNode;
3556 int depth;
3557
3558 currentNode = xmlStrdup(ctxt->name);
3559 depth = ctxt->nameNr;
3560 while (1) {
3561 long cons = ctxt->nbChars;
3562
3563 GROW;
3564 /*
3565 * Our tag or one of it's parent or children is ending.
3566 */
3567 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00003568 if (htmlParseEndTag(ctxt) &&
3569 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3570 if (currentNode != NULL)
3571 xmlFree(currentNode);
3572 return;
3573 }
3574 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00003575 }
3576
3577 /*
3578 * Has this node been popped out during parsing of
3579 * the next element
3580 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00003581 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3582 (!xmlStrEqual(currentNode, ctxt->name)))
3583 {
Owen Taylor3473f882001-02-23 17:55:21 +00003584 if (currentNode != NULL) xmlFree(currentNode);
3585 return;
3586 }
3587
Daniel Veillardf9533d12001-03-03 10:04:57 +00003588 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3589 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003590 /*
3591 * Handle SCRIPT/STYLE separately
3592 */
3593 htmlParseScript(ctxt);
3594 } else {
3595 /*
3596 * Sometimes DOCTYPE arrives in the middle of the document
3597 */
3598 if ((CUR == '<') && (NXT(1) == '!') &&
3599 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3600 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3601 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3602 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003603 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3604 "Misplaced DOCTYPE declaration\n",
3605 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003606 htmlParseDocTypeDecl(ctxt);
3607 }
3608
3609 /*
3610 * First case : a comment
3611 */
3612 if ((CUR == '<') && (NXT(1) == '!') &&
3613 (NXT(2) == '-') && (NXT(3) == '-')) {
3614 htmlParseComment(ctxt);
3615 }
3616
3617 /*
3618 * Second case : a sub-element.
3619 */
3620 else if (CUR == '<') {
3621 htmlParseElement(ctxt);
3622 }
3623
3624 /*
3625 * Third case : a reference. If if has not been resolved,
3626 * parsing returns it's Name, create the node
3627 */
3628 else if (CUR == '&') {
3629 htmlParseReference(ctxt);
3630 }
3631
3632 /*
3633 * Fourth : end of the resource
3634 */
3635 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003636 htmlAutoCloseOnEnd(ctxt);
3637 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003638 }
3639
3640 /*
3641 * Last case, text. Note that References are handled directly.
3642 */
3643 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003644 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003645 }
3646
3647 if (cons == ctxt->nbChars) {
3648 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003649 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3650 "detected an error in element content\n",
3651 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003652 }
3653 break;
3654 }
3655 }
3656 GROW;
3657 }
3658 if (currentNode != NULL) xmlFree(currentNode);
3659}
3660
3661/**
3662 * htmlParseElement:
3663 * @ctxt: an HTML parser context
3664 *
3665 * parse an HTML element, this is highly recursive
3666 *
3667 * [39] element ::= EmptyElemTag | STag content ETag
3668 *
3669 * [41] Attribute ::= Name Eq AttValue
3670 */
3671
3672void
3673htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003674 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003675 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00003676 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00003677 htmlParserNodeInfo node_info;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003678 const xmlChar *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00003679 int depth = ctxt->nameNr;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00003680 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00003681
3682 /* Capture start position */
3683 if (ctxt->record_info) {
3684 node_info.begin_pos = ctxt->input->consumed +
3685 (CUR_PTR - ctxt->input->base);
3686 node_info.begin_line = ctxt->input->line;
3687 }
3688
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003689 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003690 htmlParseStartTag(ctxt);
3691 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00003692 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3693 (name == NULL)) {
3694 if (CUR == '>')
3695 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003696 return;
3697 }
Owen Taylor3473f882001-02-23 17:55:21 +00003698
3699 /*
3700 * Lookup the info for that element.
3701 */
3702 info = htmlTagLookup(name);
3703 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003704 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3705 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003706 }
3707
3708 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00003709 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00003710 */
3711 if ((CUR == '/') && (NXT(1) == '>')) {
3712 SKIP(2);
3713 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3714 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003715 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003716 return;
3717 }
3718
3719 if (CUR == '>') {
3720 NEXT;
3721 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003722 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3723 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003724
3725 /*
3726 * end of parsing of this node.
3727 */
3728 if (xmlStrEqual(name, ctxt->name)) {
3729 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00003730 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003731 }
3732
3733 /*
3734 * Capture end position and add node
3735 */
3736 if ( currentNode != NULL && ctxt->record_info ) {
3737 node_info.end_pos = ctxt->input->consumed +
3738 (CUR_PTR - ctxt->input->base);
3739 node_info.end_line = ctxt->input->line;
3740 node_info.node = ctxt->node;
3741 xmlParserAddNodeInfo(ctxt, &node_info);
3742 }
3743 return;
3744 }
3745
3746 /*
3747 * Check for an Empty Element from DTD definition
3748 */
3749 if ((info != NULL) && (info->empty)) {
3750 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3751 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00003752 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003753 return;
3754 }
3755
3756 /*
3757 * Parse the content of the element:
3758 */
3759 currentNode = xmlStrdup(ctxt->name);
3760 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00003761 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00003762 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00003763 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00003764 if (oldptr==ctxt->input->cur) break;
Owen Taylor3473f882001-02-23 17:55:21 +00003765 if (ctxt->nameNr < depth) break;
3766 }
3767
Owen Taylor3473f882001-02-23 17:55:21 +00003768 /*
3769 * Capture end position and add node
3770 */
3771 if ( currentNode != NULL && ctxt->record_info ) {
3772 node_info.end_pos = ctxt->input->consumed +
3773 (CUR_PTR - ctxt->input->base);
3774 node_info.end_line = ctxt->input->line;
3775 node_info.node = ctxt->node;
3776 xmlParserAddNodeInfo(ctxt, &node_info);
3777 }
William M. Brack76e95df2003-10-18 16:20:14 +00003778 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003779 htmlAutoCloseOnEnd(ctxt);
3780 }
3781
Owen Taylor3473f882001-02-23 17:55:21 +00003782 if (currentNode != NULL)
3783 xmlFree(currentNode);
3784}
3785
3786/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003787 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00003788 * @ctxt: an HTML parser context
3789 *
3790 * parse an HTML document (and build a tree if using the standard SAX
3791 * interface).
3792 *
3793 * Returns 0, -1 in case of error. the parser context is augmented
3794 * as a result of the parsing.
3795 */
3796
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00003797int
Owen Taylor3473f882001-02-23 17:55:21 +00003798htmlParseDocument(htmlParserCtxtPtr ctxt) {
3799 xmlDtdPtr dtd;
3800
Daniel Veillardd0463562001-10-13 09:15:48 +00003801 xmlInitParser();
3802
Owen Taylor3473f882001-02-23 17:55:21 +00003803 htmlDefaultSAXHandlerInit();
3804 ctxt->html = 1;
3805
3806 GROW;
3807 /*
3808 * SAX: beginning of the document processing.
3809 */
3810 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3811 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3812
3813 /*
3814 * Wipe out everything which is before the first '<'
3815 */
3816 SKIP_BLANKS;
3817 if (CUR == 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003818 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
3819 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003820 }
3821
3822 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3823 ctxt->sax->startDocument(ctxt->userData);
3824
3825
3826 /*
3827 * Parse possible comments before any content
3828 */
3829 while ((CUR == '<') && (NXT(1) == '!') &&
3830 (NXT(2) == '-') && (NXT(3) == '-')) {
3831 htmlParseComment(ctxt);
3832 SKIP_BLANKS;
3833 }
3834
3835
3836 /*
3837 * Then possibly doc type declaration(s) and more Misc
3838 * (doctypedecl Misc*)?
3839 */
3840 if ((CUR == '<') && (NXT(1) == '!') &&
3841 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3842 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3843 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3844 (UPP(8) == 'E')) {
3845 htmlParseDocTypeDecl(ctxt);
3846 }
3847 SKIP_BLANKS;
3848
3849 /*
3850 * Parse possible comments before any content
3851 */
3852 while ((CUR == '<') && (NXT(1) == '!') &&
3853 (NXT(2) == '-') && (NXT(3) == '-')) {
3854 htmlParseComment(ctxt);
3855 SKIP_BLANKS;
3856 }
3857
3858 /*
3859 * Time to start parsing the tree itself
3860 */
3861 htmlParseContent(ctxt);
3862
3863 /*
3864 * autoclose
3865 */
3866 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00003867 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003868
3869
3870 /*
3871 * SAX: end of the document processing.
3872 */
3873 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3874 ctxt->sax->endDocument(ctxt->userData);
3875
3876 if (ctxt->myDoc != NULL) {
3877 dtd = xmlGetIntSubset(ctxt->myDoc);
3878 if (dtd == NULL)
3879 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00003880 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00003881 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3882 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3883 }
3884 if (! ctxt->wellFormed) return(-1);
3885 return(0);
3886}
3887
3888
3889/************************************************************************
3890 * *
3891 * Parser contexts handling *
3892 * *
3893 ************************************************************************/
3894
3895/**
William M. Brackedb65a72004-02-06 07:36:04 +00003896 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00003897 * @ctxt: an HTML parser context
3898 *
3899 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00003900 *
3901 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00003902 */
3903
Daniel Veillardf403d292003-10-05 13:51:35 +00003904static int
Owen Taylor3473f882001-02-23 17:55:21 +00003905htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3906{
3907 htmlSAXHandler *sax;
3908
Daniel Veillardf403d292003-10-05 13:51:35 +00003909 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003910 memset(ctxt, 0, sizeof(htmlParserCtxt));
3911
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003912 ctxt->dict = xmlDictCreate();
3913 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003914 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3915 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003916 }
Owen Taylor3473f882001-02-23 17:55:21 +00003917 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3918 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003919 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3920 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003921 }
3922 else
3923 memset(sax, 0, sizeof(htmlSAXHandler));
3924
3925 /* Allocate the Input stack */
3926 ctxt->inputTab = (htmlParserInputPtr *)
3927 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3928 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003929 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003930 ctxt->inputNr = 0;
3931 ctxt->inputMax = 0;
3932 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003933 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003934 }
3935 ctxt->inputNr = 0;
3936 ctxt->inputMax = 5;
3937 ctxt->input = NULL;
3938 ctxt->version = NULL;
3939 ctxt->encoding = NULL;
3940 ctxt->standalone = -1;
3941 ctxt->instate = XML_PARSER_START;
3942
3943 /* Allocate the Node stack */
3944 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3945 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003946 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003947 ctxt->nodeNr = 0;
3948 ctxt->nodeMax = 0;
3949 ctxt->node = NULL;
3950 ctxt->inputNr = 0;
3951 ctxt->inputMax = 0;
3952 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003953 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003954 }
3955 ctxt->nodeNr = 0;
3956 ctxt->nodeMax = 10;
3957 ctxt->node = NULL;
3958
3959 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003960 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003961 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003962 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003963 ctxt->nameNr = 0;
3964 ctxt->nameMax = 10;
3965 ctxt->name = NULL;
3966 ctxt->nodeNr = 0;
3967 ctxt->nodeMax = 0;
3968 ctxt->node = NULL;
3969 ctxt->inputNr = 0;
3970 ctxt->inputMax = 0;
3971 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00003972 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00003973 }
3974 ctxt->nameNr = 0;
3975 ctxt->nameMax = 10;
3976 ctxt->name = NULL;
3977
Daniel Veillard092643b2003-09-25 14:29:29 +00003978 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00003979 else {
3980 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00003981 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00003982 }
3983 ctxt->userData = ctxt;
3984 ctxt->myDoc = NULL;
3985 ctxt->wellFormed = 1;
3986 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00003987 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00003988 ctxt->html = 1;
William M. Brackedb65a72004-02-06 07:36:04 +00003989 ctxt->vctxt.userData = ctxt;
3990 ctxt->vctxt.error = xmlParserValidityError;
3991 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00003992 ctxt->record_info = 0;
3993 ctxt->validate = 0;
3994 ctxt->nbChars = 0;
3995 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00003996 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003997 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00003998 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00003999}
4000
4001/**
4002 * htmlFreeParserCtxt:
4003 * @ctxt: an HTML parser context
4004 *
4005 * Free all the memory used by a parser context. However the parsed
4006 * document in ctxt->myDoc is not freed.
4007 */
4008
4009void
4010htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4011{
4012 xmlFreeParserCtxt(ctxt);
4013}
4014
4015/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004016 * htmlNewParserCtxt:
4017 *
4018 * Allocate and initialize a new parser context.
4019 *
4020 * Returns the xmlParserCtxtPtr or NULL
4021 */
4022
4023static htmlParserCtxtPtr
4024htmlNewParserCtxt(void)
4025{
4026 xmlParserCtxtPtr ctxt;
4027
4028 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4029 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004030 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004031 return(NULL);
4032 }
4033 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004034 if (htmlInitParserCtxt(ctxt) < 0) {
4035 htmlFreeParserCtxt(ctxt);
4036 return(NULL);
4037 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004038 return(ctxt);
4039}
4040
4041/**
4042 * htmlCreateMemoryParserCtxt:
4043 * @buffer: a pointer to a char array
4044 * @size: the size of the array
4045 *
4046 * Create a parser context for an HTML in-memory document.
4047 *
4048 * Returns the new parser context or NULL
4049 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004050htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004051htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4052 xmlParserCtxtPtr ctxt;
4053 xmlParserInputPtr input;
4054 xmlParserInputBufferPtr buf;
4055
4056 if (buffer == NULL)
4057 return(NULL);
4058 if (size <= 0)
4059 return(NULL);
4060
4061 ctxt = htmlNewParserCtxt();
4062 if (ctxt == NULL)
4063 return(NULL);
4064
4065 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4066 if (buf == NULL) return(NULL);
4067
4068 input = xmlNewInputStream(ctxt);
4069 if (input == NULL) {
4070 xmlFreeParserCtxt(ctxt);
4071 return(NULL);
4072 }
4073
4074 input->filename = NULL;
4075 input->buf = buf;
4076 input->base = input->buf->buffer->content;
4077 input->cur = input->buf->buffer->content;
4078 input->end = &input->buf->buffer->content[input->buf->buffer->use];
4079
4080 inputPush(ctxt, input);
4081 return(ctxt);
4082}
4083
4084/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004085 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004086 * @cur: a pointer to an array of xmlChar
4087 * @encoding: a free form C string describing the HTML document encoding, or NULL
4088 *
4089 * Create a parser context for an HTML document.
4090 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004091 * TODO: check the need to add encoding handling there
4092 *
Owen Taylor3473f882001-02-23 17:55:21 +00004093 * Returns the new parser context or NULL
4094 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004095static htmlParserCtxtPtr
Daniel Veillardc86a4fa2001-03-26 16:28:29 +00004096htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004097 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004098 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004099
Daniel Veillard1d995272002-07-22 16:43:32 +00004100 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004101 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004102 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004103 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4104
4105 if (encoding != NULL) {
4106 xmlCharEncoding enc;
4107 xmlCharEncodingHandlerPtr handler;
4108
4109 if (ctxt->input->encoding != NULL)
4110 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004111 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004112
4113 enc = xmlParseCharEncoding(encoding);
4114 /*
4115 * registered set of known encodings
4116 */
4117 if (enc != XML_CHAR_ENCODING_ERROR) {
4118 xmlSwitchEncoding(ctxt, enc);
4119 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004120 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4121 "Unsupported encoding %s\n",
4122 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004123 }
4124 } else {
4125 /*
4126 * fallback for unknown encodings
4127 */
4128 handler = xmlFindCharEncodingHandler((const char *) encoding);
4129 if (handler != NULL) {
4130 xmlSwitchToEncoding(ctxt, handler);
4131 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004132 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4133 "Unsupported encoding %s\n",
4134 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004135 }
4136 }
4137 }
4138 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004139}
4140
Daniel Veillard73b013f2003-09-30 12:36:01 +00004141#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004142/************************************************************************
4143 * *
4144 * Progressive parsing interfaces *
4145 * *
4146 ************************************************************************/
4147
4148/**
4149 * htmlParseLookupSequence:
4150 * @ctxt: an HTML parser context
4151 * @first: the first char to lookup
4152 * @next: the next char to lookup or zero
4153 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004154 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004155 *
4156 * Try to find if a sequence (first, next, third) or just (first next) or
4157 * (first) is available in the input stream.
4158 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4159 * to avoid rescanning sequences of bytes, it DOES change the state of the
4160 * parser, do not use liberally.
4161 * This is basically similar to xmlParseLookupSequence()
4162 *
4163 * Returns the index to the current parsing point if the full sequence
4164 * is available, -1 otherwise.
4165 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004166static int
Owen Taylor3473f882001-02-23 17:55:21 +00004167htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
William M. Brackc1939562003-08-05 15:52:22 +00004168 xmlChar next, xmlChar third, int iscomment) {
Owen Taylor3473f882001-02-23 17:55:21 +00004169 int base, len;
4170 htmlParserInputPtr in;
4171 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004172 int incomment = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004173
4174 in = ctxt->input;
4175 if (in == NULL) return(-1);
4176 base = in->cur - in->base;
4177 if (base < 0) return(-1);
4178 if (ctxt->checkIndex > base)
4179 base = ctxt->checkIndex;
4180 if (in->buf == NULL) {
4181 buf = in->base;
4182 len = in->length;
4183 } else {
4184 buf = in->buf->buffer->content;
4185 len = in->buf->buffer->use;
4186 }
4187 /* take into account the sequence length */
4188 if (third) len -= 2;
4189 else if (next) len --;
4190 for (;base < len;base++) {
William M. Brackc1939562003-08-05 15:52:22 +00004191 if (!incomment && (base + 4 < len) && !iscomment) {
Daniel Veillardc1f78342001-11-10 11:43:05 +00004192 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4193 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4194 incomment = 1;
Daniel Veillard97e01882003-07-30 18:59:19 +00004195 /* do not increment past <! - some people use <!--> */
4196 base += 2;
Daniel Veillardc1f78342001-11-10 11:43:05 +00004197 }
Daniel Veillardc1f78342001-11-10 11:43:05 +00004198 }
4199 if (incomment) {
William M. Brack4a557d92003-07-29 04:28:04 +00004200 if (base + 3 > len)
Daniel Veillardc1f78342001-11-10 11:43:05 +00004201 return(-1);
4202 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4203 (buf[base + 2] == '>')) {
4204 incomment = 0;
4205 base += 2;
4206 }
4207 continue;
4208 }
Owen Taylor3473f882001-02-23 17:55:21 +00004209 if (buf[base] == first) {
4210 if (third != 0) {
4211 if ((buf[base + 1] != next) ||
4212 (buf[base + 2] != third)) continue;
4213 } else if (next != 0) {
4214 if (buf[base + 1] != next) continue;
4215 }
4216 ctxt->checkIndex = 0;
4217#ifdef DEBUG_PUSH
4218 if (next == 0)
4219 xmlGenericError(xmlGenericErrorContext,
4220 "HPP: lookup '%c' found at %d\n",
4221 first, base);
4222 else if (third == 0)
4223 xmlGenericError(xmlGenericErrorContext,
4224 "HPP: lookup '%c%c' found at %d\n",
4225 first, next, base);
4226 else
4227 xmlGenericError(xmlGenericErrorContext,
4228 "HPP: lookup '%c%c%c' found at %d\n",
4229 first, next, third, base);
4230#endif
4231 return(base - (in->cur - in->base));
4232 }
4233 }
4234 ctxt->checkIndex = base;
4235#ifdef DEBUG_PUSH
4236 if (next == 0)
4237 xmlGenericError(xmlGenericErrorContext,
4238 "HPP: lookup '%c' failed\n", first);
4239 else if (third == 0)
4240 xmlGenericError(xmlGenericErrorContext,
4241 "HPP: lookup '%c%c' failed\n", first, next);
4242 else
4243 xmlGenericError(xmlGenericErrorContext,
4244 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4245#endif
4246 return(-1);
4247}
4248
4249/**
4250 * htmlParseTryOrFinish:
4251 * @ctxt: an HTML parser context
4252 * @terminate: last chunk indicator
4253 *
4254 * Try to progress on parsing
4255 *
4256 * Returns zero if no parsing was possible
4257 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004258static int
Owen Taylor3473f882001-02-23 17:55:21 +00004259htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4260 int ret = 0;
4261 htmlParserInputPtr in;
4262 int avail = 0;
4263 xmlChar cur, next;
4264
4265#ifdef DEBUG_PUSH
4266 switch (ctxt->instate) {
4267 case XML_PARSER_EOF:
4268 xmlGenericError(xmlGenericErrorContext,
4269 "HPP: try EOF\n"); break;
4270 case XML_PARSER_START:
4271 xmlGenericError(xmlGenericErrorContext,
4272 "HPP: try START\n"); break;
4273 case XML_PARSER_MISC:
4274 xmlGenericError(xmlGenericErrorContext,
4275 "HPP: try MISC\n");break;
4276 case XML_PARSER_COMMENT:
4277 xmlGenericError(xmlGenericErrorContext,
4278 "HPP: try COMMENT\n");break;
4279 case XML_PARSER_PROLOG:
4280 xmlGenericError(xmlGenericErrorContext,
4281 "HPP: try PROLOG\n");break;
4282 case XML_PARSER_START_TAG:
4283 xmlGenericError(xmlGenericErrorContext,
4284 "HPP: try START_TAG\n");break;
4285 case XML_PARSER_CONTENT:
4286 xmlGenericError(xmlGenericErrorContext,
4287 "HPP: try CONTENT\n");break;
4288 case XML_PARSER_CDATA_SECTION:
4289 xmlGenericError(xmlGenericErrorContext,
4290 "HPP: try CDATA_SECTION\n");break;
4291 case XML_PARSER_END_TAG:
4292 xmlGenericError(xmlGenericErrorContext,
4293 "HPP: try END_TAG\n");break;
4294 case XML_PARSER_ENTITY_DECL:
4295 xmlGenericError(xmlGenericErrorContext,
4296 "HPP: try ENTITY_DECL\n");break;
4297 case XML_PARSER_ENTITY_VALUE:
4298 xmlGenericError(xmlGenericErrorContext,
4299 "HPP: try ENTITY_VALUE\n");break;
4300 case XML_PARSER_ATTRIBUTE_VALUE:
4301 xmlGenericError(xmlGenericErrorContext,
4302 "HPP: try ATTRIBUTE_VALUE\n");break;
4303 case XML_PARSER_DTD:
4304 xmlGenericError(xmlGenericErrorContext,
4305 "HPP: try DTD\n");break;
4306 case XML_PARSER_EPILOG:
4307 xmlGenericError(xmlGenericErrorContext,
4308 "HPP: try EPILOG\n");break;
4309 case XML_PARSER_PI:
4310 xmlGenericError(xmlGenericErrorContext,
4311 "HPP: try PI\n");break;
4312 case XML_PARSER_SYSTEM_LITERAL:
4313 xmlGenericError(xmlGenericErrorContext,
4314 "HPP: try SYSTEM_LITERAL\n");break;
4315 }
4316#endif
4317
4318 while (1) {
4319
4320 in = ctxt->input;
4321 if (in == NULL) break;
4322 if (in->buf == NULL)
4323 avail = in->length - (in->cur - in->base);
4324 else
4325 avail = in->buf->buffer->use - (in->cur - in->base);
4326 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004327 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004328 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4329 /*
4330 * SAX: end of the document processing.
4331 */
4332 ctxt->instate = XML_PARSER_EOF;
4333 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4334 ctxt->sax->endDocument(ctxt->userData);
4335 }
4336 }
4337 if (avail < 1)
4338 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00004339 cur = in->cur[0];
4340 if (cur == 0) {
4341 SKIP(1);
4342 continue;
4343 }
4344
Owen Taylor3473f882001-02-23 17:55:21 +00004345 switch (ctxt->instate) {
4346 case XML_PARSER_EOF:
4347 /*
4348 * Document parsing is done !
4349 */
4350 goto done;
4351 case XML_PARSER_START:
4352 /*
4353 * Very first chars read from the document flow.
4354 */
4355 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004356 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004357 SKIP_BLANKS;
4358 if (in->buf == NULL)
4359 avail = in->length - (in->cur - in->base);
4360 else
4361 avail = in->buf->buffer->use - (in->cur - in->base);
4362 }
4363 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4364 ctxt->sax->setDocumentLocator(ctxt->userData,
4365 &xmlDefaultSAXLocator);
4366 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4367 (!ctxt->disableSAX))
4368 ctxt->sax->startDocument(ctxt->userData);
4369
4370 cur = in->cur[0];
4371 next = in->cur[1];
4372 if ((cur == '<') && (next == '!') &&
4373 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4374 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4375 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4376 (UPP(8) == 'E')) {
4377 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004378 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004379 goto done;
4380#ifdef DEBUG_PUSH
4381 xmlGenericError(xmlGenericErrorContext,
4382 "HPP: Parsing internal subset\n");
4383#endif
4384 htmlParseDocTypeDecl(ctxt);
4385 ctxt->instate = XML_PARSER_PROLOG;
4386#ifdef DEBUG_PUSH
4387 xmlGenericError(xmlGenericErrorContext,
4388 "HPP: entering PROLOG\n");
4389#endif
4390 } else {
4391 ctxt->instate = XML_PARSER_MISC;
4392 }
4393#ifdef DEBUG_PUSH
4394 xmlGenericError(xmlGenericErrorContext,
4395 "HPP: entering MISC\n");
4396#endif
4397 break;
4398 case XML_PARSER_MISC:
4399 SKIP_BLANKS;
4400 if (in->buf == NULL)
4401 avail = in->length - (in->cur - in->base);
4402 else
4403 avail = in->buf->buffer->use - (in->cur - in->base);
4404 if (avail < 2)
4405 goto done;
4406 cur = in->cur[0];
4407 next = in->cur[1];
4408 if ((cur == '<') && (next == '!') &&
4409 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4410 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004411 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004412 goto done;
4413#ifdef DEBUG_PUSH
4414 xmlGenericError(xmlGenericErrorContext,
4415 "HPP: Parsing Comment\n");
4416#endif
4417 htmlParseComment(ctxt);
4418 ctxt->instate = XML_PARSER_MISC;
4419 } else if ((cur == '<') && (next == '!') &&
4420 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4421 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4422 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4423 (UPP(8) == 'E')) {
4424 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004425 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004426 goto done;
4427#ifdef DEBUG_PUSH
4428 xmlGenericError(xmlGenericErrorContext,
4429 "HPP: Parsing internal subset\n");
4430#endif
4431 htmlParseDocTypeDecl(ctxt);
4432 ctxt->instate = XML_PARSER_PROLOG;
4433#ifdef DEBUG_PUSH
4434 xmlGenericError(xmlGenericErrorContext,
4435 "HPP: entering PROLOG\n");
4436#endif
4437 } else if ((cur == '<') && (next == '!') &&
4438 (avail < 9)) {
4439 goto done;
4440 } else {
4441 ctxt->instate = XML_PARSER_START_TAG;
4442#ifdef DEBUG_PUSH
4443 xmlGenericError(xmlGenericErrorContext,
4444 "HPP: entering START_TAG\n");
4445#endif
4446 }
4447 break;
4448 case XML_PARSER_PROLOG:
4449 SKIP_BLANKS;
4450 if (in->buf == NULL)
4451 avail = in->length - (in->cur - in->base);
4452 else
4453 avail = in->buf->buffer->use - (in->cur - in->base);
4454 if (avail < 2)
4455 goto done;
4456 cur = in->cur[0];
4457 next = in->cur[1];
4458 if ((cur == '<') && (next == '!') &&
4459 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4460 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004461 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004462 goto done;
4463#ifdef DEBUG_PUSH
4464 xmlGenericError(xmlGenericErrorContext,
4465 "HPP: Parsing Comment\n");
4466#endif
4467 htmlParseComment(ctxt);
4468 ctxt->instate = XML_PARSER_PROLOG;
4469 } else if ((cur == '<') && (next == '!') &&
4470 (avail < 4)) {
4471 goto done;
4472 } else {
4473 ctxt->instate = XML_PARSER_START_TAG;
4474#ifdef DEBUG_PUSH
4475 xmlGenericError(xmlGenericErrorContext,
4476 "HPP: entering START_TAG\n");
4477#endif
4478 }
4479 break;
4480 case XML_PARSER_EPILOG:
4481 if (in->buf == NULL)
4482 avail = in->length - (in->cur - in->base);
4483 else
4484 avail = in->buf->buffer->use - (in->cur - in->base);
4485 if (avail < 1)
4486 goto done;
4487 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00004488 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004489 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004490 goto done;
4491 }
4492 if (avail < 2)
4493 goto done;
4494 next = in->cur[1];
4495 if ((cur == '<') && (next == '!') &&
4496 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4497 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004498 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004499 goto done;
4500#ifdef DEBUG_PUSH
4501 xmlGenericError(xmlGenericErrorContext,
4502 "HPP: Parsing Comment\n");
4503#endif
4504 htmlParseComment(ctxt);
4505 ctxt->instate = XML_PARSER_EPILOG;
4506 } else if ((cur == '<') && (next == '!') &&
4507 (avail < 4)) {
4508 goto done;
4509 } else {
4510 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004511 ctxt->wellFormed = 0;
4512 ctxt->instate = XML_PARSER_EOF;
4513#ifdef DEBUG_PUSH
4514 xmlGenericError(xmlGenericErrorContext,
4515 "HPP: entering EOF\n");
4516#endif
4517 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4518 ctxt->sax->endDocument(ctxt->userData);
4519 goto done;
4520 }
4521 break;
4522 case XML_PARSER_START_TAG: {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004523 const xmlChar *name, *oldname;
Owen Taylor3473f882001-02-23 17:55:21 +00004524 int depth = ctxt->nameNr;
Daniel Veillardbb371292001-08-16 23:26:59 +00004525 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004526
4527 if (avail < 2)
4528 goto done;
4529 cur = in->cur[0];
4530 if (cur != '<') {
4531 ctxt->instate = XML_PARSER_CONTENT;
4532#ifdef DEBUG_PUSH
4533 xmlGenericError(xmlGenericErrorContext,
4534 "HPP: entering CONTENT\n");
4535#endif
4536 break;
4537 }
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00004538 if (in->cur[1] == '/') {
4539 ctxt->instate = XML_PARSER_END_TAG;
4540 ctxt->checkIndex = 0;
4541#ifdef DEBUG_PUSH
4542 xmlGenericError(xmlGenericErrorContext,
4543 "HPP: entering END_TAG\n");
4544#endif
4545 break;
4546 }
Owen Taylor3473f882001-02-23 17:55:21 +00004547 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004548 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004549 goto done;
4550
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004551 oldname = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004552 htmlParseStartTag(ctxt);
4553 name = ctxt->name;
Owen Taylor3473f882001-02-23 17:55:21 +00004554 if (((depth == ctxt->nameNr) &&
4555 (xmlStrEqual(oldname, ctxt->name))) ||
4556 (name == NULL)) {
4557 if (CUR == '>')
4558 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004559 break;
4560 }
Owen Taylor3473f882001-02-23 17:55:21 +00004561
4562 /*
4563 * Lookup the info for that element.
4564 */
4565 info = htmlTagLookup(name);
4566 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004567 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4568 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004569 }
4570
4571 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004572 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004573 */
4574 if ((CUR == '/') && (NXT(1) == '>')) {
4575 SKIP(2);
4576 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4577 ctxt->sax->endElement(ctxt->userData, name);
4578 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004579 ctxt->instate = XML_PARSER_CONTENT;
4580#ifdef DEBUG_PUSH
4581 xmlGenericError(xmlGenericErrorContext,
4582 "HPP: entering CONTENT\n");
4583#endif
4584 break;
4585 }
4586
4587 if (CUR == '>') {
4588 NEXT;
4589 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004590 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4591 "Couldn't find end of Start Tag %s\n",
4592 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004593
4594 /*
4595 * end of parsing of this node.
4596 */
4597 if (xmlStrEqual(name, ctxt->name)) {
4598 nodePop(ctxt);
4599 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004600 }
4601
4602 ctxt->instate = XML_PARSER_CONTENT;
4603#ifdef DEBUG_PUSH
4604 xmlGenericError(xmlGenericErrorContext,
4605 "HPP: entering CONTENT\n");
4606#endif
4607 break;
4608 }
4609
4610 /*
4611 * Check for an Empty Element from DTD definition
4612 */
4613 if ((info != NULL) && (info->empty)) {
4614 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4615 ctxt->sax->endElement(ctxt->userData, name);
4616 oldname = htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004617 }
4618 ctxt->instate = XML_PARSER_CONTENT;
4619#ifdef DEBUG_PUSH
4620 xmlGenericError(xmlGenericErrorContext,
4621 "HPP: entering CONTENT\n");
4622#endif
4623 break;
4624 }
4625 case XML_PARSER_CONTENT: {
4626 long cons;
4627 /*
4628 * Handle preparsed entities and charRef
4629 */
4630 if (ctxt->token != 0) {
4631 xmlChar chr[2] = { 0 , 0 } ;
4632
4633 chr[0] = (xmlChar) ctxt->token;
4634 htmlCheckParagraph(ctxt);
4635 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4636 ctxt->sax->characters(ctxt->userData, chr, 1);
4637 ctxt->token = 0;
4638 ctxt->checkIndex = 0;
4639 }
4640 if ((avail == 1) && (terminate)) {
4641 cur = in->cur[0];
4642 if ((cur != '<') && (cur != '&')) {
4643 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00004644 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004645 if (ctxt->sax->ignorableWhitespace != NULL)
4646 ctxt->sax->ignorableWhitespace(
4647 ctxt->userData, &cur, 1);
4648 } else {
4649 htmlCheckParagraph(ctxt);
4650 if (ctxt->sax->characters != NULL)
4651 ctxt->sax->characters(
4652 ctxt->userData, &cur, 1);
4653 }
4654 }
4655 ctxt->token = 0;
4656 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00004657 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00004658 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004659 }
Owen Taylor3473f882001-02-23 17:55:21 +00004660 }
4661 if (avail < 2)
4662 goto done;
4663 cur = in->cur[0];
4664 next = in->cur[1];
4665 cons = ctxt->nbChars;
4666 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4667 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4668 /*
4669 * Handle SCRIPT/STYLE separately
4670 */
4671 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004672 (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004673 goto done;
4674 htmlParseScript(ctxt);
4675 if ((cur == '<') && (next == '/')) {
4676 ctxt->instate = XML_PARSER_END_TAG;
4677 ctxt->checkIndex = 0;
4678#ifdef DEBUG_PUSH
4679 xmlGenericError(xmlGenericErrorContext,
4680 "HPP: entering END_TAG\n");
4681#endif
4682 break;
4683 }
4684 } else {
4685 /*
4686 * Sometimes DOCTYPE arrives in the middle of the document
4687 */
4688 if ((cur == '<') && (next == '!') &&
4689 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4690 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4691 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4692 (UPP(8) == 'E')) {
4693 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004694 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004695 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00004696 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4697 "Misplaced DOCTYPE declaration\n",
4698 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004699 htmlParseDocTypeDecl(ctxt);
4700 } else if ((cur == '<') && (next == '!') &&
4701 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4702 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004703 (htmlParseLookupSequence(
4704 ctxt, '-', '-', '>', 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004705 goto done;
4706#ifdef DEBUG_PUSH
4707 xmlGenericError(xmlGenericErrorContext,
4708 "HPP: Parsing Comment\n");
4709#endif
4710 htmlParseComment(ctxt);
4711 ctxt->instate = XML_PARSER_CONTENT;
4712 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4713 goto done;
4714 } else if ((cur == '<') && (next == '/')) {
4715 ctxt->instate = XML_PARSER_END_TAG;
4716 ctxt->checkIndex = 0;
4717#ifdef DEBUG_PUSH
4718 xmlGenericError(xmlGenericErrorContext,
4719 "HPP: entering END_TAG\n");
4720#endif
4721 break;
4722 } else if (cur == '<') {
4723 ctxt->instate = XML_PARSER_START_TAG;
4724 ctxt->checkIndex = 0;
4725#ifdef DEBUG_PUSH
4726 xmlGenericError(xmlGenericErrorContext,
4727 "HPP: entering START_TAG\n");
4728#endif
4729 break;
4730 } else if (cur == '&') {
4731 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004732 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004733 goto done;
4734#ifdef DEBUG_PUSH
4735 xmlGenericError(xmlGenericErrorContext,
4736 "HPP: Parsing Reference\n");
4737#endif
4738 /* TODO: check generation of subtrees if noent !!! */
4739 htmlParseReference(ctxt);
4740 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004741 /*
4742 * check that the text sequence is complete
4743 * before handing out the data to the parser
4744 * to avoid problems with erroneous end of
4745 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00004746 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00004747 if ((!terminate) &&
4748 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
4749 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00004750 ctxt->checkIndex = 0;
4751#ifdef DEBUG_PUSH
4752 xmlGenericError(xmlGenericErrorContext,
4753 "HPP: Parsing char data\n");
4754#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004755 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004756 }
4757 }
4758 if (cons == ctxt->nbChars) {
4759 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004760 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4761 "detected an error in element content\n",
4762 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004763 }
4764 NEXT;
4765 break;
4766 }
4767
4768 break;
4769 }
4770 case XML_PARSER_END_TAG:
4771 if (avail < 2)
4772 goto done;
4773 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00004774 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00004775 goto done;
4776 htmlParseEndTag(ctxt);
4777 if (ctxt->nameNr == 0) {
4778 ctxt->instate = XML_PARSER_EPILOG;
4779 } else {
4780 ctxt->instate = XML_PARSER_CONTENT;
4781 }
4782 ctxt->checkIndex = 0;
4783#ifdef DEBUG_PUSH
4784 xmlGenericError(xmlGenericErrorContext,
4785 "HPP: entering CONTENT\n");
4786#endif
4787 break;
4788 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00004789 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4790 "HPP: internal error, state == CDATA\n",
4791 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004792 ctxt->instate = XML_PARSER_CONTENT;
4793 ctxt->checkIndex = 0;
4794#ifdef DEBUG_PUSH
4795 xmlGenericError(xmlGenericErrorContext,
4796 "HPP: entering CONTENT\n");
4797#endif
4798 break;
4799 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00004800 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4801 "HPP: internal error, state == DTD\n",
4802 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004803 ctxt->instate = XML_PARSER_CONTENT;
4804 ctxt->checkIndex = 0;
4805#ifdef DEBUG_PUSH
4806 xmlGenericError(xmlGenericErrorContext,
4807 "HPP: entering CONTENT\n");
4808#endif
4809 break;
4810 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00004811 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4812 "HPP: internal error, state == COMMENT\n",
4813 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004814 ctxt->instate = XML_PARSER_CONTENT;
4815 ctxt->checkIndex = 0;
4816#ifdef DEBUG_PUSH
4817 xmlGenericError(xmlGenericErrorContext,
4818 "HPP: entering CONTENT\n");
4819#endif
4820 break;
4821 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00004822 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4823 "HPP: internal error, state == PI\n",
4824 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004825 ctxt->instate = XML_PARSER_CONTENT;
4826 ctxt->checkIndex = 0;
4827#ifdef DEBUG_PUSH
4828 xmlGenericError(xmlGenericErrorContext,
4829 "HPP: entering CONTENT\n");
4830#endif
4831 break;
4832 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004833 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4834 "HPP: internal error, state == ENTITY_DECL\n",
4835 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004836 ctxt->instate = XML_PARSER_CONTENT;
4837 ctxt->checkIndex = 0;
4838#ifdef DEBUG_PUSH
4839 xmlGenericError(xmlGenericErrorContext,
4840 "HPP: entering CONTENT\n");
4841#endif
4842 break;
4843 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004844 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4845 "HPP: internal error, state == ENTITY_VALUE\n",
4846 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004847 ctxt->instate = XML_PARSER_CONTENT;
4848 ctxt->checkIndex = 0;
4849#ifdef DEBUG_PUSH
4850 xmlGenericError(xmlGenericErrorContext,
4851 "HPP: entering DTD\n");
4852#endif
4853 break;
4854 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004855 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4856 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
4857 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004858 ctxt->instate = XML_PARSER_START_TAG;
4859 ctxt->checkIndex = 0;
4860#ifdef DEBUG_PUSH
4861 xmlGenericError(xmlGenericErrorContext,
4862 "HPP: entering START_TAG\n");
4863#endif
4864 break;
4865 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004866 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4867 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
4868 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004869 ctxt->instate = XML_PARSER_CONTENT;
4870 ctxt->checkIndex = 0;
4871#ifdef DEBUG_PUSH
4872 xmlGenericError(xmlGenericErrorContext,
4873 "HPP: entering CONTENT\n");
4874#endif
4875 break;
4876 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00004877 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4878 "HPP: internal error, state == XML_PARSER_IGNORE\n",
4879 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004880 ctxt->instate = XML_PARSER_CONTENT;
4881 ctxt->checkIndex = 0;
4882#ifdef DEBUG_PUSH
4883 xmlGenericError(xmlGenericErrorContext,
4884 "HPP: entering CONTENT\n");
4885#endif
4886 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004887 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00004888 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4889 "HPP: internal error, state == XML_PARSER_LITERAL\n",
4890 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00004891 ctxt->instate = XML_PARSER_CONTENT;
4892 ctxt->checkIndex = 0;
4893#ifdef DEBUG_PUSH
4894 xmlGenericError(xmlGenericErrorContext,
4895 "HPP: entering CONTENT\n");
4896#endif
4897 break;
4898
Owen Taylor3473f882001-02-23 17:55:21 +00004899 }
4900 }
4901done:
4902 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004903 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004904 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4905 /*
4906 * SAX: end of the document processing.
4907 */
4908 ctxt->instate = XML_PARSER_EOF;
4909 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4910 ctxt->sax->endDocument(ctxt->userData);
4911 }
4912 }
4913 if ((ctxt->myDoc != NULL) &&
4914 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4915 (ctxt->instate == XML_PARSER_EPILOG))) {
4916 xmlDtdPtr dtd;
4917 dtd = xmlGetIntSubset(ctxt->myDoc);
4918 if (dtd == NULL)
4919 ctxt->myDoc->intSubset =
Daniel Veillard40412cd2003-09-03 13:28:32 +00004920 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004921 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4922 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4923 }
4924#ifdef DEBUG_PUSH
4925 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4926#endif
4927 return(ret);
4928}
4929
4930/**
Owen Taylor3473f882001-02-23 17:55:21 +00004931 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00004932 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00004933 * @chunk: an char array
4934 * @size: the size in byte of the chunk
4935 * @terminate: last chunk indicator
4936 *
4937 * Parse a Chunk of memory
4938 *
4939 * Returns zero if no error, the xmlParserErrors otherwise.
4940 */
4941int
4942htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4943 int terminate) {
4944 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4945 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4946 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4947 int cur = ctxt->input->cur - ctxt->input->base;
4948
4949 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4950 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4951 ctxt->input->cur = ctxt->input->base + cur;
4952#ifdef DEBUG_PUSH
4953 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4954#endif
4955
Daniel Veillard14f752c2003-08-09 11:44:50 +00004956#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00004957 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4958 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00004959#endif
Owen Taylor3473f882001-02-23 17:55:21 +00004960 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00004961 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
4962 xmlParserInputBufferPtr in = ctxt->input->buf;
4963 if ((in->encoder != NULL) && (in->buffer != NULL) &&
4964 (in->raw != NULL)) {
4965 int nbchars;
4966
4967 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
4968 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004969 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
4970 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00004971 return(XML_ERR_INVALID_ENCODING);
4972 }
4973 }
4974 }
Owen Taylor3473f882001-02-23 17:55:21 +00004975 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00004976 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00004977 if (terminate) {
4978 if ((ctxt->instate != XML_PARSER_EOF) &&
4979 (ctxt->instate != XML_PARSER_EPILOG) &&
4980 (ctxt->instate != XML_PARSER_MISC)) {
4981 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00004982 ctxt->wellFormed = 0;
4983 }
4984 if (ctxt->instate != XML_PARSER_EOF) {
4985 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4986 ctxt->sax->endDocument(ctxt->userData);
4987 }
4988 ctxt->instate = XML_PARSER_EOF;
4989 }
4990 return((xmlParserErrors) ctxt->errNo);
4991}
Daniel Veillard73b013f2003-09-30 12:36:01 +00004992#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00004993
4994/************************************************************************
4995 * *
4996 * User entry points *
4997 * *
4998 ************************************************************************/
4999
5000/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005001 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005002 * @sax: a SAX handler
5003 * @user_data: The user data returned on SAX callbacks
5004 * @chunk: a pointer to an array of chars
5005 * @size: number of chars in the array
5006 * @filename: an optional file name or URI
5007 * @enc: an optional encoding
5008 *
5009 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00005010 * The value of @filename is used for fetching external entities
5011 * and error/warning reports.
5012 *
5013 * Returns the new parser context or NULL
5014 */
5015htmlParserCtxtPtr
5016htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5017 const char *chunk, int size, const char *filename,
5018 xmlCharEncoding enc) {
5019 htmlParserCtxtPtr ctxt;
5020 htmlParserInputPtr inputStream;
5021 xmlParserInputBufferPtr buf;
5022
Daniel Veillardd0463562001-10-13 09:15:48 +00005023 xmlInitParser();
5024
Owen Taylor3473f882001-02-23 17:55:21 +00005025 buf = xmlAllocParserInputBuffer(enc);
5026 if (buf == NULL) return(NULL);
5027
Daniel Veillardf403d292003-10-05 13:51:35 +00005028 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005029 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005030 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005031 return(NULL);
5032 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00005033 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5034 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00005035 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00005036 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00005037 xmlFree(ctxt->sax);
5038 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5039 if (ctxt->sax == NULL) {
5040 xmlFree(buf);
5041 xmlFree(ctxt);
5042 return(NULL);
5043 }
5044 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5045 if (user_data != NULL)
5046 ctxt->userData = user_data;
5047 }
5048 if (filename == NULL) {
5049 ctxt->directory = NULL;
5050 } else {
5051 ctxt->directory = xmlParserGetDirectory(filename);
5052 }
5053
5054 inputStream = htmlNewInputStream(ctxt);
5055 if (inputStream == NULL) {
5056 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00005057 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00005058 return(NULL);
5059 }
5060
5061 if (filename == NULL)
5062 inputStream->filename = NULL;
5063 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00005064 inputStream->filename = (char *)
5065 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00005066 inputStream->buf = buf;
5067 inputStream->base = inputStream->buf->buffer->content;
5068 inputStream->cur = inputStream->buf->buffer->content;
Daniel Veillard5f704af2003-03-05 10:01:43 +00005069 inputStream->end =
5070 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005071
5072 inputPush(ctxt, inputStream);
5073
5074 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5075 (ctxt->input->buf != NULL)) {
Daniel Veillard5f704af2003-03-05 10:01:43 +00005076 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5077 int cur = ctxt->input->cur - ctxt->input->base;
5078
Owen Taylor3473f882001-02-23 17:55:21 +00005079 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00005080
5081 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5082 ctxt->input->cur = ctxt->input->base + cur;
5083 ctxt->input->end =
5084 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
Owen Taylor3473f882001-02-23 17:55:21 +00005085#ifdef DEBUG_PUSH
5086 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5087#endif
5088 }
5089
5090 return(ctxt);
5091}
5092
5093/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005094 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005095 * @cur: a pointer to an array of xmlChar
5096 * @encoding: a free form C string describing the HTML document encoding, or NULL
5097 * @sax: the SAX handler block
5098 * @userData: if using SAX, this pointer will be provided on callbacks.
5099 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005100 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5101 * to handle parse events. If sax is NULL, fallback to the default DOM
5102 * behavior and return a tree.
Owen Taylor3473f882001-02-23 17:55:21 +00005103 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005104 * Returns the resulting document tree unless SAX is NULL or the document is
5105 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005106 */
5107
5108htmlDocPtr
5109htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5110 htmlDocPtr ret;
5111 htmlParserCtxtPtr ctxt;
5112
Daniel Veillardd0463562001-10-13 09:15:48 +00005113 xmlInitParser();
5114
Owen Taylor3473f882001-02-23 17:55:21 +00005115 if (cur == NULL) return(NULL);
5116
5117
5118 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5119 if (ctxt == NULL) return(NULL);
5120 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00005121 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00005122 ctxt->sax = sax;
5123 ctxt->userData = userData;
5124 }
5125
5126 htmlParseDocument(ctxt);
5127 ret = ctxt->myDoc;
5128 if (sax != NULL) {
5129 ctxt->sax = NULL;
5130 ctxt->userData = NULL;
5131 }
5132 htmlFreeParserCtxt(ctxt);
5133
5134 return(ret);
5135}
5136
5137/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005138 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00005139 * @cur: a pointer to an array of xmlChar
5140 * @encoding: a free form C string describing the HTML document encoding, or NULL
5141 *
5142 * parse an HTML in-memory document and build a tree.
5143 *
5144 * Returns the resulting document tree
5145 */
5146
5147htmlDocPtr
5148htmlParseDoc(xmlChar *cur, const char *encoding) {
5149 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5150}
5151
5152
5153/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005154 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005155 * @filename: the filename
5156 * @encoding: a free form C string describing the HTML document encoding, or NULL
5157 *
5158 * Create a parser context for a file content.
5159 * Automatic support for ZLIB/Compress compressed document is provided
5160 * by default if found at compile-time.
5161 *
5162 * Returns the new parser context or NULL
5163 */
5164htmlParserCtxtPtr
5165htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5166{
5167 htmlParserCtxtPtr ctxt;
5168 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005169 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00005170 /* htmlCharEncoding enc; */
5171 xmlChar *content, *content_line = (xmlChar *) "charset=";
5172
Daniel Veillardf403d292003-10-05 13:51:35 +00005173 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00005174 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00005175 return(NULL);
5176 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005177 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5178 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00005179#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005180 if (xmlDefaultSAXHandler.error != NULL) {
5181 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5182 }
Daniel Veillard87247e82004-01-13 20:42:02 +00005183#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00005184 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005185 return(NULL);
5186 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005187
5188 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5189 xmlFree(canonicFilename);
5190 if (inputStream == NULL) {
5191 xmlFreeParserCtxt(ctxt);
5192 return(NULL);
5193 }
Owen Taylor3473f882001-02-23 17:55:21 +00005194
5195 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00005196
Owen Taylor3473f882001-02-23 17:55:21 +00005197 /* set encoding */
5198 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005199 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005200 if (content) {
5201 strcpy ((char *)content, (char *)content_line);
5202 strcat ((char *)content, (char *)encoding);
5203 htmlCheckEncoding (ctxt, content);
5204 xmlFree (content);
5205 }
5206 }
5207
5208 return(ctxt);
5209}
5210
5211/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005212 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005213 * @filename: the filename
5214 * @encoding: a free form C string describing the HTML document encoding, or NULL
5215 * @sax: the SAX handler block
5216 * @userData: if using SAX, this pointer will be provided on callbacks.
5217 *
5218 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5219 * compressed document is provided by default if found at compile-time.
5220 * It use the given SAX function block to handle the parsing callback.
5221 * If sax is NULL, fallback to the default DOM tree building routines.
5222 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00005223 * Returns the resulting document tree unless SAX is NULL or the document is
5224 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00005225 */
5226
5227htmlDocPtr
5228htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5229 void *userData) {
5230 htmlDocPtr ret;
5231 htmlParserCtxtPtr ctxt;
5232 htmlSAXHandlerPtr oldsax = NULL;
5233
Daniel Veillardd0463562001-10-13 09:15:48 +00005234 xmlInitParser();
5235
Owen Taylor3473f882001-02-23 17:55:21 +00005236 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5237 if (ctxt == NULL) return(NULL);
5238 if (sax != NULL) {
5239 oldsax = ctxt->sax;
5240 ctxt->sax = sax;
5241 ctxt->userData = userData;
5242 }
5243
5244 htmlParseDocument(ctxt);
5245
5246 ret = ctxt->myDoc;
5247 if (sax != NULL) {
5248 ctxt->sax = oldsax;
5249 ctxt->userData = NULL;
5250 }
5251 htmlFreeParserCtxt(ctxt);
5252
5253 return(ret);
5254}
5255
5256/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005257 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00005258 * @filename: the filename
5259 * @encoding: a free form C string describing the HTML document encoding, or NULL
5260 *
5261 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5262 * compressed document is provided by default if found at compile-time.
5263 *
5264 * Returns the resulting document tree
5265 */
5266
5267htmlDocPtr
5268htmlParseFile(const char *filename, const char *encoding) {
5269 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5270}
5271
5272/**
5273 * htmlHandleOmittedElem:
5274 * @val: int 0 or 1
5275 *
5276 * Set and return the previous value for handling HTML omitted tags.
5277 *
5278 * Returns the last value for 0 for no handling, 1 for auto insertion.
5279 */
5280
5281int
5282htmlHandleOmittedElem(int val) {
5283 int old = htmlOmittedDefaultValue;
5284
5285 htmlOmittedDefaultValue = val;
5286 return(old);
5287}
5288
Daniel Veillard930dfb62003-02-05 10:17:38 +00005289/**
5290 * htmlElementAllowedHere:
5291 * @parent: HTML parent element
5292 * @elt: HTML element
5293 *
5294 * Checks whether an HTML element may be a direct child of a parent element.
5295 * Note - doesn't check for deprecated elements
5296 *
5297 * Returns 1 if allowed; 0 otherwise.
5298 */
5299int
5300htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5301 const char** p ;
5302
5303 if ( ! elt || ! parent || ! parent->subelts )
5304 return 0 ;
5305
5306 for ( p = parent->subelts; *p; ++p )
5307 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5308 return 1 ;
5309
5310 return 0 ;
5311}
5312/**
5313 * htmlElementStatusHere:
5314 * @parent: HTML parent element
5315 * @elt: HTML element
5316 *
5317 * Checks whether an HTML element may be a direct child of a parent element.
5318 * and if so whether it is valid or deprecated.
5319 *
5320 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5321 */
5322htmlStatus
5323htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5324 if ( ! parent || ! elt )
5325 return HTML_INVALID ;
5326 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5327 return HTML_INVALID ;
5328
5329 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5330}
5331/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005332 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00005333 * @elt: HTML element
5334 * @attr: HTML attribute
5335 * @legacy: whether to allow deprecated attributes
5336 *
5337 * Checks whether an attribute is valid for an element
5338 * Has full knowledge of Required and Deprecated attributes
5339 *
5340 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5341 */
5342htmlStatus
5343htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5344 const char** p ;
5345
5346 if ( !elt || ! attr )
5347 return HTML_INVALID ;
5348
5349 if ( elt->attrs_req )
5350 for ( p = elt->attrs_req; *p; ++p)
5351 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5352 return HTML_REQUIRED ;
5353
5354 if ( elt->attrs_opt )
5355 for ( p = elt->attrs_opt; *p; ++p)
5356 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5357 return HTML_VALID ;
5358
5359 if ( legacy && elt->attrs_depr )
5360 for ( p = elt->attrs_depr; *p; ++p)
5361 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5362 return HTML_DEPRECATED ;
5363
5364 return HTML_INVALID ;
5365}
5366/**
Daniel Veillard71531f32003-02-05 13:19:53 +00005367 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00005368 * @node: an htmlNodePtr in a tree
5369 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00005370 * for Element nodes)
5371 *
5372 * Checks whether the tree node is valid. Experimental (the author
5373 * only uses the HTML enhancements in a SAX parser)
5374 *
5375 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5376 * legacy allowed) or htmlElementStatusHere (otherwise).
5377 * for Attribute nodes, a return from htmlAttrAllowed
5378 * for other nodes, HTML_NA (no checks performed)
5379 */
5380htmlStatus
5381htmlNodeStatus(const htmlNodePtr node, int legacy) {
5382 if ( ! node )
5383 return HTML_INVALID ;
5384
5385 switch ( node->type ) {
5386 case XML_ELEMENT_NODE:
5387 return legacy
5388 ? ( htmlElementAllowedHere (
5389 htmlTagLookup(node->parent->name) , node->name
5390 ) ? HTML_VALID : HTML_INVALID )
5391 : htmlElementStatusHere(
5392 htmlTagLookup(node->parent->name) ,
5393 htmlTagLookup(node->name) )
5394 ;
5395 case XML_ATTRIBUTE_NODE:
5396 return htmlAttrAllowed(
5397 htmlTagLookup(node->parent->name) , node->name, legacy) ;
5398 default: return HTML_NA ;
5399 }
5400}
Daniel Veillard9475a352003-09-26 12:47:50 +00005401/************************************************************************
5402 * *
5403 * New set (2.6.0) of simpler and more flexible APIs *
5404 * *
5405 ************************************************************************/
5406/**
5407 * DICT_FREE:
5408 * @str: a string
5409 *
5410 * Free a string if it is not owned by the "dict" dictionnary in the
5411 * current scope
5412 */
5413#define DICT_FREE(str) \
5414 if ((str) && ((!dict) || \
5415 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5416 xmlFree((char *)(str));
5417
5418/**
5419 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00005420 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00005421 *
5422 * Reset a parser context
5423 */
5424void
5425htmlCtxtReset(htmlParserCtxtPtr ctxt)
5426{
5427 xmlParserInputPtr input;
5428 xmlDictPtr dict = ctxt->dict;
5429
5430 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5431 xmlFreeInputStream(input);
5432 }
5433 ctxt->inputNr = 0;
5434 ctxt->input = NULL;
5435
5436 ctxt->spaceNr = 0;
5437 ctxt->spaceTab[0] = -1;
5438 ctxt->space = &ctxt->spaceTab[0];
5439
5440
5441 ctxt->nodeNr = 0;
5442 ctxt->node = NULL;
5443
5444 ctxt->nameNr = 0;
5445 ctxt->name = NULL;
5446
5447 DICT_FREE(ctxt->version);
5448 ctxt->version = NULL;
5449 DICT_FREE(ctxt->encoding);
5450 ctxt->encoding = NULL;
5451 DICT_FREE(ctxt->directory);
5452 ctxt->directory = NULL;
5453 DICT_FREE(ctxt->extSubURI);
5454 ctxt->extSubURI = NULL;
5455 DICT_FREE(ctxt->extSubSystem);
5456 ctxt->extSubSystem = NULL;
5457 if (ctxt->myDoc != NULL)
5458 xmlFreeDoc(ctxt->myDoc);
5459 ctxt->myDoc = NULL;
5460
5461 ctxt->standalone = -1;
5462 ctxt->hasExternalSubset = 0;
5463 ctxt->hasPErefs = 0;
5464 ctxt->html = 1;
5465 ctxt->external = 0;
5466 ctxt->instate = XML_PARSER_START;
5467 ctxt->token = 0;
5468
5469 ctxt->wellFormed = 1;
5470 ctxt->nsWellFormed = 1;
5471 ctxt->valid = 1;
5472 ctxt->vctxt.userData = ctxt;
5473 ctxt->vctxt.error = xmlParserValidityError;
5474 ctxt->vctxt.warning = xmlParserValidityWarning;
5475 ctxt->record_info = 0;
5476 ctxt->nbChars = 0;
5477 ctxt->checkIndex = 0;
5478 ctxt->inSubset = 0;
5479 ctxt->errNo = XML_ERR_OK;
5480 ctxt->depth = 0;
5481 ctxt->charset = XML_CHAR_ENCODING_UTF8;
5482 ctxt->catalogs = NULL;
5483 xmlInitNodeInfoSeq(&ctxt->node_seq);
5484
5485 if (ctxt->attsDefault != NULL) {
5486 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5487 ctxt->attsDefault = NULL;
5488 }
5489 if (ctxt->attsSpecial != NULL) {
5490 xmlHashFree(ctxt->attsSpecial, NULL);
5491 ctxt->attsSpecial = NULL;
5492 }
5493}
5494
5495/**
5496 * htmlCtxtUseOptions:
5497 * @ctxt: an HTML parser context
5498 * @options: a combination of htmlParserOption(s)
5499 *
5500 * Applies the options to the parser context
5501 *
5502 * Returns 0 in case of success, the set of unknown or unimplemented options
5503 * in case of error.
5504 */
5505int
5506htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5507{
5508 if (options & HTML_PARSE_NOWARNING) {
5509 ctxt->sax->warning = NULL;
5510 options -= XML_PARSE_NOWARNING;
5511 }
5512 if (options & HTML_PARSE_NOERROR) {
5513 ctxt->sax->error = NULL;
5514 ctxt->sax->fatalError = NULL;
5515 options -= XML_PARSE_NOERROR;
5516 }
5517 if (options & HTML_PARSE_PEDANTIC) {
5518 ctxt->pedantic = 1;
5519 options -= XML_PARSE_PEDANTIC;
5520 } else
5521 ctxt->pedantic = 0;
5522 if (options & XML_PARSE_NOBLANKS) {
5523 ctxt->keepBlanks = 0;
5524 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5525 options -= XML_PARSE_NOBLANKS;
5526 } else
5527 ctxt->keepBlanks = 1;
5528 ctxt->dictNames = 0;
5529 return (options);
5530}
5531
5532/**
5533 * htmlDoRead:
5534 * @ctxt: an HTML parser context
5535 * @URL: the base URL to use for the document
5536 * @encoding: the document encoding, or NULL
5537 * @options: a combination of htmlParserOption(s)
5538 * @reuse: keep the context for reuse
5539 *
5540 * Common front-end for the htmlRead functions
5541 *
5542 * Returns the resulting document tree or NULL
5543 */
5544static htmlDocPtr
5545htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5546 int options, int reuse)
5547{
5548 htmlDocPtr ret;
5549
5550 htmlCtxtUseOptions(ctxt, options);
5551 ctxt->html = 1;
5552 if (encoding != NULL) {
5553 xmlCharEncodingHandlerPtr hdlr;
5554
5555 hdlr = xmlFindCharEncodingHandler(encoding);
5556 if (hdlr != NULL)
5557 xmlSwitchToEncoding(ctxt, hdlr);
5558 }
5559 if ((URL != NULL) && (ctxt->input != NULL) &&
5560 (ctxt->input->filename == NULL))
5561 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5562 htmlParseDocument(ctxt);
5563 ret = ctxt->myDoc;
5564 ctxt->myDoc = NULL;
5565 if (!reuse) {
5566 if ((ctxt->dictNames) &&
5567 (ret != NULL) &&
5568 (ret->dict == ctxt->dict))
5569 ctxt->dict = NULL;
5570 xmlFreeParserCtxt(ctxt);
5571 } else {
5572 /* Must duplicate the reference to the dictionary */
5573 if ((ctxt->dictNames) &&
5574 (ret != NULL) &&
5575 (ret->dict == ctxt->dict))
5576 xmlDictReference(ctxt->dict);
5577 }
5578 return (ret);
5579}
5580
5581/**
5582 * htmlReadDoc:
5583 * @cur: a pointer to a zero terminated string
5584 * @URL: the base URL to use for the document
5585 * @encoding: the document encoding, or NULL
5586 * @options: a combination of htmlParserOption(s)
5587 *
5588 * parse an XML in-memory document and build a tree.
5589 *
5590 * Returns the resulting document tree
5591 */
5592htmlDocPtr
5593htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5594{
5595 htmlParserCtxtPtr ctxt;
5596
5597 if (cur == NULL)
5598 return (NULL);
5599
5600 ctxt = xmlCreateDocParserCtxt(cur);
5601 if (ctxt == NULL)
5602 return (NULL);
5603 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5604}
5605
5606/**
5607 * htmlReadFile:
5608 * @filename: a file or URL
5609 * @encoding: the document encoding, or NULL
5610 * @options: a combination of htmlParserOption(s)
5611 *
5612 * parse an XML file from the filesystem or the network.
5613 *
5614 * Returns the resulting document tree
5615 */
5616htmlDocPtr
5617htmlReadFile(const char *filename, const char *encoding, int options)
5618{
5619 htmlParserCtxtPtr ctxt;
5620
5621 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5622 if (ctxt == NULL)
5623 return (NULL);
5624 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5625}
5626
5627/**
5628 * htmlReadMemory:
5629 * @buffer: a pointer to a char array
5630 * @size: the size of the array
5631 * @URL: the base URL to use for the document
5632 * @encoding: the document encoding, or NULL
5633 * @options: a combination of htmlParserOption(s)
5634 *
5635 * parse an XML in-memory document and build a tree.
5636 *
5637 * Returns the resulting document tree
5638 */
5639htmlDocPtr
5640htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5641{
5642 htmlParserCtxtPtr ctxt;
5643
5644 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5645 if (ctxt == NULL)
5646 return (NULL);
5647 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5648}
5649
5650/**
5651 * htmlReadFd:
5652 * @fd: an open file descriptor
5653 * @URL: the base URL to use for the document
5654 * @encoding: the document encoding, or NULL
5655 * @options: a combination of htmlParserOption(s)
5656 *
5657 * parse an XML from a file descriptor and build a tree.
5658 *
5659 * Returns the resulting document tree
5660 */
5661htmlDocPtr
5662htmlReadFd(int fd, const char *URL, const char *encoding, int options)
5663{
5664 htmlParserCtxtPtr ctxt;
5665 xmlParserInputBufferPtr input;
5666 xmlParserInputPtr stream;
5667
5668 if (fd < 0)
5669 return (NULL);
5670
5671 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5672 if (input == NULL)
5673 return (NULL);
5674 ctxt = xmlNewParserCtxt();
5675 if (ctxt == NULL) {
5676 xmlFreeParserInputBuffer(input);
5677 return (NULL);
5678 }
5679 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5680 if (stream == NULL) {
5681 xmlFreeParserInputBuffer(input);
5682 xmlFreeParserCtxt(ctxt);
5683 return (NULL);
5684 }
5685 inputPush(ctxt, stream);
5686 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5687}
5688
5689/**
5690 * htmlReadIO:
5691 * @ioread: an I/O read function
5692 * @ioclose: an I/O close function
5693 * @ioctx: an I/O handler
5694 * @URL: the base URL to use for the document
5695 * @encoding: the document encoding, or NULL
5696 * @options: a combination of htmlParserOption(s)
5697 *
5698 * parse an HTML document from I/O functions and source and build a tree.
5699 *
5700 * Returns the resulting document tree
5701 */
5702htmlDocPtr
5703htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5704 void *ioctx, const char *URL, const char *encoding, int options)
5705{
5706 htmlParserCtxtPtr ctxt;
5707 xmlParserInputBufferPtr input;
5708 xmlParserInputPtr stream;
5709
5710 if (ioread == NULL)
5711 return (NULL);
5712
5713 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5714 XML_CHAR_ENCODING_NONE);
5715 if (input == NULL)
5716 return (NULL);
5717 ctxt = xmlNewParserCtxt();
5718 if (ctxt == NULL) {
5719 xmlFreeParserInputBuffer(input);
5720 return (NULL);
5721 }
5722 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5723 if (stream == NULL) {
5724 xmlFreeParserInputBuffer(input);
5725 xmlFreeParserCtxt(ctxt);
5726 return (NULL);
5727 }
5728 inputPush(ctxt, stream);
5729 return (htmlDoRead(ctxt, URL, encoding, options, 0));
5730}
5731
5732/**
5733 * htmlCtxtReadDoc:
5734 * @ctxt: an HTML parser context
5735 * @cur: a pointer to a zero terminated string
5736 * @URL: the base URL to use for the document
5737 * @encoding: the document encoding, or NULL
5738 * @options: a combination of htmlParserOption(s)
5739 *
5740 * parse an XML in-memory document and build a tree.
5741 * This reuses the existing @ctxt parser context
5742 *
5743 * Returns the resulting document tree
5744 */
5745htmlDocPtr
5746htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
5747 const char *URL, const char *encoding, int options)
5748{
5749 xmlParserInputPtr stream;
5750
5751 if (cur == NULL)
5752 return (NULL);
5753 if (ctxt == NULL)
5754 return (NULL);
5755
5756 htmlCtxtReset(ctxt);
5757
5758 stream = xmlNewStringInputStream(ctxt, cur);
5759 if (stream == NULL) {
5760 return (NULL);
5761 }
5762 inputPush(ctxt, stream);
5763 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5764}
5765
5766/**
5767 * htmlCtxtReadFile:
5768 * @ctxt: an HTML parser context
5769 * @filename: a file or URL
5770 * @encoding: the document encoding, or NULL
5771 * @options: a combination of htmlParserOption(s)
5772 *
5773 * parse an XML file from the filesystem or the network.
5774 * This reuses the existing @ctxt parser context
5775 *
5776 * Returns the resulting document tree
5777 */
5778htmlDocPtr
5779htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
5780 const char *encoding, int options)
5781{
5782 xmlParserInputPtr stream;
5783
5784 if (filename == NULL)
5785 return (NULL);
5786 if (ctxt == NULL)
5787 return (NULL);
5788
5789 htmlCtxtReset(ctxt);
5790
5791 stream = xmlNewInputFromFile(ctxt, filename);
5792 if (stream == NULL) {
5793 return (NULL);
5794 }
5795 inputPush(ctxt, stream);
5796 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
5797}
5798
5799/**
5800 * htmlCtxtReadMemory:
5801 * @ctxt: an HTML parser context
5802 * @buffer: a pointer to a char array
5803 * @size: the size of the array
5804 * @URL: the base URL to use for the document
5805 * @encoding: the document encoding, or NULL
5806 * @options: a combination of htmlParserOption(s)
5807 *
5808 * parse an XML in-memory document and build a tree.
5809 * This reuses the existing @ctxt parser context
5810 *
5811 * Returns the resulting document tree
5812 */
5813htmlDocPtr
5814htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
5815 const char *URL, const char *encoding, int options)
5816{
5817 xmlParserInputBufferPtr input;
5818 xmlParserInputPtr stream;
5819
5820 if (ctxt == NULL)
5821 return (NULL);
5822 if (buffer == NULL)
5823 return (NULL);
5824
5825 htmlCtxtReset(ctxt);
5826
5827 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5828 if (input == NULL) {
5829 return(NULL);
5830 }
5831
5832 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5833 if (stream == NULL) {
5834 xmlFreeParserInputBuffer(input);
5835 return(NULL);
5836 }
5837
5838 inputPush(ctxt, stream);
5839 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5840}
5841
5842/**
5843 * htmlCtxtReadFd:
5844 * @ctxt: an HTML parser context
5845 * @fd: an open file descriptor
5846 * @URL: the base URL to use for the document
5847 * @encoding: the document encoding, or NULL
5848 * @options: a combination of htmlParserOption(s)
5849 *
5850 * parse an XML from a file descriptor and build a tree.
5851 * This reuses the existing @ctxt parser context
5852 *
5853 * Returns the resulting document tree
5854 */
5855htmlDocPtr
5856htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
5857 const char *URL, const char *encoding, int options)
5858{
5859 xmlParserInputBufferPtr input;
5860 xmlParserInputPtr stream;
5861
5862 if (fd < 0)
5863 return (NULL);
5864 if (ctxt == NULL)
5865 return (NULL);
5866
5867 htmlCtxtReset(ctxt);
5868
5869
5870 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5871 if (input == NULL)
5872 return (NULL);
5873 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5874 if (stream == NULL) {
5875 xmlFreeParserInputBuffer(input);
5876 return (NULL);
5877 }
5878 inputPush(ctxt, stream);
5879 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5880}
5881
5882/**
5883 * htmlCtxtReadIO:
5884 * @ctxt: an HTML parser context
5885 * @ioread: an I/O read function
5886 * @ioclose: an I/O close function
5887 * @ioctx: an I/O handler
5888 * @URL: the base URL to use for the document
5889 * @encoding: the document encoding, or NULL
5890 * @options: a combination of htmlParserOption(s)
5891 *
5892 * parse an HTML document from I/O functions and source and build a tree.
5893 * This reuses the existing @ctxt parser context
5894 *
5895 * Returns the resulting document tree
5896 */
5897htmlDocPtr
5898htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
5899 xmlInputCloseCallback ioclose, void *ioctx,
5900 const char *URL,
5901 const char *encoding, int options)
5902{
5903 xmlParserInputBufferPtr input;
5904 xmlParserInputPtr stream;
5905
5906 if (ioread == NULL)
5907 return (NULL);
5908 if (ctxt == NULL)
5909 return (NULL);
5910
5911 htmlCtxtReset(ctxt);
5912
5913 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5914 XML_CHAR_ENCODING_NONE);
5915 if (input == NULL)
5916 return (NULL);
5917 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5918 if (stream == NULL) {
5919 xmlFreeParserInputBuffer(input);
5920 return (NULL);
5921 }
5922 inputPush(ctxt, stream);
5923 return (htmlDoRead(ctxt, URL, encoding, options, 1));
5924}
5925
Owen Taylor3473f882001-02-23 17:55:21 +00005926#endif /* LIBXML_HTML_ENABLED */