blob: 4d366175ba1bc88e7e9176f9c4c14937c1225ea2 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
Daniel Veillarda78d8032012-07-16 14:56:50 +080047#include "buf.h"
48#include "enc.h"
49
Owen Taylor3473f882001-02-23 17:55:21 +000050#define HTML_MAX_NAMELEN 1000
51#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52#define HTML_PARSER_BUFFER_SIZE 100
53
54/* #define DEBUG */
55/* #define DEBUG_PUSH */
56
Daniel Veillard22090732001-07-16 00:06:07 +000057static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000058
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000061static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000062
63/************************************************************************
64 * *
Daniel Veillarde77db162009-08-22 11:32:38 +020065 * Some factorized error routines *
Daniel Veillardf403d292003-10-05 13:51:35 +000066 * *
67 ************************************************************************/
68
69/**
William M. Brackedb65a72004-02-06 07:36:04 +000070 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000071 * @ctxt: an HTML parser context
72 * @extra: extra informations
73 *
74 * Handle a redefinition of attribute error
75 */
76static void
77htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78{
Daniel Veillard157fee02003-10-31 10:36:03 +000079 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80 (ctxt->instate == XML_PARSER_EOF))
81 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000082 if (ctxt != NULL) {
83 ctxt->errNo = XML_ERR_NO_MEMORY;
84 ctxt->instate = XML_PARSER_EOF;
85 ctxt->disableSAX = 1;
86 }
87 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000088 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000089 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90 NULL, NULL, 0, 0,
91 "Memory allocation failed : %s\n", extra);
92 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000093 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000094 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95 NULL, NULL, 0, 0, "Memory allocation failed\n");
96}
97
98/**
99 * htmlParseErr:
100 * @ctxt: an HTML parser context
101 * @error: the error number
102 * @msg: the error message
103 * @str1: string infor
104 * @str2: string infor
105 *
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107 */
108static void
109htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110 const char *msg, const xmlChar *str1, const xmlChar *str2)
111{
Daniel Veillard157fee02003-10-31 10:36:03 +0000112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113 (ctxt->instate == XML_PARSER_EOF))
114 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000115 if (ctxt != NULL)
116 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000118 XML_ERR_ERROR, NULL, 0,
119 (const char *) str1, (const char *) str2,
120 NULL, 0, 0,
121 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000122 if (ctxt != NULL)
123 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000124}
125
126/**
127 * htmlParseErrInt:
128 * @ctxt: an HTML parser context
129 * @error: the error number
130 * @msg: the error message
131 * @val: integer info
132 *
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134 */
135static void
136htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137 const char *msg, int val)
138{
Daniel Veillard157fee02003-10-31 10:36:03 +0000139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140 (ctxt->instate == XML_PARSER_EOF))
141 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000142 if (ctxt != NULL)
143 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000145 XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000147 if (ctxt != NULL)
148 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000149}
150
151/************************************************************************
152 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200153 * Parser stacks related functions and macros *
Owen Taylor3473f882001-02-23 17:55:21 +0000154 * *
155 ************************************************************************/
156
Daniel Veillard1c732d22002-11-30 11:22:59 +0000157/**
158 * htmlnamePush:
159 * @ctxt: an HTML parser context
160 * @value: the element name
161 *
162 * Pushes a new element name on top of the name stack
163 *
164 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000165 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000166static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000167htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000168{
Daniel Veillard029a04d2009-08-24 12:50:23 +0200169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170 ctxt->html = 3;
171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172 ctxt->html = 10;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000173 if (ctxt->nameNr >= ctxt->nameMax) {
174 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000175 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000176 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000177 ctxt->nameMax *
178 sizeof(ctxt->nameTab[0]));
179 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000180 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000181 return (0);
182 }
183 }
184 ctxt->nameTab[ctxt->nameNr] = value;
185 ctxt->name = value;
186 return (ctxt->nameNr++);
187}
188/**
189 * htmlnamePop:
190 * @ctxt: an HTML parser context
191 *
192 * Pops the top element name from the name stack
193 *
194 * Returns the name just removed
195 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000196static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000197htmlnamePop(htmlParserCtxtPtr ctxt)
198{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000199 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000200
Daniel Veillard1c732d22002-11-30 11:22:59 +0000201 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000202 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000203 ctxt->nameNr--;
204 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000205 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000206 if (ctxt->nameNr > 0)
207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208 else
209 ctxt->name = NULL;
210 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000211 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000212 return (ret);
213}
Owen Taylor3473f882001-02-23 17:55:21 +0000214
Eugene Pimenov615904f2010-03-15 15:16:02 +0100215/**
216 * htmlNodeInfoPush:
217 * @ctxt: an HTML parser context
218 * @value: the node info
219 *
220 * Pushes a new element name on top of the node info stack
221 *
222 * Returns 0 in case of error, the index in the stack otherwise
223 */
224static int
225htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226{
227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228 if (ctxt->nodeInfoMax == 0)
229 ctxt->nodeInfoMax = 5;
230 ctxt->nodeInfoMax *= 2;
231 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233 ctxt->nodeInfoMax *
234 sizeof(ctxt->nodeInfoTab[0]));
235 if (ctxt->nodeInfoTab == NULL) {
236 htmlErrMemory(ctxt, NULL);
237 return (0);
238 }
239 }
240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242 return (ctxt->nodeInfoNr++);
243}
244
245/**
246 * htmlNodeInfoPop:
247 * @ctxt: an HTML parser context
248 *
249 * Pops the top element name from the node info stack
250 *
251 * Returns 0 in case of error, the pointer to NodeInfo otherwise
252 */
253static htmlParserNodeInfo *
254htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255{
256 if (ctxt->nodeInfoNr <= 0)
257 return (NULL);
258 ctxt->nodeInfoNr--;
259 if (ctxt->nodeInfoNr < 0)
260 return (NULL);
261 if (ctxt->nodeInfoNr > 0)
262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263 else
264 ctxt->nodeInfo = NULL;
265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266}
267
Owen Taylor3473f882001-02-23 17:55:21 +0000268/*
269 * Macros for accessing the content. Those should be used only by the parser,
270 * and not exported.
271 *
272 * Dirty macros, i.e. one need to make assumption on the context to use them
273 *
274 * CUR_PTR return the current pointer to the xmlChar to be parsed.
275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277 * in UNICODE mode. This should be used internally by the parser
278 * only to compare to ASCII values otherwise it would break when
279 * running with UTF-8 encoding.
280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281 * to compare on ASCII based substring.
282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283 * it should be used only to compare on ASCII based substring.
284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000285 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000286 *
287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288 *
289 * CURRENT Returns the current char value, with the full decoding of
290 * UTF-8 if we are using this mode. It returns an int.
291 * NEXT Skip to the next character, this does the proper decoding
292 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000293 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295 */
296
297#define UPPER (toupper(*ctxt->input->cur))
298
Daniel Veillard77a90a72003-03-22 00:04:05 +0000299#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000300
301#define NXT(val) ctxt->input->cur[(val)]
302
303#define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305#define CUR_PTR ctxt->input->cur
306
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000307#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
308 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
309 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000310
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000311#define GROW if ((ctxt->progressive == 0) && \
312 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
313 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000314
315#define CURRENT ((int) (*ctxt->input->cur))
316
317#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
318
319/* Inported from XML */
320
Daniel Veillard561b7f82002-03-20 21:55:57 +0000321/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
322#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000323#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000324
Daniel Veillard561b7f82002-03-20 21:55:57 +0000325#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000326
327
328#define NEXTL(l) do { \
329 if (*(ctxt->input->cur) == '\n') { \
330 ctxt->input->line++; ctxt->input->col = 1; \
331 } else ctxt->input->col++; \
332 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
333 } while (0)
Daniel Veillarde77db162009-08-22 11:32:38 +0200334
Owen Taylor3473f882001-02-23 17:55:21 +0000335/************
336 \
337 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
338 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
339 ************/
340
341#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
342#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
343
344#define COPY_BUF(l,b,i,v) \
345 if (l == 1) b[i++] = (xmlChar) v; \
346 else i += xmlCopyChar(l,&b[i],v)
347
348/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200349 * htmlFindEncoding:
350 * @the HTML parser context
351 *
352 * Ty to find and encoding in the current data available in the input
353 * buffer this is needed to try to switch to the proper encoding when
354 * one face a character error.
355 * That's an heuristic, since it's operating outside of parsing it could
356 * try to use a meta which had been commented out, that's the reason it
357 * should only be used in case of error, not as a default.
358 *
359 * Returns an encoding string or NULL if not found, the string need to
360 * be freed
361 */
362static xmlChar *
363htmlFindEncoding(xmlParserCtxtPtr ctxt) {
364 const xmlChar *start, *cur, *end;
365
366 if ((ctxt == NULL) || (ctxt->input == NULL) ||
367 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
368 (ctxt->input->buf->encoder != NULL))
369 return(NULL);
370 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
371 return(NULL);
372
373 start = ctxt->input->cur;
374 end = ctxt->input->end;
375 /* we also expect the input buffer to be zero terminated */
376 if (*end != 0)
377 return(NULL);
378
379 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
380 if (cur == NULL)
381 return(NULL);
382 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
383 if (cur == NULL)
384 return(NULL);
385 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
386 if (cur == NULL)
387 return(NULL);
388 cur += 8;
389 start = cur;
390 while (((*cur >= 'A') && (*cur <= 'Z')) ||
391 ((*cur >= 'a') && (*cur <= 'z')) ||
392 ((*cur >= '0') && (*cur <= '9')) ||
393 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
394 cur++;
395 if (cur == start)
396 return(NULL);
397 return(xmlStrndup(start, cur - start));
398}
399
400/**
Owen Taylor3473f882001-02-23 17:55:21 +0000401 * htmlCurrentChar:
402 * @ctxt: the HTML parser context
403 * @len: pointer to the length of the char read
404 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000405 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000406 * bytes in the input buffer. Implement the end of line normalization:
407 * 2.11 End-of-Line Handling
408 * If the encoding is unspecified, in the case we find an ISO-Latin-1
409 * char, then the encoding converter is plugged in automatically.
410 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000411 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000412 */
413
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000414static int
Owen Taylor3473f882001-02-23 17:55:21 +0000415htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
416 if (ctxt->instate == XML_PARSER_EOF)
417 return(0);
418
419 if (ctxt->token != 0) {
420 *len = 0;
421 return(ctxt->token);
Daniel Veillarde77db162009-08-22 11:32:38 +0200422 }
Owen Taylor3473f882001-02-23 17:55:21 +0000423 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
424 /*
425 * We are supposed to handle UTF8, check it's valid
426 * From rfc2044: encoding of the Unicode values on UTF-8:
427 *
428 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
429 * 0000 0000-0000 007F 0xxxxxxx
430 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
Daniel Veillarde77db162009-08-22 11:32:38 +0200431 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
Owen Taylor3473f882001-02-23 17:55:21 +0000432 *
433 * Check for the 0x110000 limit too
434 */
435 const unsigned char *cur = ctxt->input->cur;
436 unsigned char c;
437 unsigned int val;
438
439 c = *cur;
440 if (c & 0x80) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200441 if (cur[1] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000442 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200443 cur = ctxt->input->cur;
444 }
Owen Taylor3473f882001-02-23 17:55:21 +0000445 if ((cur[1] & 0xc0) != 0x80)
446 goto encoding_error;
447 if ((c & 0xe0) == 0xe0) {
448
Adiel Mittmann8a103792009-08-25 11:27:13 +0200449 if (cur[2] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000450 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200451 cur = ctxt->input->cur;
452 }
Owen Taylor3473f882001-02-23 17:55:21 +0000453 if ((cur[2] & 0xc0) != 0x80)
454 goto encoding_error;
455 if ((c & 0xf0) == 0xf0) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200456 if (cur[3] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000457 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200458 cur = ctxt->input->cur;
459 }
Owen Taylor3473f882001-02-23 17:55:21 +0000460 if (((c & 0xf8) != 0xf0) ||
461 ((cur[3] & 0xc0) != 0x80))
462 goto encoding_error;
463 /* 4-byte code */
464 *len = 4;
465 val = (cur[0] & 0x7) << 18;
466 val |= (cur[1] & 0x3f) << 12;
467 val |= (cur[2] & 0x3f) << 6;
468 val |= cur[3] & 0x3f;
469 } else {
470 /* 3-byte code */
471 *len = 3;
472 val = (cur[0] & 0xf) << 12;
473 val |= (cur[1] & 0x3f) << 6;
474 val |= cur[2] & 0x3f;
475 }
476 } else {
477 /* 2-byte code */
478 *len = 2;
479 val = (cur[0] & 0x1f) << 6;
480 val |= cur[1] & 0x3f;
481 }
482 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000483 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
484 "Char 0x%X out of allowed range\n", val);
Daniel Veillarde77db162009-08-22 11:32:38 +0200485 }
Owen Taylor3473f882001-02-23 17:55:21 +0000486 return(val);
487 } else {
Daniel Veillard856c6682009-08-24 18:16:56 +0200488 if ((*ctxt->input->cur == 0) &&
489 (ctxt->input->cur < ctxt->input->end)) {
490 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
491 "Char 0x%X out of allowed range\n", 0);
492 *len = 1;
493 return(' ');
494 }
Owen Taylor3473f882001-02-23 17:55:21 +0000495 /* 1-byte code */
496 *len = 1;
497 return((int) *ctxt->input->cur);
498 }
499 }
500 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000501 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000502 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000503 * XML constructs only use < 128 chars
504 */
505 *len = 1;
506 if ((int) *ctxt->input->cur < 0x80)
507 return((int) *ctxt->input->cur);
508
509 /*
510 * Humm this is bad, do an automatic flow conversion
511 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200512 {
513 xmlChar * guess;
514 xmlCharEncodingHandlerPtr handler;
515
516 guess = htmlFindEncoding(ctxt);
517 if (guess == NULL) {
518 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
519 } else {
520 if (ctxt->input->encoding != NULL)
521 xmlFree((xmlChar *) ctxt->input->encoding);
522 ctxt->input->encoding = guess;
523 handler = xmlFindCharEncodingHandler((const char *) guess);
524 if (handler != NULL) {
525 xmlSwitchToEncoding(ctxt, handler);
526 } else {
527 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
528 "Unsupported encoding %s", guess, NULL);
529 }
530 }
531 ctxt->charset = XML_CHAR_ENCODING_UTF8;
532 }
533
Owen Taylor3473f882001-02-23 17:55:21 +0000534 return(xmlCurrentChar(ctxt, len));
535
536encoding_error:
537 /*
538 * If we detect an UTF8 error that probably mean that the
539 * input encoding didn't get properly advertized in the
540 * declaration header. Report the error and switch the encoding
541 * to ISO-Latin-1 (if you don't like this policy, just declare the
542 * encoding !)
543 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000544 {
545 char buffer[150];
546
Daniel Veillard861101d2007-06-12 08:38:57 +0000547 if (ctxt->input->end - ctxt->input->cur >= 4) {
548 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
549 ctxt->input->cur[0], ctxt->input->cur[1],
550 ctxt->input->cur[2], ctxt->input->cur[3]);
551 } else {
552 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
553 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000554 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
555 "Input is not proper UTF-8, indicate encoding !\n",
556 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000557 }
558
Daniel Veillarde77db162009-08-22 11:32:38 +0200559 ctxt->charset = XML_CHAR_ENCODING_8859_1;
Owen Taylor3473f882001-02-23 17:55:21 +0000560 *len = 1;
561 return((int) *ctxt->input->cur);
562}
563
564/**
Owen Taylor3473f882001-02-23 17:55:21 +0000565 * htmlSkipBlankChars:
566 * @ctxt: the HTML parser context
567 *
568 * skip all blanks character found at that point in the input streams.
569 *
570 * Returns the number of space chars skipped
571 */
572
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000573static int
Owen Taylor3473f882001-02-23 17:55:21 +0000574htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
575 int res = 0;
576
William M. Brack76e95df2003-10-18 16:20:14 +0000577 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000578 if ((*ctxt->input->cur == 0) &&
579 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
580 xmlPopInput(ctxt);
581 } else {
582 if (*(ctxt->input->cur) == '\n') {
583 ctxt->input->line++; ctxt->input->col = 1;
584 } else ctxt->input->col++;
585 ctxt->input->cur++;
586 ctxt->nbChars++;
587 if (*ctxt->input->cur == 0)
588 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
589 }
590 res++;
591 }
592 return(res);
593}
594
595
596
597/************************************************************************
598 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200599 * The list of HTML elements and their properties *
Owen Taylor3473f882001-02-23 17:55:21 +0000600 * *
601 ************************************************************************/
602
603/*
604 * Start Tag: 1 means the start tag can be ommited
605 * End Tag: 1 means the end tag can be ommited
606 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000607 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000608 * Depr: this element is deprecated
609 * DTD: 1 means that this element is valid only in the Loose DTD
610 * 2 means that this element is valid only in the Frameset DTD
611 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000612 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000613 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000614 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000615
616/* Definitions and a couple of vars for HTML Elements */
617
618#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000619#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000620#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000621#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000622#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
623#define NB_SPECIAL 16
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100624#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000625#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100626#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000627#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000628#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000629#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000630#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000631#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000632#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000633#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000634#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000635#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000636#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000637#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000638#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000639#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000640#define EMPTY NULL
641
642
Daniel Veillard065abe82006-07-03 08:55:04 +0000643static const char* const html_flow[] = { FLOW, NULL } ;
644static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000645
646/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000647static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000648#define html_cdata html_pcdata
649
650
651/* ... and for HTML Attributes */
652
653#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000654#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000655#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000656#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000657#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000658#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000659#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000660#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000661#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000662#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000663#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000664#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000665
Daniel Veillard065abe82006-07-03 08:55:04 +0000666static const char* const html_attrs[] = { ATTRS, NULL } ;
667static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
668static const char* const core_attrs[] = { COREATTRS, NULL } ;
669static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000670
671
672/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000673static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000674 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
675 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000676static const char* const target_attr[] = { "target", NULL } ;
677static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
678static const char* const alt_attr[] = { "alt", NULL } ;
679static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
680static const char* const href_attrs[] = { "href", NULL } ;
681static const char* const clear_attrs[] = { "clear", NULL } ;
682static const char* const inline_p[] = { INLINE, "p", NULL } ;
683
684static const char* const flow_param[] = { FLOW, "param", NULL } ;
685static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000686 "archive", "alt", "name", "height", "width", "align",
687 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000688static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000689 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000690static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000691 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000692static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
693static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
694static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
695static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000696 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000697static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000698 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
699
700
Daniel Veillard065abe82006-07-03 08:55:04 +0000701static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
702static const char* const col_elt[] = { "col", NULL } ;
703static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
704static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
705static const char* const dl_contents[] = { "dt", "dd", NULL } ;
706static const char* const compact_attr[] = { "compact", NULL } ;
707static const char* const label_attr[] = { "label", NULL } ;
708static const char* const fieldset_contents[] = { FLOW, "legend" } ;
709static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
710static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
711static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
712static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
713static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
714static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
715static const char* const head_attrs[] = { I18N, "profile", NULL } ;
716static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
717static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
718static const char* const version_attr[] = { "version", NULL } ;
719static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
720static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
721static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000722static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000723static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
724static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
725static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
726static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
727static const char* const align_attr[] = { "align", NULL } ;
728static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
729static const char* const map_contents[] = { BLOCK, "area", NULL } ;
730static const char* const name_attr[] = { "name", NULL } ;
731static const char* const action_attr[] = { "action", NULL } ;
732static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
Denis Pauk868d92d2012-05-10 15:34:57 +0800733static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000734static const char* const content_attr[] = { "content", NULL } ;
735static const char* const type_attr[] = { "type", NULL } ;
736static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
737static const char* const object_contents[] = { FLOW, "param", NULL } ;
738static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
739static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
740static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
741static const char* const option_elt[] = { "option", NULL } ;
742static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
743static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
744static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
745static const char* const width_attr[] = { "width", NULL } ;
746static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
747static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
748static const char* const language_attr[] = { "language", NULL } ;
749static const char* const select_content[] = { "optgroup", "option", NULL } ;
750static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
751static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200752static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000753static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
754static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
755static const char* const tr_elt[] = { "tr", NULL } ;
756static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
757static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
758static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
759static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
760static const char* const tr_contents[] = { "th", "td", NULL } ;
761static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
762static const char* const li_elt[] = { "li", NULL } ;
763static const char* const ul_depr[] = { "type", "compact", NULL} ;
764static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000765
766#define DECL (const char**)
767
Daniel Veillard22090732001-07-16 00:06:07 +0000768static const htmlElemDesc
769html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000770{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
771 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
772},
773{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
774 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
775},
776{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
777 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
778},
779{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
780 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
781},
782{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
783 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
784},
785{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
786 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
787},
788{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
789 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
790},
791{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
792 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
793},
794{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
795 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
796},
797{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
798 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
799},
800{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
801 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
802},
803{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
804 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
805},
806{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
807 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
808},
809{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
810 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
811},
812{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
813 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
814},
815{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
816 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
817},
818{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
819 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
820},
821{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
822 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
823},
824{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
825 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
826},
827{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
828 EMPTY , NULL , DECL col_attrs , NULL, NULL
829},
830{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
831 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
832},
833{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
834 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
835},
836{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
837 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
838},
839{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
840 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
841},
842{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
843 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
844},
845{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
846 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
847},
848{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000849 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000850},
851{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
852 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
853},
854{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
855 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
856},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000857{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000858 EMPTY, NULL, DECL embed_attrs, NULL, NULL
859},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000860{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
861 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
862},
863{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
864 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
865},
866{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
867 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
868},
869{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
870 EMPTY, NULL, NULL, DECL frame_attrs, NULL
871},
872{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
873 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
874},
875{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
876 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
877},
878{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
879 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
880},
881{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
882 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
883},
884{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
885 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
886},
887{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
888 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
889},
890{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
891 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
892},
893{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
894 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
895},
896{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
897 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
898},
899{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
900 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
901},
902{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
903 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
904},
905{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
906 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
907},
908{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000909 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000910},
911{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
912 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
913},
914{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
915 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
916},
917{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
918 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
919},
920{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
921 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
922},
923{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
924 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
925},
926{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
927 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
928},
929{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
930 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
931},
932{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
933 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
934},
935{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000936 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000937},
938{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
939 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
940},
941{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
942 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
943},
944{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
945 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
946},
947{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
948 DECL html_flow, "div", DECL html_attrs, NULL, NULL
949},
950{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
951 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
952},
953{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
954 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
955},
956{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000957 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000958},
959{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
960 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
961},
962{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
963 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
964},
965{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000966 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000967},
968{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
969 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
970},
971{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
972 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
973},
974{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
975 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
976},
977{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
978 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
979},
980{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
981 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
982},
983{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
984 DECL select_content, NULL, DECL select_attrs, NULL, NULL
985},
986{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
987 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
988},
989{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
990 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
991},
992{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
993 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
994},
995{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
996 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
997},
998{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
999 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1000},
1001{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1002 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003},
1004{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1005 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1006},
1007{ "table", 0, 0, 0, 0, 0, 0, 0, "",
1008 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1009},
1010{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1011 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1012},
1013{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1014 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1015},
1016{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1017 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1018},
1019{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1020 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1021},
1022{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1023 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1024},
1025{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1026 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1027},
1028{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1029 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1030},
1031{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1032 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1033},
1034{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1035 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1036},
1037{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1038 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1039},
1040{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1041 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1042},
1043{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1044 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1045}
Owen Taylor3473f882001-02-23 17:55:21 +00001046};
1047
1048/*
Owen Taylor3473f882001-02-23 17:55:21 +00001049 * start tags that imply the end of current element
1050 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001051static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001052"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1053 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1054 "listing", "xmp", "head", NULL,
1055"head", "p", NULL,
1056"title", "p", NULL,
1057"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +00001058"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001059"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1060 "pre", "listing", "xmp", "head", "li", NULL,
1061"hr", "p", "head", NULL,
1062"h1", "p", "head", NULL,
1063"h2", "p", "head", NULL,
1064"h3", "p", "head", NULL,
1065"h4", "p", "head", NULL,
1066"h5", "p", "head", NULL,
1067"h6", "p", "head", NULL,
1068"dir", "p", "head", NULL,
1069"address", "p", "head", "ul", NULL,
1070"pre", "p", "head", "ul", NULL,
1071"listing", "p", "head", NULL,
1072"xmp", "p", "head", NULL,
1073"blockquote", "p", "head", NULL,
1074"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1075 "xmp", "head", NULL,
1076"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1077 "head", "dd", NULL,
1078"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1079 "head", "dt", NULL,
1080"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1081 "listing", "xmp", NULL,
1082"ol", "p", "head", "ul", NULL,
1083"menu", "p", "head", "ul", NULL,
Daniel Veillard6339c1a2009-08-24 11:59:51 +02001084"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001085"div", "p", "head", NULL,
Denis Pauka0cd0752012-05-11 19:31:12 +08001086"noscript", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001087"center", "font", "b", "i", "p", "head", NULL,
1088"a", "a", NULL,
1089"caption", "p", NULL,
1090"colgroup", "caption", "colgroup", "col", "p", NULL,
1091"col", "caption", "col", "p", NULL,
1092"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1093 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001094"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Daniel Veillarde77db162009-08-22 11:32:38 +02001095"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001096"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1097"thead", "caption", "col", "colgroup", NULL,
1098"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1099 "tbody", "p", NULL,
1100"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1101 "tfoot", "tbody", "p", NULL,
1102"optgroup", "option", NULL,
1103"option", "option", NULL,
1104"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1105 "pre", "listing", "xmp", "a", NULL,
1106NULL
1107};
1108
1109/*
1110 * The list of HTML elements which are supposed not to have
1111 * CDATA content and where a p element will be implied
1112 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001113 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001114 * implied paragraph
1115 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001116static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001117 "html",
1118 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001119 NULL
1120};
1121
1122/*
1123 * The list of HTML attributes which are of content %Script;
1124 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1125 * it assumes the name starts with 'on'
1126 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001127static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001128 "onclick",
1129 "ondblclick",
1130 "onmousedown",
1131 "onmouseup",
1132 "onmouseover",
1133 "onmousemove",
1134 "onmouseout",
1135 "onkeypress",
1136 "onkeydown",
1137 "onkeyup",
1138 "onload",
1139 "onunload",
1140 "onfocus",
1141 "onblur",
1142 "onsubmit",
1143 "onrest",
1144 "onchange",
1145 "onselect"
1146};
1147
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001148/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001149 * This table is used by the htmlparser to know what to do with
1150 * broken html pages. By assigning different priorities to different
1151 * elements the parser can decide how to handle extra endtags.
1152 * Endtags are only allowed to close elements with lower or equal
1153 * priority.
Daniel Veillarde77db162009-08-22 11:32:38 +02001154 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001155
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001156typedef struct {
1157 const char *name;
1158 int priority;
1159} elementPriority;
1160
Daniel Veillard22090732001-07-16 00:06:07 +00001161static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001162 {"div", 150},
1163 {"td", 160},
1164 {"th", 160},
1165 {"tr", 170},
1166 {"thead", 180},
1167 {"tbody", 180},
1168 {"tfoot", 180},
1169 {"table", 190},
1170 {"head", 200},
1171 {"body", 200},
1172 {"html", 220},
1173 {NULL, 100} /* Default priority */
1174};
Owen Taylor3473f882001-02-23 17:55:21 +00001175
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001176static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001177static int htmlStartCloseIndexinitialized = 0;
1178
1179/************************************************************************
1180 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001181 * functions to handle HTML specific data *
Owen Taylor3473f882001-02-23 17:55:21 +00001182 * *
1183 ************************************************************************/
1184
1185/**
1186 * htmlInitAutoClose:
1187 *
1188 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1189 * This is not reentrant. Call xmlInitParser() once before processing in
1190 * case of use in multithreaded programs.
1191 */
1192void
1193htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001194 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001195
1196 if (htmlStartCloseIndexinitialized) return;
1197
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001198 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1199 indx = 0;
1200 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001201 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001202 while (htmlStartClose[i] != NULL) i++;
1203 i++;
1204 }
1205 htmlStartCloseIndexinitialized = 1;
1206}
1207
1208/**
1209 * htmlTagLookup:
1210 * @tag: The tag name in lowercase
1211 *
1212 * Lookup the HTML tag in the ElementTable
1213 *
1214 * Returns the related htmlElemDescPtr or NULL if not found.
1215 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001216const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001217htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001218 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001219
1220 for (i = 0; i < (sizeof(html40ElementTable) /
1221 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001222 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001223 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001224 }
1225 return(NULL);
1226}
1227
1228/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001229 * htmlGetEndPriority:
1230 * @name: The name of the element to look up the priority for.
Daniel Veillarde77db162009-08-22 11:32:38 +02001231 *
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001232 * Return value: The "endtag" priority.
1233 **/
1234static int
1235htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001236 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001237
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001238 while ((htmlEndPriority[i].name != NULL) &&
1239 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1240 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001241
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001242 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001243}
1244
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001245
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001246/**
Owen Taylor3473f882001-02-23 17:55:21 +00001247 * htmlCheckAutoClose:
1248 * @newtag: The new tag name
1249 * @oldtag: The old tag name
1250 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001251 * Checks whether the new tag is one of the registered valid tags for
1252 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001253 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1254 *
1255 * Returns 0 if no, 1 if yes.
1256 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001257static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001258htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1259{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001260 int i, indx;
1261 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001262
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001263 if (htmlStartCloseIndexinitialized == 0)
1264 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001265
1266 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001267 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001268 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001269 if (closed == NULL)
1270 return (0);
1271 if (xmlStrEqual(BAD_CAST * closed, newtag))
1272 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001273 }
1274
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001275 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001276 i++;
1277 while (htmlStartClose[i] != NULL) {
1278 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001279 return (1);
1280 }
1281 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001282 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001283 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001284}
1285
1286/**
1287 * htmlAutoCloseOnClose:
1288 * @ctxt: an HTML parser context
1289 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001290 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001291 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001292 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001293 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001294static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001295htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1296{
1297 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001298 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001299
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001300 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001301
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001302 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001303
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001304 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1305 break;
1306 /*
1307 * A missplaced endtag can only close elements with lower
1308 * or equal priority, so if we find an element with higher
1309 * priority before we find an element with
Daniel Veillarde77db162009-08-22 11:32:38 +02001310 * matching name, we just ignore this endtag
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001311 */
1312 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1313 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001314 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001315 if (i < 0)
1316 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001317
1318 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001319 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001320 if ((info != NULL) && (info->endTag == 3)) {
1321 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1322 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001323 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001324 }
1325 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1326 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001327 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001328 }
1329}
1330
1331/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001332 * htmlAutoCloseOnEnd:
1333 * @ctxt: an HTML parser context
1334 *
1335 * Close all remaining tags at the end of the stream
1336 */
1337static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001338htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1339{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001340 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001341
William M. Brack899e64a2003-09-26 18:03:42 +00001342 if (ctxt->nameNr == 0)
1343 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001344 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001345 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1346 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001347 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001348 }
1349}
1350
1351/**
Owen Taylor3473f882001-02-23 17:55:21 +00001352 * htmlAutoClose:
1353 * @ctxt: an HTML parser context
1354 * @newtag: The new tag name or NULL
1355 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001356 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001357 * The list is kept in htmlStartClose array. This function is
1358 * called when a new tag has been detected and generates the
1359 * appropriates closes if possible/needed.
1360 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillarde77db162009-08-22 11:32:38 +02001361 * and we should check
Owen Taylor3473f882001-02-23 17:55:21 +00001362 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001363static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001364htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1365{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001366 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001367 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001368 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1369 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001370 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001371 }
1372 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001373 htmlAutoCloseOnEnd(ctxt);
1374 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001375 }
1376 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001377 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1378 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1379 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001380 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1381 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001382 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001383 }
Owen Taylor3473f882001-02-23 17:55:21 +00001384}
1385
1386/**
1387 * htmlAutoCloseTag:
1388 * @doc: the HTML document
1389 * @name: The tag name
1390 * @elem: the HTML element
1391 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001392 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001393 * The list is kept in htmlStartClose array. This function checks
1394 * if the element or one of it's children would autoclose the
1395 * given tag.
1396 *
1397 * Returns 1 if autoclose, 0 otherwise
1398 */
1399int
1400htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1401 htmlNodePtr child;
1402
1403 if (elem == NULL) return(1);
1404 if (xmlStrEqual(name, elem->name)) return(0);
1405 if (htmlCheckAutoClose(elem->name, name)) return(1);
1406 child = elem->children;
1407 while (child != NULL) {
1408 if (htmlAutoCloseTag(doc, name, child)) return(1);
1409 child = child->next;
1410 }
1411 return(0);
1412}
1413
1414/**
1415 * htmlIsAutoClosed:
1416 * @doc: the HTML document
1417 * @elem: the HTML element
1418 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001419 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001420 * The list is kept in htmlStartClose array. This function checks
1421 * if a tag is autoclosed by one of it's child
1422 *
1423 * Returns 1 if autoclosed, 0 otherwise
1424 */
1425int
1426htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1427 htmlNodePtr child;
1428
1429 if (elem == NULL) return(1);
1430 child = elem->children;
1431 while (child != NULL) {
1432 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1433 child = child->next;
1434 }
1435 return(0);
1436}
1437
1438/**
1439 * htmlCheckImplied:
1440 * @ctxt: an HTML parser context
1441 * @newtag: The new tag name
1442 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001443 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001444 * called when a new tag has been detected and generates the
1445 * appropriates implicit tags if missing
1446 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001447static void
Owen Taylor3473f882001-02-23 17:55:21 +00001448htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardb468f742009-08-24 18:45:33 +02001449 int i;
1450
Daniel Veillarde20fb5a2010-01-29 20:47:08 +01001451 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1452 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001453 if (!htmlOmittedDefaultValue)
1454 return;
1455 if (xmlStrEqual(newtag, BAD_CAST"html"))
1456 return;
1457 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001458 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001459 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1460 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1461 }
1462 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1463 return;
Daniel Veillarde77db162009-08-22 11:32:38 +02001464 if ((ctxt->nameNr <= 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001465 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1466 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1467 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1468 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1469 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1470 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001471 if (ctxt->html >= 3) {
1472 /* we already saw or generated an <head> before */
1473 return;
1474 }
1475 /*
1476 * dropped OBJECT ... i you put it first BODY will be
1477 * assumed !
1478 */
1479 htmlnamePush(ctxt, BAD_CAST"head");
1480 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1481 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001482 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1483 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1484 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001485 if (ctxt->html >= 10) {
1486 /* we already saw or generated a <body> before */
1487 return;
1488 }
Owen Taylor3473f882001-02-23 17:55:21 +00001489 for (i = 0;i < ctxt->nameNr;i++) {
1490 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1491 return;
1492 }
1493 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1494 return;
1495 }
1496 }
Daniel Veillarde77db162009-08-22 11:32:38 +02001497
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001498 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001499 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1500 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1501 }
1502}
1503
1504/**
1505 * htmlCheckParagraph
1506 * @ctxt: an HTML parser context
1507 *
1508 * Check whether a p element need to be implied before inserting
1509 * characters in the current element.
1510 *
1511 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1512 * in case of error.
1513 */
1514
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001515static int
Owen Taylor3473f882001-02-23 17:55:21 +00001516htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1517 const xmlChar *tag;
1518 int i;
1519
1520 if (ctxt == NULL)
1521 return(-1);
1522 tag = ctxt->name;
1523 if (tag == NULL) {
1524 htmlAutoClose(ctxt, BAD_CAST"p");
1525 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001526 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001527 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1528 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1529 return(1);
1530 }
1531 if (!htmlOmittedDefaultValue)
1532 return(0);
1533 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1534 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001535 htmlAutoClose(ctxt, BAD_CAST"p");
1536 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001537 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001538 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1539 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1540 return(1);
1541 }
1542 }
1543 return(0);
1544}
1545
1546/**
1547 * htmlIsScriptAttribute:
1548 * @name: an attribute name
1549 *
1550 * Check if an attribute is of content type Script
1551 *
1552 * Returns 1 is the attribute is a script 0 otherwise
1553 */
1554int
1555htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001556 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001557
1558 if (name == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02001559 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001560 /*
1561 * all script attributes start with 'on'
1562 */
1563 if ((name[0] != 'o') || (name[1] != 'n'))
Daniel Veillarde77db162009-08-22 11:32:38 +02001564 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001565 for (i = 0;
1566 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1567 i++) {
1568 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1569 return(1);
1570 }
1571 return(0);
1572}
1573
1574/************************************************************************
1575 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001576 * The list of HTML predefined entities *
Owen Taylor3473f882001-02-23 17:55:21 +00001577 * *
1578 ************************************************************************/
1579
1580
Daniel Veillard22090732001-07-16 00:06:07 +00001581static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001582/*
1583 * the 4 absolute ones, plus apostrophe.
1584 */
1585{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1586{ 38, "amp", "ampersand, U+0026 ISOnum" },
1587{ 39, "apos", "single quote" },
1588{ 60, "lt", "less-than sign, U+003C ISOnum" },
1589{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1590
1591/*
1592 * A bunch still in the 128-255 range
1593 * Replacing them depend really on the charset used.
1594 */
1595{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1596{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1597{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1598{ 163, "pound","pound sign, U+00A3 ISOnum" },
1599{ 164, "curren","currency sign, U+00A4 ISOnum" },
1600{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1601{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1602{ 167, "sect", "section sign, U+00A7 ISOnum" },
1603{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1604{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1605{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1606{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1607{ 172, "not", "not sign, U+00AC ISOnum" },
1608{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1609{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1610{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1611{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1612{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1613{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1614{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1615{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1616{ 181, "micro","micro sign, U+00B5 ISOnum" },
1617{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1618{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1619{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1620{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1621{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1622{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1623{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1624{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1625{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1626{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1627{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1628{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1629{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1630{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1631{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1632{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1633{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1634{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1635{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1636{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1637{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1638{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1639{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1640{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1641{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1642{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1643{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1644{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1645{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1646{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1647{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1648{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1649{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1650{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1651{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1652{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1653{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1654{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1655{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1656{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1657{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1658{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1659{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1660{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1661{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1662{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1663{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1664{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1665{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1666{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1667{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1668{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1669{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1670{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1671{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1672{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1673{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1674{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1675{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1676{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1677{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1678{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1679{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1680{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1681{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1682{ 247, "divide","division sign, U+00F7 ISOnum" },
1683{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1684{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1685{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1686{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1687{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1688{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1689{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1690{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1691
1692{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1693{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1694{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1695{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1696{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1697
1698/*
1699 * Anything below should really be kept as entities references
1700 */
1701{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1702
1703{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1704{ 732, "tilde","small tilde, U+02DC ISOdia" },
1705
1706{ 913, "Alpha","greek capital letter alpha, U+0391" },
1707{ 914, "Beta", "greek capital letter beta, U+0392" },
1708{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1709{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1710{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1711{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1712{ 919, "Eta", "greek capital letter eta, U+0397" },
1713{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1714{ 921, "Iota", "greek capital letter iota, U+0399" },
1715{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001716{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001717{ 924, "Mu", "greek capital letter mu, U+039C" },
1718{ 925, "Nu", "greek capital letter nu, U+039D" },
1719{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1720{ 927, "Omicron","greek capital letter omicron, U+039F" },
1721{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1722{ 929, "Rho", "greek capital letter rho, U+03A1" },
1723{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1724{ 932, "Tau", "greek capital letter tau, U+03A4" },
1725{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1726{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1727{ 935, "Chi", "greek capital letter chi, U+03A7" },
1728{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1729{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1730
1731{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1732{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1733{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1734{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1735{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1736{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1737{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1738{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1739{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1740{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1741{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1742{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1743{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1744{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1745{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1746{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1747{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1748{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1749{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1750{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1751{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1752{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1753{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1754{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1755{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1756{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1757{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1758{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1759
1760{ 8194, "ensp", "en space, U+2002 ISOpub" },
1761{ 8195, "emsp", "em space, U+2003 ISOpub" },
1762{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1763{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1764{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1765{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1766{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1767{ 8211, "ndash","en dash, U+2013 ISOpub" },
1768{ 8212, "mdash","em dash, U+2014 ISOpub" },
1769{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1770{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1771{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1772{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1773{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1774{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1775{ 8224, "dagger","dagger, U+2020 ISOpub" },
1776{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1777
1778{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1779{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1780
1781{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1782
1783{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1784{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1785
1786{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1787{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1788
1789{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1790{ 8260, "frasl","fraction slash, U+2044 NEW" },
1791
1792{ 8364, "euro", "euro sign, U+20AC NEW" },
1793
1794{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1795{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1796{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1797{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1798{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1799{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1800{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1801{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1802{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1803{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1804{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1805{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1806{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1807{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1808{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1809{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1810
1811{ 8704, "forall","for all, U+2200 ISOtech" },
1812{ 8706, "part", "partial differential, U+2202 ISOtech" },
1813{ 8707, "exist","there exists, U+2203 ISOtech" },
1814{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1815{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1816{ 8712, "isin", "element of, U+2208 ISOtech" },
1817{ 8713, "notin","not an element of, U+2209 ISOtech" },
1818{ 8715, "ni", "contains as member, U+220B ISOtech" },
1819{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001820{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001821{ 8722, "minus","minus sign, U+2212 ISOtech" },
1822{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1823{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1824{ 8733, "prop", "proportional to, U+221D ISOtech" },
1825{ 8734, "infin","infinity, U+221E ISOtech" },
1826{ 8736, "ang", "angle, U+2220 ISOamso" },
1827{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1828{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1829{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1830{ 8746, "cup", "union = cup, U+222A ISOtech" },
1831{ 8747, "int", "integral, U+222B ISOtech" },
1832{ 8756, "there4","therefore, U+2234 ISOtech" },
1833{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1834{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1835{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1836{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1837{ 8801, "equiv","identical to, U+2261 ISOtech" },
1838{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1839{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1840{ 8834, "sub", "subset of, U+2282 ISOtech" },
1841{ 8835, "sup", "superset of, U+2283 ISOtech" },
1842{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1843{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1844{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1845{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1846{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1847{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1848{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1849{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1850{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1851{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1852{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1853{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1854{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1855{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1856
1857{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1858{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1859{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1860{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1861
1862};
1863
1864/************************************************************************
1865 * *
1866 * Commodity functions to handle entities *
1867 * *
1868 ************************************************************************/
1869
1870/*
1871 * Macro used to grow the current buffer.
1872 */
1873#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001874 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001875 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001876 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1877 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001878 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001879 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001880 return(NULL); \
1881 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001882 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001883}
1884
1885/**
1886 * htmlEntityLookup:
1887 * @name: the entity name
1888 *
1889 * Lookup the given entity in EntitiesTable
1890 *
1891 * TODO: the linear scan is really ugly, an hash table is really needed.
1892 *
1893 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1894 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001895const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001896htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001897 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001898
1899 for (i = 0;i < (sizeof(html40EntitiesTable)/
1900 sizeof(html40EntitiesTable[0]));i++) {
1901 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001902 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001903 }
1904 }
1905 return(NULL);
1906}
1907
1908/**
1909 * htmlEntityValueLookup:
1910 * @value: the entity's unicode value
1911 *
1912 * Lookup the given entity in EntitiesTable
1913 *
1914 * TODO: the linear scan is really ugly, an hash table is really needed.
1915 *
1916 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1917 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001918const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001919htmlEntityValueLookup(unsigned int value) {
1920 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001921
1922 for (i = 0;i < (sizeof(html40EntitiesTable)/
1923 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001924 if (html40EntitiesTable[i].value >= value) {
1925 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001926 break;
William M. Brack78637da2003-07-31 14:47:38 +00001927 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001928 }
Owen Taylor3473f882001-02-23 17:55:21 +00001929 }
1930 return(NULL);
1931}
1932
1933/**
1934 * UTF8ToHtml:
1935 * @out: a pointer to an array of bytes to store the result
1936 * @outlen: the length of @out
1937 * @in: a pointer to an array of UTF-8 chars
1938 * @inlen: the length of @in
1939 *
1940 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1941 * plus HTML entities block of chars out.
1942 *
1943 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1944 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001945 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001946 * The value of @outlen after return is the number of octets consumed.
1947 */
1948int
1949UTF8ToHtml(unsigned char* out, int *outlen,
1950 const unsigned char* in, int *inlen) {
1951 const unsigned char* processed = in;
1952 const unsigned char* outend;
1953 const unsigned char* outstart = out;
1954 const unsigned char* instart = in;
1955 const unsigned char* inend;
1956 unsigned int c, d;
1957 int trailing;
1958
Daniel Veillardce682bc2004-11-05 17:22:25 +00001959 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001960 if (in == NULL) {
1961 /*
1962 * initialization nothing to do
1963 */
1964 *outlen = 0;
1965 *inlen = 0;
1966 return(0);
1967 }
1968 inend = in + (*inlen);
1969 outend = out + (*outlen);
1970 while (in < inend) {
1971 d = *in++;
1972 if (d < 0x80) { c= d; trailing= 0; }
1973 else if (d < 0xC0) {
1974 /* trailing byte in leading position */
1975 *outlen = out - outstart;
1976 *inlen = processed - instart;
1977 return(-2);
1978 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1979 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1980 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1981 else {
1982 /* no chance for this in Ascii */
1983 *outlen = out - outstart;
1984 *inlen = processed - instart;
1985 return(-2);
1986 }
1987
1988 if (inend - in < trailing) {
1989 break;
Daniel Veillarde77db162009-08-22 11:32:38 +02001990 }
Owen Taylor3473f882001-02-23 17:55:21 +00001991
1992 for ( ; trailing; trailing--) {
1993 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1994 break;
1995 c <<= 6;
1996 c |= d & 0x3F;
1997 }
1998
1999 /* assertion: c is a single UTF-4 value */
2000 if (c < 0x80) {
2001 if (out + 1 >= outend)
2002 break;
2003 *out++ = c;
2004 } else {
2005 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00002006 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00002007 const char *cp;
2008 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00002009
2010 /*
2011 * Try to lookup a predefined HTML entity for it
2012 */
2013
2014 ent = htmlEntityValueLookup(c);
2015 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00002016 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2017 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00002018 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00002019 else
2020 cp = ent->name;
2021 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00002022 if (out + 2 + len >= outend)
2023 break;
2024 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00002025 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00002026 out += len;
2027 *out++ = ';';
2028 }
2029 processed = in;
2030 }
2031 *outlen = out - outstart;
2032 *inlen = processed - instart;
2033 return(0);
2034}
2035
2036/**
2037 * htmlEncodeEntities:
2038 * @out: a pointer to an array of bytes to store the result
2039 * @outlen: the length of @out
2040 * @in: a pointer to an array of UTF-8 chars
2041 * @inlen: the length of @in
2042 * @quoteChar: the quote character to escape (' or ") or zero.
2043 *
2044 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2045 * plus HTML entities block of chars out.
2046 *
2047 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2048 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002049 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00002050 * The value of @outlen after return is the number of octets consumed.
2051 */
2052int
2053htmlEncodeEntities(unsigned char* out, int *outlen,
2054 const unsigned char* in, int *inlen, int quoteChar) {
2055 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002056 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00002057 const unsigned char* outstart = out;
2058 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002059 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00002060 unsigned int c, d;
2061 int trailing;
2062
Daniel Veillardce682bc2004-11-05 17:22:25 +00002063 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2064 return(-1);
2065 outend = out + (*outlen);
2066 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00002067 while (in < inend) {
2068 d = *in++;
2069 if (d < 0x80) { c= d; trailing= 0; }
2070 else if (d < 0xC0) {
2071 /* trailing byte in leading position */
2072 *outlen = out - outstart;
2073 *inlen = processed - instart;
2074 return(-2);
2075 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2076 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2077 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2078 else {
2079 /* no chance for this in Ascii */
2080 *outlen = out - outstart;
2081 *inlen = processed - instart;
2082 return(-2);
2083 }
2084
2085 if (inend - in < trailing)
2086 break;
2087
2088 while (trailing--) {
2089 if (((d= *in++) & 0xC0) != 0x80) {
2090 *outlen = out - outstart;
2091 *inlen = processed - instart;
2092 return(-2);
2093 }
2094 c <<= 6;
2095 c |= d & 0x3F;
2096 }
2097
2098 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002099 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2100 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002101 if (out >= outend)
2102 break;
2103 *out++ = c;
2104 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002105 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002106 const char *cp;
2107 char nbuf[16];
2108 int len;
2109
2110 /*
2111 * Try to lookup a predefined HTML entity for it
2112 */
2113 ent = htmlEntityValueLookup(c);
2114 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002115 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002116 cp = nbuf;
2117 }
2118 else
2119 cp = ent->name;
2120 len = strlen(cp);
2121 if (out + 2 + len > outend)
2122 break;
2123 *out++ = '&';
2124 memcpy(out, cp, len);
2125 out += len;
2126 *out++ = ';';
2127 }
2128 processed = in;
2129 }
2130 *outlen = out - outstart;
2131 *inlen = processed - instart;
2132 return(0);
2133}
2134
Owen Taylor3473f882001-02-23 17:55:21 +00002135/************************************************************************
2136 * *
2137 * Commodity functions to handle streams *
2138 * *
2139 ************************************************************************/
2140
2141/**
Owen Taylor3473f882001-02-23 17:55:21 +00002142 * htmlNewInputStream:
2143 * @ctxt: an HTML parser context
2144 *
2145 * Create a new input stream structure
2146 * Returns the new input stream or NULL
2147 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002148static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002149htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2150 htmlParserInputPtr input;
2151
2152 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2153 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002154 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002155 return(NULL);
2156 }
2157 memset(input, 0, sizeof(htmlParserInput));
2158 input->filename = NULL;
2159 input->directory = NULL;
2160 input->base = NULL;
2161 input->cur = NULL;
2162 input->buf = NULL;
2163 input->line = 1;
2164 input->col = 1;
2165 input->buf = NULL;
2166 input->free = NULL;
2167 input->version = NULL;
2168 input->consumed = 0;
2169 input->length = 0;
2170 return(input);
2171}
2172
2173
2174/************************************************************************
2175 * *
2176 * Commodity functions, cleanup needed ? *
2177 * *
2178 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002179/*
Daniel Veillarde77db162009-08-22 11:32:38 +02002180 * all tags allowing pc data from the html 4.01 loose dtd
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002181 * NOTE: it might be more apropriate to integrate this information
2182 * into the html40ElementTable array but I don't want to risk any
2183 * binary incomptibility
2184 */
2185static const char *allowPCData[] = {
2186 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2187 "blockquote", "body", "button", "caption", "center", "cite", "code",
2188 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2189 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2190 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2191 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2192};
Owen Taylor3473f882001-02-23 17:55:21 +00002193
2194/**
2195 * areBlanks:
2196 * @ctxt: an HTML parser context
2197 * @str: a xmlChar *
2198 * @len: the size of @str
2199 *
2200 * Is this a sequence of blank chars that one can ignore ?
2201 *
2202 * Returns 1 if ignorable 0 otherwise.
2203 */
2204
2205static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002206 unsigned int i;
2207 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002208 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002209 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002210
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002211 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002212 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002213
2214 if (CUR == 0) return(1);
2215 if (CUR != '<') return(0);
2216 if (ctxt->name == NULL)
2217 return(1);
2218 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2219 return(1);
2220 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2221 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002222
2223 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2224 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2225 dtd = xmlGetIntSubset(ctxt->myDoc);
2226 if (dtd != NULL && dtd->ExternalID != NULL) {
2227 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2228 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2229 return(1);
2230 }
2231 }
2232
Owen Taylor3473f882001-02-23 17:55:21 +00002233 if (ctxt->node == NULL) return(0);
2234 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002235 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2236 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002237 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002238 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2239 (ctxt->node->content != NULL)) return(0);
Daniel Veillarde77db162009-08-22 11:32:38 +02002240 /* keep ws in constructs like ...<b> </b>...
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002241 for all tags "b" allowing PCDATA */
2242 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2243 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2244 return(0);
2245 }
2246 }
Owen Taylor3473f882001-02-23 17:55:21 +00002247 } else if (xmlNodeIsText(lastChild)) {
2248 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002249 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02002250 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002251 for all tags "p" allowing PCDATA */
2252 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2253 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2254 return(0);
2255 }
2256 }
Owen Taylor3473f882001-02-23 17:55:21 +00002257 }
2258 return(1);
2259}
2260
2261/**
Owen Taylor3473f882001-02-23 17:55:21 +00002262 * htmlNewDocNoDtD:
2263 * @URI: URI for the dtd, or NULL
2264 * @ExternalID: the external ID of the DTD, or NULL
2265 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002266 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2267 * are NULL
2268 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002269 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002270 */
2271htmlDocPtr
2272htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2273 xmlDocPtr cur;
2274
2275 /*
2276 * Allocate a new document and fill the fields.
2277 */
2278 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2279 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002280 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002281 return(NULL);
2282 }
2283 memset(cur, 0, sizeof(xmlDoc));
2284
2285 cur->type = XML_HTML_DOCUMENT_NODE;
2286 cur->version = NULL;
2287 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002288 cur->doc = cur;
2289 cur->name = NULL;
Daniel Veillarde77db162009-08-22 11:32:38 +02002290 cur->children = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002291 cur->extSubset = NULL;
2292 cur->oldNs = NULL;
2293 cur->encoding = NULL;
2294 cur->standalone = 1;
2295 cur->compression = 0;
2296 cur->ids = NULL;
2297 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002298 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002299 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002300 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002301 if ((ExternalID != NULL) ||
2302 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002303 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002304 return(cur);
2305}
2306
2307/**
2308 * htmlNewDoc:
2309 * @URI: URI for the dtd, or NULL
2310 * @ExternalID: the external ID of the DTD, or NULL
2311 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002312 * Creates a new HTML document
2313 *
Owen Taylor3473f882001-02-23 17:55:21 +00002314 * Returns a new document
2315 */
2316htmlDocPtr
2317htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2318 if ((URI == NULL) && (ExternalID == NULL))
2319 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002320 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2321 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002322
2323 return(htmlNewDocNoDtD(URI, ExternalID));
2324}
2325
2326
2327/************************************************************************
2328 * *
2329 * The parser itself *
2330 * Relates to http://www.w3.org/TR/html40 *
2331 * *
2332 ************************************************************************/
2333
2334/************************************************************************
2335 * *
2336 * The parser itself *
2337 * *
2338 ************************************************************************/
2339
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002340static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002341
Owen Taylor3473f882001-02-23 17:55:21 +00002342/**
2343 * htmlParseHTMLName:
2344 * @ctxt: an HTML parser context
2345 *
2346 * parse an HTML tag or attribute name, note that we convert it to lowercase
2347 * since HTML names are not case-sensitive.
2348 *
2349 * Returns the Tag Name parsed or NULL
2350 */
2351
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002352static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002353htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002354 int i = 0;
2355 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2356
William M. Brackd1757ab2004-10-02 22:07:48 +00002357 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002358 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002359
2360 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002361 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002362 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2363 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002364 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2365 else loc[i] = CUR;
2366 i++;
Daniel Veillarde77db162009-08-22 11:32:38 +02002367
Owen Taylor3473f882001-02-23 17:55:21 +00002368 NEXT;
2369 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002370
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002371 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002372}
2373
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002374
2375/**
2376 * htmlParseHTMLName_nonInvasive:
2377 * @ctxt: an HTML parser context
2378 *
2379 * parse an HTML tag or attribute name, note that we convert it to lowercase
2380 * since HTML names are not case-sensitive, this doesn't consume the data
2381 * from the stream, it's a look-ahead
2382 *
2383 * Returns the Tag Name parsed or NULL
2384 */
2385
2386static const xmlChar *
2387htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2388 int i = 0;
2389 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2390
2391 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2392 (NXT(1) != ':')) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02002393
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002394 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2395 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2396 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2397 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2398 else loc[i] = NXT(1+i);
2399 i++;
2400 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002401
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002402 return(xmlDictLookup(ctxt->dict, loc, i));
2403}
2404
2405
Owen Taylor3473f882001-02-23 17:55:21 +00002406/**
2407 * htmlParseName:
2408 * @ctxt: an HTML parser context
2409 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002410 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002411 *
2412 * Returns the Name parsed or NULL
2413 */
2414
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002415static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002416htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002417 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002418 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002419 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002420
2421 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002422
2423 /*
2424 * Accelerator for simple ASCII names
2425 */
2426 in = ctxt->input->cur;
2427 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2428 ((*in >= 0x41) && (*in <= 0x5A)) ||
2429 (*in == '_') || (*in == ':')) {
2430 in++;
2431 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2432 ((*in >= 0x41) && (*in <= 0x5A)) ||
2433 ((*in >= 0x30) && (*in <= 0x39)) ||
2434 (*in == '_') || (*in == '-') ||
2435 (*in == ':') || (*in == '.'))
2436 in++;
2437 if ((*in > 0) && (*in < 0x80)) {
2438 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002439 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002440 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002441 ctxt->nbChars += count;
2442 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002443 return(ret);
2444 }
2445 }
2446 return(htmlParseNameComplex(ctxt));
2447}
2448
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002449static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002450htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002451 int len = 0, l;
2452 int c;
2453 int count = 0;
2454
2455 /*
2456 * Handler for more complex cases
2457 */
2458 GROW;
2459 c = CUR_CHAR(l);
2460 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2461 (!IS_LETTER(c) && (c != '_') &&
2462 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002463 return(NULL);
2464 }
2465
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002466 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2467 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2468 (c == '.') || (c == '-') ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002469 (c == '_') || (c == ':') ||
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002470 (IS_COMBINING(c)) ||
2471 (IS_EXTENDER(c)))) {
2472 if (count++ > 100) {
2473 count = 0;
2474 GROW;
2475 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002476 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002477 NEXTL(l);
2478 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002479 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002480 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002481}
2482
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002483
Owen Taylor3473f882001-02-23 17:55:21 +00002484/**
2485 * htmlParseHTMLAttribute:
2486 * @ctxt: an HTML parser context
2487 * @stop: a char stop value
Daniel Veillarde77db162009-08-22 11:32:38 +02002488 *
Owen Taylor3473f882001-02-23 17:55:21 +00002489 * parse an HTML attribute value till the stop (quote), if
2490 * stop is 0 then it stops at the first space
2491 *
2492 * Returns the attribute parsed or NULL
2493 */
2494
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002495static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002496htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2497 xmlChar *buffer = NULL;
2498 int buffer_size = 0;
2499 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002500 const xmlChar *name = NULL;
2501 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002502 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002503
2504 /*
2505 * allocate a translation buffer.
2506 */
2507 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002508 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002509 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002510 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002511 return(NULL);
2512 }
2513 out = buffer;
2514
2515 /*
2516 * Ok loop until we reach one of the ending chars
2517 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002518 while ((CUR != 0) && (CUR != stop)) {
2519 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002520 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002521 if (CUR == '&') {
2522 if (NXT(1) == '#') {
2523 unsigned int c;
2524 int bits;
2525
2526 c = htmlParseCharRef(ctxt);
2527 if (c < 0x80)
2528 { *out++ = c; bits= -6; }
2529 else if (c < 0x800)
2530 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2531 else if (c < 0x10000)
2532 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002533 else
Owen Taylor3473f882001-02-23 17:55:21 +00002534 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002535
Owen Taylor3473f882001-02-23 17:55:21 +00002536 for ( ; bits >= 0; bits-= 6) {
2537 *out++ = ((c >> bits) & 0x3F) | 0x80;
2538 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002539
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002540 if (out - buffer > buffer_size - 100) {
2541 int indx = out - buffer;
2542
2543 growBuffer(buffer);
2544 out = &buffer[indx];
2545 }
Owen Taylor3473f882001-02-23 17:55:21 +00002546 } else {
2547 ent = htmlParseEntityRef(ctxt, &name);
2548 if (name == NULL) {
2549 *out++ = '&';
2550 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002551 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002552
2553 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002554 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002555 }
2556 } else if (ent == NULL) {
2557 *out++ = '&';
2558 cur = name;
2559 while (*cur != 0) {
2560 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002561 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002562
2563 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002564 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002565 }
2566 *out++ = *cur++;
2567 }
Owen Taylor3473f882001-02-23 17:55:21 +00002568 } else {
2569 unsigned int c;
2570 int bits;
2571
2572 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002573 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002574
2575 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002576 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002577 }
Daniel Veillard48519092006-10-17 15:56:35 +00002578 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002579 if (c < 0x80)
2580 { *out++ = c; bits= -6; }
2581 else if (c < 0x800)
2582 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2583 else if (c < 0x10000)
2584 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002585 else
Owen Taylor3473f882001-02-23 17:55:21 +00002586 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002587
Owen Taylor3473f882001-02-23 17:55:21 +00002588 for ( ; bits >= 0; bits-= 6) {
2589 *out++ = ((c >> bits) & 0x3F) | 0x80;
2590 }
Owen Taylor3473f882001-02-23 17:55:21 +00002591 }
2592 }
2593 } else {
2594 unsigned int c;
2595 int bits, l;
2596
2597 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002598 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002599
2600 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002601 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002602 }
2603 c = CUR_CHAR(l);
2604 if (c < 0x80)
2605 { *out++ = c; bits= -6; }
2606 else if (c < 0x800)
2607 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2608 else if (c < 0x10000)
2609 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002610 else
Owen Taylor3473f882001-02-23 17:55:21 +00002611 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002612
Owen Taylor3473f882001-02-23 17:55:21 +00002613 for ( ; bits >= 0; bits-= 6) {
2614 *out++ = ((c >> bits) & 0x3F) | 0x80;
2615 }
2616 NEXT;
2617 }
2618 }
Daniel Veillard13cee4e2009-09-05 14:52:55 +02002619 *out = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002620 return(buffer);
2621}
2622
2623/**
Owen Taylor3473f882001-02-23 17:55:21 +00002624 * htmlParseEntityRef:
2625 * @ctxt: an HTML parser context
2626 * @str: location to store the entity name
2627 *
2628 * parse an HTML ENTITY references
2629 *
2630 * [68] EntityRef ::= '&' Name ';'
2631 *
2632 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2633 * if non-NULL *str will have to be freed by the caller.
2634 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002635const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002636htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2637 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002638 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002639
2640 if (str != NULL) *str = NULL;
2641 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002642
2643 if (CUR == '&') {
2644 NEXT;
2645 name = htmlParseName(ctxt);
2646 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002647 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2648 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002649 } else {
2650 GROW;
2651 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002652 if (str != NULL)
2653 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002654
2655 /*
2656 * Lookup the entity in the table.
2657 */
2658 ent = htmlEntityLookup(name);
2659 if (ent != NULL) /* OK that's ugly !!! */
2660 NEXT;
2661 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002662 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2663 "htmlParseEntityRef: expecting ';'\n",
2664 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002665 if (str != NULL)
2666 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002667 }
2668 }
2669 }
2670 return(ent);
2671}
2672
2673/**
2674 * htmlParseAttValue:
2675 * @ctxt: an HTML parser context
2676 *
2677 * parse a value for an attribute
2678 * Note: the parser won't do substitution of entities here, this
2679 * will be handled later in xmlStringGetNodeList, unless it was
Daniel Veillarde77db162009-08-22 11:32:38 +02002680 * asked for ctxt->replaceEntities != 0
Owen Taylor3473f882001-02-23 17:55:21 +00002681 *
2682 * Returns the AttValue parsed or NULL.
2683 */
2684
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002685static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002686htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2687 xmlChar *ret = NULL;
2688
2689 if (CUR == '"') {
2690 NEXT;
2691 ret = htmlParseHTMLAttribute(ctxt, '"');
2692 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002693 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2694 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002695 } else
2696 NEXT;
2697 } else if (CUR == '\'') {
2698 NEXT;
2699 ret = htmlParseHTMLAttribute(ctxt, '\'');
2700 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002701 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2702 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002703 } else
2704 NEXT;
2705 } else {
2706 /*
2707 * That's an HTMLism, the attribute value may not be quoted
2708 */
2709 ret = htmlParseHTMLAttribute(ctxt, 0);
2710 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002711 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2712 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002713 }
2714 }
2715 return(ret);
2716}
2717
2718/**
2719 * htmlParseSystemLiteral:
2720 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02002721 *
Owen Taylor3473f882001-02-23 17:55:21 +00002722 * parse an HTML Literal
2723 *
2724 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2725 *
2726 * Returns the SystemLiteral parsed or NULL
2727 */
2728
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002729static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002730htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2731 const xmlChar *q;
2732 xmlChar *ret = NULL;
2733
2734 if (CUR == '"') {
2735 NEXT;
2736 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002737 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002738 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002739 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002740 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2741 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002742 } else {
2743 ret = xmlStrndup(q, CUR_PTR - q);
2744 NEXT;
2745 }
2746 } else if (CUR == '\'') {
2747 NEXT;
2748 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002749 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002750 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002751 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002752 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2753 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002754 } else {
2755 ret = xmlStrndup(q, CUR_PTR - q);
2756 NEXT;
2757 }
2758 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002759 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2760 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002761 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002762
Owen Taylor3473f882001-02-23 17:55:21 +00002763 return(ret);
2764}
2765
2766/**
2767 * htmlParsePubidLiteral:
2768 * @ctxt: an HTML parser context
2769 *
2770 * parse an HTML public literal
2771 *
2772 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2773 *
2774 * Returns the PubidLiteral parsed or NULL.
2775 */
2776
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002777static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002778htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2779 const xmlChar *q;
2780 xmlChar *ret = NULL;
2781 /*
2782 * Name ::= (Letter | '_') (NameChar)*
2783 */
2784 if (CUR == '"') {
2785 NEXT;
2786 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002787 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002788 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002789 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2790 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002791 } else {
2792 ret = xmlStrndup(q, CUR_PTR - q);
2793 NEXT;
2794 }
2795 } else if (CUR == '\'') {
2796 NEXT;
2797 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002798 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002799 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002800 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002801 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2802 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002803 } else {
2804 ret = xmlStrndup(q, CUR_PTR - q);
2805 NEXT;
2806 }
2807 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002808 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2809 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002810 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002811
Owen Taylor3473f882001-02-23 17:55:21 +00002812 return(ret);
2813}
2814
2815/**
2816 * htmlParseScript:
2817 * @ctxt: an HTML parser context
2818 *
2819 * parse the content of an HTML SCRIPT or STYLE element
2820 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2821 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2822 * http://www.w3.org/TR/html4/types.html#type-script
2823 * http://www.w3.org/TR/html4/types.html#h-6.15
2824 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2825 *
2826 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2827 * element and the value of intrinsic event attributes. User agents must
2828 * not evaluate script data as HTML markup but instead must pass it on as
2829 * data to a script engine.
2830 * NOTES:
2831 * - The content is passed like CDATA
2832 * - the attributes for style and scripting "onXXX" are also described
2833 * as CDATA but SGML allows entities references in attributes so their
2834 * processing is identical as other attributes
2835 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002836static void
Owen Taylor3473f882001-02-23 17:55:21 +00002837htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002838 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002839 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002840 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002841
2842 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002843 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002844 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002845 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002846 /*
2847 * One should break here, the specification is clear:
2848 * Authors should therefore escape "</" within the content.
2849 * Escape mechanisms are specific to each scripting or
2850 * style sheet language.
2851 *
2852 * In recovery mode, only break if end tag match the
2853 * current tag, effectively ignoring all tags inside the
2854 * script/style block and treating the entire block as
2855 * CDATA.
2856 */
2857 if (ctxt->recovery) {
Daniel Veillarde77db162009-08-22 11:32:38 +02002858 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2859 xmlStrlen(ctxt->name)) == 0)
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002860 {
2861 break; /* while */
2862 } else {
2863 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002864 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002865 ctxt->name, NULL);
2866 }
2867 } else {
2868 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002869 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002870 {
2871 break; /* while */
2872 }
2873 }
Owen Taylor3473f882001-02-23 17:55:21 +00002874 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002875 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002876 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2877 if (ctxt->sax->cdataBlock!= NULL) {
2878 /*
2879 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2880 */
2881 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002882 } else if (ctxt->sax->characters != NULL) {
2883 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002884 }
2885 nbchar = 0;
2886 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002887 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002888 NEXTL(l);
2889 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002890 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002891
Daniel Veillard68716a72006-10-16 09:32:17 +00002892 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Pierre Belziled4b54472010-11-04 10:18:17 +01002893 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2894 "Invalid char in CDATA 0x%X\n", cur);
2895 if (ctxt->input->cur < ctxt->input->end) {
2896 NEXT;
2897 }
Owen Taylor3473f882001-02-23 17:55:21 +00002898 }
2899
2900 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2901 if (ctxt->sax->cdataBlock!= NULL) {
2902 /*
2903 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2904 */
2905 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002906 } else if (ctxt->sax->characters != NULL) {
2907 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002908 }
2909 }
2910}
2911
2912
2913/**
2914 * htmlParseCharData:
2915 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002916 *
2917 * parse a CharData section.
2918 * if we are within a CDATA section ']]>' marks an end of section.
2919 *
2920 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2921 */
2922
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002923static void
2924htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002925 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2926 int nbchar = 0;
2927 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002928 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002929
2930 SHRINK;
2931 cur = CUR_CHAR(l);
2932 while (((cur != '<') || (ctxt->token == '<')) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02002933 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002934 (cur != 0)) {
2935 if (!(IS_CHAR(cur))) {
2936 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2937 "Invalid char in CDATA 0x%X\n", cur);
2938 } else {
2939 COPY_BUF(l,buf,nbchar,cur);
2940 }
Owen Taylor3473f882001-02-23 17:55:21 +00002941 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2942 /*
2943 * Ok the segment is to be consumed as chars.
2944 */
2945 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2946 if (areBlanks(ctxt, buf, nbchar)) {
2947 if (ctxt->sax->ignorableWhitespace != NULL)
2948 ctxt->sax->ignorableWhitespace(ctxt->userData,
2949 buf, nbchar);
2950 } else {
2951 htmlCheckParagraph(ctxt);
2952 if (ctxt->sax->characters != NULL)
2953 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2954 }
2955 }
2956 nbchar = 0;
2957 }
2958 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002959 chunk++;
2960 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2961 chunk = 0;
2962 SHRINK;
2963 GROW;
2964 }
Owen Taylor3473f882001-02-23 17:55:21 +00002965 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002966 if (cur == 0) {
2967 SHRINK;
2968 GROW;
2969 cur = CUR_CHAR(l);
2970 }
Owen Taylor3473f882001-02-23 17:55:21 +00002971 }
2972 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002973 buf[nbchar] = 0;
2974
Owen Taylor3473f882001-02-23 17:55:21 +00002975 /*
2976 * Ok the segment is to be consumed as chars.
2977 */
2978 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2979 if (areBlanks(ctxt, buf, nbchar)) {
2980 if (ctxt->sax->ignorableWhitespace != NULL)
2981 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2982 } else {
2983 htmlCheckParagraph(ctxt);
2984 if (ctxt->sax->characters != NULL)
2985 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2986 }
2987 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002988 } else {
2989 /*
2990 * Loop detection
2991 */
2992 if (cur == 0)
2993 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002994 }
2995}
2996
2997/**
2998 * htmlParseExternalID:
2999 * @ctxt: an HTML parser context
3000 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00003001 *
3002 * Parse an External ID or a Public ID
3003 *
Owen Taylor3473f882001-02-23 17:55:21 +00003004 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3005 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3006 *
3007 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3008 *
3009 * Returns the function returns SystemLiteral and in the second
3010 * case publicID receives PubidLiteral, is strict is off
3011 * it is possible to return NULL and have publicID set.
3012 */
3013
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003014static xmlChar *
3015htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00003016 xmlChar *URI = NULL;
3017
3018 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3019 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3020 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3021 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00003022 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003023 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3024 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003025 }
3026 SKIP_BLANKS;
3027 URI = htmlParseSystemLiteral(ctxt);
3028 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003029 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3030 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003031 }
3032 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3033 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3034 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3035 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00003036 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003037 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3038 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003039 }
3040 SKIP_BLANKS;
3041 *publicID = htmlParsePubidLiteral(ctxt);
3042 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003043 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3044 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3045 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003046 }
3047 SKIP_BLANKS;
3048 if ((CUR == '"') || (CUR == '\'')) {
3049 URI = htmlParseSystemLiteral(ctxt);
3050 }
3051 }
3052 return(URI);
3053}
3054
3055/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003056 * xmlParsePI:
3057 * @ctxt: an XML parser context
3058 *
3059 * parse an XML Processing Instruction.
3060 *
3061 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3062 */
3063static void
3064htmlParsePI(htmlParserCtxtPtr ctxt) {
3065 xmlChar *buf = NULL;
3066 int len = 0;
3067 int size = HTML_PARSER_BUFFER_SIZE;
3068 int cur, l;
3069 const xmlChar *target;
3070 xmlParserInputState state;
3071 int count = 0;
3072
3073 if ((RAW == '<') && (NXT(1) == '?')) {
3074 state = ctxt->instate;
3075 ctxt->instate = XML_PARSER_PI;
3076 /*
3077 * this is a Processing Instruction.
3078 */
3079 SKIP(2);
3080 SHRINK;
3081
3082 /*
3083 * Parse the target name and check for special support like
3084 * namespace.
3085 */
3086 target = htmlParseName(ctxt);
3087 if (target != NULL) {
3088 if (RAW == '>') {
3089 SKIP(1);
3090
3091 /*
3092 * SAX: PI detected.
3093 */
3094 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3095 (ctxt->sax->processingInstruction != NULL))
3096 ctxt->sax->processingInstruction(ctxt->userData,
3097 target, NULL);
3098 ctxt->instate = state;
3099 return;
3100 }
3101 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3102 if (buf == NULL) {
3103 htmlErrMemory(ctxt, NULL);
3104 ctxt->instate = state;
3105 return;
3106 }
3107 cur = CUR;
3108 if (!IS_BLANK(cur)) {
3109 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3110 "ParsePI: PI %s space expected\n", target, NULL);
3111 }
3112 SKIP_BLANKS;
3113 cur = CUR_CHAR(l);
3114 while (IS_CHAR(cur) && (cur != '>')) {
3115 if (len + 5 >= size) {
3116 xmlChar *tmp;
3117
3118 size *= 2;
3119 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3120 if (tmp == NULL) {
3121 htmlErrMemory(ctxt, NULL);
3122 xmlFree(buf);
3123 ctxt->instate = state;
3124 return;
3125 }
3126 buf = tmp;
3127 }
3128 count++;
3129 if (count > 50) {
3130 GROW;
3131 count = 0;
3132 }
3133 COPY_BUF(l,buf,len,cur);
3134 NEXTL(l);
3135 cur = CUR_CHAR(l);
3136 if (cur == 0) {
3137 SHRINK;
3138 GROW;
3139 cur = CUR_CHAR(l);
3140 }
3141 }
3142 buf[len] = 0;
3143 if (cur != '>') {
3144 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3145 "ParsePI: PI %s never end ...\n", target, NULL);
3146 } else {
3147 SKIP(1);
3148
3149 /*
3150 * SAX: PI detected.
3151 */
3152 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3153 (ctxt->sax->processingInstruction != NULL))
3154 ctxt->sax->processingInstruction(ctxt->userData,
3155 target, buf);
3156 }
3157 xmlFree(buf);
3158 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02003159 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003160 "PI is not started correctly", NULL, NULL);
3161 }
3162 ctxt->instate = state;
3163 }
3164}
3165
3166/**
Owen Taylor3473f882001-02-23 17:55:21 +00003167 * htmlParseComment:
3168 * @ctxt: an HTML parser context
3169 *
3170 * Parse an XML (SGML) comment <!-- .... -->
3171 *
3172 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3173 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003174static void
Owen Taylor3473f882001-02-23 17:55:21 +00003175htmlParseComment(htmlParserCtxtPtr ctxt) {
3176 xmlChar *buf = NULL;
3177 int len;
3178 int size = HTML_PARSER_BUFFER_SIZE;
3179 int q, ql;
3180 int r, rl;
3181 int cur, l;
3182 xmlParserInputState state;
3183
3184 /*
3185 * Check that there is a comment right here.
3186 */
3187 if ((RAW != '<') || (NXT(1) != '!') ||
3188 (NXT(2) != '-') || (NXT(3) != '-')) return;
3189
3190 state = ctxt->instate;
3191 ctxt->instate = XML_PARSER_COMMENT;
3192 SHRINK;
3193 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003194 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003195 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003196 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003197 ctxt->instate = state;
3198 return;
3199 }
3200 q = CUR_CHAR(ql);
3201 NEXTL(ql);
3202 r = CUR_CHAR(rl);
3203 NEXTL(rl);
3204 cur = CUR_CHAR(l);
3205 len = 0;
3206 while (IS_CHAR(cur) &&
3207 ((cur != '>') ||
3208 (r != '-') || (q != '-'))) {
3209 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003210 xmlChar *tmp;
3211
Owen Taylor3473f882001-02-23 17:55:21 +00003212 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003213 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3214 if (tmp == NULL) {
3215 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003216 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003217 ctxt->instate = state;
3218 return;
3219 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003220 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003221 }
3222 COPY_BUF(ql,buf,len,q);
3223 q = r;
3224 ql = rl;
3225 r = cur;
3226 rl = l;
3227 NEXTL(l);
3228 cur = CUR_CHAR(l);
3229 if (cur == 0) {
3230 SHRINK;
3231 GROW;
3232 cur = CUR_CHAR(l);
3233 }
3234 }
3235 buf[len] = 0;
3236 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003237 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3238 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003239 xmlFree(buf);
3240 } else {
3241 NEXT;
3242 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3243 (!ctxt->disableSAX))
3244 ctxt->sax->comment(ctxt->userData, buf);
3245 xmlFree(buf);
3246 }
3247 ctxt->instate = state;
3248}
3249
3250/**
3251 * htmlParseCharRef:
3252 * @ctxt: an HTML parser context
3253 *
3254 * parse Reference declarations
3255 *
3256 * [66] CharRef ::= '&#' [0-9]+ ';' |
3257 * '&#x' [0-9a-fA-F]+ ';'
3258 *
3259 * Returns the value parsed (as an int)
3260 */
3261int
3262htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3263 int val = 0;
3264
Daniel Veillarda03e3652004-11-02 18:45:30 +00003265 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3266 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3267 "htmlParseCharRef: context error\n",
3268 NULL, NULL);
3269 return(0);
3270 }
Owen Taylor3473f882001-02-23 17:55:21 +00003271 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003272 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003273 SKIP(3);
3274 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003275 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003276 val = val * 16 + (CUR - '0');
3277 else if ((CUR >= 'a') && (CUR <= 'f'))
3278 val = val * 16 + (CUR - 'a') + 10;
3279 else if ((CUR >= 'A') && (CUR <= 'F'))
3280 val = val * 16 + (CUR - 'A') + 10;
3281 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003282 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Michael Dayaf58ee12010-08-02 13:43:28 +02003283 "htmlParseCharRef: missing semicolon\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003284 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003285 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003286 }
3287 NEXT;
3288 }
3289 if (CUR == ';')
3290 NEXT;
3291 } else if ((CUR == '&') && (NXT(1) == '#')) {
3292 SKIP(2);
3293 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003294 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003295 val = val * 10 + (CUR - '0');
3296 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003297 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Michael Dayaf58ee12010-08-02 13:43:28 +02003298 "htmlParseCharRef: missing semicolon\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003299 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003300 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003301 }
3302 NEXT;
3303 }
3304 if (CUR == ';')
3305 NEXT;
3306 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003307 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3308 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003309 }
3310 /*
3311 * Check the value IS_CHAR ...
3312 */
3313 if (IS_CHAR(val)) {
3314 return(val);
3315 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003316 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3317 "htmlParseCharRef: invalid xmlChar value %d\n",
3318 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003319 }
3320 return(0);
3321}
3322
3323
3324/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003325 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003326 * @ctxt: an HTML parser context
3327 *
3328 * parse a DOCTYPE declaration
3329 *
Daniel Veillarde77db162009-08-22 11:32:38 +02003330 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
Owen Taylor3473f882001-02-23 17:55:21 +00003331 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3332 */
3333
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003334static void
Owen Taylor3473f882001-02-23 17:55:21 +00003335htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003336 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003337 xmlChar *ExternalID = NULL;
3338 xmlChar *URI = NULL;
3339
3340 /*
3341 * We know that '<!DOCTYPE' has been detected.
3342 */
3343 SKIP(9);
3344
3345 SKIP_BLANKS;
3346
3347 /*
3348 * Parse the DOCTYPE name.
3349 */
3350 name = htmlParseName(ctxt);
3351 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003352 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3353 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3354 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003355 }
3356 /*
3357 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3358 */
3359
3360 SKIP_BLANKS;
3361
3362 /*
3363 * Check for SystemID and ExternalID
3364 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003365 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003366 SKIP_BLANKS;
3367
3368 /*
3369 * We should be at the end of the DOCTYPE declaration.
3370 */
3371 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003372 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3373 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003374 /* We shouldn't try to resynchronize ... */
3375 }
3376 NEXT;
3377
3378 /*
3379 * Create or update the document accordingly to the DOCTYPE
3380 */
3381 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3382 (!ctxt->disableSAX))
3383 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3384
3385 /*
3386 * Cleanup, since we don't use all those identifiers
3387 */
3388 if (URI != NULL) xmlFree(URI);
3389 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003390}
3391
3392/**
3393 * htmlParseAttribute:
3394 * @ctxt: an HTML parser context
3395 * @value: a xmlChar ** used to store the value of the attribute
3396 *
3397 * parse an attribute
3398 *
3399 * [41] Attribute ::= Name Eq AttValue
3400 *
3401 * [25] Eq ::= S? '=' S?
3402 *
3403 * With namespace:
3404 *
3405 * [NS 11] Attribute ::= QName Eq AttValue
3406 *
3407 * Also the case QName == xmlns:??? is handled independently as a namespace
3408 * definition.
3409 *
3410 * Returns the attribute name, and the value in *value.
3411 */
3412
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003413static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003414htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003415 const xmlChar *name;
3416 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003417
3418 *value = NULL;
3419 name = htmlParseHTMLName(ctxt);
3420 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003421 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3422 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003423 return(NULL);
3424 }
3425
3426 /*
3427 * read the value
3428 */
3429 SKIP_BLANKS;
3430 if (CUR == '=') {
3431 NEXT;
3432 SKIP_BLANKS;
3433 val = htmlParseAttValue(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003434 }
3435
3436 *value = val;
3437 return(name);
3438}
3439
3440/**
Denis Pauk868d92d2012-05-10 15:34:57 +08003441 * htmlCheckEncodingDirect:
Owen Taylor3473f882001-02-23 17:55:21 +00003442 * @ctxt: an HTML parser context
3443 * @attvalue: the attribute value
3444 *
Denis Pauk868d92d2012-05-10 15:34:57 +08003445 * Checks an attribute value to detect
Owen Taylor3473f882001-02-23 17:55:21 +00003446 * the encoding
3447 * If a new encoding is detected the parser is switched to decode
3448 * it and pass UTF8
3449 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003450static void
Denis Pauk868d92d2012-05-10 15:34:57 +08003451htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
Owen Taylor3473f882001-02-23 17:55:21 +00003452
Denis Pauk868d92d2012-05-10 15:34:57 +08003453 if ((ctxt == NULL) || (encoding == NULL) ||
Daniel Veillardc62efc82011-05-16 16:03:50 +08003454 (ctxt->options & HTML_PARSE_IGNORE_ENC))
Owen Taylor3473f882001-02-23 17:55:21 +00003455 return;
3456
Daniel Veillarde77db162009-08-22 11:32:38 +02003457 /* do not change encoding */
Owen Taylor3473f882001-02-23 17:55:21 +00003458 if (ctxt->input->encoding != NULL)
3459 return;
3460
Owen Taylor3473f882001-02-23 17:55:21 +00003461 if (encoding != NULL) {
3462 xmlCharEncoding enc;
3463 xmlCharEncodingHandlerPtr handler;
3464
3465 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3466
3467 if (ctxt->input->encoding != NULL)
3468 xmlFree((xmlChar *) ctxt->input->encoding);
3469 ctxt->input->encoding = xmlStrdup(encoding);
3470
3471 enc = xmlParseCharEncoding((const char *) encoding);
3472 /*
3473 * registered set of known encodings
3474 */
3475 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillarde77db162009-08-22 11:32:38 +02003476 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
Daniel Veillard7e303562006-10-16 13:14:55 +00003477 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3478 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3479 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3480 (ctxt->input->buf != NULL) &&
3481 (ctxt->input->buf->encoder == NULL)) {
3482 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3483 "htmlCheckEncoding: wrong encoding meta\n",
3484 NULL, NULL);
3485 } else {
3486 xmlSwitchEncoding(ctxt, enc);
3487 }
Owen Taylor3473f882001-02-23 17:55:21 +00003488 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3489 } else {
3490 /*
3491 * fallback for unknown encodings
3492 */
3493 handler = xmlFindCharEncodingHandler((const char *) encoding);
3494 if (handler != NULL) {
3495 xmlSwitchToEncoding(ctxt, handler);
3496 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3497 } else {
Daniel Veillardc62efc82011-05-16 16:03:50 +08003498 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3499 "htmlCheckEncoding: unknown encoding %s\n",
3500 encoding, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003501 }
3502 }
3503
3504 if ((ctxt->input->buf != NULL) &&
3505 (ctxt->input->buf->encoder != NULL) &&
3506 (ctxt->input->buf->raw != NULL) &&
3507 (ctxt->input->buf->buffer != NULL)) {
3508 int nbchars;
3509 int processed;
3510
3511 /*
3512 * convert as much as possible to the parser reading buffer.
3513 */
3514 processed = ctxt->input->cur - ctxt->input->base;
Daniel Veillarda78d8032012-07-16 14:56:50 +08003515 xmlBufShrink(ctxt->input->buf->buffer, processed);
3516 nbchars = xmlCharEncInput(ctxt->input->buf);
Owen Taylor3473f882001-02-23 17:55:21 +00003517 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003518 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3519 "htmlCheckEncoding: encoder error\n",
3520 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003521 }
3522 ctxt->input->base =
Daniel Veillarda78d8032012-07-16 14:56:50 +08003523 ctxt->input->cur = xmlBufContent(ctxt->input->buf->buffer);
3524 ctxt->input->end = xmlBufEnd(ctxt->input->buf->buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00003525 }
3526 }
3527}
3528
3529/**
Denis Pauk868d92d2012-05-10 15:34:57 +08003530 * htmlCheckEncoding:
3531 * @ctxt: an HTML parser context
3532 * @attvalue: the attribute value
3533 *
3534 * Checks an http-equiv attribute from a Meta tag to detect
3535 * the encoding
3536 * If a new encoding is detected the parser is switched to decode
3537 * it and pass UTF8
3538 */
3539static void
3540htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3541 const xmlChar *encoding;
3542
3543 if (!attvalue)
3544 return;
3545
3546 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3547 if (encoding != NULL) {
3548 encoding += 7;
3549 }
3550 /*
3551 * skip blank
3552 */
3553 if (encoding && IS_BLANK_CH(*encoding))
3554 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3555 if (encoding && *encoding == '=') {
3556 encoding ++;
3557 htmlCheckEncodingDirect(ctxt, encoding);
3558 }
3559}
3560
3561/**
Owen Taylor3473f882001-02-23 17:55:21 +00003562 * htmlCheckMeta:
3563 * @ctxt: an HTML parser context
3564 * @atts: the attributes values
3565 *
3566 * Checks an attributes from a Meta tag
3567 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003568static void
Owen Taylor3473f882001-02-23 17:55:21 +00003569htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3570 int i;
3571 const xmlChar *att, *value;
3572 int http = 0;
3573 const xmlChar *content = NULL;
3574
3575 if ((ctxt == NULL) || (atts == NULL))
3576 return;
3577
3578 i = 0;
3579 att = atts[i++];
3580 while (att != NULL) {
3581 value = atts[i++];
3582 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3583 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3584 http = 1;
Denis Pauk868d92d2012-05-10 15:34:57 +08003585 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3586 htmlCheckEncodingDirect(ctxt, value);
Owen Taylor3473f882001-02-23 17:55:21 +00003587 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3588 content = value;
3589 att = atts[i++];
3590 }
3591 if ((http) && (content != NULL))
3592 htmlCheckEncoding(ctxt, content);
3593
3594}
3595
3596/**
3597 * htmlParseStartTag:
3598 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003599 *
Owen Taylor3473f882001-02-23 17:55:21 +00003600 * parse a start of tag either for rule element or
3601 * EmptyElement. In both case we don't parse the tag closing chars.
3602 *
3603 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3604 *
3605 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3606 *
3607 * With namespace:
3608 *
3609 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3610 *
3611 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3612 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003613 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003614 */
3615
Daniel Veillard597f1c12005-07-03 23:00:18 +00003616static int
Owen Taylor3473f882001-02-23 17:55:21 +00003617htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003618 const xmlChar *name;
3619 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003620 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003621 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003622 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003623 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003624 int meta = 0;
3625 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003626 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003627
Daniel Veillarde77db162009-08-22 11:32:38 +02003628 if (ctxt->instate == XML_PARSER_EOF)
3629 return(-1);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003630 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3631 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3632 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003633 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003634 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003635 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003636 NEXT;
3637
Daniel Veillard30e76072006-03-09 14:13:55 +00003638 atts = ctxt->atts;
3639 maxatts = ctxt->maxatts;
3640
Owen Taylor3473f882001-02-23 17:55:21 +00003641 GROW;
3642 name = htmlParseHTMLName(ctxt);
3643 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003644 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3645 "htmlParseStartTag: invalid element name\n",
3646 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003647 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003648 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3649 (ctxt->instate != XML_PARSER_EOF))
Owen Taylor3473f882001-02-23 17:55:21 +00003650 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003651 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003652 }
3653 if (xmlStrEqual(name, BAD_CAST"meta"))
3654 meta = 1;
3655
3656 /*
3657 * Check for auto-closure of HTML elements.
3658 */
3659 htmlAutoClose(ctxt, name);
3660
3661 /*
3662 * Check for implied HTML elements.
3663 */
3664 htmlCheckImplied(ctxt, name);
3665
3666 /*
3667 * Avoid html at any level > 0, head at any level != 1
3668 * or any attempt to recurse body
3669 */
3670 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003671 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3672 "htmlParseStartTag: misplaced <html> tag\n",
3673 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003674 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003675 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003676 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003677 if ((ctxt->nameNr != 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003678 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003679 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3680 "htmlParseStartTag: misplaced <head> tag\n",
3681 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003682 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003683 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003684 }
3685 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003686 int indx;
3687 for (indx = 0;indx < ctxt->nameNr;indx++) {
3688 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003689 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3690 "htmlParseStartTag: misplaced <body> tag\n",
3691 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003692 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003693 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003694 }
3695 }
3696 }
3697
3698 /*
3699 * Now parse the attributes, it ends up with the ending
3700 *
3701 * (S Attribute)* S?
3702 */
3703 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003704 while ((IS_CHAR_CH(CUR)) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003705 (CUR != '>') &&
Owen Taylor3473f882001-02-23 17:55:21 +00003706 ((CUR != '/') || (NXT(1) != '>'))) {
3707 long cons = ctxt->nbChars;
3708
3709 GROW;
3710 attname = htmlParseAttribute(ctxt, &attvalue);
3711 if (attname != NULL) {
3712
3713 /*
3714 * Well formedness requires at most one declaration of an attribute
3715 */
3716 for (i = 0; i < nbatts;i += 2) {
3717 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003718 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3719 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003720 if (attvalue != NULL)
3721 xmlFree(attvalue);
3722 goto failed;
3723 }
3724 }
3725
3726 /*
3727 * Add the pair to atts
3728 */
3729 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003730 maxatts = 22; /* allow for 10 attrs by default */
3731 atts = (const xmlChar **)
3732 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003733 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003734 htmlErrMemory(ctxt, NULL);
3735 if (attvalue != NULL)
3736 xmlFree(attvalue);
3737 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003738 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003739 ctxt->atts = atts;
3740 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003741 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003742 const xmlChar **n;
3743
Owen Taylor3473f882001-02-23 17:55:21 +00003744 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003745 n = (const xmlChar **) xmlRealloc((void *) atts,
3746 maxatts * sizeof(const xmlChar *));
3747 if (n == NULL) {
3748 htmlErrMemory(ctxt, NULL);
3749 if (attvalue != NULL)
3750 xmlFree(attvalue);
3751 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003752 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003753 atts = n;
3754 ctxt->atts = atts;
3755 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003756 }
3757 atts[nbatts++] = attname;
3758 atts[nbatts++] = attvalue;
3759 atts[nbatts] = NULL;
3760 atts[nbatts + 1] = NULL;
3761 }
3762 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003763 if (attvalue != NULL)
3764 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003765 /* Dump the bogus attribute string up to the next blank or
3766 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003767 while ((IS_CHAR_CH(CUR)) &&
3768 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003769 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003770 NEXT;
3771 }
3772
3773failed:
3774 SKIP_BLANKS;
3775 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003776 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3777 "htmlParseStartTag: problem parsing attributes\n",
3778 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003779 break;
3780 }
3781 }
3782
3783 /*
3784 * Handle specific association to the META tag
3785 */
William M. Bracke978ae22007-03-21 06:16:02 +00003786 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003787 htmlCheckMeta(ctxt, atts);
3788
3789 /*
3790 * SAX: Start of Element !
3791 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003792 if (!discardtag) {
3793 htmlnamePush(ctxt, name);
3794 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3795 if (nbatts != 0)
3796 ctxt->sax->startElement(ctxt->userData, name, atts);
3797 else
3798 ctxt->sax->startElement(ctxt->userData, name, NULL);
3799 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003800 }
Owen Taylor3473f882001-02-23 17:55:21 +00003801
3802 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003803 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003804 if (atts[i] != NULL)
3805 xmlFree((xmlChar *) atts[i]);
3806 }
Owen Taylor3473f882001-02-23 17:55:21 +00003807 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003808
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003809 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003810}
3811
3812/**
3813 * htmlParseEndTag:
3814 * @ctxt: an HTML parser context
3815 *
3816 * parse an end of tag
3817 *
3818 * [42] ETag ::= '</' Name S? '>'
3819 *
3820 * With namespace
3821 *
3822 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003823 *
3824 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003825 */
3826
Daniel Veillardf420ac52001-07-04 16:04:09 +00003827static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003828htmlParseEndTag(htmlParserCtxtPtr ctxt)
3829{
3830 const xmlChar *name;
3831 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003832 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003833
3834 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003835 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3836 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003837 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003838 }
3839 SKIP(2);
3840
3841 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003842 if (name == NULL)
3843 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003844 /*
3845 * We should definitely be at the ending "S? '>'" part
3846 */
3847 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003848 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003849 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3850 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003851 if (ctxt->recovery) {
3852 /*
3853 * We're not at the ending > !!
3854 * Error, unless in recover mode where we search forwards
3855 * until we find a >
3856 */
3857 while (CUR != '\0' && CUR != '>') NEXT;
3858 NEXT;
3859 }
Owen Taylor3473f882001-02-23 17:55:21 +00003860 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003861 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003862
3863 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003864 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3865 * out now.
3866 */
3867 if ((ctxt->depth > 0) &&
3868 (xmlStrEqual(name, BAD_CAST "html") ||
3869 xmlStrEqual(name, BAD_CAST "body") ||
3870 xmlStrEqual(name, BAD_CAST "head"))) {
3871 ctxt->depth--;
3872 return (0);
3873 }
3874
3875 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003876 * If the name read is not one of the element in the parsing stack
3877 * then return, it's just an error.
3878 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003879 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3880 if (xmlStrEqual(name, ctxt->nameTab[i]))
3881 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003882 }
3883 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003884 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3885 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003886 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003887 }
3888
3889
3890 /*
3891 * Check for auto-closure of HTML elements.
3892 */
3893
3894 htmlAutoCloseOnClose(ctxt, name);
3895
3896 /*
3897 * Well formedness constraints, opening and closing must match.
3898 * With the exception that the autoclose may have popped stuff out
3899 * of the stack.
3900 */
3901 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003902 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003903 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3904 "Opening and ending tag mismatch: %s and %s\n",
3905 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003906 }
3907 }
3908
3909 /*
3910 * SAX: End of Tag
3911 */
3912 oldname = ctxt->name;
3913 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003914 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3915 ctxt->sax->endElement(ctxt->userData, name);
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08003916 htmlNodeInfoPop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00003917 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003918 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003919 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003920 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003921 }
3922
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003923 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003924}
3925
3926
3927/**
3928 * htmlParseReference:
3929 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003930 *
Owen Taylor3473f882001-02-23 17:55:21 +00003931 * parse and handle entity references in content,
3932 * this will end-up in a call to character() since this is either a
3933 * CharRef, or a predefined entity.
3934 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003935static void
Owen Taylor3473f882001-02-23 17:55:21 +00003936htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003937 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003938 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003939 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003940 if (CUR != '&') return;
3941
3942 if (NXT(1) == '#') {
3943 unsigned int c;
3944 int bits, i = 0;
3945
3946 c = htmlParseCharRef(ctxt);
3947 if (c == 0)
3948 return;
3949
3950 if (c < 0x80) { out[i++]= c; bits= -6; }
3951 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3952 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3953 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003954
Owen Taylor3473f882001-02-23 17:55:21 +00003955 for ( ; bits >= 0; bits-= 6) {
3956 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3957 }
3958 out[i] = 0;
3959
3960 htmlCheckParagraph(ctxt);
3961 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3962 ctxt->sax->characters(ctxt->userData, out, i);
3963 } else {
3964 ent = htmlParseEntityRef(ctxt, &name);
3965 if (name == NULL) {
3966 htmlCheckParagraph(ctxt);
3967 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3968 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3969 return;
3970 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003971 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003972 htmlCheckParagraph(ctxt);
3973 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3974 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3975 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3976 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3977 }
3978 } else {
3979 unsigned int c;
3980 int bits, i = 0;
3981
3982 c = ent->value;
3983 if (c < 0x80)
3984 { out[i++]= c; bits= -6; }
3985 else if (c < 0x800)
3986 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3987 else if (c < 0x10000)
3988 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003989 else
Owen Taylor3473f882001-02-23 17:55:21 +00003990 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003991
Owen Taylor3473f882001-02-23 17:55:21 +00003992 for ( ; bits >= 0; bits-= 6) {
3993 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3994 }
3995 out[i] = 0;
3996
3997 htmlCheckParagraph(ctxt);
3998 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3999 ctxt->sax->characters(ctxt->userData, out, i);
4000 }
Owen Taylor3473f882001-02-23 17:55:21 +00004001 }
4002}
4003
4004/**
4005 * htmlParseContent:
4006 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00004007 *
4008 * Parse a content: comment, sub-element, reference or text.
Eugene Pimenov615904f2010-03-15 15:16:02 +01004009 * Kept for compatibility with old code
Owen Taylor3473f882001-02-23 17:55:21 +00004010 */
4011
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004012static void
Owen Taylor3473f882001-02-23 17:55:21 +00004013htmlParseContent(htmlParserCtxtPtr ctxt) {
4014 xmlChar *currentNode;
4015 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004016 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004017
4018 currentNode = xmlStrdup(ctxt->name);
4019 depth = ctxt->nameNr;
4020 while (1) {
4021 long cons = ctxt->nbChars;
4022
4023 GROW;
Daniel Veillarde77db162009-08-22 11:32:38 +02004024
4025 if (ctxt->instate == XML_PARSER_EOF)
4026 break;
4027
Owen Taylor3473f882001-02-23 17:55:21 +00004028 /*
4029 * Our tag or one of it's parent or children is ending.
4030 */
4031 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00004032 if (htmlParseEndTag(ctxt) &&
4033 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4034 if (currentNode != NULL)
4035 xmlFree(currentNode);
4036 return;
4037 }
4038 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00004039 }
4040
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004041 else if ((CUR == '<') &&
4042 ((IS_ASCII_LETTER(NXT(1))) ||
4043 (NXT(1) == '_') || (NXT(1) == ':'))) {
4044 name = htmlParseHTMLName_nonInvasive(ctxt);
4045 if (name == NULL) {
4046 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4047 "htmlParseStartTag: invalid element name\n",
4048 NULL, NULL);
4049 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02004050 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004051 NEXT;
4052
4053 if (currentNode != NULL)
4054 xmlFree(currentNode);
4055 return;
4056 }
4057
4058 if (ctxt->name != NULL) {
4059 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4060 htmlAutoClose(ctxt, name);
4061 continue;
4062 }
Daniel Veillarde77db162009-08-22 11:32:38 +02004063 }
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004064 }
4065
Owen Taylor3473f882001-02-23 17:55:21 +00004066 /*
4067 * Has this node been popped out during parsing of
4068 * the next element
4069 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00004070 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4071 (!xmlStrEqual(currentNode, ctxt->name)))
4072 {
Owen Taylor3473f882001-02-23 17:55:21 +00004073 if (currentNode != NULL) xmlFree(currentNode);
4074 return;
4075 }
4076
Daniel Veillardf9533d12001-03-03 10:04:57 +00004077 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4078 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004079 /*
4080 * Handle SCRIPT/STYLE separately
4081 */
4082 htmlParseScript(ctxt);
4083 } else {
4084 /*
4085 * Sometimes DOCTYPE arrives in the middle of the document
4086 */
4087 if ((CUR == '<') && (NXT(1) == '!') &&
4088 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4089 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4090 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4091 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004092 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4093 "Misplaced DOCTYPE declaration\n",
4094 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004095 htmlParseDocTypeDecl(ctxt);
4096 }
4097
4098 /*
4099 * First case : a comment
4100 */
4101 if ((CUR == '<') && (NXT(1) == '!') &&
4102 (NXT(2) == '-') && (NXT(3) == '-')) {
4103 htmlParseComment(ctxt);
4104 }
4105
4106 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004107 * Second case : a Processing Instruction.
4108 */
4109 else if ((CUR == '<') && (NXT(1) == '?')) {
4110 htmlParsePI(ctxt);
4111 }
4112
4113 /*
4114 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004115 */
4116 else if (CUR == '<') {
4117 htmlParseElement(ctxt);
4118 }
4119
4120 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004121 * Fourth case : a reference. If if has not been resolved,
Daniel Veillarde77db162009-08-22 11:32:38 +02004122 * parsing returns it's Name, create the node
Owen Taylor3473f882001-02-23 17:55:21 +00004123 */
4124 else if (CUR == '&') {
4125 htmlParseReference(ctxt);
4126 }
4127
4128 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004129 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004130 */
4131 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004132 htmlAutoCloseOnEnd(ctxt);
4133 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004134 }
4135
4136 /*
4137 * Last case, text. Note that References are handled directly.
4138 */
4139 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004140 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004141 }
4142
4143 if (cons == ctxt->nbChars) {
4144 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004145 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4146 "detected an error in element content\n",
4147 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004148 }
4149 break;
4150 }
4151 }
4152 GROW;
4153 }
4154 if (currentNode != NULL) xmlFree(currentNode);
4155}
4156
4157/**
4158 * htmlParseElement:
4159 * @ctxt: an HTML parser context
4160 *
4161 * parse an HTML element, this is highly recursive
Eugene Pimenov615904f2010-03-15 15:16:02 +01004162 * this is kept for compatibility with previous code versions
Owen Taylor3473f882001-02-23 17:55:21 +00004163 *
4164 * [39] element ::= EmptyElemTag | STag content ETag
4165 *
4166 * [41] Attribute ::= Name Eq AttValue
4167 */
4168
4169void
4170htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004171 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004172 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004173 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004174 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004175 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004176 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004177 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004178
Daniel Veillarda03e3652004-11-02 18:45:30 +00004179 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4180 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004181 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004182 return;
4183 }
Daniel Veillarddb4ac222009-08-22 17:58:31 +02004184
4185 if (ctxt->instate == XML_PARSER_EOF)
4186 return;
4187
Owen Taylor3473f882001-02-23 17:55:21 +00004188 /* Capture start position */
4189 if (ctxt->record_info) {
4190 node_info.begin_pos = ctxt->input->consumed +
4191 (CUR_PTR - ctxt->input->base);
4192 node_info.begin_line = ctxt->input->line;
4193 }
4194
Daniel Veillard597f1c12005-07-03 23:00:18 +00004195 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004196 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004197 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004198 if (CUR == '>')
4199 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004200 return;
4201 }
Owen Taylor3473f882001-02-23 17:55:21 +00004202
4203 /*
4204 * Lookup the info for that element.
4205 */
4206 info = htmlTagLookup(name);
4207 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004208 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4209 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004210 }
4211
4212 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004213 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004214 */
4215 if ((CUR == '/') && (NXT(1) == '>')) {
4216 SKIP(2);
4217 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4218 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004219 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004220 return;
4221 }
4222
4223 if (CUR == '>') {
4224 NEXT;
4225 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004226 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4227 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004228
4229 /*
4230 * end of parsing of this node.
4231 */
Daniel Veillarde77db162009-08-22 11:32:38 +02004232 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004233 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004234 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004235 }
Owen Taylor3473f882001-02-23 17:55:21 +00004236
4237 /*
4238 * Capture end position and add node
4239 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004240 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004241 node_info.end_pos = ctxt->input->consumed +
4242 (CUR_PTR - ctxt->input->base);
4243 node_info.end_line = ctxt->input->line;
4244 node_info.node = ctxt->node;
4245 xmlParserAddNodeInfo(ctxt, &node_info);
4246 }
4247 return;
4248 }
4249
4250 /*
4251 * Check for an Empty Element from DTD definition
4252 */
4253 if ((info != NULL) && (info->empty)) {
4254 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4255 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004256 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004257 return;
4258 }
4259
4260 /*
4261 * Parse the content of the element:
4262 */
4263 currentNode = xmlStrdup(ctxt->name);
4264 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004265 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004266 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004267 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004268 if (oldptr==ctxt->input->cur) break;
Daniel Veillarde77db162009-08-22 11:32:38 +02004269 if (ctxt->nameNr < depth) break;
4270 }
Owen Taylor3473f882001-02-23 17:55:21 +00004271
Owen Taylor3473f882001-02-23 17:55:21 +00004272 /*
4273 * Capture end position and add node
4274 */
4275 if ( currentNode != NULL && ctxt->record_info ) {
4276 node_info.end_pos = ctxt->input->consumed +
4277 (CUR_PTR - ctxt->input->base);
4278 node_info.end_line = ctxt->input->line;
4279 node_info.node = ctxt->node;
4280 xmlParserAddNodeInfo(ctxt, &node_info);
4281 }
William M. Brack76e95df2003-10-18 16:20:14 +00004282 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004283 htmlAutoCloseOnEnd(ctxt);
4284 }
4285
Owen Taylor3473f882001-02-23 17:55:21 +00004286 if (currentNode != NULL)
4287 xmlFree(currentNode);
4288}
4289
Eugene Pimenov615904f2010-03-15 15:16:02 +01004290static void
4291htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4292 /*
4293 * Capture end position and add node
4294 */
4295 if ( ctxt->node != NULL && ctxt->record_info ) {
4296 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4297 (CUR_PTR - ctxt->input->base);
4298 ctxt->nodeInfo->end_line = ctxt->input->line;
4299 ctxt->nodeInfo->node = ctxt->node;
4300 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4301 htmlNodeInfoPop(ctxt);
4302 }
4303 if (!IS_CHAR_CH(CUR)) {
4304 htmlAutoCloseOnEnd(ctxt);
4305 }
4306}
4307
4308/**
4309 * htmlParseElementInternal:
4310 * @ctxt: an HTML parser context
4311 *
4312 * parse an HTML element, new version, non recursive
4313 *
4314 * [39] element ::= EmptyElemTag | STag content ETag
4315 *
4316 * [41] Attribute ::= Name Eq AttValue
4317 */
4318
4319static void
4320htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4321 const xmlChar *name;
4322 const htmlElemDesc * info;
4323 htmlParserNodeInfo node_info;
4324 int failed;
Eugene Pimenov615904f2010-03-15 15:16:02 +01004325
4326 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4327 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4328 "htmlParseElementInternal: context error\n", NULL, NULL);
4329 return;
4330 }
4331
4332 if (ctxt->instate == XML_PARSER_EOF)
4333 return;
4334
4335 /* Capture start position */
4336 if (ctxt->record_info) {
4337 node_info.begin_pos = ctxt->input->consumed +
4338 (CUR_PTR - ctxt->input->base);
4339 node_info.begin_line = ctxt->input->line;
4340 }
4341
4342 failed = htmlParseStartTag(ctxt);
4343 name = ctxt->name;
4344 if ((failed == -1) || (name == NULL)) {
4345 if (CUR == '>')
4346 NEXT;
4347 return;
4348 }
4349
4350 /*
4351 * Lookup the info for that element.
4352 */
4353 info = htmlTagLookup(name);
4354 if (info == NULL) {
4355 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4356 "Tag %s invalid\n", name, NULL);
4357 }
4358
4359 /*
4360 * Check for an Empty Element labeled the XML/SGML way
4361 */
4362 if ((CUR == '/') && (NXT(1) == '>')) {
4363 SKIP(2);
4364 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4365 ctxt->sax->endElement(ctxt->userData, name);
4366 htmlnamePop(ctxt);
4367 return;
4368 }
4369
4370 if (CUR == '>') {
4371 NEXT;
4372 } else {
4373 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4374 "Couldn't find end of Start Tag %s\n", name, NULL);
4375
4376 /*
4377 * end of parsing of this node.
4378 */
4379 if (xmlStrEqual(name, ctxt->name)) {
4380 nodePop(ctxt);
4381 htmlnamePop(ctxt);
4382 }
4383
4384 if (ctxt->record_info)
4385 htmlNodeInfoPush(ctxt, &node_info);
4386 htmlParserFinishElementParsing(ctxt);
4387 return;
4388 }
4389
4390 /*
4391 * Check for an Empty Element from DTD definition
4392 */
4393 if ((info != NULL) && (info->empty)) {
4394 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4395 ctxt->sax->endElement(ctxt->userData, name);
4396 htmlnamePop(ctxt);
4397 return;
4398 }
4399
4400 if (ctxt->record_info)
4401 htmlNodeInfoPush(ctxt, &node_info);
4402}
4403
4404/**
4405 * htmlParseContentInternal:
4406 * @ctxt: an HTML parser context
4407 *
4408 * Parse a content: comment, sub-element, reference or text.
4409 * New version for non recursive htmlParseElementInternal
4410 */
4411
4412static void
4413htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4414 xmlChar *currentNode;
4415 int depth;
4416 const xmlChar *name;
4417
4418 currentNode = xmlStrdup(ctxt->name);
4419 depth = ctxt->nameNr;
4420 while (1) {
4421 long cons = ctxt->nbChars;
4422
4423 GROW;
4424
4425 if (ctxt->instate == XML_PARSER_EOF)
4426 break;
4427
4428 /*
4429 * Our tag or one of it's parent or children is ending.
4430 */
4431 if ((CUR == '<') && (NXT(1) == '/')) {
4432 if (htmlParseEndTag(ctxt) &&
4433 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4434 if (currentNode != NULL)
4435 xmlFree(currentNode);
4436
4437 currentNode = xmlStrdup(ctxt->name);
4438 depth = ctxt->nameNr;
4439 }
4440 continue; /* while */
4441 }
4442
4443 else if ((CUR == '<') &&
4444 ((IS_ASCII_LETTER(NXT(1))) ||
4445 (NXT(1) == '_') || (NXT(1) == ':'))) {
4446 name = htmlParseHTMLName_nonInvasive(ctxt);
4447 if (name == NULL) {
4448 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4449 "htmlParseStartTag: invalid element name\n",
4450 NULL, NULL);
4451 /* Dump the bogus tag like browsers do */
4452 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4453 NEXT;
4454
4455 htmlParserFinishElementParsing(ctxt);
4456 if (currentNode != NULL)
4457 xmlFree(currentNode);
4458
4459 currentNode = xmlStrdup(ctxt->name);
4460 depth = ctxt->nameNr;
4461 continue;
4462 }
4463
4464 if (ctxt->name != NULL) {
4465 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4466 htmlAutoClose(ctxt, name);
4467 continue;
4468 }
4469 }
4470 }
4471
4472 /*
4473 * Has this node been popped out during parsing of
4474 * the next element
4475 */
4476 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4477 (!xmlStrEqual(currentNode, ctxt->name)))
4478 {
4479 htmlParserFinishElementParsing(ctxt);
4480 if (currentNode != NULL) xmlFree(currentNode);
4481
4482 currentNode = xmlStrdup(ctxt->name);
4483 depth = ctxt->nameNr;
4484 continue;
4485 }
4486
4487 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4488 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4489 /*
4490 * Handle SCRIPT/STYLE separately
4491 */
4492 htmlParseScript(ctxt);
4493 } else {
4494 /*
4495 * Sometimes DOCTYPE arrives in the middle of the document
4496 */
4497 if ((CUR == '<') && (NXT(1) == '!') &&
4498 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4499 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4500 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4501 (UPP(8) == 'E')) {
4502 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4503 "Misplaced DOCTYPE declaration\n",
4504 BAD_CAST "DOCTYPE" , NULL);
4505 htmlParseDocTypeDecl(ctxt);
4506 }
4507
4508 /*
4509 * First case : a comment
4510 */
4511 if ((CUR == '<') && (NXT(1) == '!') &&
4512 (NXT(2) == '-') && (NXT(3) == '-')) {
4513 htmlParseComment(ctxt);
4514 }
4515
4516 /*
4517 * Second case : a Processing Instruction.
4518 */
4519 else if ((CUR == '<') && (NXT(1) == '?')) {
4520 htmlParsePI(ctxt);
4521 }
4522
4523 /*
4524 * Third case : a sub-element.
4525 */
4526 else if (CUR == '<') {
4527 htmlParseElementInternal(ctxt);
4528 if (currentNode != NULL) xmlFree(currentNode);
4529
4530 currentNode = xmlStrdup(ctxt->name);
4531 depth = ctxt->nameNr;
4532 }
4533
4534 /*
4535 * Fourth case : a reference. If if has not been resolved,
4536 * parsing returns it's Name, create the node
4537 */
4538 else if (CUR == '&') {
4539 htmlParseReference(ctxt);
4540 }
4541
4542 /*
4543 * Fifth case : end of the resource
4544 */
4545 else if (CUR == 0) {
4546 htmlAutoCloseOnEnd(ctxt);
4547 break;
4548 }
4549
4550 /*
4551 * Last case, text. Note that References are handled directly.
4552 */
4553 else {
4554 htmlParseCharData(ctxt);
4555 }
4556
4557 if (cons == ctxt->nbChars) {
4558 if (ctxt->node != NULL) {
4559 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4560 "detected an error in element content\n",
4561 NULL, NULL);
4562 }
4563 break;
4564 }
4565 }
4566 GROW;
4567 }
4568 if (currentNode != NULL) xmlFree(currentNode);
4569}
4570
4571/**
4572 * htmlParseContent:
4573 * @ctxt: an HTML parser context
4574 *
4575 * Parse a content: comment, sub-element, reference or text.
4576 * This is the entry point when called from parser.c
4577 */
4578
4579void
4580__htmlParseContent(void *ctxt) {
4581 if (ctxt != NULL)
4582 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4583}
4584
Owen Taylor3473f882001-02-23 17:55:21 +00004585/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004586 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004587 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004588 *
Owen Taylor3473f882001-02-23 17:55:21 +00004589 * parse an HTML document (and build a tree if using the standard SAX
4590 * interface).
4591 *
4592 * Returns 0, -1 in case of error. the parser context is augmented
4593 * as a result of the parsing.
4594 */
4595
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004596int
Owen Taylor3473f882001-02-23 17:55:21 +00004597htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004598 xmlChar start[4];
4599 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004600 xmlDtdPtr dtd;
4601
Daniel Veillardd0463562001-10-13 09:15:48 +00004602 xmlInitParser();
4603
Owen Taylor3473f882001-02-23 17:55:21 +00004604 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004605
Daniel Veillarda03e3652004-11-02 18:45:30 +00004606 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4607 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4608 "htmlParseDocument: context error\n", NULL, NULL);
4609 return(XML_ERR_INTERNAL_ERROR);
4610 }
4611 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004612 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004613 GROW;
4614 /*
4615 * SAX: beginning of the document processing.
4616 */
4617 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4618 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4619
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004620 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4621 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4622 /*
4623 * Get the 4 first bytes and decode the charset
4624 * if enc != XML_CHAR_ENCODING_NONE
4625 * plug some encoding conversion routines.
4626 */
4627 start[0] = RAW;
4628 start[1] = NXT(1);
4629 start[2] = NXT(2);
4630 start[3] = NXT(3);
4631 enc = xmlDetectCharEncoding(&start[0], 4);
4632 if (enc != XML_CHAR_ENCODING_NONE) {
4633 xmlSwitchEncoding(ctxt, enc);
4634 }
4635 }
4636
Owen Taylor3473f882001-02-23 17:55:21 +00004637 /*
4638 * Wipe out everything which is before the first '<'
4639 */
4640 SKIP_BLANKS;
4641 if (CUR == 0) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004642 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
Daniel Veillardf403d292003-10-05 13:51:35 +00004643 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004644 }
4645
4646 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4647 ctxt->sax->startDocument(ctxt->userData);
4648
4649
4650 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004651 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004652 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004653 while (((CUR == '<') && (NXT(1) == '!') &&
4654 (NXT(2) == '-') && (NXT(3) == '-')) ||
4655 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004656 htmlParseComment(ctxt);
4657 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004658 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004659 }
Owen Taylor3473f882001-02-23 17:55:21 +00004660
4661
4662 /*
4663 * Then possibly doc type declaration(s) and more Misc
4664 * (doctypedecl Misc*)?
4665 */
4666 if ((CUR == '<') && (NXT(1) == '!') &&
4667 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4668 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4669 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4670 (UPP(8) == 'E')) {
4671 htmlParseDocTypeDecl(ctxt);
4672 }
4673 SKIP_BLANKS;
4674
4675 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004676 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004677 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004678 while (((CUR == '<') && (NXT(1) == '!') &&
4679 (NXT(2) == '-') && (NXT(3) == '-')) ||
4680 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004681 htmlParseComment(ctxt);
4682 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004683 SKIP_BLANKS;
Daniel Veillarde77db162009-08-22 11:32:38 +02004684 }
Owen Taylor3473f882001-02-23 17:55:21 +00004685
4686 /*
4687 * Time to start parsing the tree itself
4688 */
Eugene Pimenov615904f2010-03-15 15:16:02 +01004689 htmlParseContentInternal(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004690
4691 /*
4692 * autoclose
4693 */
4694 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004695 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004696
4697
4698 /*
4699 * SAX: end of the document processing.
4700 */
4701 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4702 ctxt->sax->endDocument(ctxt->userData);
4703
Daniel Veillardf1121c42010-07-26 14:02:42 +02004704 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004705 dtd = xmlGetIntSubset(ctxt->myDoc);
4706 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02004707 ctxt->myDoc->intSubset =
4708 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004709 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4710 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4711 }
4712 if (! ctxt->wellFormed) return(-1);
4713 return(0);
4714}
4715
4716
4717/************************************************************************
4718 * *
4719 * Parser contexts handling *
4720 * *
4721 ************************************************************************/
4722
4723/**
William M. Brackedb65a72004-02-06 07:36:04 +00004724 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004725 * @ctxt: an HTML parser context
4726 *
4727 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004728 *
4729 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004730 */
4731
Daniel Veillardf403d292003-10-05 13:51:35 +00004732static int
Owen Taylor3473f882001-02-23 17:55:21 +00004733htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4734{
4735 htmlSAXHandler *sax;
4736
Daniel Veillardf403d292003-10-05 13:51:35 +00004737 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004738 memset(ctxt, 0, sizeof(htmlParserCtxt));
4739
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004740 ctxt->dict = xmlDictCreate();
4741 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004742 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4743 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004744 }
Owen Taylor3473f882001-02-23 17:55:21 +00004745 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4746 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004747 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4748 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004749 }
4750 else
4751 memset(sax, 0, sizeof(htmlSAXHandler));
4752
4753 /* Allocate the Input stack */
Daniel Veillarde77db162009-08-22 11:32:38 +02004754 ctxt->inputTab = (htmlParserInputPtr *)
Owen Taylor3473f882001-02-23 17:55:21 +00004755 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4756 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004757 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004758 ctxt->inputNr = 0;
4759 ctxt->inputMax = 0;
4760 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004761 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004762 }
4763 ctxt->inputNr = 0;
4764 ctxt->inputMax = 5;
4765 ctxt->input = NULL;
4766 ctxt->version = NULL;
4767 ctxt->encoding = NULL;
4768 ctxt->standalone = -1;
4769 ctxt->instate = XML_PARSER_START;
4770
4771 /* Allocate the Node stack */
4772 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4773 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004774 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004775 ctxt->nodeNr = 0;
4776 ctxt->nodeMax = 0;
4777 ctxt->node = NULL;
4778 ctxt->inputNr = 0;
4779 ctxt->inputMax = 0;
4780 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004781 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004782 }
4783 ctxt->nodeNr = 0;
4784 ctxt->nodeMax = 10;
4785 ctxt->node = NULL;
4786
4787 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004788 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004789 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004790 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004791 ctxt->nameNr = 0;
Eugene Pimenovef9c6362010-03-15 11:37:48 +01004792 ctxt->nameMax = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004793 ctxt->name = NULL;
4794 ctxt->nodeNr = 0;
4795 ctxt->nodeMax = 0;
4796 ctxt->node = NULL;
4797 ctxt->inputNr = 0;
4798 ctxt->inputMax = 0;
4799 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004800 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004801 }
4802 ctxt->nameNr = 0;
4803 ctxt->nameMax = 10;
4804 ctxt->name = NULL;
4805
Eugene Pimenov615904f2010-03-15 15:16:02 +01004806 ctxt->nodeInfoTab = NULL;
4807 ctxt->nodeInfoNr = 0;
4808 ctxt->nodeInfoMax = 0;
4809
Daniel Veillard092643b2003-09-25 14:29:29 +00004810 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004811 else {
4812 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004813 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004814 }
4815 ctxt->userData = ctxt;
4816 ctxt->myDoc = NULL;
4817 ctxt->wellFormed = 1;
4818 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004819 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004820 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004821 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004822 ctxt->vctxt.userData = ctxt;
4823 ctxt->vctxt.error = xmlParserValidityError;
4824 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004825 ctxt->record_info = 0;
4826 ctxt->validate = 0;
4827 ctxt->nbChars = 0;
4828 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004829 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004830 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004831 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004832}
4833
4834/**
4835 * htmlFreeParserCtxt:
4836 * @ctxt: an HTML parser context
4837 *
4838 * Free all the memory used by a parser context. However the parsed
4839 * document in ctxt->myDoc is not freed.
4840 */
4841
4842void
4843htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4844{
4845 xmlFreeParserCtxt(ctxt);
4846}
4847
4848/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004849 * htmlNewParserCtxt:
4850 *
4851 * Allocate and initialize a new parser context.
4852 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004853 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004854 */
4855
Daniel Veillard34c647c2006-09-21 06:53:59 +00004856htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004857htmlNewParserCtxt(void)
4858{
4859 xmlParserCtxtPtr ctxt;
4860
4861 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4862 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004863 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004864 return(NULL);
4865 }
4866 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004867 if (htmlInitParserCtxt(ctxt) < 0) {
4868 htmlFreeParserCtxt(ctxt);
4869 return(NULL);
4870 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004871 return(ctxt);
4872}
4873
4874/**
4875 * htmlCreateMemoryParserCtxt:
4876 * @buffer: a pointer to a char array
4877 * @size: the size of the array
4878 *
4879 * Create a parser context for an HTML in-memory document.
4880 *
4881 * Returns the new parser context or NULL
4882 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004883htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004884htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4885 xmlParserCtxtPtr ctxt;
4886 xmlParserInputPtr input;
4887 xmlParserInputBufferPtr buf;
4888
4889 if (buffer == NULL)
4890 return(NULL);
4891 if (size <= 0)
4892 return(NULL);
4893
4894 ctxt = htmlNewParserCtxt();
4895 if (ctxt == NULL)
4896 return(NULL);
4897
4898 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4899 if (buf == NULL) return(NULL);
4900
4901 input = xmlNewInputStream(ctxt);
4902 if (input == NULL) {
4903 xmlFreeParserCtxt(ctxt);
4904 return(NULL);
4905 }
4906
4907 input->filename = NULL;
4908 input->buf = buf;
Daniel Veillarda78d8032012-07-16 14:56:50 +08004909 input->cur =
4910 input->base = xmlBufContent(input->buf->buffer);
4911 input->end = xmlBufEnd(input->buf->buffer);
Daniel Veillard1d995272002-07-22 16:43:32 +00004912
4913 inputPush(ctxt, input);
4914 return(ctxt);
4915}
4916
4917/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004918 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004919 * @cur: a pointer to an array of xmlChar
4920 * @encoding: a free form C string describing the HTML document encoding, or NULL
4921 *
4922 * Create a parser context for an HTML document.
4923 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004924 * TODO: check the need to add encoding handling there
4925 *
Owen Taylor3473f882001-02-23 17:55:21 +00004926 * Returns the new parser context or NULL
4927 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004928static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004929htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004930 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004931 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004932
Daniel Veillard1d995272002-07-22 16:43:32 +00004933 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004934 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004935 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004936 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004937 if (ctxt == NULL)
4938 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004939
4940 if (encoding != NULL) {
4941 xmlCharEncoding enc;
4942 xmlCharEncodingHandlerPtr handler;
4943
4944 if (ctxt->input->encoding != NULL)
4945 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004946 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004947
4948 enc = xmlParseCharEncoding(encoding);
4949 /*
4950 * registered set of known encodings
4951 */
4952 if (enc != XML_CHAR_ENCODING_ERROR) {
4953 xmlSwitchEncoding(ctxt, enc);
4954 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004955 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
Daniel Veillarde77db162009-08-22 11:32:38 +02004956 "Unsupported encoding %s\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00004957 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004958 }
4959 } else {
4960 /*
4961 * fallback for unknown encodings
4962 */
4963 handler = xmlFindCharEncodingHandler((const char *) encoding);
4964 if (handler != NULL) {
4965 xmlSwitchToEncoding(ctxt, handler);
4966 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004967 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4968 "Unsupported encoding %s\n",
4969 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004970 }
4971 }
4972 }
4973 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004974}
4975
Daniel Veillard73b013f2003-09-30 12:36:01 +00004976#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004977/************************************************************************
4978 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02004979 * Progressive parsing interfaces *
Owen Taylor3473f882001-02-23 17:55:21 +00004980 * *
4981 ************************************************************************/
4982
4983/**
4984 * htmlParseLookupSequence:
4985 * @ctxt: an HTML parser context
4986 * @first: the first char to lookup
4987 * @next: the next char to lookup or zero
4988 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004989 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004990 *
4991 * Try to find if a sequence (first, next, third) or just (first next) or
4992 * (first) is available in the input stream.
4993 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4994 * to avoid rescanning sequences of bytes, it DOES change the state of the
4995 * parser, do not use liberally.
4996 * This is basically similar to xmlParseLookupSequence()
4997 *
4998 * Returns the index to the current parsing point if the full sequence
4999 * is available, -1 otherwise.
5000 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005001static int
Owen Taylor3473f882001-02-23 17:55:21 +00005002htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02005003 xmlChar next, xmlChar third, int iscomment,
Daniel Veillardeeb99322009-08-25 14:42:16 +02005004 int ignoreattrval)
5005{
Owen Taylor3473f882001-02-23 17:55:21 +00005006 int base, len;
5007 htmlParserInputPtr in;
5008 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00005009 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02005010 int invalue = 0;
5011 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00005012
5013 in = ctxt->input;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005014 if (in == NULL)
5015 return (-1);
5016
Owen Taylor3473f882001-02-23 17:55:21 +00005017 base = in->cur - in->base;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005018 if (base < 0)
5019 return (-1);
5020
Owen Taylor3473f882001-02-23 17:55:21 +00005021 if (ctxt->checkIndex > base)
5022 base = ctxt->checkIndex;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005023
Owen Taylor3473f882001-02-23 17:55:21 +00005024 if (in->buf == NULL) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02005025 buf = in->base;
5026 len = in->length;
Owen Taylor3473f882001-02-23 17:55:21 +00005027 } else {
Daniel Veillarda78d8032012-07-16 14:56:50 +08005028 buf = xmlBufContent(in->buf->buffer);
5029 len = xmlBufUse(in->buf->buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00005030 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02005031
Owen Taylor3473f882001-02-23 17:55:21 +00005032 /* take into account the sequence length */
Daniel Veillardeeb99322009-08-25 14:42:16 +02005033 if (third)
5034 len -= 2;
5035 else if (next)
5036 len--;
5037 for (; base < len; base++) {
5038 if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5039 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5040 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5041 incomment = 1;
5042 /* do not increment past <! - some people use <!--> */
5043 base += 2;
5044 }
5045 }
5046 if (ignoreattrval) {
5047 if (buf[base] == '"' || buf[base] == '\'') {
5048 if (invalue) {
5049 if (buf[base] == valdellim) {
5050 invalue = 0;
5051 continue;
5052 }
5053 } else {
5054 valdellim = buf[base];
5055 invalue = 1;
5056 continue;
5057 }
5058 } else if (invalue) {
5059 continue;
5060 }
5061 }
5062 if (incomment) {
5063 if (base + 3 > len)
5064 return (-1);
5065 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5066 (buf[base + 2] == '>')) {
5067 incomment = 0;
5068 base += 2;
5069 }
5070 continue;
5071 }
Owen Taylor3473f882001-02-23 17:55:21 +00005072 if (buf[base] == first) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02005073 if (third != 0) {
5074 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5075 continue;
5076 } else if (next != 0) {
5077 if (buf[base + 1] != next)
5078 continue;
5079 }
5080 ctxt->checkIndex = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00005081#ifdef DEBUG_PUSH
Daniel Veillardeeb99322009-08-25 14:42:16 +02005082 if (next == 0)
5083 xmlGenericError(xmlGenericErrorContext,
5084 "HPP: lookup '%c' found at %d\n",
5085 first, base);
5086 else if (third == 0)
5087 xmlGenericError(xmlGenericErrorContext,
5088 "HPP: lookup '%c%c' found at %d\n",
5089 first, next, base);
5090 else
5091 xmlGenericError(xmlGenericErrorContext,
5092 "HPP: lookup '%c%c%c' found at %d\n",
5093 first, next, third, base);
Owen Taylor3473f882001-02-23 17:55:21 +00005094#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02005095 return (base - (in->cur - in->base));
5096 }
Owen Taylor3473f882001-02-23 17:55:21 +00005097 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02005098 if ((!incomment) && (!invalue))
5099 ctxt->checkIndex = base;
Owen Taylor3473f882001-02-23 17:55:21 +00005100#ifdef DEBUG_PUSH
5101 if (next == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02005102 xmlGenericError(xmlGenericErrorContext,
5103 "HPP: lookup '%c' failed\n", first);
Owen Taylor3473f882001-02-23 17:55:21 +00005104 else if (third == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02005105 xmlGenericError(xmlGenericErrorContext,
5106 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillarde77db162009-08-22 11:32:38 +02005107 else
Daniel Veillardeeb99322009-08-25 14:42:16 +02005108 xmlGenericError(xmlGenericErrorContext,
5109 "HPP: lookup '%c%c%c' failed\n", first, next,
5110 third);
Owen Taylor3473f882001-02-23 17:55:21 +00005111#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02005112 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00005113}
5114
5115/**
Markus Kull56a03032009-08-24 19:00:23 +02005116 * htmlParseLookupChars:
5117 * @ctxt: an HTML parser context
5118 * @stop: Array of chars, which stop the lookup.
5119 * @stopLen: Length of stop-Array
5120 *
5121 * Try to find if any char of the stop-Array is available in the input
5122 * stream.
5123 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5124 * to avoid rescanning sequences of bytes, it DOES change the state of the
5125 * parser, do not use liberally.
5126 *
5127 * Returns the index to the current parsing point if a stopChar
5128 * is available, -1 otherwise.
5129 */
5130static int
5131htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5132 int stopLen)
5133{
5134 int base, len;
5135 htmlParserInputPtr in;
5136 const xmlChar *buf;
5137 int incomment = 0;
5138 int i;
5139
5140 in = ctxt->input;
5141 if (in == NULL)
5142 return (-1);
5143
5144 base = in->cur - in->base;
5145 if (base < 0)
5146 return (-1);
5147
5148 if (ctxt->checkIndex > base)
5149 base = ctxt->checkIndex;
5150
5151 if (in->buf == NULL) {
5152 buf = in->base;
5153 len = in->length;
5154 } else {
Daniel Veillarda78d8032012-07-16 14:56:50 +08005155 buf = xmlBufContent(in->buf->buffer);
5156 len = xmlBufUse(in->buf->buffer);
Markus Kull56a03032009-08-24 19:00:23 +02005157 }
5158
5159 for (; base < len; base++) {
5160 if (!incomment && (base + 4 < len)) {
5161 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5162 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5163 incomment = 1;
5164 /* do not increment past <! - some people use <!--> */
5165 base += 2;
5166 }
5167 }
5168 if (incomment) {
5169 if (base + 3 > len)
5170 return (-1);
5171 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5172 (buf[base + 2] == '>')) {
5173 incomment = 0;
5174 base += 2;
5175 }
5176 continue;
5177 }
5178 for (i = 0; i < stopLen; ++i) {
5179 if (buf[base] == stop[i]) {
5180 ctxt->checkIndex = 0;
5181 return (base - (in->cur - in->base));
5182 }
5183 }
5184 }
5185 ctxt->checkIndex = base;
5186 return (-1);
5187}
5188
5189/**
Owen Taylor3473f882001-02-23 17:55:21 +00005190 * htmlParseTryOrFinish:
5191 * @ctxt: an HTML parser context
5192 * @terminate: last chunk indicator
5193 *
5194 * Try to progress on parsing
5195 *
5196 * Returns zero if no parsing was possible
5197 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005198static int
Owen Taylor3473f882001-02-23 17:55:21 +00005199htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5200 int ret = 0;
5201 htmlParserInputPtr in;
5202 int avail = 0;
5203 xmlChar cur, next;
5204
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005205 htmlParserNodeInfo node_info;
5206
Owen Taylor3473f882001-02-23 17:55:21 +00005207#ifdef DEBUG_PUSH
5208 switch (ctxt->instate) {
5209 case XML_PARSER_EOF:
5210 xmlGenericError(xmlGenericErrorContext,
5211 "HPP: try EOF\n"); break;
5212 case XML_PARSER_START:
5213 xmlGenericError(xmlGenericErrorContext,
5214 "HPP: try START\n"); break;
5215 case XML_PARSER_MISC:
5216 xmlGenericError(xmlGenericErrorContext,
5217 "HPP: try MISC\n");break;
5218 case XML_PARSER_COMMENT:
5219 xmlGenericError(xmlGenericErrorContext,
5220 "HPP: try COMMENT\n");break;
5221 case XML_PARSER_PROLOG:
5222 xmlGenericError(xmlGenericErrorContext,
5223 "HPP: try PROLOG\n");break;
5224 case XML_PARSER_START_TAG:
5225 xmlGenericError(xmlGenericErrorContext,
5226 "HPP: try START_TAG\n");break;
5227 case XML_PARSER_CONTENT:
5228 xmlGenericError(xmlGenericErrorContext,
5229 "HPP: try CONTENT\n");break;
5230 case XML_PARSER_CDATA_SECTION:
5231 xmlGenericError(xmlGenericErrorContext,
5232 "HPP: try CDATA_SECTION\n");break;
5233 case XML_PARSER_END_TAG:
5234 xmlGenericError(xmlGenericErrorContext,
5235 "HPP: try END_TAG\n");break;
5236 case XML_PARSER_ENTITY_DECL:
5237 xmlGenericError(xmlGenericErrorContext,
5238 "HPP: try ENTITY_DECL\n");break;
5239 case XML_PARSER_ENTITY_VALUE:
5240 xmlGenericError(xmlGenericErrorContext,
5241 "HPP: try ENTITY_VALUE\n");break;
5242 case XML_PARSER_ATTRIBUTE_VALUE:
5243 xmlGenericError(xmlGenericErrorContext,
5244 "HPP: try ATTRIBUTE_VALUE\n");break;
5245 case XML_PARSER_DTD:
5246 xmlGenericError(xmlGenericErrorContext,
5247 "HPP: try DTD\n");break;
5248 case XML_PARSER_EPILOG:
5249 xmlGenericError(xmlGenericErrorContext,
5250 "HPP: try EPILOG\n");break;
5251 case XML_PARSER_PI:
5252 xmlGenericError(xmlGenericErrorContext,
5253 "HPP: try PI\n");break;
5254 case XML_PARSER_SYSTEM_LITERAL:
5255 xmlGenericError(xmlGenericErrorContext,
5256 "HPP: try SYSTEM_LITERAL\n");break;
5257 }
5258#endif
5259
5260 while (1) {
5261
5262 in = ctxt->input;
5263 if (in == NULL) break;
5264 if (in->buf == NULL)
5265 avail = in->length - (in->cur - in->base);
5266 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005267 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Owen Taylor3473f882001-02-23 17:55:21 +00005268 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005269 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005270 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005271 /*
5272 * SAX: end of the document processing.
5273 */
5274 ctxt->instate = XML_PARSER_EOF;
5275 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5276 ctxt->sax->endDocument(ctxt->userData);
5277 }
5278 }
5279 if (avail < 1)
5280 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00005281 cur = in->cur[0];
5282 if (cur == 0) {
5283 SKIP(1);
5284 continue;
5285 }
5286
Owen Taylor3473f882001-02-23 17:55:21 +00005287 switch (ctxt->instate) {
5288 case XML_PARSER_EOF:
5289 /*
5290 * Document parsing is done !
5291 */
5292 goto done;
5293 case XML_PARSER_START:
5294 /*
5295 * Very first chars read from the document flow.
5296 */
5297 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005298 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005299 SKIP_BLANKS;
5300 if (in->buf == NULL)
5301 avail = in->length - (in->cur - in->base);
5302 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005303 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Owen Taylor3473f882001-02-23 17:55:21 +00005304 }
5305 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5306 ctxt->sax->setDocumentLocator(ctxt->userData,
5307 &xmlDefaultSAXLocator);
5308 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5309 (!ctxt->disableSAX))
5310 ctxt->sax->startDocument(ctxt->userData);
5311
5312 cur = in->cur[0];
5313 next = in->cur[1];
5314 if ((cur == '<') && (next == '!') &&
5315 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5316 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5317 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5318 (UPP(8) == 'E')) {
5319 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005320 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005321 goto done;
5322#ifdef DEBUG_PUSH
5323 xmlGenericError(xmlGenericErrorContext,
5324 "HPP: Parsing internal subset\n");
5325#endif
5326 htmlParseDocTypeDecl(ctxt);
5327 ctxt->instate = XML_PARSER_PROLOG;
5328#ifdef DEBUG_PUSH
5329 xmlGenericError(xmlGenericErrorContext,
5330 "HPP: entering PROLOG\n");
5331#endif
5332 } else {
5333 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005334#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00005335 xmlGenericError(xmlGenericErrorContext,
5336 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005337#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00005338 }
Owen Taylor3473f882001-02-23 17:55:21 +00005339 break;
5340 case XML_PARSER_MISC:
5341 SKIP_BLANKS;
5342 if (in->buf == NULL)
5343 avail = in->length - (in->cur - in->base);
5344 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005345 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Denis Paukfdf990c2012-05-10 20:40:49 +08005346 /*
5347 * no chars in buffer
5348 */
5349 if (avail < 1)
Owen Taylor3473f882001-02-23 17:55:21 +00005350 goto done;
Denis Paukfdf990c2012-05-10 20:40:49 +08005351 /*
5352 * not enouth chars in buffer
5353 */
5354 if (avail < 2) {
5355 if (!terminate)
5356 goto done;
5357 else
5358 next = ' ';
5359 } else {
5360 next = in->cur[1];
5361 }
Owen Taylor3473f882001-02-23 17:55:21 +00005362 cur = in->cur[0];
Owen Taylor3473f882001-02-23 17:55:21 +00005363 if ((cur == '<') && (next == '!') &&
5364 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5365 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005366 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005367 goto done;
5368#ifdef DEBUG_PUSH
5369 xmlGenericError(xmlGenericErrorContext,
5370 "HPP: Parsing Comment\n");
5371#endif
5372 htmlParseComment(ctxt);
5373 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005374 } else if ((cur == '<') && (next == '?')) {
5375 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005376 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005377 goto done;
5378#ifdef DEBUG_PUSH
5379 xmlGenericError(xmlGenericErrorContext,
5380 "HPP: Parsing PI\n");
5381#endif
5382 htmlParsePI(ctxt);
5383 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005384 } else if ((cur == '<') && (next == '!') &&
5385 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5386 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5387 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5388 (UPP(8) == 'E')) {
5389 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005390 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005391 goto done;
5392#ifdef DEBUG_PUSH
5393 xmlGenericError(xmlGenericErrorContext,
5394 "HPP: Parsing internal subset\n");
5395#endif
5396 htmlParseDocTypeDecl(ctxt);
5397 ctxt->instate = XML_PARSER_PROLOG;
5398#ifdef DEBUG_PUSH
5399 xmlGenericError(xmlGenericErrorContext,
5400 "HPP: entering PROLOG\n");
5401#endif
5402 } else if ((cur == '<') && (next == '!') &&
5403 (avail < 9)) {
5404 goto done;
5405 } else {
5406 ctxt->instate = XML_PARSER_START_TAG;
5407#ifdef DEBUG_PUSH
5408 xmlGenericError(xmlGenericErrorContext,
5409 "HPP: entering START_TAG\n");
5410#endif
5411 }
5412 break;
5413 case XML_PARSER_PROLOG:
5414 SKIP_BLANKS;
5415 if (in->buf == NULL)
5416 avail = in->length - (in->cur - in->base);
5417 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005418 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Daniel Veillarde77db162009-08-22 11:32:38 +02005419 if (avail < 2)
Owen Taylor3473f882001-02-23 17:55:21 +00005420 goto done;
5421 cur = in->cur[0];
5422 next = in->cur[1];
5423 if ((cur == '<') && (next == '!') &&
5424 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5425 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005426 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005427 goto done;
5428#ifdef DEBUG_PUSH
5429 xmlGenericError(xmlGenericErrorContext,
5430 "HPP: Parsing Comment\n");
5431#endif
5432 htmlParseComment(ctxt);
5433 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005434 } else if ((cur == '<') && (next == '?')) {
5435 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005436 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005437 goto done;
5438#ifdef DEBUG_PUSH
5439 xmlGenericError(xmlGenericErrorContext,
5440 "HPP: Parsing PI\n");
5441#endif
5442 htmlParsePI(ctxt);
5443 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005444 } else if ((cur == '<') && (next == '!') &&
5445 (avail < 4)) {
5446 goto done;
5447 } else {
5448 ctxt->instate = XML_PARSER_START_TAG;
5449#ifdef DEBUG_PUSH
5450 xmlGenericError(xmlGenericErrorContext,
5451 "HPP: entering START_TAG\n");
5452#endif
5453 }
5454 break;
5455 case XML_PARSER_EPILOG:
5456 if (in->buf == NULL)
5457 avail = in->length - (in->cur - in->base);
5458 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005459 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Owen Taylor3473f882001-02-23 17:55:21 +00005460 if (avail < 1)
5461 goto done;
5462 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005463 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005464 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005465 goto done;
5466 }
5467 if (avail < 2)
5468 goto done;
5469 next = in->cur[1];
5470 if ((cur == '<') && (next == '!') &&
5471 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5472 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005473 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005474 goto done;
5475#ifdef DEBUG_PUSH
5476 xmlGenericError(xmlGenericErrorContext,
5477 "HPP: Parsing Comment\n");
5478#endif
5479 htmlParseComment(ctxt);
5480 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005481 } else if ((cur == '<') && (next == '?')) {
5482 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005483 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005484 goto done;
5485#ifdef DEBUG_PUSH
5486 xmlGenericError(xmlGenericErrorContext,
5487 "HPP: Parsing PI\n");
5488#endif
5489 htmlParsePI(ctxt);
5490 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005491 } else if ((cur == '<') && (next == '!') &&
5492 (avail < 4)) {
5493 goto done;
5494 } else {
5495 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005496 ctxt->wellFormed = 0;
5497 ctxt->instate = XML_PARSER_EOF;
5498#ifdef DEBUG_PUSH
5499 xmlGenericError(xmlGenericErrorContext,
5500 "HPP: entering EOF\n");
5501#endif
5502 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5503 ctxt->sax->endDocument(ctxt->userData);
5504 goto done;
5505 }
5506 break;
5507 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005508 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005509 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005510 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005511
Denis Paukfdf990c2012-05-10 20:40:49 +08005512 /*
5513 * no chars in buffer
5514 */
5515 if (avail < 1)
Owen Taylor3473f882001-02-23 17:55:21 +00005516 goto done;
Denis Paukfdf990c2012-05-10 20:40:49 +08005517 /*
5518 * not enouth chars in buffer
5519 */
5520 if (avail < 2) {
5521 if (!terminate)
5522 goto done;
5523 else
5524 next = ' ';
5525 } else {
5526 next = in->cur[1];
5527 }
Owen Taylor3473f882001-02-23 17:55:21 +00005528 cur = in->cur[0];
5529 if (cur != '<') {
5530 ctxt->instate = XML_PARSER_CONTENT;
5531#ifdef DEBUG_PUSH
5532 xmlGenericError(xmlGenericErrorContext,
5533 "HPP: entering CONTENT\n");
5534#endif
5535 break;
5536 }
Denis Paukfdf990c2012-05-10 20:40:49 +08005537 if (next == '/') {
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005538 ctxt->instate = XML_PARSER_END_TAG;
5539 ctxt->checkIndex = 0;
5540#ifdef DEBUG_PUSH
5541 xmlGenericError(xmlGenericErrorContext,
5542 "HPP: entering END_TAG\n");
5543#endif
5544 break;
5545 }
Owen Taylor3473f882001-02-23 17:55:21 +00005546 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005547 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005548 goto done;
5549
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005550 /* Capture start position */
5551 if (ctxt->record_info) {
5552 node_info.begin_pos = ctxt->input->consumed +
5553 (CUR_PTR - ctxt->input->base);
5554 node_info.begin_line = ctxt->input->line;
5555 }
5556
5557
Daniel Veillard597f1c12005-07-03 23:00:18 +00005558 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005559 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005560 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005561 (name == NULL)) {
5562 if (CUR == '>')
5563 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005564 break;
5565 }
Owen Taylor3473f882001-02-23 17:55:21 +00005566
5567 /*
5568 * Lookup the info for that element.
5569 */
5570 info = htmlTagLookup(name);
5571 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005572 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5573 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005574 }
5575
5576 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005577 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005578 */
5579 if ((CUR == '/') && (NXT(1) == '>')) {
5580 SKIP(2);
5581 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5582 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005583 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005584 ctxt->instate = XML_PARSER_CONTENT;
5585#ifdef DEBUG_PUSH
5586 xmlGenericError(xmlGenericErrorContext,
5587 "HPP: entering CONTENT\n");
5588#endif
5589 break;
5590 }
5591
5592 if (CUR == '>') {
5593 NEXT;
5594 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005595 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5596 "Couldn't find end of Start Tag %s\n",
5597 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005598
5599 /*
5600 * end of parsing of this node.
5601 */
Daniel Veillarde77db162009-08-22 11:32:38 +02005602 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005603 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005604 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005605 }
Owen Taylor3473f882001-02-23 17:55:21 +00005606
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005607 if (ctxt->record_info)
5608 htmlNodeInfoPush(ctxt, &node_info);
5609
Owen Taylor3473f882001-02-23 17:55:21 +00005610 ctxt->instate = XML_PARSER_CONTENT;
5611#ifdef DEBUG_PUSH
5612 xmlGenericError(xmlGenericErrorContext,
5613 "HPP: entering CONTENT\n");
5614#endif
5615 break;
5616 }
5617
5618 /*
5619 * Check for an Empty Element from DTD definition
5620 */
5621 if ((info != NULL) && (info->empty)) {
5622 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5623 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005624 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005625 }
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005626
5627 if (ctxt->record_info)
5628 htmlNodeInfoPush(ctxt, &node_info);
5629
Owen Taylor3473f882001-02-23 17:55:21 +00005630 ctxt->instate = XML_PARSER_CONTENT;
5631#ifdef DEBUG_PUSH
5632 xmlGenericError(xmlGenericErrorContext,
5633 "HPP: entering CONTENT\n");
5634#endif
5635 break;
5636 }
5637 case XML_PARSER_CONTENT: {
5638 long cons;
5639 /*
5640 * Handle preparsed entities and charRef
5641 */
5642 if (ctxt->token != 0) {
5643 xmlChar chr[2] = { 0 , 0 } ;
5644
5645 chr[0] = (xmlChar) ctxt->token;
5646 htmlCheckParagraph(ctxt);
5647 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5648 ctxt->sax->characters(ctxt->userData, chr, 1);
5649 ctxt->token = 0;
5650 ctxt->checkIndex = 0;
5651 }
5652 if ((avail == 1) && (terminate)) {
5653 cur = in->cur[0];
5654 if ((cur != '<') && (cur != '&')) {
5655 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005656 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005657 if (ctxt->sax->ignorableWhitespace != NULL)
5658 ctxt->sax->ignorableWhitespace(
5659 ctxt->userData, &cur, 1);
5660 } else {
5661 htmlCheckParagraph(ctxt);
5662 if (ctxt->sax->characters != NULL)
5663 ctxt->sax->characters(
5664 ctxt->userData, &cur, 1);
5665 }
5666 }
5667 ctxt->token = 0;
5668 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005669 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005670 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005671 }
Owen Taylor3473f882001-02-23 17:55:21 +00005672 }
5673 if (avail < 2)
5674 goto done;
5675 cur = in->cur[0];
5676 next = in->cur[1];
5677 cons = ctxt->nbChars;
5678 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5679 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5680 /*
5681 * Handle SCRIPT/STYLE separately
5682 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005683 if (!terminate) {
5684 int idx;
5685 xmlChar val;
5686
Denis Pauk91d239c2010-11-04 12:39:18 +01005687 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
Daniel Veillard68716a72006-10-16 09:32:17 +00005688 if (idx < 0)
5689 goto done;
5690 val = in->cur[idx + 2];
5691 if (val == 0) /* bad cut of input */
5692 goto done;
5693 }
Owen Taylor3473f882001-02-23 17:55:21 +00005694 htmlParseScript(ctxt);
5695 if ((cur == '<') && (next == '/')) {
5696 ctxt->instate = XML_PARSER_END_TAG;
5697 ctxt->checkIndex = 0;
5698#ifdef DEBUG_PUSH
5699 xmlGenericError(xmlGenericErrorContext,
5700 "HPP: entering END_TAG\n");
5701#endif
5702 break;
5703 }
5704 } else {
5705 /*
5706 * Sometimes DOCTYPE arrives in the middle of the document
5707 */
5708 if ((cur == '<') && (next == '!') &&
5709 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5710 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5711 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5712 (UPP(8) == 'E')) {
5713 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005714 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005715 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005716 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5717 "Misplaced DOCTYPE declaration\n",
5718 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005719 htmlParseDocTypeDecl(ctxt);
5720 } else if ((cur == '<') && (next == '!') &&
5721 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5722 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005723 (htmlParseLookupSequence(
Daniel Veillarde77db162009-08-22 11:32:38 +02005724 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005725 goto done;
5726#ifdef DEBUG_PUSH
5727 xmlGenericError(xmlGenericErrorContext,
5728 "HPP: Parsing Comment\n");
5729#endif
5730 htmlParseComment(ctxt);
5731 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005732 } else if ((cur == '<') && (next == '?')) {
5733 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005734 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005735 goto done;
5736#ifdef DEBUG_PUSH
5737 xmlGenericError(xmlGenericErrorContext,
5738 "HPP: Parsing PI\n");
5739#endif
5740 htmlParsePI(ctxt);
5741 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005742 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5743 goto done;
5744 } else if ((cur == '<') && (next == '/')) {
5745 ctxt->instate = XML_PARSER_END_TAG;
5746 ctxt->checkIndex = 0;
5747#ifdef DEBUG_PUSH
5748 xmlGenericError(xmlGenericErrorContext,
5749 "HPP: entering END_TAG\n");
5750#endif
5751 break;
5752 } else if (cur == '<') {
5753 ctxt->instate = XML_PARSER_START_TAG;
5754 ctxt->checkIndex = 0;
5755#ifdef DEBUG_PUSH
5756 xmlGenericError(xmlGenericErrorContext,
5757 "HPP: entering START_TAG\n");
5758#endif
5759 break;
5760 } else if (cur == '&') {
5761 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005762 (htmlParseLookupChars(ctxt,
5763 BAD_CAST "; >/", 4) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005764 goto done;
5765#ifdef DEBUG_PUSH
5766 xmlGenericError(xmlGenericErrorContext,
5767 "HPP: Parsing Reference\n");
5768#endif
5769 /* TODO: check generation of subtrees if noent !!! */
5770 htmlParseReference(ctxt);
5771 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005772 /*
5773 * check that the text sequence is complete
5774 * before handing out the data to the parser
5775 * to avoid problems with erroneous end of
5776 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005777 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005778 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005779 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005780 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005781 ctxt->checkIndex = 0;
5782#ifdef DEBUG_PUSH
5783 xmlGenericError(xmlGenericErrorContext,
5784 "HPP: Parsing char data\n");
5785#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005786 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005787 }
5788 }
5789 if (cons == ctxt->nbChars) {
5790 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005791 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5792 "detected an error in element content\n",
5793 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005794 }
5795 NEXT;
5796 break;
5797 }
5798
5799 break;
5800 }
5801 case XML_PARSER_END_TAG:
5802 if (avail < 2)
5803 goto done;
5804 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005805 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005806 goto done;
5807 htmlParseEndTag(ctxt);
5808 if (ctxt->nameNr == 0) {
5809 ctxt->instate = XML_PARSER_EPILOG;
5810 } else {
5811 ctxt->instate = XML_PARSER_CONTENT;
5812 }
5813 ctxt->checkIndex = 0;
5814#ifdef DEBUG_PUSH
5815 xmlGenericError(xmlGenericErrorContext,
5816 "HPP: entering CONTENT\n");
5817#endif
5818 break;
5819 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005820 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5821 "HPP: internal error, state == CDATA\n",
5822 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005823 ctxt->instate = XML_PARSER_CONTENT;
5824 ctxt->checkIndex = 0;
5825#ifdef DEBUG_PUSH
5826 xmlGenericError(xmlGenericErrorContext,
5827 "HPP: entering CONTENT\n");
5828#endif
5829 break;
5830 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005831 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5832 "HPP: internal error, state == DTD\n",
5833 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005834 ctxt->instate = XML_PARSER_CONTENT;
5835 ctxt->checkIndex = 0;
5836#ifdef DEBUG_PUSH
5837 xmlGenericError(xmlGenericErrorContext,
5838 "HPP: entering CONTENT\n");
5839#endif
5840 break;
5841 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005842 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5843 "HPP: internal error, state == COMMENT\n",
5844 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005845 ctxt->instate = XML_PARSER_CONTENT;
5846 ctxt->checkIndex = 0;
5847#ifdef DEBUG_PUSH
5848 xmlGenericError(xmlGenericErrorContext,
5849 "HPP: entering CONTENT\n");
5850#endif
5851 break;
5852 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005853 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5854 "HPP: internal error, state == PI\n",
5855 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005856 ctxt->instate = XML_PARSER_CONTENT;
5857 ctxt->checkIndex = 0;
5858#ifdef DEBUG_PUSH
5859 xmlGenericError(xmlGenericErrorContext,
5860 "HPP: entering CONTENT\n");
5861#endif
5862 break;
5863 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005864 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5865 "HPP: internal error, state == ENTITY_DECL\n",
5866 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005867 ctxt->instate = XML_PARSER_CONTENT;
5868 ctxt->checkIndex = 0;
5869#ifdef DEBUG_PUSH
5870 xmlGenericError(xmlGenericErrorContext,
5871 "HPP: entering CONTENT\n");
5872#endif
5873 break;
5874 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005875 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5876 "HPP: internal error, state == ENTITY_VALUE\n",
5877 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005878 ctxt->instate = XML_PARSER_CONTENT;
5879 ctxt->checkIndex = 0;
5880#ifdef DEBUG_PUSH
5881 xmlGenericError(xmlGenericErrorContext,
5882 "HPP: entering DTD\n");
5883#endif
5884 break;
5885 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005886 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5887 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5888 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005889 ctxt->instate = XML_PARSER_START_TAG;
5890 ctxt->checkIndex = 0;
5891#ifdef DEBUG_PUSH
5892 xmlGenericError(xmlGenericErrorContext,
5893 "HPP: entering START_TAG\n");
5894#endif
5895 break;
5896 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005897 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5898 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5899 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005900 ctxt->instate = XML_PARSER_CONTENT;
5901 ctxt->checkIndex = 0;
5902#ifdef DEBUG_PUSH
5903 xmlGenericError(xmlGenericErrorContext,
5904 "HPP: entering CONTENT\n");
5905#endif
5906 break;
5907 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005908 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5909 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5910 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005911 ctxt->instate = XML_PARSER_CONTENT;
5912 ctxt->checkIndex = 0;
5913#ifdef DEBUG_PUSH
5914 xmlGenericError(xmlGenericErrorContext,
5915 "HPP: entering CONTENT\n");
5916#endif
5917 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005918 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005919 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5920 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5921 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005922 ctxt->instate = XML_PARSER_CONTENT;
5923 ctxt->checkIndex = 0;
5924#ifdef DEBUG_PUSH
5925 xmlGenericError(xmlGenericErrorContext,
5926 "HPP: entering CONTENT\n");
5927#endif
5928 break;
5929
Owen Taylor3473f882001-02-23 17:55:21 +00005930 }
5931 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005932done:
Owen Taylor3473f882001-02-23 17:55:21 +00005933 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005934 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005935 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005936 /*
5937 * SAX: end of the document processing.
5938 */
5939 ctxt->instate = XML_PARSER_EOF;
5940 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5941 ctxt->sax->endDocument(ctxt->userData);
5942 }
5943 }
5944 if ((ctxt->myDoc != NULL) &&
5945 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5946 (ctxt->instate == XML_PARSER_EPILOG))) {
5947 xmlDtdPtr dtd;
5948 dtd = xmlGetIntSubset(ctxt->myDoc);
5949 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02005950 ctxt->myDoc->intSubset =
5951 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005952 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5953 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5954 }
5955#ifdef DEBUG_PUSH
5956 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5957#endif
5958 return(ret);
5959}
5960
5961/**
Owen Taylor3473f882001-02-23 17:55:21 +00005962 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005963 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005964 * @chunk: an char array
5965 * @size: the size in byte of the chunk
5966 * @terminate: last chunk indicator
5967 *
5968 * Parse a Chunk of memory
5969 *
5970 * Returns zero if no error, the xmlParserErrors otherwise.
5971 */
5972int
5973htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5974 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005975 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5976 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5977 "htmlParseChunk: context error\n", NULL, NULL);
5978 return(XML_ERR_INTERNAL_ERROR);
5979 }
Owen Taylor3473f882001-02-23 17:55:21 +00005980 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5981 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
Daniel Veillarda78d8032012-07-16 14:56:50 +08005982 int base = ctxt->input->base - xmlBufContent(ctxt->input->buf->buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00005983 int cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005984 int res;
Daniel Veillarde77db162009-08-22 11:32:38 +02005985
5986 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillardd2755a82005-08-07 23:42:39 +00005987 if (res < 0) {
5988 ctxt->errNo = XML_PARSER_EOF;
5989 ctxt->disableSAX = 1;
5990 return (XML_PARSER_EOF);
5991 }
Daniel Veillarda78d8032012-07-16 14:56:50 +08005992 ctxt->input->base = xmlBufContent(ctxt->input->buf->buffer) + base;
Owen Taylor3473f882001-02-23 17:55:21 +00005993 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillarda78d8032012-07-16 14:56:50 +08005994 ctxt->input->end = xmlBufEnd(ctxt->input->buf->buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00005995#ifdef DEBUG_PUSH
5996 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5997#endif
5998
Daniel Veillard14f752c2003-08-09 11:44:50 +00005999#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00006000 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6001 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006002#endif
Owen Taylor3473f882001-02-23 17:55:21 +00006003 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00006004 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6005 xmlParserInputBufferPtr in = ctxt->input->buf;
6006 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6007 (in->raw != NULL)) {
6008 int nbchars;
Daniel Veillarde77db162009-08-22 11:32:38 +02006009
Daniel Veillarda78d8032012-07-16 14:56:50 +08006010 nbchars = xmlCharEncInput(in);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006011 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00006012 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6013 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006014 return(XML_ERR_INVALID_ENCODING);
6015 }
6016 }
6017 }
Owen Taylor3473f882001-02-23 17:55:21 +00006018 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00006019 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00006020 if (terminate) {
6021 if ((ctxt->instate != XML_PARSER_EOF) &&
6022 (ctxt->instate != XML_PARSER_EPILOG) &&
6023 (ctxt->instate != XML_PARSER_MISC)) {
6024 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00006025 ctxt->wellFormed = 0;
Daniel Veillarde77db162009-08-22 11:32:38 +02006026 }
Owen Taylor3473f882001-02-23 17:55:21 +00006027 if (ctxt->instate != XML_PARSER_EOF) {
6028 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6029 ctxt->sax->endDocument(ctxt->userData);
6030 }
6031 ctxt->instate = XML_PARSER_EOF;
6032 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006033 return((xmlParserErrors) ctxt->errNo);
Owen Taylor3473f882001-02-23 17:55:21 +00006034}
6035
6036/************************************************************************
6037 * *
6038 * User entry points *
6039 * *
6040 ************************************************************************/
6041
6042/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006043 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00006044 * @sax: a SAX handler
6045 * @user_data: The user data returned on SAX callbacks
6046 * @chunk: a pointer to an array of chars
6047 * @size: number of chars in the array
6048 * @filename: an optional file name or URI
6049 * @enc: an optional encoding
6050 *
6051 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00006052 * The value of @filename is used for fetching external entities
6053 * and error/warning reports.
6054 *
6055 * Returns the new parser context or NULL
6056 */
6057htmlParserCtxtPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02006058htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
Owen Taylor3473f882001-02-23 17:55:21 +00006059 const char *chunk, int size, const char *filename,
6060 xmlCharEncoding enc) {
6061 htmlParserCtxtPtr ctxt;
6062 htmlParserInputPtr inputStream;
6063 xmlParserInputBufferPtr buf;
6064
Daniel Veillardd0463562001-10-13 09:15:48 +00006065 xmlInitParser();
6066
Owen Taylor3473f882001-02-23 17:55:21 +00006067 buf = xmlAllocParserInputBuffer(enc);
6068 if (buf == NULL) return(NULL);
6069
Daniel Veillardf403d292003-10-05 13:51:35 +00006070 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00006071 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00006072 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00006073 return(NULL);
6074 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00006075 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6076 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00006077 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00006078 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00006079 xmlFree(ctxt->sax);
6080 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6081 if (ctxt->sax == NULL) {
6082 xmlFree(buf);
6083 xmlFree(ctxt);
6084 return(NULL);
6085 }
6086 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6087 if (user_data != NULL)
6088 ctxt->userData = user_data;
Daniel Veillarde77db162009-08-22 11:32:38 +02006089 }
Owen Taylor3473f882001-02-23 17:55:21 +00006090 if (filename == NULL) {
6091 ctxt->directory = NULL;
6092 } else {
6093 ctxt->directory = xmlParserGetDirectory(filename);
6094 }
6095
6096 inputStream = htmlNewInputStream(ctxt);
6097 if (inputStream == NULL) {
6098 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00006099 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00006100 return(NULL);
6101 }
6102
6103 if (filename == NULL)
6104 inputStream->filename = NULL;
6105 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00006106 inputStream->filename = (char *)
6107 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00006108 inputStream->buf = buf;
Daniel Veillarda78d8032012-07-16 14:56:50 +08006109 inputStream->cur =
6110 inputStream->base = xmlBufContent(buf->buffer);
6111 inputStream->end = xmlBufEnd(buf->buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00006112
6113 inputPush(ctxt, inputStream);
6114
6115 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02006116 (ctxt->input->buf != NULL)) {
Daniel Veillarda78d8032012-07-16 14:56:50 +08006117 int base = ctxt->input->base - xmlBufContent(ctxt->input->buf->buffer);
Daniel Veillard5f704af2003-03-05 10:01:43 +00006118 int cur = ctxt->input->cur - ctxt->input->base;
6119
Daniel Veillarde77db162009-08-22 11:32:38 +02006120 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00006121
Daniel Veillarda78d8032012-07-16 14:56:50 +08006122 ctxt->input->base = xmlBufContent(ctxt->input->buf->buffer) + base;
Daniel Veillard5f704af2003-03-05 10:01:43 +00006123 ctxt->input->cur = ctxt->input->base + cur;
Daniel Veillarda78d8032012-07-16 14:56:50 +08006124 ctxt->input->end = xmlBufEnd(ctxt->input->buf->buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00006125#ifdef DEBUG_PUSH
6126 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6127#endif
6128 }
Daniel Veillard68716a72006-10-16 09:32:17 +00006129 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00006130
6131 return(ctxt);
6132}
William M. Brack21e4ef22005-01-02 09:53:13 +00006133#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00006134
6135/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006136 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00006137 * @cur: a pointer to an array of xmlChar
6138 * @encoding: a free form C string describing the HTML document encoding, or NULL
6139 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02006140 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00006141 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006142 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6143 * to handle parse events. If sax is NULL, fallback to the default DOM
6144 * behavior and return a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006145 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006146 * Returns the resulting document tree unless SAX is NULL or the document is
6147 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00006148 */
6149
6150htmlDocPtr
6151htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
6152 htmlDocPtr ret;
6153 htmlParserCtxtPtr ctxt;
6154
Daniel Veillardd0463562001-10-13 09:15:48 +00006155 xmlInitParser();
6156
Owen Taylor3473f882001-02-23 17:55:21 +00006157 if (cur == NULL) return(NULL);
6158
6159
6160 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6161 if (ctxt == NULL) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02006162 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00006163 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00006164 ctxt->sax = sax;
6165 ctxt->userData = userData;
6166 }
6167
6168 htmlParseDocument(ctxt);
6169 ret = ctxt->myDoc;
6170 if (sax != NULL) {
6171 ctxt->sax = NULL;
6172 ctxt->userData = NULL;
6173 }
6174 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006175
Owen Taylor3473f882001-02-23 17:55:21 +00006176 return(ret);
6177}
6178
6179/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006180 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00006181 * @cur: a pointer to an array of xmlChar
6182 * @encoding: a free form C string describing the HTML document encoding, or NULL
6183 *
6184 * parse an HTML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006185 *
Owen Taylor3473f882001-02-23 17:55:21 +00006186 * Returns the resulting document tree
6187 */
6188
6189htmlDocPtr
6190htmlParseDoc(xmlChar *cur, const char *encoding) {
6191 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6192}
6193
6194
6195/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006196 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00006197 * @filename: the filename
6198 * @encoding: a free form C string describing the HTML document encoding, or NULL
6199 *
Daniel Veillarde77db162009-08-22 11:32:38 +02006200 * Create a parser context for a file content.
Owen Taylor3473f882001-02-23 17:55:21 +00006201 * Automatic support for ZLIB/Compress compressed document is provided
6202 * by default if found at compile-time.
6203 *
6204 * Returns the new parser context or NULL
6205 */
6206htmlParserCtxtPtr
6207htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6208{
6209 htmlParserCtxtPtr ctxt;
6210 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006211 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00006212 /* htmlCharEncoding enc; */
6213 xmlChar *content, *content_line = (xmlChar *) "charset=";
6214
Daniel Veillarda03e3652004-11-02 18:45:30 +00006215 if (filename == NULL)
6216 return(NULL);
6217
Daniel Veillardf403d292003-10-05 13:51:35 +00006218 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00006219 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00006220 return(NULL);
6221 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006222 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6223 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00006224#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006225 if (xmlDefaultSAXHandler.error != NULL) {
6226 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6227 }
Daniel Veillard87247e82004-01-13 20:42:02 +00006228#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00006229 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00006230 return(NULL);
6231 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006232
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006233 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6234 xmlFree(canonicFilename);
6235 if (inputStream == NULL) {
6236 xmlFreeParserCtxt(ctxt);
6237 return(NULL);
6238 }
Owen Taylor3473f882001-02-23 17:55:21 +00006239
6240 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006241
Owen Taylor3473f882001-02-23 17:55:21 +00006242 /* set encoding */
6243 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00006244 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Daniel Veillarde77db162009-08-22 11:32:38 +02006245 if (content) {
Owen Taylor3473f882001-02-23 17:55:21 +00006246 strcpy ((char *)content, (char *)content_line);
6247 strcat ((char *)content, (char *)encoding);
6248 htmlCheckEncoding (ctxt, content);
6249 xmlFree (content);
6250 }
6251 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006252
Owen Taylor3473f882001-02-23 17:55:21 +00006253 return(ctxt);
6254}
6255
6256/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006257 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00006258 * @filename: the filename
6259 * @encoding: a free form C string describing the HTML document encoding, or NULL
6260 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02006261 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00006262 *
6263 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6264 * compressed document is provided by default if found at compile-time.
6265 * It use the given SAX function block to handle the parsing callback.
6266 * If sax is NULL, fallback to the default DOM tree building routines.
6267 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006268 * Returns the resulting document tree unless SAX is NULL or the document is
6269 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00006270 */
6271
6272htmlDocPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02006273htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
Owen Taylor3473f882001-02-23 17:55:21 +00006274 void *userData) {
6275 htmlDocPtr ret;
6276 htmlParserCtxtPtr ctxt;
6277 htmlSAXHandlerPtr oldsax = NULL;
6278
Daniel Veillardd0463562001-10-13 09:15:48 +00006279 xmlInitParser();
6280
Owen Taylor3473f882001-02-23 17:55:21 +00006281 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6282 if (ctxt == NULL) return(NULL);
6283 if (sax != NULL) {
6284 oldsax = ctxt->sax;
6285 ctxt->sax = sax;
6286 ctxt->userData = userData;
6287 }
6288
6289 htmlParseDocument(ctxt);
6290
6291 ret = ctxt->myDoc;
6292 if (sax != NULL) {
6293 ctxt->sax = oldsax;
6294 ctxt->userData = NULL;
6295 }
6296 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006297
Owen Taylor3473f882001-02-23 17:55:21 +00006298 return(ret);
6299}
6300
6301/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006302 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00006303 * @filename: the filename
6304 * @encoding: a free form C string describing the HTML document encoding, or NULL
6305 *
6306 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6307 * compressed document is provided by default if found at compile-time.
6308 *
6309 * Returns the resulting document tree
6310 */
6311
6312htmlDocPtr
6313htmlParseFile(const char *filename, const char *encoding) {
6314 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6315}
6316
6317/**
6318 * htmlHandleOmittedElem:
Daniel Veillarde77db162009-08-22 11:32:38 +02006319 * @val: int 0 or 1
Owen Taylor3473f882001-02-23 17:55:21 +00006320 *
6321 * Set and return the previous value for handling HTML omitted tags.
6322 *
6323 * Returns the last value for 0 for no handling, 1 for auto insertion.
6324 */
6325
6326int
6327htmlHandleOmittedElem(int val) {
6328 int old = htmlOmittedDefaultValue;
6329
6330 htmlOmittedDefaultValue = val;
6331 return(old);
6332}
6333
Daniel Veillard930dfb62003-02-05 10:17:38 +00006334/**
6335 * htmlElementAllowedHere:
6336 * @parent: HTML parent element
6337 * @elt: HTML element
6338 *
6339 * Checks whether an HTML element may be a direct child of a parent element.
6340 * Note - doesn't check for deprecated elements
6341 *
6342 * Returns 1 if allowed; 0 otherwise.
6343 */
6344int
6345htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6346 const char** p ;
6347
6348 if ( ! elt || ! parent || ! parent->subelts )
6349 return 0 ;
6350
6351 for ( p = parent->subelts; *p; ++p )
6352 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6353 return 1 ;
6354
6355 return 0 ;
6356}
6357/**
6358 * htmlElementStatusHere:
6359 * @parent: HTML parent element
6360 * @elt: HTML element
6361 *
6362 * Checks whether an HTML element may be a direct child of a parent element.
6363 * and if so whether it is valid or deprecated.
6364 *
6365 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6366 */
6367htmlStatus
6368htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6369 if ( ! parent || ! elt )
6370 return HTML_INVALID ;
6371 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6372 return HTML_INVALID ;
6373
6374 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6375}
6376/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006377 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00006378 * @elt: HTML element
6379 * @attr: HTML attribute
6380 * @legacy: whether to allow deprecated attributes
6381 *
6382 * Checks whether an attribute is valid for an element
6383 * Has full knowledge of Required and Deprecated attributes
6384 *
6385 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6386 */
6387htmlStatus
6388htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6389 const char** p ;
6390
6391 if ( !elt || ! attr )
6392 return HTML_INVALID ;
6393
6394 if ( elt->attrs_req )
6395 for ( p = elt->attrs_req; *p; ++p)
6396 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6397 return HTML_REQUIRED ;
6398
6399 if ( elt->attrs_opt )
6400 for ( p = elt->attrs_opt; *p; ++p)
6401 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6402 return HTML_VALID ;
6403
6404 if ( legacy && elt->attrs_depr )
6405 for ( p = elt->attrs_depr; *p; ++p)
6406 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6407 return HTML_DEPRECATED ;
6408
6409 return HTML_INVALID ;
6410}
6411/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006412 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00006413 * @node: an htmlNodePtr in a tree
6414 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00006415 * for Element nodes)
6416 *
6417 * Checks whether the tree node is valid. Experimental (the author
6418 * only uses the HTML enhancements in a SAX parser)
6419 *
6420 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6421 * legacy allowed) or htmlElementStatusHere (otherwise).
6422 * for Attribute nodes, a return from htmlAttrAllowed
6423 * for other nodes, HTML_NA (no checks performed)
6424 */
6425htmlStatus
6426htmlNodeStatus(const htmlNodePtr node, int legacy) {
6427 if ( ! node )
6428 return HTML_INVALID ;
6429
6430 switch ( node->type ) {
6431 case XML_ELEMENT_NODE:
6432 return legacy
6433 ? ( htmlElementAllowedHere (
6434 htmlTagLookup(node->parent->name) , node->name
6435 ) ? HTML_VALID : HTML_INVALID )
6436 : htmlElementStatusHere(
6437 htmlTagLookup(node->parent->name) ,
6438 htmlTagLookup(node->name) )
6439 ;
6440 case XML_ATTRIBUTE_NODE:
6441 return htmlAttrAllowed(
6442 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6443 default: return HTML_NA ;
6444 }
6445}
Daniel Veillard9475a352003-09-26 12:47:50 +00006446/************************************************************************
6447 * *
6448 * New set (2.6.0) of simpler and more flexible APIs *
6449 * *
6450 ************************************************************************/
6451/**
6452 * DICT_FREE:
6453 * @str: a string
6454 *
6455 * Free a string if it is not owned by the "dict" dictionnary in the
6456 * current scope
6457 */
6458#define DICT_FREE(str) \
Daniel Veillarde77db162009-08-22 11:32:38 +02006459 if ((str) && ((!dict) || \
Daniel Veillard9475a352003-09-26 12:47:50 +00006460 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6461 xmlFree((char *)(str));
6462
6463/**
6464 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00006465 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00006466 *
6467 * Reset a parser context
6468 */
6469void
6470htmlCtxtReset(htmlParserCtxtPtr ctxt)
6471{
6472 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00006473 xmlDictPtr dict;
Daniel Veillarde77db162009-08-22 11:32:38 +02006474
Daniel Veillarda03e3652004-11-02 18:45:30 +00006475 if (ctxt == NULL)
6476 return;
6477
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006478 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00006479 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00006480
6481 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6482 xmlFreeInputStream(input);
6483 }
6484 ctxt->inputNr = 0;
6485 ctxt->input = NULL;
6486
6487 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00006488 if (ctxt->spaceTab != NULL) {
6489 ctxt->spaceTab[0] = -1;
6490 ctxt->space = &ctxt->spaceTab[0];
6491 } else {
6492 ctxt->space = NULL;
6493 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006494
6495
6496 ctxt->nodeNr = 0;
6497 ctxt->node = NULL;
6498
6499 ctxt->nameNr = 0;
6500 ctxt->name = NULL;
6501
6502 DICT_FREE(ctxt->version);
6503 ctxt->version = NULL;
6504 DICT_FREE(ctxt->encoding);
6505 ctxt->encoding = NULL;
6506 DICT_FREE(ctxt->directory);
6507 ctxt->directory = NULL;
6508 DICT_FREE(ctxt->extSubURI);
6509 ctxt->extSubURI = NULL;
6510 DICT_FREE(ctxt->extSubSystem);
6511 ctxt->extSubSystem = NULL;
6512 if (ctxt->myDoc != NULL)
6513 xmlFreeDoc(ctxt->myDoc);
6514 ctxt->myDoc = NULL;
6515
6516 ctxt->standalone = -1;
6517 ctxt->hasExternalSubset = 0;
6518 ctxt->hasPErefs = 0;
6519 ctxt->html = 1;
6520 ctxt->external = 0;
6521 ctxt->instate = XML_PARSER_START;
6522 ctxt->token = 0;
6523
6524 ctxt->wellFormed = 1;
6525 ctxt->nsWellFormed = 1;
Daniel Veillard8ad29302010-10-28 11:51:22 +02006526 ctxt->disableSAX = 0;
Daniel Veillard9475a352003-09-26 12:47:50 +00006527 ctxt->valid = 1;
6528 ctxt->vctxt.userData = ctxt;
6529 ctxt->vctxt.error = xmlParserValidityError;
6530 ctxt->vctxt.warning = xmlParserValidityWarning;
6531 ctxt->record_info = 0;
6532 ctxt->nbChars = 0;
6533 ctxt->checkIndex = 0;
6534 ctxt->inSubset = 0;
6535 ctxt->errNo = XML_ERR_OK;
6536 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006537 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006538 ctxt->catalogs = NULL;
6539 xmlInitNodeInfoSeq(&ctxt->node_seq);
6540
6541 if (ctxt->attsDefault != NULL) {
6542 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6543 ctxt->attsDefault = NULL;
6544 }
6545 if (ctxt->attsSpecial != NULL) {
6546 xmlHashFree(ctxt->attsSpecial, NULL);
6547 ctxt->attsSpecial = NULL;
6548 }
6549}
6550
6551/**
6552 * htmlCtxtUseOptions:
6553 * @ctxt: an HTML parser context
6554 * @options: a combination of htmlParserOption(s)
6555 *
6556 * Applies the options to the parser context
6557 *
6558 * Returns 0 in case of success, the set of unknown or unimplemented options
6559 * in case of error.
6560 */
6561int
6562htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6563{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006564 if (ctxt == NULL)
6565 return(-1);
6566
Daniel Veillard9475a352003-09-26 12:47:50 +00006567 if (options & HTML_PARSE_NOWARNING) {
6568 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006569 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006570 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006571 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006572 }
6573 if (options & HTML_PARSE_NOERROR) {
6574 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006575 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006576 ctxt->sax->fatalError = NULL;
6577 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006578 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006579 }
6580 if (options & HTML_PARSE_PEDANTIC) {
6581 ctxt->pedantic = 1;
6582 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006583 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006584 } else
6585 ctxt->pedantic = 0;
6586 if (options & XML_PARSE_NOBLANKS) {
6587 ctxt->keepBlanks = 0;
6588 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6589 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006590 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006591 } else
6592 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006593 if (options & HTML_PARSE_RECOVER) {
6594 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006595 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006596 } else
6597 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006598 if (options & HTML_PARSE_COMPACT) {
6599 ctxt->options |= HTML_PARSE_COMPACT;
6600 options -= HTML_PARSE_COMPACT;
6601 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006602 if (options & XML_PARSE_HUGE) {
6603 ctxt->options |= XML_PARSE_HUGE;
6604 options -= XML_PARSE_HUGE;
6605 }
Daniel Veillardf1121c42010-07-26 14:02:42 +02006606 if (options & HTML_PARSE_NODEFDTD) {
6607 ctxt->options |= HTML_PARSE_NODEFDTD;
6608 options -= HTML_PARSE_NODEFDTD;
6609 }
Daniel Veillardc62efc82011-05-16 16:03:50 +08006610 if (options & HTML_PARSE_IGNORE_ENC) {
6611 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6612 options -= HTML_PARSE_IGNORE_ENC;
6613 }
Martin Schröderb91111b2012-05-10 18:52:37 +08006614 if (options & HTML_PARSE_NOIMPLIED) {
6615 ctxt->options |= HTML_PARSE_NOIMPLIED;
6616 options -= HTML_PARSE_NOIMPLIED;
6617 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006618 ctxt->dictNames = 0;
6619 return (options);
6620}
6621
6622/**
6623 * htmlDoRead:
6624 * @ctxt: an HTML parser context
6625 * @URL: the base URL to use for the document
6626 * @encoding: the document encoding, or NULL
6627 * @options: a combination of htmlParserOption(s)
6628 * @reuse: keep the context for reuse
6629 *
6630 * Common front-end for the htmlRead functions
Daniel Veillarde77db162009-08-22 11:32:38 +02006631 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006632 * Returns the resulting document tree or NULL
6633 */
6634static htmlDocPtr
6635htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6636 int options, int reuse)
6637{
6638 htmlDocPtr ret;
Daniel Veillarde77db162009-08-22 11:32:38 +02006639
Daniel Veillard9475a352003-09-26 12:47:50 +00006640 htmlCtxtUseOptions(ctxt, options);
6641 ctxt->html = 1;
6642 if (encoding != NULL) {
6643 xmlCharEncodingHandlerPtr hdlr;
6644
6645 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006646 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006647 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006648 if (ctxt->input->encoding != NULL)
6649 xmlFree((xmlChar *) ctxt->input->encoding);
6650 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6651 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006652 }
6653 if ((URL != NULL) && (ctxt->input != NULL) &&
6654 (ctxt->input->filename == NULL))
6655 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6656 htmlParseDocument(ctxt);
6657 ret = ctxt->myDoc;
6658 ctxt->myDoc = NULL;
6659 if (!reuse) {
6660 if ((ctxt->dictNames) &&
6661 (ret != NULL) &&
6662 (ret->dict == ctxt->dict))
6663 ctxt->dict = NULL;
6664 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006665 }
6666 return (ret);
6667}
6668
6669/**
6670 * htmlReadDoc:
6671 * @cur: a pointer to a zero terminated string
6672 * @URL: the base URL to use for the document
6673 * @encoding: the document encoding, or NULL
6674 * @options: a combination of htmlParserOption(s)
6675 *
6676 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006677 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006678 * Returns the resulting document tree
6679 */
6680htmlDocPtr
6681htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6682{
6683 htmlParserCtxtPtr ctxt;
6684
6685 if (cur == NULL)
6686 return (NULL);
6687
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006688 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006689 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006690 if (ctxt == NULL)
6691 return (NULL);
6692 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6693}
6694
6695/**
6696 * htmlReadFile:
6697 * @filename: a file or URL
6698 * @encoding: the document encoding, or NULL
6699 * @options: a combination of htmlParserOption(s)
6700 *
6701 * parse an XML file from the filesystem or the network.
Daniel Veillarde77db162009-08-22 11:32:38 +02006702 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006703 * Returns the resulting document tree
6704 */
6705htmlDocPtr
6706htmlReadFile(const char *filename, const char *encoding, int options)
6707{
6708 htmlParserCtxtPtr ctxt;
6709
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006710 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006711 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6712 if (ctxt == NULL)
6713 return (NULL);
6714 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6715}
6716
6717/**
6718 * htmlReadMemory:
6719 * @buffer: a pointer to a char array
6720 * @size: the size of the array
6721 * @URL: the base URL to use for the document
6722 * @encoding: the document encoding, or NULL
6723 * @options: a combination of htmlParserOption(s)
6724 *
6725 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006726 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006727 * Returns the resulting document tree
6728 */
6729htmlDocPtr
6730htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6731{
6732 htmlParserCtxtPtr ctxt;
6733
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006734 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006735 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6736 if (ctxt == NULL)
6737 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006738 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006739 if (ctxt->sax != NULL)
6740 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006741 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6742}
6743
6744/**
6745 * htmlReadFd:
6746 * @fd: an open file descriptor
6747 * @URL: the base URL to use for the document
6748 * @encoding: the document encoding, or NULL
6749 * @options: a combination of htmlParserOption(s)
6750 *
6751 * parse an XML from a file descriptor and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006752 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006753 * Returns the resulting document tree
6754 */
6755htmlDocPtr
6756htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6757{
6758 htmlParserCtxtPtr ctxt;
6759 xmlParserInputBufferPtr input;
6760 xmlParserInputPtr stream;
6761
6762 if (fd < 0)
6763 return (NULL);
6764
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006765 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006766 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6767 if (input == NULL)
6768 return (NULL);
6769 ctxt = xmlNewParserCtxt();
6770 if (ctxt == NULL) {
6771 xmlFreeParserInputBuffer(input);
6772 return (NULL);
6773 }
6774 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6775 if (stream == NULL) {
6776 xmlFreeParserInputBuffer(input);
6777 xmlFreeParserCtxt(ctxt);
6778 return (NULL);
6779 }
6780 inputPush(ctxt, stream);
6781 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6782}
6783
6784/**
6785 * htmlReadIO:
6786 * @ioread: an I/O read function
6787 * @ioclose: an I/O close function
6788 * @ioctx: an I/O handler
6789 * @URL: the base URL to use for the document
6790 * @encoding: the document encoding, or NULL
6791 * @options: a combination of htmlParserOption(s)
6792 *
6793 * parse an HTML document from I/O functions and source and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006794 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006795 * Returns the resulting document tree
6796 */
6797htmlDocPtr
6798htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6799 void *ioctx, const char *URL, const char *encoding, int options)
6800{
6801 htmlParserCtxtPtr ctxt;
6802 xmlParserInputBufferPtr input;
6803 xmlParserInputPtr stream;
6804
6805 if (ioread == NULL)
6806 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006807 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006808
6809 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6810 XML_CHAR_ENCODING_NONE);
Lin Yi-Li24464be2012-05-10 16:14:55 +08006811 if (input == NULL) {
6812 if (ioclose != NULL)
6813 ioclose(ioctx);
Daniel Veillard9475a352003-09-26 12:47:50 +00006814 return (NULL);
Lin Yi-Li24464be2012-05-10 16:14:55 +08006815 }
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006816 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006817 if (ctxt == NULL) {
6818 xmlFreeParserInputBuffer(input);
6819 return (NULL);
6820 }
6821 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6822 if (stream == NULL) {
6823 xmlFreeParserInputBuffer(input);
6824 xmlFreeParserCtxt(ctxt);
6825 return (NULL);
6826 }
6827 inputPush(ctxt, stream);
6828 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6829}
6830
6831/**
6832 * htmlCtxtReadDoc:
6833 * @ctxt: an HTML parser context
6834 * @cur: a pointer to a zero terminated string
6835 * @URL: the base URL to use for the document
6836 * @encoding: the document encoding, or NULL
6837 * @options: a combination of htmlParserOption(s)
6838 *
6839 * parse an XML in-memory document and build a tree.
6840 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006841 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006842 * Returns the resulting document tree
6843 */
6844htmlDocPtr
6845htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6846 const char *URL, const char *encoding, int options)
6847{
6848 xmlParserInputPtr stream;
6849
6850 if (cur == NULL)
6851 return (NULL);
6852 if (ctxt == NULL)
6853 return (NULL);
6854
6855 htmlCtxtReset(ctxt);
6856
6857 stream = xmlNewStringInputStream(ctxt, cur);
6858 if (stream == NULL) {
6859 return (NULL);
6860 }
6861 inputPush(ctxt, stream);
6862 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6863}
6864
6865/**
6866 * htmlCtxtReadFile:
6867 * @ctxt: an HTML parser context
6868 * @filename: a file or URL
6869 * @encoding: the document encoding, or NULL
6870 * @options: a combination of htmlParserOption(s)
6871 *
6872 * parse an XML file from the filesystem or the network.
6873 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006874 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006875 * Returns the resulting document tree
6876 */
6877htmlDocPtr
6878htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6879 const char *encoding, int options)
6880{
6881 xmlParserInputPtr stream;
6882
6883 if (filename == NULL)
6884 return (NULL);
6885 if (ctxt == NULL)
6886 return (NULL);
6887
6888 htmlCtxtReset(ctxt);
6889
Daniel Veillard29614c72004-11-26 10:47:26 +00006890 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006891 if (stream == NULL) {
6892 return (NULL);
6893 }
6894 inputPush(ctxt, stream);
6895 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6896}
6897
6898/**
6899 * htmlCtxtReadMemory:
6900 * @ctxt: an HTML parser context
6901 * @buffer: a pointer to a char array
6902 * @size: the size of the array
6903 * @URL: the base URL to use for the document
6904 * @encoding: the document encoding, or NULL
6905 * @options: a combination of htmlParserOption(s)
6906 *
6907 * parse an XML in-memory document and build a tree.
6908 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006909 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006910 * Returns the resulting document tree
6911 */
6912htmlDocPtr
6913htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6914 const char *URL, const char *encoding, int options)
6915{
6916 xmlParserInputBufferPtr input;
6917 xmlParserInputPtr stream;
6918
6919 if (ctxt == NULL)
6920 return (NULL);
6921 if (buffer == NULL)
6922 return (NULL);
6923
6924 htmlCtxtReset(ctxt);
6925
6926 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6927 if (input == NULL) {
6928 return(NULL);
6929 }
6930
6931 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6932 if (stream == NULL) {
6933 xmlFreeParserInputBuffer(input);
6934 return(NULL);
6935 }
6936
6937 inputPush(ctxt, stream);
6938 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6939}
6940
6941/**
6942 * htmlCtxtReadFd:
6943 * @ctxt: an HTML parser context
6944 * @fd: an open file descriptor
6945 * @URL: the base URL to use for the document
6946 * @encoding: the document encoding, or NULL
6947 * @options: a combination of htmlParserOption(s)
6948 *
6949 * parse an XML from a file descriptor and build a tree.
6950 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006951 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006952 * Returns the resulting document tree
6953 */
6954htmlDocPtr
6955htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6956 const char *URL, const char *encoding, int options)
6957{
6958 xmlParserInputBufferPtr input;
6959 xmlParserInputPtr stream;
6960
6961 if (fd < 0)
6962 return (NULL);
6963 if (ctxt == NULL)
6964 return (NULL);
6965
6966 htmlCtxtReset(ctxt);
6967
6968
6969 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6970 if (input == NULL)
6971 return (NULL);
6972 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6973 if (stream == NULL) {
6974 xmlFreeParserInputBuffer(input);
6975 return (NULL);
6976 }
6977 inputPush(ctxt, stream);
6978 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6979}
6980
6981/**
6982 * htmlCtxtReadIO:
6983 * @ctxt: an HTML parser context
6984 * @ioread: an I/O read function
6985 * @ioclose: an I/O close function
6986 * @ioctx: an I/O handler
6987 * @URL: the base URL to use for the document
6988 * @encoding: the document encoding, or NULL
6989 * @options: a combination of htmlParserOption(s)
6990 *
6991 * parse an HTML document from I/O functions and source and build a tree.
6992 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006993 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006994 * Returns the resulting document tree
6995 */
6996htmlDocPtr
6997htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6998 xmlInputCloseCallback ioclose, void *ioctx,
6999 const char *URL,
7000 const char *encoding, int options)
7001{
7002 xmlParserInputBufferPtr input;
7003 xmlParserInputPtr stream;
7004
7005 if (ioread == NULL)
7006 return (NULL);
7007 if (ctxt == NULL)
7008 return (NULL);
7009
7010 htmlCtxtReset(ctxt);
7011
7012 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7013 XML_CHAR_ENCODING_NONE);
Lin Yi-Li24464be2012-05-10 16:14:55 +08007014 if (input == NULL) {
7015 if (ioclose != NULL)
7016 ioclose(ioctx);
Daniel Veillard9475a352003-09-26 12:47:50 +00007017 return (NULL);
Lin Yi-Li24464be2012-05-10 16:14:55 +08007018 }
Daniel Veillard9475a352003-09-26 12:47:50 +00007019 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7020 if (stream == NULL) {
7021 xmlFreeParserInputBuffer(input);
7022 return (NULL);
7023 }
7024 inputPush(ctxt, stream);
7025 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7026}
7027
Daniel Veillard5d4644e2005-04-01 13:11:58 +00007028#define bottom_HTMLparser
7029#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00007030#endif /* LIBXML_HTML_ENABLED */