blob: 63befed976e18526f96a7a1bdfc7666d35bfcb80 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
Daniel Veillarda78d8032012-07-16 14:56:50 +080047#include "buf.h"
48#include "enc.h"
49
Owen Taylor3473f882001-02-23 17:55:21 +000050#define HTML_MAX_NAMELEN 1000
51#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52#define HTML_PARSER_BUFFER_SIZE 100
53
54/* #define DEBUG */
55/* #define DEBUG_PUSH */
56
Daniel Veillard22090732001-07-16 00:06:07 +000057static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000058
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000061static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000062
63/************************************************************************
64 * *
Daniel Veillarde77db162009-08-22 11:32:38 +020065 * Some factorized error routines *
Daniel Veillardf403d292003-10-05 13:51:35 +000066 * *
67 ************************************************************************/
68
69/**
William M. Brackedb65a72004-02-06 07:36:04 +000070 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000071 * @ctxt: an HTML parser context
72 * @extra: extra informations
73 *
74 * Handle a redefinition of attribute error
75 */
76static void
77htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78{
Daniel Veillard157fee02003-10-31 10:36:03 +000079 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80 (ctxt->instate == XML_PARSER_EOF))
81 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000082 if (ctxt != NULL) {
83 ctxt->errNo = XML_ERR_NO_MEMORY;
84 ctxt->instate = XML_PARSER_EOF;
85 ctxt->disableSAX = 1;
86 }
87 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000088 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000089 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90 NULL, NULL, 0, 0,
91 "Memory allocation failed : %s\n", extra);
92 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000093 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000094 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95 NULL, NULL, 0, 0, "Memory allocation failed\n");
96}
97
98/**
99 * htmlParseErr:
100 * @ctxt: an HTML parser context
101 * @error: the error number
102 * @msg: the error message
103 * @str1: string infor
104 * @str2: string infor
105 *
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107 */
108static void
109htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110 const char *msg, const xmlChar *str1, const xmlChar *str2)
111{
Daniel Veillard157fee02003-10-31 10:36:03 +0000112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113 (ctxt->instate == XML_PARSER_EOF))
114 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000115 if (ctxt != NULL)
116 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000118 XML_ERR_ERROR, NULL, 0,
119 (const char *) str1, (const char *) str2,
120 NULL, 0, 0,
121 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000122 if (ctxt != NULL)
123 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000124}
125
126/**
127 * htmlParseErrInt:
128 * @ctxt: an HTML parser context
129 * @error: the error number
130 * @msg: the error message
131 * @val: integer info
132 *
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134 */
135static void
136htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137 const char *msg, int val)
138{
Daniel Veillard157fee02003-10-31 10:36:03 +0000139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140 (ctxt->instate == XML_PARSER_EOF))
141 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000142 if (ctxt != NULL)
143 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000145 XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000147 if (ctxt != NULL)
148 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000149}
150
151/************************************************************************
152 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200153 * Parser stacks related functions and macros *
Owen Taylor3473f882001-02-23 17:55:21 +0000154 * *
155 ************************************************************************/
156
Daniel Veillard1c732d22002-11-30 11:22:59 +0000157/**
158 * htmlnamePush:
159 * @ctxt: an HTML parser context
160 * @value: the element name
161 *
162 * Pushes a new element name on top of the name stack
163 *
164 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000165 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000166static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000167htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000168{
Daniel Veillard029a04d2009-08-24 12:50:23 +0200169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170 ctxt->html = 3;
171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172 ctxt->html = 10;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000173 if (ctxt->nameNr >= ctxt->nameMax) {
174 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000175 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000176 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000177 ctxt->nameMax *
178 sizeof(ctxt->nameTab[0]));
179 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000180 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000181 return (0);
182 }
183 }
184 ctxt->nameTab[ctxt->nameNr] = value;
185 ctxt->name = value;
186 return (ctxt->nameNr++);
187}
188/**
189 * htmlnamePop:
190 * @ctxt: an HTML parser context
191 *
192 * Pops the top element name from the name stack
193 *
194 * Returns the name just removed
195 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000196static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000197htmlnamePop(htmlParserCtxtPtr ctxt)
198{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000199 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000200
Daniel Veillard1c732d22002-11-30 11:22:59 +0000201 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000202 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000203 ctxt->nameNr--;
204 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000205 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000206 if (ctxt->nameNr > 0)
207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208 else
209 ctxt->name = NULL;
210 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000211 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000212 return (ret);
213}
Owen Taylor3473f882001-02-23 17:55:21 +0000214
Eugene Pimenov615904f2010-03-15 15:16:02 +0100215/**
216 * htmlNodeInfoPush:
217 * @ctxt: an HTML parser context
218 * @value: the node info
219 *
220 * Pushes a new element name on top of the node info stack
221 *
222 * Returns 0 in case of error, the index in the stack otherwise
223 */
224static int
225htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226{
227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228 if (ctxt->nodeInfoMax == 0)
229 ctxt->nodeInfoMax = 5;
230 ctxt->nodeInfoMax *= 2;
231 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233 ctxt->nodeInfoMax *
234 sizeof(ctxt->nodeInfoTab[0]));
235 if (ctxt->nodeInfoTab == NULL) {
236 htmlErrMemory(ctxt, NULL);
237 return (0);
238 }
239 }
240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242 return (ctxt->nodeInfoNr++);
243}
244
245/**
246 * htmlNodeInfoPop:
247 * @ctxt: an HTML parser context
248 *
249 * Pops the top element name from the node info stack
250 *
251 * Returns 0 in case of error, the pointer to NodeInfo otherwise
252 */
253static htmlParserNodeInfo *
254htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255{
256 if (ctxt->nodeInfoNr <= 0)
257 return (NULL);
258 ctxt->nodeInfoNr--;
259 if (ctxt->nodeInfoNr < 0)
260 return (NULL);
261 if (ctxt->nodeInfoNr > 0)
262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263 else
264 ctxt->nodeInfo = NULL;
265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266}
267
Owen Taylor3473f882001-02-23 17:55:21 +0000268/*
269 * Macros for accessing the content. Those should be used only by the parser,
270 * and not exported.
271 *
272 * Dirty macros, i.e. one need to make assumption on the context to use them
273 *
274 * CUR_PTR return the current pointer to the xmlChar to be parsed.
275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277 * in UNICODE mode. This should be used internally by the parser
278 * only to compare to ASCII values otherwise it would break when
279 * running with UTF-8 encoding.
280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281 * to compare on ASCII based substring.
282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283 * it should be used only to compare on ASCII based substring.
284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000285 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000286 *
287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288 *
289 * CURRENT Returns the current char value, with the full decoding of
290 * UTF-8 if we are using this mode. It returns an int.
291 * NEXT Skip to the next character, this does the proper decoding
292 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000293 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295 */
296
297#define UPPER (toupper(*ctxt->input->cur))
298
Daniel Veillard77a90a72003-03-22 00:04:05 +0000299#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000300
301#define NXT(val) ctxt->input->cur[(val)]
302
303#define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305#define CUR_PTR ctxt->input->cur
306
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000307#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
308 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
309 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000310
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000311#define GROW if ((ctxt->progressive == 0) && \
312 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
313 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000314
315#define CURRENT ((int) (*ctxt->input->cur))
316
317#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
318
319/* Inported from XML */
320
Daniel Veillard561b7f82002-03-20 21:55:57 +0000321/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
322#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000323#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000324
Daniel Veillard561b7f82002-03-20 21:55:57 +0000325#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000326
327
328#define NEXTL(l) do { \
329 if (*(ctxt->input->cur) == '\n') { \
330 ctxt->input->line++; ctxt->input->col = 1; \
331 } else ctxt->input->col++; \
332 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
333 } while (0)
Daniel Veillarde77db162009-08-22 11:32:38 +0200334
Owen Taylor3473f882001-02-23 17:55:21 +0000335/************
336 \
337 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
338 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
339 ************/
340
341#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
342#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
343
344#define COPY_BUF(l,b,i,v) \
345 if (l == 1) b[i++] = (xmlChar) v; \
346 else i += xmlCopyChar(l,&b[i],v)
347
348/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200349 * htmlFindEncoding:
350 * @the HTML parser context
351 *
352 * Ty to find and encoding in the current data available in the input
353 * buffer this is needed to try to switch to the proper encoding when
354 * one face a character error.
355 * That's an heuristic, since it's operating outside of parsing it could
356 * try to use a meta which had been commented out, that's the reason it
357 * should only be used in case of error, not as a default.
358 *
359 * Returns an encoding string or NULL if not found, the string need to
360 * be freed
361 */
362static xmlChar *
363htmlFindEncoding(xmlParserCtxtPtr ctxt) {
364 const xmlChar *start, *cur, *end;
365
366 if ((ctxt == NULL) || (ctxt->input == NULL) ||
367 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
368 (ctxt->input->buf->encoder != NULL))
369 return(NULL);
370 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
371 return(NULL);
372
373 start = ctxt->input->cur;
374 end = ctxt->input->end;
375 /* we also expect the input buffer to be zero terminated */
376 if (*end != 0)
377 return(NULL);
378
379 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
380 if (cur == NULL)
381 return(NULL);
382 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
383 if (cur == NULL)
384 return(NULL);
385 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
386 if (cur == NULL)
387 return(NULL);
388 cur += 8;
389 start = cur;
390 while (((*cur >= 'A') && (*cur <= 'Z')) ||
391 ((*cur >= 'a') && (*cur <= 'z')) ||
392 ((*cur >= '0') && (*cur <= '9')) ||
393 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
394 cur++;
395 if (cur == start)
396 return(NULL);
397 return(xmlStrndup(start, cur - start));
398}
399
400/**
Owen Taylor3473f882001-02-23 17:55:21 +0000401 * htmlCurrentChar:
402 * @ctxt: the HTML parser context
403 * @len: pointer to the length of the char read
404 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000405 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000406 * bytes in the input buffer. Implement the end of line normalization:
407 * 2.11 End-of-Line Handling
408 * If the encoding is unspecified, in the case we find an ISO-Latin-1
409 * char, then the encoding converter is plugged in automatically.
410 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000411 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000412 */
413
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000414static int
Owen Taylor3473f882001-02-23 17:55:21 +0000415htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
416 if (ctxt->instate == XML_PARSER_EOF)
417 return(0);
418
419 if (ctxt->token != 0) {
420 *len = 0;
421 return(ctxt->token);
Daniel Veillarde77db162009-08-22 11:32:38 +0200422 }
Owen Taylor3473f882001-02-23 17:55:21 +0000423 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
424 /*
425 * We are supposed to handle UTF8, check it's valid
426 * From rfc2044: encoding of the Unicode values on UTF-8:
427 *
428 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
429 * 0000 0000-0000 007F 0xxxxxxx
430 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
Daniel Veillarde77db162009-08-22 11:32:38 +0200431 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
Owen Taylor3473f882001-02-23 17:55:21 +0000432 *
433 * Check for the 0x110000 limit too
434 */
435 const unsigned char *cur = ctxt->input->cur;
436 unsigned char c;
437 unsigned int val;
438
439 c = *cur;
440 if (c & 0x80) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200441 if (cur[1] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000442 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200443 cur = ctxt->input->cur;
444 }
Owen Taylor3473f882001-02-23 17:55:21 +0000445 if ((cur[1] & 0xc0) != 0x80)
446 goto encoding_error;
447 if ((c & 0xe0) == 0xe0) {
448
Adiel Mittmann8a103792009-08-25 11:27:13 +0200449 if (cur[2] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000450 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200451 cur = ctxt->input->cur;
452 }
Owen Taylor3473f882001-02-23 17:55:21 +0000453 if ((cur[2] & 0xc0) != 0x80)
454 goto encoding_error;
455 if ((c & 0xf0) == 0xf0) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200456 if (cur[3] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000457 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200458 cur = ctxt->input->cur;
459 }
Owen Taylor3473f882001-02-23 17:55:21 +0000460 if (((c & 0xf8) != 0xf0) ||
461 ((cur[3] & 0xc0) != 0x80))
462 goto encoding_error;
463 /* 4-byte code */
464 *len = 4;
465 val = (cur[0] & 0x7) << 18;
466 val |= (cur[1] & 0x3f) << 12;
467 val |= (cur[2] & 0x3f) << 6;
468 val |= cur[3] & 0x3f;
469 } else {
470 /* 3-byte code */
471 *len = 3;
472 val = (cur[0] & 0xf) << 12;
473 val |= (cur[1] & 0x3f) << 6;
474 val |= cur[2] & 0x3f;
475 }
476 } else {
477 /* 2-byte code */
478 *len = 2;
479 val = (cur[0] & 0x1f) << 6;
480 val |= cur[1] & 0x3f;
481 }
482 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000483 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
484 "Char 0x%X out of allowed range\n", val);
Daniel Veillarde77db162009-08-22 11:32:38 +0200485 }
Owen Taylor3473f882001-02-23 17:55:21 +0000486 return(val);
487 } else {
Daniel Veillard856c6682009-08-24 18:16:56 +0200488 if ((*ctxt->input->cur == 0) &&
489 (ctxt->input->cur < ctxt->input->end)) {
490 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
491 "Char 0x%X out of allowed range\n", 0);
492 *len = 1;
493 return(' ');
494 }
Owen Taylor3473f882001-02-23 17:55:21 +0000495 /* 1-byte code */
496 *len = 1;
497 return((int) *ctxt->input->cur);
498 }
499 }
500 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000501 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000502 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000503 * XML constructs only use < 128 chars
504 */
505 *len = 1;
506 if ((int) *ctxt->input->cur < 0x80)
507 return((int) *ctxt->input->cur);
508
509 /*
510 * Humm this is bad, do an automatic flow conversion
511 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200512 {
513 xmlChar * guess;
514 xmlCharEncodingHandlerPtr handler;
515
516 guess = htmlFindEncoding(ctxt);
517 if (guess == NULL) {
518 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
519 } else {
520 if (ctxt->input->encoding != NULL)
521 xmlFree((xmlChar *) ctxt->input->encoding);
522 ctxt->input->encoding = guess;
523 handler = xmlFindCharEncodingHandler((const char *) guess);
524 if (handler != NULL) {
525 xmlSwitchToEncoding(ctxt, handler);
526 } else {
527 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
528 "Unsupported encoding %s", guess, NULL);
529 }
530 }
531 ctxt->charset = XML_CHAR_ENCODING_UTF8;
532 }
533
Owen Taylor3473f882001-02-23 17:55:21 +0000534 return(xmlCurrentChar(ctxt, len));
535
536encoding_error:
537 /*
538 * If we detect an UTF8 error that probably mean that the
539 * input encoding didn't get properly advertized in the
540 * declaration header. Report the error and switch the encoding
541 * to ISO-Latin-1 (if you don't like this policy, just declare the
542 * encoding !)
543 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000544 {
545 char buffer[150];
546
Daniel Veillard861101d2007-06-12 08:38:57 +0000547 if (ctxt->input->end - ctxt->input->cur >= 4) {
548 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
549 ctxt->input->cur[0], ctxt->input->cur[1],
550 ctxt->input->cur[2], ctxt->input->cur[3]);
551 } else {
552 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
553 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000554 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
555 "Input is not proper UTF-8, indicate encoding !\n",
556 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000557 }
558
Daniel Veillarde77db162009-08-22 11:32:38 +0200559 ctxt->charset = XML_CHAR_ENCODING_8859_1;
Owen Taylor3473f882001-02-23 17:55:21 +0000560 *len = 1;
561 return((int) *ctxt->input->cur);
562}
563
564/**
Owen Taylor3473f882001-02-23 17:55:21 +0000565 * htmlSkipBlankChars:
566 * @ctxt: the HTML parser context
567 *
568 * skip all blanks character found at that point in the input streams.
569 *
570 * Returns the number of space chars skipped
571 */
572
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000573static int
Owen Taylor3473f882001-02-23 17:55:21 +0000574htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
575 int res = 0;
576
William M. Brack76e95df2003-10-18 16:20:14 +0000577 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000578 if ((*ctxt->input->cur == 0) &&
579 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
580 xmlPopInput(ctxt);
581 } else {
582 if (*(ctxt->input->cur) == '\n') {
583 ctxt->input->line++; ctxt->input->col = 1;
584 } else ctxt->input->col++;
585 ctxt->input->cur++;
586 ctxt->nbChars++;
587 if (*ctxt->input->cur == 0)
588 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
589 }
590 res++;
591 }
592 return(res);
593}
594
595
596
597/************************************************************************
598 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200599 * The list of HTML elements and their properties *
Owen Taylor3473f882001-02-23 17:55:21 +0000600 * *
601 ************************************************************************/
602
603/*
604 * Start Tag: 1 means the start tag can be ommited
605 * End Tag: 1 means the end tag can be ommited
606 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000607 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000608 * Depr: this element is deprecated
609 * DTD: 1 means that this element is valid only in the Loose DTD
610 * 2 means that this element is valid only in the Frameset DTD
611 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000612 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000613 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000614 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000615
616/* Definitions and a couple of vars for HTML Elements */
617
618#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000619#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000620#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000621#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000622#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
623#define NB_SPECIAL 16
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100624#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000625#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100626#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000627#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000628#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000629#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000630#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000631#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000632#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000633#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000634#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000635#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000636#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000637#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000638#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000639#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000640#define EMPTY NULL
641
642
Daniel Veillard065abe82006-07-03 08:55:04 +0000643static const char* const html_flow[] = { FLOW, NULL } ;
644static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000645
646/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000647static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000648#define html_cdata html_pcdata
649
650
651/* ... and for HTML Attributes */
652
653#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000654#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000655#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000656#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000657#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000658#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000659#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000660#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000661#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000662#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000663#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000664#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000665
Daniel Veillard065abe82006-07-03 08:55:04 +0000666static const char* const html_attrs[] = { ATTRS, NULL } ;
667static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
668static const char* const core_attrs[] = { COREATTRS, NULL } ;
669static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000670
671
672/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000673static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000674 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
675 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000676static const char* const target_attr[] = { "target", NULL } ;
677static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
678static const char* const alt_attr[] = { "alt", NULL } ;
679static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
680static const char* const href_attrs[] = { "href", NULL } ;
681static const char* const clear_attrs[] = { "clear", NULL } ;
682static const char* const inline_p[] = { INLINE, "p", NULL } ;
683
684static const char* const flow_param[] = { FLOW, "param", NULL } ;
685static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000686 "archive", "alt", "name", "height", "width", "align",
687 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000688static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000689 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000690static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000691 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000692static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
693static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
694static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
695static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000696 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000697static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000698 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
699
700
Daniel Veillard065abe82006-07-03 08:55:04 +0000701static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
702static const char* const col_elt[] = { "col", NULL } ;
703static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
704static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
705static const char* const dl_contents[] = { "dt", "dd", NULL } ;
706static const char* const compact_attr[] = { "compact", NULL } ;
707static const char* const label_attr[] = { "label", NULL } ;
708static const char* const fieldset_contents[] = { FLOW, "legend" } ;
709static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
710static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
711static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
712static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
713static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
714static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
715static const char* const head_attrs[] = { I18N, "profile", NULL } ;
716static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
717static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
718static const char* const version_attr[] = { "version", NULL } ;
719static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
720static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
721static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000722static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000723static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
724static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
725static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
726static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
727static const char* const align_attr[] = { "align", NULL } ;
728static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
729static const char* const map_contents[] = { BLOCK, "area", NULL } ;
730static const char* const name_attr[] = { "name", NULL } ;
731static const char* const action_attr[] = { "action", NULL } ;
732static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
Denis Pauk868d92d2012-05-10 15:34:57 +0800733static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000734static const char* const content_attr[] = { "content", NULL } ;
735static const char* const type_attr[] = { "type", NULL } ;
736static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
737static const char* const object_contents[] = { FLOW, "param", NULL } ;
738static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
739static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
740static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
741static const char* const option_elt[] = { "option", NULL } ;
742static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
743static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
744static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
745static const char* const width_attr[] = { "width", NULL } ;
746static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
747static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
748static const char* const language_attr[] = { "language", NULL } ;
749static const char* const select_content[] = { "optgroup", "option", NULL } ;
750static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
751static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200752static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000753static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
754static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
755static const char* const tr_elt[] = { "tr", NULL } ;
756static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
757static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
758static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
759static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
760static const char* const tr_contents[] = { "th", "td", NULL } ;
761static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
762static const char* const li_elt[] = { "li", NULL } ;
763static const char* const ul_depr[] = { "type", "compact", NULL} ;
764static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000765
766#define DECL (const char**)
767
Daniel Veillard22090732001-07-16 00:06:07 +0000768static const htmlElemDesc
769html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000770{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
771 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
772},
773{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
774 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
775},
776{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
777 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
778},
779{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
780 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
781},
782{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
783 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
784},
785{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
786 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
787},
788{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
789 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
790},
791{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
792 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
793},
794{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
795 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
796},
797{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
798 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
799},
800{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
801 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
802},
803{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
804 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
805},
806{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
807 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
808},
809{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
810 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
811},
812{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
813 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
814},
815{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
816 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
817},
818{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
819 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
820},
821{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
822 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
823},
824{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
825 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
826},
827{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
828 EMPTY , NULL , DECL col_attrs , NULL, NULL
829},
830{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
831 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
832},
833{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
834 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
835},
836{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
837 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
838},
839{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
840 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
841},
842{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
843 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
844},
845{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
846 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
847},
848{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000849 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000850},
851{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
852 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
853},
854{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
855 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
856},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000857{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000858 EMPTY, NULL, DECL embed_attrs, NULL, NULL
859},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000860{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
861 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
862},
863{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
864 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
865},
866{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
867 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
868},
869{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
870 EMPTY, NULL, NULL, DECL frame_attrs, NULL
871},
872{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
873 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
874},
875{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
876 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
877},
878{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
879 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
880},
881{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
882 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
883},
884{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
885 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
886},
887{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
888 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
889},
890{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
891 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
892},
893{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
894 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
895},
896{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
897 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
898},
899{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
900 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
901},
902{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
903 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
904},
905{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
906 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
907},
908{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000909 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000910},
911{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
912 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
913},
914{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
915 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
916},
917{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
918 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
919},
920{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
921 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
922},
923{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
924 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
925},
926{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
927 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
928},
929{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
930 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
931},
932{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
933 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
934},
935{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000936 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000937},
938{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
939 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
940},
941{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
942 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
943},
944{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
945 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
946},
947{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
948 DECL html_flow, "div", DECL html_attrs, NULL, NULL
949},
950{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
951 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
952},
953{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
954 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
955},
956{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000957 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000958},
959{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
960 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
961},
962{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
963 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
964},
965{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000966 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000967},
968{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
969 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
970},
971{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
972 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
973},
974{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
975 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
976},
977{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
978 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
979},
980{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
981 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
982},
983{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
984 DECL select_content, NULL, DECL select_attrs, NULL, NULL
985},
986{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
987 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
988},
989{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
990 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
991},
992{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
993 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
994},
995{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
996 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
997},
998{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
999 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1000},
1001{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1002 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003},
1004{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1005 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1006},
1007{ "table", 0, 0, 0, 0, 0, 0, 0, "",
1008 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1009},
1010{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1011 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1012},
1013{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1014 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1015},
1016{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1017 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1018},
1019{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1020 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1021},
1022{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1023 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1024},
1025{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1026 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1027},
1028{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1029 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1030},
1031{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1032 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1033},
1034{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1035 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1036},
1037{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1038 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1039},
1040{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1041 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1042},
1043{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1044 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1045}
Owen Taylor3473f882001-02-23 17:55:21 +00001046};
1047
1048/*
Owen Taylor3473f882001-02-23 17:55:21 +00001049 * start tags that imply the end of current element
1050 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001051static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001052"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1053 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1054 "listing", "xmp", "head", NULL,
1055"head", "p", NULL,
1056"title", "p", NULL,
1057"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +00001058"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001059"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1060 "pre", "listing", "xmp", "head", "li", NULL,
1061"hr", "p", "head", NULL,
1062"h1", "p", "head", NULL,
1063"h2", "p", "head", NULL,
1064"h3", "p", "head", NULL,
1065"h4", "p", "head", NULL,
1066"h5", "p", "head", NULL,
1067"h6", "p", "head", NULL,
1068"dir", "p", "head", NULL,
1069"address", "p", "head", "ul", NULL,
1070"pre", "p", "head", "ul", NULL,
1071"listing", "p", "head", NULL,
1072"xmp", "p", "head", NULL,
1073"blockquote", "p", "head", NULL,
1074"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1075 "xmp", "head", NULL,
1076"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1077 "head", "dd", NULL,
1078"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1079 "head", "dt", NULL,
1080"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1081 "listing", "xmp", NULL,
1082"ol", "p", "head", "ul", NULL,
1083"menu", "p", "head", "ul", NULL,
Daniel Veillard6339c1a2009-08-24 11:59:51 +02001084"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001085"div", "p", "head", NULL,
Denis Pauka0cd0752012-05-11 19:31:12 +08001086"noscript", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001087"center", "font", "b", "i", "p", "head", NULL,
1088"a", "a", NULL,
1089"caption", "p", NULL,
1090"colgroup", "caption", "colgroup", "col", "p", NULL,
1091"col", "caption", "col", "p", NULL,
1092"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1093 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001094"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Daniel Veillarde77db162009-08-22 11:32:38 +02001095"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001096"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1097"thead", "caption", "col", "colgroup", NULL,
1098"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1099 "tbody", "p", NULL,
1100"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1101 "tfoot", "tbody", "p", NULL,
1102"optgroup", "option", NULL,
1103"option", "option", NULL,
1104"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1105 "pre", "listing", "xmp", "a", NULL,
1106NULL
1107};
1108
1109/*
1110 * The list of HTML elements which are supposed not to have
1111 * CDATA content and where a p element will be implied
1112 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001113 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001114 * implied paragraph
1115 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001116static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001117 "html",
1118 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001119 NULL
1120};
1121
1122/*
1123 * The list of HTML attributes which are of content %Script;
1124 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1125 * it assumes the name starts with 'on'
1126 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001127static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001128 "onclick",
1129 "ondblclick",
1130 "onmousedown",
1131 "onmouseup",
1132 "onmouseover",
1133 "onmousemove",
1134 "onmouseout",
1135 "onkeypress",
1136 "onkeydown",
1137 "onkeyup",
1138 "onload",
1139 "onunload",
1140 "onfocus",
1141 "onblur",
1142 "onsubmit",
1143 "onrest",
1144 "onchange",
1145 "onselect"
1146};
1147
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001148/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001149 * This table is used by the htmlparser to know what to do with
1150 * broken html pages. By assigning different priorities to different
1151 * elements the parser can decide how to handle extra endtags.
1152 * Endtags are only allowed to close elements with lower or equal
1153 * priority.
Daniel Veillarde77db162009-08-22 11:32:38 +02001154 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001155
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001156typedef struct {
1157 const char *name;
1158 int priority;
1159} elementPriority;
1160
Daniel Veillard22090732001-07-16 00:06:07 +00001161static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001162 {"div", 150},
1163 {"td", 160},
1164 {"th", 160},
1165 {"tr", 170},
1166 {"thead", 180},
1167 {"tbody", 180},
1168 {"tfoot", 180},
1169 {"table", 190},
1170 {"head", 200},
1171 {"body", 200},
1172 {"html", 220},
1173 {NULL, 100} /* Default priority */
1174};
Owen Taylor3473f882001-02-23 17:55:21 +00001175
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001176static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001177static int htmlStartCloseIndexinitialized = 0;
1178
1179/************************************************************************
1180 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001181 * functions to handle HTML specific data *
Owen Taylor3473f882001-02-23 17:55:21 +00001182 * *
1183 ************************************************************************/
1184
1185/**
1186 * htmlInitAutoClose:
1187 *
1188 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1189 * This is not reentrant. Call xmlInitParser() once before processing in
1190 * case of use in multithreaded programs.
1191 */
1192void
1193htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001194 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001195
1196 if (htmlStartCloseIndexinitialized) return;
1197
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001198 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1199 indx = 0;
1200 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001201 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001202 while (htmlStartClose[i] != NULL) i++;
1203 i++;
1204 }
1205 htmlStartCloseIndexinitialized = 1;
1206}
1207
1208/**
1209 * htmlTagLookup:
1210 * @tag: The tag name in lowercase
1211 *
1212 * Lookup the HTML tag in the ElementTable
1213 *
1214 * Returns the related htmlElemDescPtr or NULL if not found.
1215 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001216const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001217htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001218 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001219
1220 for (i = 0; i < (sizeof(html40ElementTable) /
1221 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001222 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001223 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001224 }
1225 return(NULL);
1226}
1227
1228/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001229 * htmlGetEndPriority:
1230 * @name: The name of the element to look up the priority for.
Daniel Veillarde77db162009-08-22 11:32:38 +02001231 *
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001232 * Return value: The "endtag" priority.
1233 **/
1234static int
1235htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001236 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001237
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001238 while ((htmlEndPriority[i].name != NULL) &&
1239 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1240 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001241
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001242 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001243}
1244
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001245
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001246/**
Owen Taylor3473f882001-02-23 17:55:21 +00001247 * htmlCheckAutoClose:
1248 * @newtag: The new tag name
1249 * @oldtag: The old tag name
1250 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001251 * Checks whether the new tag is one of the registered valid tags for
1252 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001253 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1254 *
1255 * Returns 0 if no, 1 if yes.
1256 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001257static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001258htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1259{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001260 int i, indx;
1261 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001262
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001263 if (htmlStartCloseIndexinitialized == 0)
1264 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001265
1266 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001267 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001268 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001269 if (closed == NULL)
1270 return (0);
1271 if (xmlStrEqual(BAD_CAST * closed, newtag))
1272 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001273 }
1274
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001275 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001276 i++;
1277 while (htmlStartClose[i] != NULL) {
1278 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001279 return (1);
1280 }
1281 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001282 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001283 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001284}
1285
1286/**
1287 * htmlAutoCloseOnClose:
1288 * @ctxt: an HTML parser context
1289 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001290 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001291 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001292 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001293 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001294static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001295htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1296{
1297 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001298 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001299
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001300 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001301
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001302 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001303
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001304 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1305 break;
1306 /*
1307 * A missplaced endtag can only close elements with lower
1308 * or equal priority, so if we find an element with higher
1309 * priority before we find an element with
Daniel Veillarde77db162009-08-22 11:32:38 +02001310 * matching name, we just ignore this endtag
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001311 */
1312 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1313 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001314 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001315 if (i < 0)
1316 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001317
1318 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001319 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001320 if ((info != NULL) && (info->endTag == 3)) {
1321 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1322 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001323 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001324 }
1325 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1326 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001327 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001328 }
1329}
1330
1331/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001332 * htmlAutoCloseOnEnd:
1333 * @ctxt: an HTML parser context
1334 *
1335 * Close all remaining tags at the end of the stream
1336 */
1337static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001338htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1339{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001340 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001341
William M. Brack899e64a2003-09-26 18:03:42 +00001342 if (ctxt->nameNr == 0)
1343 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001344 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001345 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1346 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001347 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001348 }
1349}
1350
1351/**
Owen Taylor3473f882001-02-23 17:55:21 +00001352 * htmlAutoClose:
1353 * @ctxt: an HTML parser context
1354 * @newtag: The new tag name or NULL
1355 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001356 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001357 * The list is kept in htmlStartClose array. This function is
1358 * called when a new tag has been detected and generates the
1359 * appropriates closes if possible/needed.
1360 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillarde77db162009-08-22 11:32:38 +02001361 * and we should check
Owen Taylor3473f882001-02-23 17:55:21 +00001362 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001363static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001364htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1365{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001366 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001367 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001368 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1369 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001370 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001371 }
1372 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001373 htmlAutoCloseOnEnd(ctxt);
1374 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001375 }
1376 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001377 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1378 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1379 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001380 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1381 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001382 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001383 }
Owen Taylor3473f882001-02-23 17:55:21 +00001384}
1385
1386/**
1387 * htmlAutoCloseTag:
1388 * @doc: the HTML document
1389 * @name: The tag name
1390 * @elem: the HTML element
1391 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001392 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001393 * The list is kept in htmlStartClose array. This function checks
1394 * if the element or one of it's children would autoclose the
1395 * given tag.
1396 *
1397 * Returns 1 if autoclose, 0 otherwise
1398 */
1399int
1400htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1401 htmlNodePtr child;
1402
1403 if (elem == NULL) return(1);
1404 if (xmlStrEqual(name, elem->name)) return(0);
1405 if (htmlCheckAutoClose(elem->name, name)) return(1);
1406 child = elem->children;
1407 while (child != NULL) {
1408 if (htmlAutoCloseTag(doc, name, child)) return(1);
1409 child = child->next;
1410 }
1411 return(0);
1412}
1413
1414/**
1415 * htmlIsAutoClosed:
1416 * @doc: the HTML document
1417 * @elem: the HTML element
1418 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001419 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001420 * The list is kept in htmlStartClose array. This function checks
1421 * if a tag is autoclosed by one of it's child
1422 *
1423 * Returns 1 if autoclosed, 0 otherwise
1424 */
1425int
1426htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1427 htmlNodePtr child;
1428
1429 if (elem == NULL) return(1);
1430 child = elem->children;
1431 while (child != NULL) {
1432 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1433 child = child->next;
1434 }
1435 return(0);
1436}
1437
1438/**
1439 * htmlCheckImplied:
1440 * @ctxt: an HTML parser context
1441 * @newtag: The new tag name
1442 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001443 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001444 * called when a new tag has been detected and generates the
1445 * appropriates implicit tags if missing
1446 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001447static void
Owen Taylor3473f882001-02-23 17:55:21 +00001448htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardb468f742009-08-24 18:45:33 +02001449 int i;
1450
Daniel Veillarde20fb5a2010-01-29 20:47:08 +01001451 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1452 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001453 if (!htmlOmittedDefaultValue)
1454 return;
1455 if (xmlStrEqual(newtag, BAD_CAST"html"))
1456 return;
1457 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001458 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001459 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1460 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1461 }
1462 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1463 return;
Daniel Veillarde77db162009-08-22 11:32:38 +02001464 if ((ctxt->nameNr <= 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001465 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1466 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1467 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1468 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1469 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1470 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001471 if (ctxt->html >= 3) {
1472 /* we already saw or generated an <head> before */
1473 return;
1474 }
1475 /*
1476 * dropped OBJECT ... i you put it first BODY will be
1477 * assumed !
1478 */
1479 htmlnamePush(ctxt, BAD_CAST"head");
1480 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1481 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001482 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1483 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1484 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001485 if (ctxt->html >= 10) {
1486 /* we already saw or generated a <body> before */
1487 return;
1488 }
Owen Taylor3473f882001-02-23 17:55:21 +00001489 for (i = 0;i < ctxt->nameNr;i++) {
1490 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1491 return;
1492 }
1493 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1494 return;
1495 }
1496 }
Daniel Veillarde77db162009-08-22 11:32:38 +02001497
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001498 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001499 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1500 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1501 }
1502}
1503
1504/**
1505 * htmlCheckParagraph
1506 * @ctxt: an HTML parser context
1507 *
1508 * Check whether a p element need to be implied before inserting
1509 * characters in the current element.
1510 *
1511 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1512 * in case of error.
1513 */
1514
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001515static int
Owen Taylor3473f882001-02-23 17:55:21 +00001516htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1517 const xmlChar *tag;
1518 int i;
1519
1520 if (ctxt == NULL)
1521 return(-1);
1522 tag = ctxt->name;
1523 if (tag == NULL) {
1524 htmlAutoClose(ctxt, BAD_CAST"p");
1525 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001526 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001527 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1528 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1529 return(1);
1530 }
1531 if (!htmlOmittedDefaultValue)
1532 return(0);
1533 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1534 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001535 htmlAutoClose(ctxt, BAD_CAST"p");
1536 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001537 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001538 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1539 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1540 return(1);
1541 }
1542 }
1543 return(0);
1544}
1545
1546/**
1547 * htmlIsScriptAttribute:
1548 * @name: an attribute name
1549 *
1550 * Check if an attribute is of content type Script
1551 *
1552 * Returns 1 is the attribute is a script 0 otherwise
1553 */
1554int
1555htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001556 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001557
1558 if (name == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02001559 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001560 /*
1561 * all script attributes start with 'on'
1562 */
1563 if ((name[0] != 'o') || (name[1] != 'n'))
Daniel Veillarde77db162009-08-22 11:32:38 +02001564 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001565 for (i = 0;
1566 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1567 i++) {
1568 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1569 return(1);
1570 }
1571 return(0);
1572}
1573
1574/************************************************************************
1575 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001576 * The list of HTML predefined entities *
Owen Taylor3473f882001-02-23 17:55:21 +00001577 * *
1578 ************************************************************************/
1579
1580
Daniel Veillard22090732001-07-16 00:06:07 +00001581static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001582/*
1583 * the 4 absolute ones, plus apostrophe.
1584 */
1585{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1586{ 38, "amp", "ampersand, U+0026 ISOnum" },
1587{ 39, "apos", "single quote" },
1588{ 60, "lt", "less-than sign, U+003C ISOnum" },
1589{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1590
1591/*
1592 * A bunch still in the 128-255 range
1593 * Replacing them depend really on the charset used.
1594 */
1595{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1596{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1597{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1598{ 163, "pound","pound sign, U+00A3 ISOnum" },
1599{ 164, "curren","currency sign, U+00A4 ISOnum" },
1600{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1601{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1602{ 167, "sect", "section sign, U+00A7 ISOnum" },
1603{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1604{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1605{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1606{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1607{ 172, "not", "not sign, U+00AC ISOnum" },
1608{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1609{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1610{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1611{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1612{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1613{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1614{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1615{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1616{ 181, "micro","micro sign, U+00B5 ISOnum" },
1617{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1618{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1619{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1620{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1621{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1622{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1623{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1624{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1625{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1626{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1627{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1628{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1629{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1630{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1631{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1632{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1633{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1634{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1635{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1636{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1637{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1638{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1639{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1640{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1641{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1642{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1643{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1644{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1645{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1646{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1647{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1648{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1649{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1650{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1651{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1652{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1653{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1654{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1655{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1656{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1657{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1658{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1659{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1660{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1661{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1662{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1663{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1664{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1665{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1666{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1667{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1668{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1669{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1670{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1671{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1672{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1673{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1674{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1675{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1676{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1677{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1678{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1679{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1680{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1681{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1682{ 247, "divide","division sign, U+00F7 ISOnum" },
1683{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1684{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1685{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1686{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1687{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1688{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1689{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1690{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1691
1692{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1693{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1694{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1695{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1696{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1697
1698/*
1699 * Anything below should really be kept as entities references
1700 */
1701{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1702
1703{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1704{ 732, "tilde","small tilde, U+02DC ISOdia" },
1705
1706{ 913, "Alpha","greek capital letter alpha, U+0391" },
1707{ 914, "Beta", "greek capital letter beta, U+0392" },
1708{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1709{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1710{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1711{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1712{ 919, "Eta", "greek capital letter eta, U+0397" },
1713{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1714{ 921, "Iota", "greek capital letter iota, U+0399" },
1715{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001716{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001717{ 924, "Mu", "greek capital letter mu, U+039C" },
1718{ 925, "Nu", "greek capital letter nu, U+039D" },
1719{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1720{ 927, "Omicron","greek capital letter omicron, U+039F" },
1721{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1722{ 929, "Rho", "greek capital letter rho, U+03A1" },
1723{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1724{ 932, "Tau", "greek capital letter tau, U+03A4" },
1725{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1726{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1727{ 935, "Chi", "greek capital letter chi, U+03A7" },
1728{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1729{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1730
1731{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1732{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1733{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1734{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1735{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1736{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1737{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1738{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1739{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1740{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1741{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1742{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1743{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1744{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1745{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1746{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1747{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1748{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1749{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1750{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1751{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1752{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1753{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1754{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1755{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1756{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1757{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1758{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1759
1760{ 8194, "ensp", "en space, U+2002 ISOpub" },
1761{ 8195, "emsp", "em space, U+2003 ISOpub" },
1762{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1763{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1764{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1765{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1766{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1767{ 8211, "ndash","en dash, U+2013 ISOpub" },
1768{ 8212, "mdash","em dash, U+2014 ISOpub" },
1769{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1770{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1771{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1772{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1773{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1774{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1775{ 8224, "dagger","dagger, U+2020 ISOpub" },
1776{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1777
1778{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1779{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1780
1781{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1782
1783{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1784{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1785
1786{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1787{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1788
1789{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1790{ 8260, "frasl","fraction slash, U+2044 NEW" },
1791
1792{ 8364, "euro", "euro sign, U+20AC NEW" },
1793
1794{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1795{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1796{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1797{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1798{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1799{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1800{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1801{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1802{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1803{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1804{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1805{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1806{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1807{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1808{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1809{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1810
1811{ 8704, "forall","for all, U+2200 ISOtech" },
1812{ 8706, "part", "partial differential, U+2202 ISOtech" },
1813{ 8707, "exist","there exists, U+2203 ISOtech" },
1814{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1815{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1816{ 8712, "isin", "element of, U+2208 ISOtech" },
1817{ 8713, "notin","not an element of, U+2209 ISOtech" },
1818{ 8715, "ni", "contains as member, U+220B ISOtech" },
1819{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001820{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001821{ 8722, "minus","minus sign, U+2212 ISOtech" },
1822{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1823{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1824{ 8733, "prop", "proportional to, U+221D ISOtech" },
1825{ 8734, "infin","infinity, U+221E ISOtech" },
1826{ 8736, "ang", "angle, U+2220 ISOamso" },
1827{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1828{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1829{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1830{ 8746, "cup", "union = cup, U+222A ISOtech" },
1831{ 8747, "int", "integral, U+222B ISOtech" },
1832{ 8756, "there4","therefore, U+2234 ISOtech" },
1833{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1834{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1835{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1836{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1837{ 8801, "equiv","identical to, U+2261 ISOtech" },
1838{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1839{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1840{ 8834, "sub", "subset of, U+2282 ISOtech" },
1841{ 8835, "sup", "superset of, U+2283 ISOtech" },
1842{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1843{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1844{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1845{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1846{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1847{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1848{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1849{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1850{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1851{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1852{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1853{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1854{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1855{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1856
1857{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1858{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1859{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1860{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1861
1862};
1863
1864/************************************************************************
1865 * *
1866 * Commodity functions to handle entities *
1867 * *
1868 ************************************************************************/
1869
1870/*
1871 * Macro used to grow the current buffer.
1872 */
1873#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001874 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001875 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001876 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1877 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001878 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001879 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001880 return(NULL); \
1881 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001882 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001883}
1884
1885/**
1886 * htmlEntityLookup:
1887 * @name: the entity name
1888 *
1889 * Lookup the given entity in EntitiesTable
1890 *
1891 * TODO: the linear scan is really ugly, an hash table is really needed.
1892 *
1893 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1894 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001895const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001896htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001897 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001898
1899 for (i = 0;i < (sizeof(html40EntitiesTable)/
1900 sizeof(html40EntitiesTable[0]));i++) {
1901 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001902 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001903 }
1904 }
1905 return(NULL);
1906}
1907
1908/**
1909 * htmlEntityValueLookup:
1910 * @value: the entity's unicode value
1911 *
1912 * Lookup the given entity in EntitiesTable
1913 *
1914 * TODO: the linear scan is really ugly, an hash table is really needed.
1915 *
1916 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1917 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001918const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001919htmlEntityValueLookup(unsigned int value) {
1920 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001921
1922 for (i = 0;i < (sizeof(html40EntitiesTable)/
1923 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001924 if (html40EntitiesTable[i].value >= value) {
1925 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001926 break;
William M. Brack78637da2003-07-31 14:47:38 +00001927 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001928 }
Owen Taylor3473f882001-02-23 17:55:21 +00001929 }
1930 return(NULL);
1931}
1932
1933/**
1934 * UTF8ToHtml:
1935 * @out: a pointer to an array of bytes to store the result
1936 * @outlen: the length of @out
1937 * @in: a pointer to an array of UTF-8 chars
1938 * @inlen: the length of @in
1939 *
1940 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1941 * plus HTML entities block of chars out.
1942 *
1943 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1944 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001945 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001946 * The value of @outlen after return is the number of octets consumed.
1947 */
1948int
1949UTF8ToHtml(unsigned char* out, int *outlen,
1950 const unsigned char* in, int *inlen) {
1951 const unsigned char* processed = in;
1952 const unsigned char* outend;
1953 const unsigned char* outstart = out;
1954 const unsigned char* instart = in;
1955 const unsigned char* inend;
1956 unsigned int c, d;
1957 int trailing;
1958
Daniel Veillardce682bc2004-11-05 17:22:25 +00001959 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001960 if (in == NULL) {
1961 /*
1962 * initialization nothing to do
1963 */
1964 *outlen = 0;
1965 *inlen = 0;
1966 return(0);
1967 }
1968 inend = in + (*inlen);
1969 outend = out + (*outlen);
1970 while (in < inend) {
1971 d = *in++;
1972 if (d < 0x80) { c= d; trailing= 0; }
1973 else if (d < 0xC0) {
1974 /* trailing byte in leading position */
1975 *outlen = out - outstart;
1976 *inlen = processed - instart;
1977 return(-2);
1978 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1979 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1980 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1981 else {
1982 /* no chance for this in Ascii */
1983 *outlen = out - outstart;
1984 *inlen = processed - instart;
1985 return(-2);
1986 }
1987
1988 if (inend - in < trailing) {
1989 break;
Daniel Veillarde77db162009-08-22 11:32:38 +02001990 }
Owen Taylor3473f882001-02-23 17:55:21 +00001991
1992 for ( ; trailing; trailing--) {
1993 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1994 break;
1995 c <<= 6;
1996 c |= d & 0x3F;
1997 }
1998
1999 /* assertion: c is a single UTF-4 value */
2000 if (c < 0x80) {
2001 if (out + 1 >= outend)
2002 break;
2003 *out++ = c;
2004 } else {
2005 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00002006 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00002007 const char *cp;
2008 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00002009
2010 /*
2011 * Try to lookup a predefined HTML entity for it
2012 */
2013
2014 ent = htmlEntityValueLookup(c);
2015 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00002016 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2017 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00002018 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00002019 else
2020 cp = ent->name;
2021 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00002022 if (out + 2 + len >= outend)
2023 break;
2024 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00002025 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00002026 out += len;
2027 *out++ = ';';
2028 }
2029 processed = in;
2030 }
2031 *outlen = out - outstart;
2032 *inlen = processed - instart;
2033 return(0);
2034}
2035
2036/**
2037 * htmlEncodeEntities:
2038 * @out: a pointer to an array of bytes to store the result
2039 * @outlen: the length of @out
2040 * @in: a pointer to an array of UTF-8 chars
2041 * @inlen: the length of @in
2042 * @quoteChar: the quote character to escape (' or ") or zero.
2043 *
2044 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2045 * plus HTML entities block of chars out.
2046 *
2047 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2048 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002049 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00002050 * The value of @outlen after return is the number of octets consumed.
2051 */
2052int
2053htmlEncodeEntities(unsigned char* out, int *outlen,
2054 const unsigned char* in, int *inlen, int quoteChar) {
2055 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002056 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00002057 const unsigned char* outstart = out;
2058 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002059 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00002060 unsigned int c, d;
2061 int trailing;
2062
Daniel Veillardce682bc2004-11-05 17:22:25 +00002063 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2064 return(-1);
2065 outend = out + (*outlen);
2066 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00002067 while (in < inend) {
2068 d = *in++;
2069 if (d < 0x80) { c= d; trailing= 0; }
2070 else if (d < 0xC0) {
2071 /* trailing byte in leading position */
2072 *outlen = out - outstart;
2073 *inlen = processed - instart;
2074 return(-2);
2075 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2076 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2077 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2078 else {
2079 /* no chance for this in Ascii */
2080 *outlen = out - outstart;
2081 *inlen = processed - instart;
2082 return(-2);
2083 }
2084
2085 if (inend - in < trailing)
2086 break;
2087
2088 while (trailing--) {
2089 if (((d= *in++) & 0xC0) != 0x80) {
2090 *outlen = out - outstart;
2091 *inlen = processed - instart;
2092 return(-2);
2093 }
2094 c <<= 6;
2095 c |= d & 0x3F;
2096 }
2097
2098 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002099 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2100 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002101 if (out >= outend)
2102 break;
2103 *out++ = c;
2104 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002105 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002106 const char *cp;
2107 char nbuf[16];
2108 int len;
2109
2110 /*
2111 * Try to lookup a predefined HTML entity for it
2112 */
2113 ent = htmlEntityValueLookup(c);
2114 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002115 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002116 cp = nbuf;
2117 }
2118 else
2119 cp = ent->name;
2120 len = strlen(cp);
2121 if (out + 2 + len > outend)
2122 break;
2123 *out++ = '&';
2124 memcpy(out, cp, len);
2125 out += len;
2126 *out++ = ';';
2127 }
2128 processed = in;
2129 }
2130 *outlen = out - outstart;
2131 *inlen = processed - instart;
2132 return(0);
2133}
2134
Owen Taylor3473f882001-02-23 17:55:21 +00002135/************************************************************************
2136 * *
2137 * Commodity functions to handle streams *
2138 * *
2139 ************************************************************************/
2140
2141/**
Owen Taylor3473f882001-02-23 17:55:21 +00002142 * htmlNewInputStream:
2143 * @ctxt: an HTML parser context
2144 *
2145 * Create a new input stream structure
2146 * Returns the new input stream or NULL
2147 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002148static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002149htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2150 htmlParserInputPtr input;
2151
2152 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2153 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002154 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002155 return(NULL);
2156 }
2157 memset(input, 0, sizeof(htmlParserInput));
2158 input->filename = NULL;
2159 input->directory = NULL;
2160 input->base = NULL;
2161 input->cur = NULL;
2162 input->buf = NULL;
2163 input->line = 1;
2164 input->col = 1;
2165 input->buf = NULL;
2166 input->free = NULL;
2167 input->version = NULL;
2168 input->consumed = 0;
2169 input->length = 0;
2170 return(input);
2171}
2172
2173
2174/************************************************************************
2175 * *
2176 * Commodity functions, cleanup needed ? *
2177 * *
2178 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002179/*
Daniel Veillarde77db162009-08-22 11:32:38 +02002180 * all tags allowing pc data from the html 4.01 loose dtd
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002181 * NOTE: it might be more apropriate to integrate this information
2182 * into the html40ElementTable array but I don't want to risk any
2183 * binary incomptibility
2184 */
2185static const char *allowPCData[] = {
2186 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2187 "blockquote", "body", "button", "caption", "center", "cite", "code",
2188 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2189 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2190 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2191 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2192};
Owen Taylor3473f882001-02-23 17:55:21 +00002193
2194/**
2195 * areBlanks:
2196 * @ctxt: an HTML parser context
2197 * @str: a xmlChar *
2198 * @len: the size of @str
2199 *
2200 * Is this a sequence of blank chars that one can ignore ?
2201 *
2202 * Returns 1 if ignorable 0 otherwise.
2203 */
2204
2205static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002206 unsigned int i;
2207 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002208 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002209 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002210
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002211 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002212 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002213
2214 if (CUR == 0) return(1);
2215 if (CUR != '<') return(0);
2216 if (ctxt->name == NULL)
2217 return(1);
2218 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2219 return(1);
2220 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2221 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002222
2223 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2224 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2225 dtd = xmlGetIntSubset(ctxt->myDoc);
2226 if (dtd != NULL && dtd->ExternalID != NULL) {
2227 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2228 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2229 return(1);
2230 }
2231 }
2232
Owen Taylor3473f882001-02-23 17:55:21 +00002233 if (ctxt->node == NULL) return(0);
2234 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002235 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2236 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002237 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002238 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2239 (ctxt->node->content != NULL)) return(0);
Daniel Veillarde77db162009-08-22 11:32:38 +02002240 /* keep ws in constructs like ...<b> </b>...
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002241 for all tags "b" allowing PCDATA */
2242 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2243 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2244 return(0);
2245 }
2246 }
Owen Taylor3473f882001-02-23 17:55:21 +00002247 } else if (xmlNodeIsText(lastChild)) {
2248 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002249 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02002250 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002251 for all tags "p" allowing PCDATA */
2252 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2253 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2254 return(0);
2255 }
2256 }
Owen Taylor3473f882001-02-23 17:55:21 +00002257 }
2258 return(1);
2259}
2260
2261/**
Owen Taylor3473f882001-02-23 17:55:21 +00002262 * htmlNewDocNoDtD:
2263 * @URI: URI for the dtd, or NULL
2264 * @ExternalID: the external ID of the DTD, or NULL
2265 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002266 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2267 * are NULL
2268 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002269 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002270 */
2271htmlDocPtr
2272htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2273 xmlDocPtr cur;
2274
2275 /*
2276 * Allocate a new document and fill the fields.
2277 */
2278 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2279 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002280 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002281 return(NULL);
2282 }
2283 memset(cur, 0, sizeof(xmlDoc));
2284
2285 cur->type = XML_HTML_DOCUMENT_NODE;
2286 cur->version = NULL;
2287 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002288 cur->doc = cur;
2289 cur->name = NULL;
Daniel Veillarde77db162009-08-22 11:32:38 +02002290 cur->children = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002291 cur->extSubset = NULL;
2292 cur->oldNs = NULL;
2293 cur->encoding = NULL;
2294 cur->standalone = 1;
2295 cur->compression = 0;
2296 cur->ids = NULL;
2297 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002298 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002299 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002300 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002301 if ((ExternalID != NULL) ||
2302 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002303 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002304 return(cur);
2305}
2306
2307/**
2308 * htmlNewDoc:
2309 * @URI: URI for the dtd, or NULL
2310 * @ExternalID: the external ID of the DTD, or NULL
2311 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002312 * Creates a new HTML document
2313 *
Owen Taylor3473f882001-02-23 17:55:21 +00002314 * Returns a new document
2315 */
2316htmlDocPtr
2317htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2318 if ((URI == NULL) && (ExternalID == NULL))
2319 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002320 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2321 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002322
2323 return(htmlNewDocNoDtD(URI, ExternalID));
2324}
2325
2326
2327/************************************************************************
2328 * *
2329 * The parser itself *
2330 * Relates to http://www.w3.org/TR/html40 *
2331 * *
2332 ************************************************************************/
2333
2334/************************************************************************
2335 * *
2336 * The parser itself *
2337 * *
2338 ************************************************************************/
2339
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002340static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002341
Owen Taylor3473f882001-02-23 17:55:21 +00002342/**
2343 * htmlParseHTMLName:
2344 * @ctxt: an HTML parser context
2345 *
2346 * parse an HTML tag or attribute name, note that we convert it to lowercase
2347 * since HTML names are not case-sensitive.
2348 *
2349 * Returns the Tag Name parsed or NULL
2350 */
2351
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002352static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002353htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002354 int i = 0;
2355 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2356
William M. Brackd1757ab2004-10-02 22:07:48 +00002357 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002358 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002359
2360 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002361 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002362 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2363 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002364 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2365 else loc[i] = CUR;
2366 i++;
Daniel Veillarde77db162009-08-22 11:32:38 +02002367
Owen Taylor3473f882001-02-23 17:55:21 +00002368 NEXT;
2369 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002370
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002371 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002372}
2373
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002374
2375/**
2376 * htmlParseHTMLName_nonInvasive:
2377 * @ctxt: an HTML parser context
2378 *
2379 * parse an HTML tag or attribute name, note that we convert it to lowercase
2380 * since HTML names are not case-sensitive, this doesn't consume the data
2381 * from the stream, it's a look-ahead
2382 *
2383 * Returns the Tag Name parsed or NULL
2384 */
2385
2386static const xmlChar *
2387htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2388 int i = 0;
2389 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2390
2391 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2392 (NXT(1) != ':')) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02002393
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002394 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2395 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2396 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2397 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2398 else loc[i] = NXT(1+i);
2399 i++;
2400 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002401
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002402 return(xmlDictLookup(ctxt->dict, loc, i));
2403}
2404
2405
Owen Taylor3473f882001-02-23 17:55:21 +00002406/**
2407 * htmlParseName:
2408 * @ctxt: an HTML parser context
2409 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002410 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002411 *
2412 * Returns the Name parsed or NULL
2413 */
2414
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002415static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002416htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002417 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002418 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002419 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002420
2421 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002422
2423 /*
2424 * Accelerator for simple ASCII names
2425 */
2426 in = ctxt->input->cur;
2427 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2428 ((*in >= 0x41) && (*in <= 0x5A)) ||
2429 (*in == '_') || (*in == ':')) {
2430 in++;
2431 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2432 ((*in >= 0x41) && (*in <= 0x5A)) ||
2433 ((*in >= 0x30) && (*in <= 0x39)) ||
2434 (*in == '_') || (*in == '-') ||
2435 (*in == ':') || (*in == '.'))
2436 in++;
2437 if ((*in > 0) && (*in < 0x80)) {
2438 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002439 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002440 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002441 ctxt->nbChars += count;
2442 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002443 return(ret);
2444 }
2445 }
2446 return(htmlParseNameComplex(ctxt));
2447}
2448
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002449static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002450htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002451 int len = 0, l;
2452 int c;
2453 int count = 0;
2454
2455 /*
2456 * Handler for more complex cases
2457 */
2458 GROW;
2459 c = CUR_CHAR(l);
2460 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2461 (!IS_LETTER(c) && (c != '_') &&
2462 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002463 return(NULL);
2464 }
2465
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002466 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2467 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2468 (c == '.') || (c == '-') ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002469 (c == '_') || (c == ':') ||
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002470 (IS_COMBINING(c)) ||
2471 (IS_EXTENDER(c)))) {
2472 if (count++ > 100) {
2473 count = 0;
2474 GROW;
2475 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002476 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002477 NEXTL(l);
2478 c = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002479 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002480 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002481}
2482
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002483
Owen Taylor3473f882001-02-23 17:55:21 +00002484/**
2485 * htmlParseHTMLAttribute:
2486 * @ctxt: an HTML parser context
2487 * @stop: a char stop value
Daniel Veillarde77db162009-08-22 11:32:38 +02002488 *
Owen Taylor3473f882001-02-23 17:55:21 +00002489 * parse an HTML attribute value till the stop (quote), if
2490 * stop is 0 then it stops at the first space
2491 *
2492 * Returns the attribute parsed or NULL
2493 */
2494
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002495static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002496htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2497 xmlChar *buffer = NULL;
2498 int buffer_size = 0;
2499 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002500 const xmlChar *name = NULL;
2501 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002502 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002503
2504 /*
2505 * allocate a translation buffer.
2506 */
2507 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002508 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002509 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002510 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002511 return(NULL);
2512 }
2513 out = buffer;
2514
2515 /*
2516 * Ok loop until we reach one of the ending chars
2517 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002518 while ((CUR != 0) && (CUR != stop)) {
2519 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002520 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002521 if (CUR == '&') {
2522 if (NXT(1) == '#') {
2523 unsigned int c;
2524 int bits;
2525
2526 c = htmlParseCharRef(ctxt);
2527 if (c < 0x80)
2528 { *out++ = c; bits= -6; }
2529 else if (c < 0x800)
2530 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2531 else if (c < 0x10000)
2532 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002533 else
Owen Taylor3473f882001-02-23 17:55:21 +00002534 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002535
Owen Taylor3473f882001-02-23 17:55:21 +00002536 for ( ; bits >= 0; bits-= 6) {
2537 *out++ = ((c >> bits) & 0x3F) | 0x80;
2538 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002539
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002540 if (out - buffer > buffer_size - 100) {
2541 int indx = out - buffer;
2542
2543 growBuffer(buffer);
2544 out = &buffer[indx];
2545 }
Owen Taylor3473f882001-02-23 17:55:21 +00002546 } else {
2547 ent = htmlParseEntityRef(ctxt, &name);
2548 if (name == NULL) {
2549 *out++ = '&';
2550 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002551 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002552
2553 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002554 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002555 }
2556 } else if (ent == NULL) {
2557 *out++ = '&';
2558 cur = name;
2559 while (*cur != 0) {
2560 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002561 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002562
2563 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002564 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002565 }
2566 *out++ = *cur++;
2567 }
Owen Taylor3473f882001-02-23 17:55:21 +00002568 } else {
2569 unsigned int c;
2570 int bits;
2571
2572 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002573 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002574
2575 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002576 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002577 }
Daniel Veillard48519092006-10-17 15:56:35 +00002578 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002579 if (c < 0x80)
2580 { *out++ = c; bits= -6; }
2581 else if (c < 0x800)
2582 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2583 else if (c < 0x10000)
2584 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002585 else
Owen Taylor3473f882001-02-23 17:55:21 +00002586 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002587
Owen Taylor3473f882001-02-23 17:55:21 +00002588 for ( ; bits >= 0; bits-= 6) {
2589 *out++ = ((c >> bits) & 0x3F) | 0x80;
2590 }
Owen Taylor3473f882001-02-23 17:55:21 +00002591 }
2592 }
2593 } else {
2594 unsigned int c;
2595 int bits, l;
2596
2597 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002598 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002599
2600 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002601 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002602 }
2603 c = CUR_CHAR(l);
2604 if (c < 0x80)
2605 { *out++ = c; bits= -6; }
2606 else if (c < 0x800)
2607 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2608 else if (c < 0x10000)
2609 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002610 else
Owen Taylor3473f882001-02-23 17:55:21 +00002611 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002612
Owen Taylor3473f882001-02-23 17:55:21 +00002613 for ( ; bits >= 0; bits-= 6) {
2614 *out++ = ((c >> bits) & 0x3F) | 0x80;
2615 }
2616 NEXT;
2617 }
2618 }
Daniel Veillard13cee4e2009-09-05 14:52:55 +02002619 *out = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002620 return(buffer);
2621}
2622
2623/**
Owen Taylor3473f882001-02-23 17:55:21 +00002624 * htmlParseEntityRef:
2625 * @ctxt: an HTML parser context
2626 * @str: location to store the entity name
2627 *
2628 * parse an HTML ENTITY references
2629 *
2630 * [68] EntityRef ::= '&' Name ';'
2631 *
2632 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2633 * if non-NULL *str will have to be freed by the caller.
2634 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002635const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002636htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2637 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002638 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002639
2640 if (str != NULL) *str = NULL;
2641 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002642
2643 if (CUR == '&') {
2644 NEXT;
2645 name = htmlParseName(ctxt);
2646 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002647 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2648 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002649 } else {
2650 GROW;
2651 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002652 if (str != NULL)
2653 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002654
2655 /*
2656 * Lookup the entity in the table.
2657 */
2658 ent = htmlEntityLookup(name);
2659 if (ent != NULL) /* OK that's ugly !!! */
2660 NEXT;
2661 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002662 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2663 "htmlParseEntityRef: expecting ';'\n",
2664 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002665 if (str != NULL)
2666 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002667 }
2668 }
2669 }
2670 return(ent);
2671}
2672
2673/**
2674 * htmlParseAttValue:
2675 * @ctxt: an HTML parser context
2676 *
2677 * parse a value for an attribute
2678 * Note: the parser won't do substitution of entities here, this
2679 * will be handled later in xmlStringGetNodeList, unless it was
Daniel Veillarde77db162009-08-22 11:32:38 +02002680 * asked for ctxt->replaceEntities != 0
Owen Taylor3473f882001-02-23 17:55:21 +00002681 *
2682 * Returns the AttValue parsed or NULL.
2683 */
2684
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002685static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002686htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2687 xmlChar *ret = NULL;
2688
2689 if (CUR == '"') {
2690 NEXT;
2691 ret = htmlParseHTMLAttribute(ctxt, '"');
2692 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002693 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2694 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002695 } else
2696 NEXT;
2697 } else if (CUR == '\'') {
2698 NEXT;
2699 ret = htmlParseHTMLAttribute(ctxt, '\'');
2700 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002701 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2702 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002703 } else
2704 NEXT;
2705 } else {
2706 /*
2707 * That's an HTMLism, the attribute value may not be quoted
2708 */
2709 ret = htmlParseHTMLAttribute(ctxt, 0);
2710 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002711 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2712 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002713 }
2714 }
2715 return(ret);
2716}
2717
2718/**
2719 * htmlParseSystemLiteral:
2720 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02002721 *
Owen Taylor3473f882001-02-23 17:55:21 +00002722 * parse an HTML Literal
2723 *
2724 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2725 *
2726 * Returns the SystemLiteral parsed or NULL
2727 */
2728
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002729static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002730htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2731 const xmlChar *q;
2732 xmlChar *ret = NULL;
2733
2734 if (CUR == '"') {
2735 NEXT;
2736 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002737 while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
Owen Taylor3473f882001-02-23 17:55:21 +00002738 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002739 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002740 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2741 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002742 } else {
2743 ret = xmlStrndup(q, CUR_PTR - q);
2744 NEXT;
2745 }
2746 } else if (CUR == '\'') {
2747 NEXT;
2748 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002749 while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002750 NEXT;
William M. Brack76e95df2003-10-18 16:20:14 +00002751 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002752 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2753 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002754 } else {
2755 ret = xmlStrndup(q, CUR_PTR - q);
2756 NEXT;
2757 }
2758 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002759 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2760 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002761 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002762
Owen Taylor3473f882001-02-23 17:55:21 +00002763 return(ret);
2764}
2765
2766/**
2767 * htmlParsePubidLiteral:
2768 * @ctxt: an HTML parser context
2769 *
2770 * parse an HTML public literal
2771 *
2772 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2773 *
2774 * Returns the PubidLiteral parsed or NULL.
2775 */
2776
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002777static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002778htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2779 const xmlChar *q;
2780 xmlChar *ret = NULL;
2781 /*
2782 * Name ::= (Letter | '_') (NameChar)*
2783 */
2784 if (CUR == '"') {
2785 NEXT;
2786 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002787 while (IS_PUBIDCHAR_CH(CUR)) NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00002788 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002789 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2790 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002791 } else {
2792 ret = xmlStrndup(q, CUR_PTR - q);
2793 NEXT;
2794 }
2795 } else if (CUR == '\'') {
2796 NEXT;
2797 q = CUR_PTR;
William M. Brack76e95df2003-10-18 16:20:14 +00002798 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
Owen Taylor3473f882001-02-23 17:55:21 +00002799 NEXT;
Daniel Veillard6560a422003-03-27 21:25:38 +00002800 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002801 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2802 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002803 } else {
2804 ret = xmlStrndup(q, CUR_PTR - q);
2805 NEXT;
2806 }
2807 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002808 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2809 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002810 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002811
Owen Taylor3473f882001-02-23 17:55:21 +00002812 return(ret);
2813}
2814
2815/**
2816 * htmlParseScript:
2817 * @ctxt: an HTML parser context
2818 *
2819 * parse the content of an HTML SCRIPT or STYLE element
2820 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2821 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2822 * http://www.w3.org/TR/html4/types.html#type-script
2823 * http://www.w3.org/TR/html4/types.html#h-6.15
2824 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2825 *
2826 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2827 * element and the value of intrinsic event attributes. User agents must
2828 * not evaluate script data as HTML markup but instead must pass it on as
2829 * data to a script engine.
2830 * NOTES:
2831 * - The content is passed like CDATA
2832 * - the attributes for style and scripting "onXXX" are also described
2833 * as CDATA but SGML allows entities references in attributes so their
2834 * processing is identical as other attributes
2835 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002836static void
Owen Taylor3473f882001-02-23 17:55:21 +00002837htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002838 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002839 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002840 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002841
2842 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002843 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002844 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002845 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002846 /*
2847 * One should break here, the specification is clear:
2848 * Authors should therefore escape "</" within the content.
2849 * Escape mechanisms are specific to each scripting or
2850 * style sheet language.
2851 *
2852 * In recovery mode, only break if end tag match the
2853 * current tag, effectively ignoring all tags inside the
2854 * script/style block and treating the entire block as
2855 * CDATA.
2856 */
2857 if (ctxt->recovery) {
Daniel Veillarde77db162009-08-22 11:32:38 +02002858 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2859 xmlStrlen(ctxt->name)) == 0)
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002860 {
2861 break; /* while */
2862 } else {
2863 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002864 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002865 ctxt->name, NULL);
2866 }
2867 } else {
2868 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002869 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002870 {
2871 break; /* while */
2872 }
2873 }
Owen Taylor3473f882001-02-23 17:55:21 +00002874 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002875 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002876 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2877 if (ctxt->sax->cdataBlock!= NULL) {
2878 /*
2879 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2880 */
2881 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002882 } else if (ctxt->sax->characters != NULL) {
2883 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002884 }
2885 nbchar = 0;
2886 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002887 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002888 NEXTL(l);
2889 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002890 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002891
Daniel Veillard68716a72006-10-16 09:32:17 +00002892 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Pierre Belziled4b54472010-11-04 10:18:17 +01002893 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2894 "Invalid char in CDATA 0x%X\n", cur);
2895 if (ctxt->input->cur < ctxt->input->end) {
2896 NEXT;
2897 }
Owen Taylor3473f882001-02-23 17:55:21 +00002898 }
2899
2900 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2901 if (ctxt->sax->cdataBlock!= NULL) {
2902 /*
2903 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2904 */
2905 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002906 } else if (ctxt->sax->characters != NULL) {
2907 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002908 }
2909 }
2910}
2911
2912
2913/**
2914 * htmlParseCharData:
2915 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00002916 *
2917 * parse a CharData section.
2918 * if we are within a CDATA section ']]>' marks an end of section.
2919 *
2920 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2921 */
2922
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002923static void
2924htmlParseCharData(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002925 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2926 int nbchar = 0;
2927 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002928 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002929
2930 SHRINK;
2931 cur = CUR_CHAR(l);
2932 while (((cur != '<') || (ctxt->token == '<')) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02002933 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00002934 (cur != 0)) {
2935 if (!(IS_CHAR(cur))) {
2936 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2937 "Invalid char in CDATA 0x%X\n", cur);
2938 } else {
2939 COPY_BUF(l,buf,nbchar,cur);
2940 }
Owen Taylor3473f882001-02-23 17:55:21 +00002941 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2942 /*
2943 * Ok the segment is to be consumed as chars.
2944 */
2945 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2946 if (areBlanks(ctxt, buf, nbchar)) {
2947 if (ctxt->sax->ignorableWhitespace != NULL)
2948 ctxt->sax->ignorableWhitespace(ctxt->userData,
2949 buf, nbchar);
2950 } else {
2951 htmlCheckParagraph(ctxt);
2952 if (ctxt->sax->characters != NULL)
2953 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2954 }
2955 }
2956 nbchar = 0;
2957 }
2958 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00002959 chunk++;
2960 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2961 chunk = 0;
2962 SHRINK;
2963 GROW;
2964 }
Owen Taylor3473f882001-02-23 17:55:21 +00002965 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00002966 if (cur == 0) {
2967 SHRINK;
2968 GROW;
2969 cur = CUR_CHAR(l);
2970 }
Owen Taylor3473f882001-02-23 17:55:21 +00002971 }
2972 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00002973 buf[nbchar] = 0;
2974
Owen Taylor3473f882001-02-23 17:55:21 +00002975 /*
2976 * Ok the segment is to be consumed as chars.
2977 */
2978 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2979 if (areBlanks(ctxt, buf, nbchar)) {
2980 if (ctxt->sax->ignorableWhitespace != NULL)
2981 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2982 } else {
2983 htmlCheckParagraph(ctxt);
2984 if (ctxt->sax->characters != NULL)
2985 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2986 }
2987 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00002988 } else {
2989 /*
2990 * Loop detection
2991 */
2992 if (cur == 0)
2993 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00002994 }
2995}
2996
2997/**
2998 * htmlParseExternalID:
2999 * @ctxt: an HTML parser context
3000 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00003001 *
3002 * Parse an External ID or a Public ID
3003 *
Owen Taylor3473f882001-02-23 17:55:21 +00003004 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3005 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3006 *
3007 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3008 *
3009 * Returns the function returns SystemLiteral and in the second
3010 * case publicID receives PubidLiteral, is strict is off
3011 * it is possible to return NULL and have publicID set.
3012 */
3013
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003014static xmlChar *
3015htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00003016 xmlChar *URI = NULL;
3017
3018 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3019 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3020 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3021 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00003022 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003023 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3024 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003025 }
3026 SKIP_BLANKS;
3027 URI = htmlParseSystemLiteral(ctxt);
3028 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003029 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3030 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003031 }
3032 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3033 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3034 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3035 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00003036 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003037 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3038 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003039 }
3040 SKIP_BLANKS;
3041 *publicID = htmlParsePubidLiteral(ctxt);
3042 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003043 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3044 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3045 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003046 }
3047 SKIP_BLANKS;
3048 if ((CUR == '"') || (CUR == '\'')) {
3049 URI = htmlParseSystemLiteral(ctxt);
3050 }
3051 }
3052 return(URI);
3053}
3054
3055/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003056 * xmlParsePI:
3057 * @ctxt: an XML parser context
3058 *
3059 * parse an XML Processing Instruction.
3060 *
3061 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3062 */
3063static void
3064htmlParsePI(htmlParserCtxtPtr ctxt) {
3065 xmlChar *buf = NULL;
3066 int len = 0;
3067 int size = HTML_PARSER_BUFFER_SIZE;
3068 int cur, l;
3069 const xmlChar *target;
3070 xmlParserInputState state;
3071 int count = 0;
3072
3073 if ((RAW == '<') && (NXT(1) == '?')) {
3074 state = ctxt->instate;
3075 ctxt->instate = XML_PARSER_PI;
3076 /*
3077 * this is a Processing Instruction.
3078 */
3079 SKIP(2);
3080 SHRINK;
3081
3082 /*
3083 * Parse the target name and check for special support like
3084 * namespace.
3085 */
3086 target = htmlParseName(ctxt);
3087 if (target != NULL) {
3088 if (RAW == '>') {
3089 SKIP(1);
3090
3091 /*
3092 * SAX: PI detected.
3093 */
3094 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3095 (ctxt->sax->processingInstruction != NULL))
3096 ctxt->sax->processingInstruction(ctxt->userData,
3097 target, NULL);
3098 ctxt->instate = state;
3099 return;
3100 }
3101 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3102 if (buf == NULL) {
3103 htmlErrMemory(ctxt, NULL);
3104 ctxt->instate = state;
3105 return;
3106 }
3107 cur = CUR;
3108 if (!IS_BLANK(cur)) {
3109 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3110 "ParsePI: PI %s space expected\n", target, NULL);
3111 }
3112 SKIP_BLANKS;
3113 cur = CUR_CHAR(l);
3114 while (IS_CHAR(cur) && (cur != '>')) {
3115 if (len + 5 >= size) {
3116 xmlChar *tmp;
3117
3118 size *= 2;
3119 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3120 if (tmp == NULL) {
3121 htmlErrMemory(ctxt, NULL);
3122 xmlFree(buf);
3123 ctxt->instate = state;
3124 return;
3125 }
3126 buf = tmp;
3127 }
3128 count++;
3129 if (count > 50) {
3130 GROW;
3131 count = 0;
3132 }
3133 COPY_BUF(l,buf,len,cur);
3134 NEXTL(l);
3135 cur = CUR_CHAR(l);
3136 if (cur == 0) {
3137 SHRINK;
3138 GROW;
3139 cur = CUR_CHAR(l);
3140 }
3141 }
3142 buf[len] = 0;
3143 if (cur != '>') {
3144 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3145 "ParsePI: PI %s never end ...\n", target, NULL);
3146 } else {
3147 SKIP(1);
3148
3149 /*
3150 * SAX: PI detected.
3151 */
3152 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3153 (ctxt->sax->processingInstruction != NULL))
3154 ctxt->sax->processingInstruction(ctxt->userData,
3155 target, buf);
3156 }
3157 xmlFree(buf);
3158 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02003159 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003160 "PI is not started correctly", NULL, NULL);
3161 }
3162 ctxt->instate = state;
3163 }
3164}
3165
3166/**
Owen Taylor3473f882001-02-23 17:55:21 +00003167 * htmlParseComment:
3168 * @ctxt: an HTML parser context
3169 *
3170 * Parse an XML (SGML) comment <!-- .... -->
3171 *
3172 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3173 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003174static void
Owen Taylor3473f882001-02-23 17:55:21 +00003175htmlParseComment(htmlParserCtxtPtr ctxt) {
3176 xmlChar *buf = NULL;
3177 int len;
3178 int size = HTML_PARSER_BUFFER_SIZE;
3179 int q, ql;
3180 int r, rl;
3181 int cur, l;
3182 xmlParserInputState state;
3183
3184 /*
3185 * Check that there is a comment right here.
3186 */
3187 if ((RAW != '<') || (NXT(1) != '!') ||
3188 (NXT(2) != '-') || (NXT(3) != '-')) return;
3189
3190 state = ctxt->instate;
3191 ctxt->instate = XML_PARSER_COMMENT;
3192 SHRINK;
3193 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003194 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003195 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003196 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003197 ctxt->instate = state;
3198 return;
3199 }
3200 q = CUR_CHAR(ql);
3201 NEXTL(ql);
3202 r = CUR_CHAR(rl);
3203 NEXTL(rl);
3204 cur = CUR_CHAR(l);
3205 len = 0;
3206 while (IS_CHAR(cur) &&
3207 ((cur != '>') ||
3208 (r != '-') || (q != '-'))) {
3209 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003210 xmlChar *tmp;
3211
Owen Taylor3473f882001-02-23 17:55:21 +00003212 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003213 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3214 if (tmp == NULL) {
3215 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003216 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003217 ctxt->instate = state;
3218 return;
3219 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003220 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003221 }
3222 COPY_BUF(ql,buf,len,q);
3223 q = r;
3224 ql = rl;
3225 r = cur;
3226 rl = l;
3227 NEXTL(l);
3228 cur = CUR_CHAR(l);
3229 if (cur == 0) {
3230 SHRINK;
3231 GROW;
3232 cur = CUR_CHAR(l);
3233 }
3234 }
3235 buf[len] = 0;
3236 if (!IS_CHAR(cur)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003237 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3238 "Comment not terminated \n<!--%.50s\n", buf, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003239 xmlFree(buf);
3240 } else {
3241 NEXT;
3242 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3243 (!ctxt->disableSAX))
3244 ctxt->sax->comment(ctxt->userData, buf);
3245 xmlFree(buf);
3246 }
3247 ctxt->instate = state;
3248}
3249
3250/**
3251 * htmlParseCharRef:
3252 * @ctxt: an HTML parser context
3253 *
3254 * parse Reference declarations
3255 *
3256 * [66] CharRef ::= '&#' [0-9]+ ';' |
3257 * '&#x' [0-9a-fA-F]+ ';'
3258 *
3259 * Returns the value parsed (as an int)
3260 */
3261int
3262htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3263 int val = 0;
3264
Daniel Veillarda03e3652004-11-02 18:45:30 +00003265 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3266 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3267 "htmlParseCharRef: context error\n",
3268 NULL, NULL);
3269 return(0);
3270 }
Owen Taylor3473f882001-02-23 17:55:21 +00003271 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003272 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003273 SKIP(3);
3274 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003275 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003276 val = val * 16 + (CUR - '0');
3277 else if ((CUR >= 'a') && (CUR <= 'f'))
3278 val = val * 16 + (CUR - 'a') + 10;
3279 else if ((CUR >= 'A') && (CUR <= 'F'))
3280 val = val * 16 + (CUR - 'A') + 10;
3281 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003282 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Michael Dayaf58ee12010-08-02 13:43:28 +02003283 "htmlParseCharRef: missing semicolon\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003284 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003285 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003286 }
3287 NEXT;
3288 }
3289 if (CUR == ';')
3290 NEXT;
3291 } else if ((CUR == '&') && (NXT(1) == '#')) {
3292 SKIP(2);
3293 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003294 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003295 val = val * 10 + (CUR - '0');
3296 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003297 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Michael Dayaf58ee12010-08-02 13:43:28 +02003298 "htmlParseCharRef: missing semicolon\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003299 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003300 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003301 }
3302 NEXT;
3303 }
3304 if (CUR == ';')
3305 NEXT;
3306 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003307 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3308 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003309 }
3310 /*
3311 * Check the value IS_CHAR ...
3312 */
3313 if (IS_CHAR(val)) {
3314 return(val);
3315 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003316 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3317 "htmlParseCharRef: invalid xmlChar value %d\n",
3318 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003319 }
3320 return(0);
3321}
3322
3323
3324/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003325 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003326 * @ctxt: an HTML parser context
3327 *
3328 * parse a DOCTYPE declaration
3329 *
Daniel Veillarde77db162009-08-22 11:32:38 +02003330 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
Owen Taylor3473f882001-02-23 17:55:21 +00003331 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3332 */
3333
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003334static void
Owen Taylor3473f882001-02-23 17:55:21 +00003335htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003336 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003337 xmlChar *ExternalID = NULL;
3338 xmlChar *URI = NULL;
3339
3340 /*
3341 * We know that '<!DOCTYPE' has been detected.
3342 */
3343 SKIP(9);
3344
3345 SKIP_BLANKS;
3346
3347 /*
3348 * Parse the DOCTYPE name.
3349 */
3350 name = htmlParseName(ctxt);
3351 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003352 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3353 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3354 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003355 }
3356 /*
3357 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3358 */
3359
3360 SKIP_BLANKS;
3361
3362 /*
3363 * Check for SystemID and ExternalID
3364 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003365 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003366 SKIP_BLANKS;
3367
3368 /*
3369 * We should be at the end of the DOCTYPE declaration.
3370 */
3371 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003372 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3373 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003374 /* We shouldn't try to resynchronize ... */
3375 }
3376 NEXT;
3377
3378 /*
3379 * Create or update the document accordingly to the DOCTYPE
3380 */
3381 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3382 (!ctxt->disableSAX))
3383 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3384
3385 /*
3386 * Cleanup, since we don't use all those identifiers
3387 */
3388 if (URI != NULL) xmlFree(URI);
3389 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003390}
3391
3392/**
3393 * htmlParseAttribute:
3394 * @ctxt: an HTML parser context
3395 * @value: a xmlChar ** used to store the value of the attribute
3396 *
3397 * parse an attribute
3398 *
3399 * [41] Attribute ::= Name Eq AttValue
3400 *
3401 * [25] Eq ::= S? '=' S?
3402 *
3403 * With namespace:
3404 *
3405 * [NS 11] Attribute ::= QName Eq AttValue
3406 *
3407 * Also the case QName == xmlns:??? is handled independently as a namespace
3408 * definition.
3409 *
3410 * Returns the attribute name, and the value in *value.
3411 */
3412
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003413static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003414htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003415 const xmlChar *name;
3416 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003417
3418 *value = NULL;
3419 name = htmlParseHTMLName(ctxt);
3420 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003421 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3422 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003423 return(NULL);
3424 }
3425
3426 /*
3427 * read the value
3428 */
3429 SKIP_BLANKS;
3430 if (CUR == '=') {
3431 NEXT;
3432 SKIP_BLANKS;
3433 val = htmlParseAttValue(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003434 }
3435
3436 *value = val;
3437 return(name);
3438}
3439
3440/**
Denis Pauk868d92d2012-05-10 15:34:57 +08003441 * htmlCheckEncodingDirect:
Owen Taylor3473f882001-02-23 17:55:21 +00003442 * @ctxt: an HTML parser context
3443 * @attvalue: the attribute value
3444 *
Denis Pauk868d92d2012-05-10 15:34:57 +08003445 * Checks an attribute value to detect
Owen Taylor3473f882001-02-23 17:55:21 +00003446 * the encoding
3447 * If a new encoding is detected the parser is switched to decode
3448 * it and pass UTF8
3449 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003450static void
Denis Pauk868d92d2012-05-10 15:34:57 +08003451htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
Owen Taylor3473f882001-02-23 17:55:21 +00003452
Denis Pauk868d92d2012-05-10 15:34:57 +08003453 if ((ctxt == NULL) || (encoding == NULL) ||
Daniel Veillardc62efc82011-05-16 16:03:50 +08003454 (ctxt->options & HTML_PARSE_IGNORE_ENC))
Owen Taylor3473f882001-02-23 17:55:21 +00003455 return;
3456
Daniel Veillarde77db162009-08-22 11:32:38 +02003457 /* do not change encoding */
Owen Taylor3473f882001-02-23 17:55:21 +00003458 if (ctxt->input->encoding != NULL)
3459 return;
3460
Owen Taylor3473f882001-02-23 17:55:21 +00003461 if (encoding != NULL) {
3462 xmlCharEncoding enc;
3463 xmlCharEncodingHandlerPtr handler;
3464
3465 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3466
3467 if (ctxt->input->encoding != NULL)
3468 xmlFree((xmlChar *) ctxt->input->encoding);
3469 ctxt->input->encoding = xmlStrdup(encoding);
3470
3471 enc = xmlParseCharEncoding((const char *) encoding);
3472 /*
3473 * registered set of known encodings
3474 */
3475 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillarde77db162009-08-22 11:32:38 +02003476 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
Daniel Veillard7e303562006-10-16 13:14:55 +00003477 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3478 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3479 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3480 (ctxt->input->buf != NULL) &&
3481 (ctxt->input->buf->encoder == NULL)) {
3482 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3483 "htmlCheckEncoding: wrong encoding meta\n",
3484 NULL, NULL);
3485 } else {
3486 xmlSwitchEncoding(ctxt, enc);
3487 }
Owen Taylor3473f882001-02-23 17:55:21 +00003488 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3489 } else {
3490 /*
3491 * fallback for unknown encodings
3492 */
3493 handler = xmlFindCharEncodingHandler((const char *) encoding);
3494 if (handler != NULL) {
3495 xmlSwitchToEncoding(ctxt, handler);
3496 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3497 } else {
Daniel Veillardc62efc82011-05-16 16:03:50 +08003498 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3499 "htmlCheckEncoding: unknown encoding %s\n",
3500 encoding, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003501 }
3502 }
3503
3504 if ((ctxt->input->buf != NULL) &&
3505 (ctxt->input->buf->encoder != NULL) &&
3506 (ctxt->input->buf->raw != NULL) &&
3507 (ctxt->input->buf->buffer != NULL)) {
3508 int nbchars;
3509 int processed;
3510
3511 /*
3512 * convert as much as possible to the parser reading buffer.
3513 */
3514 processed = ctxt->input->cur - ctxt->input->base;
Daniel Veillarda78d8032012-07-16 14:56:50 +08003515 xmlBufShrink(ctxt->input->buf->buffer, processed);
3516 nbchars = xmlCharEncInput(ctxt->input->buf);
Owen Taylor3473f882001-02-23 17:55:21 +00003517 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003518 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3519 "htmlCheckEncoding: encoder error\n",
3520 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003521 }
Daniel Veillard61551a12012-07-16 16:28:47 +08003522 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
Owen Taylor3473f882001-02-23 17:55:21 +00003523 }
3524 }
3525}
3526
3527/**
Denis Pauk868d92d2012-05-10 15:34:57 +08003528 * htmlCheckEncoding:
3529 * @ctxt: an HTML parser context
3530 * @attvalue: the attribute value
3531 *
3532 * Checks an http-equiv attribute from a Meta tag to detect
3533 * the encoding
3534 * If a new encoding is detected the parser is switched to decode
3535 * it and pass UTF8
3536 */
3537static void
3538htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3539 const xmlChar *encoding;
3540
3541 if (!attvalue)
3542 return;
3543
3544 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3545 if (encoding != NULL) {
3546 encoding += 7;
3547 }
3548 /*
3549 * skip blank
3550 */
3551 if (encoding && IS_BLANK_CH(*encoding))
3552 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3553 if (encoding && *encoding == '=') {
3554 encoding ++;
3555 htmlCheckEncodingDirect(ctxt, encoding);
3556 }
3557}
3558
3559/**
Owen Taylor3473f882001-02-23 17:55:21 +00003560 * htmlCheckMeta:
3561 * @ctxt: an HTML parser context
3562 * @atts: the attributes values
3563 *
3564 * Checks an attributes from a Meta tag
3565 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003566static void
Owen Taylor3473f882001-02-23 17:55:21 +00003567htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3568 int i;
3569 const xmlChar *att, *value;
3570 int http = 0;
3571 const xmlChar *content = NULL;
3572
3573 if ((ctxt == NULL) || (atts == NULL))
3574 return;
3575
3576 i = 0;
3577 att = atts[i++];
3578 while (att != NULL) {
3579 value = atts[i++];
3580 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3581 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3582 http = 1;
Denis Pauk868d92d2012-05-10 15:34:57 +08003583 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3584 htmlCheckEncodingDirect(ctxt, value);
Owen Taylor3473f882001-02-23 17:55:21 +00003585 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3586 content = value;
3587 att = atts[i++];
3588 }
3589 if ((http) && (content != NULL))
3590 htmlCheckEncoding(ctxt, content);
3591
3592}
3593
3594/**
3595 * htmlParseStartTag:
3596 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003597 *
Owen Taylor3473f882001-02-23 17:55:21 +00003598 * parse a start of tag either for rule element or
3599 * EmptyElement. In both case we don't parse the tag closing chars.
3600 *
3601 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3602 *
3603 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3604 *
3605 * With namespace:
3606 *
3607 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3608 *
3609 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3610 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003611 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003612 */
3613
Daniel Veillard597f1c12005-07-03 23:00:18 +00003614static int
Owen Taylor3473f882001-02-23 17:55:21 +00003615htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003616 const xmlChar *name;
3617 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003618 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003619 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003620 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003621 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003622 int meta = 0;
3623 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003624 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003625
Daniel Veillarde77db162009-08-22 11:32:38 +02003626 if (ctxt->instate == XML_PARSER_EOF)
3627 return(-1);
Daniel Veillarda03e3652004-11-02 18:45:30 +00003628 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3629 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3630 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003631 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003632 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003633 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003634 NEXT;
3635
Daniel Veillard30e76072006-03-09 14:13:55 +00003636 atts = ctxt->atts;
3637 maxatts = ctxt->maxatts;
3638
Owen Taylor3473f882001-02-23 17:55:21 +00003639 GROW;
3640 name = htmlParseHTMLName(ctxt);
3641 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003642 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3643 "htmlParseStartTag: invalid element name\n",
3644 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003645 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003646 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3647 (ctxt->instate != XML_PARSER_EOF))
Owen Taylor3473f882001-02-23 17:55:21 +00003648 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003649 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003650 }
3651 if (xmlStrEqual(name, BAD_CAST"meta"))
3652 meta = 1;
3653
3654 /*
3655 * Check for auto-closure of HTML elements.
3656 */
3657 htmlAutoClose(ctxt, name);
3658
3659 /*
3660 * Check for implied HTML elements.
3661 */
3662 htmlCheckImplied(ctxt, name);
3663
3664 /*
3665 * Avoid html at any level > 0, head at any level != 1
3666 * or any attempt to recurse body
3667 */
3668 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003669 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3670 "htmlParseStartTag: misplaced <html> tag\n",
3671 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003672 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003673 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003674 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003675 if ((ctxt->nameNr != 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003676 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003677 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3678 "htmlParseStartTag: misplaced <head> tag\n",
3679 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003680 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003681 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003682 }
3683 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003684 int indx;
3685 for (indx = 0;indx < ctxt->nameNr;indx++) {
3686 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003687 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3688 "htmlParseStartTag: misplaced <body> tag\n",
3689 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003690 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003691 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003692 }
3693 }
3694 }
3695
3696 /*
3697 * Now parse the attributes, it ends up with the ending
3698 *
3699 * (S Attribute)* S?
3700 */
3701 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003702 while ((IS_CHAR_CH(CUR)) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003703 (CUR != '>') &&
Owen Taylor3473f882001-02-23 17:55:21 +00003704 ((CUR != '/') || (NXT(1) != '>'))) {
3705 long cons = ctxt->nbChars;
3706
3707 GROW;
3708 attname = htmlParseAttribute(ctxt, &attvalue);
3709 if (attname != NULL) {
3710
3711 /*
3712 * Well formedness requires at most one declaration of an attribute
3713 */
3714 for (i = 0; i < nbatts;i += 2) {
3715 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003716 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3717 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003718 if (attvalue != NULL)
3719 xmlFree(attvalue);
3720 goto failed;
3721 }
3722 }
3723
3724 /*
3725 * Add the pair to atts
3726 */
3727 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003728 maxatts = 22; /* allow for 10 attrs by default */
3729 atts = (const xmlChar **)
3730 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003731 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003732 htmlErrMemory(ctxt, NULL);
3733 if (attvalue != NULL)
3734 xmlFree(attvalue);
3735 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003736 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003737 ctxt->atts = atts;
3738 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003739 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003740 const xmlChar **n;
3741
Owen Taylor3473f882001-02-23 17:55:21 +00003742 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003743 n = (const xmlChar **) xmlRealloc((void *) atts,
3744 maxatts * sizeof(const xmlChar *));
3745 if (n == NULL) {
3746 htmlErrMemory(ctxt, NULL);
3747 if (attvalue != NULL)
3748 xmlFree(attvalue);
3749 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003750 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003751 atts = n;
3752 ctxt->atts = atts;
3753 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003754 }
3755 atts[nbatts++] = attname;
3756 atts[nbatts++] = attvalue;
3757 atts[nbatts] = NULL;
3758 atts[nbatts + 1] = NULL;
3759 }
3760 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003761 if (attvalue != NULL)
3762 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003763 /* Dump the bogus attribute string up to the next blank or
3764 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003765 while ((IS_CHAR_CH(CUR)) &&
3766 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003767 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003768 NEXT;
3769 }
3770
3771failed:
3772 SKIP_BLANKS;
3773 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003774 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3775 "htmlParseStartTag: problem parsing attributes\n",
3776 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003777 break;
3778 }
3779 }
3780
3781 /*
3782 * Handle specific association to the META tag
3783 */
William M. Bracke978ae22007-03-21 06:16:02 +00003784 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003785 htmlCheckMeta(ctxt, atts);
3786
3787 /*
3788 * SAX: Start of Element !
3789 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003790 if (!discardtag) {
3791 htmlnamePush(ctxt, name);
3792 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3793 if (nbatts != 0)
3794 ctxt->sax->startElement(ctxt->userData, name, atts);
3795 else
3796 ctxt->sax->startElement(ctxt->userData, name, NULL);
3797 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003798 }
Owen Taylor3473f882001-02-23 17:55:21 +00003799
3800 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003801 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003802 if (atts[i] != NULL)
3803 xmlFree((xmlChar *) atts[i]);
3804 }
Owen Taylor3473f882001-02-23 17:55:21 +00003805 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003806
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003807 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003808}
3809
3810/**
3811 * htmlParseEndTag:
3812 * @ctxt: an HTML parser context
3813 *
3814 * parse an end of tag
3815 *
3816 * [42] ETag ::= '</' Name S? '>'
3817 *
3818 * With namespace
3819 *
3820 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003821 *
3822 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003823 */
3824
Daniel Veillardf420ac52001-07-04 16:04:09 +00003825static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003826htmlParseEndTag(htmlParserCtxtPtr ctxt)
3827{
3828 const xmlChar *name;
3829 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003830 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003831
3832 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003833 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3834 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003835 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003836 }
3837 SKIP(2);
3838
3839 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003840 if (name == NULL)
3841 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003842 /*
3843 * We should definitely be at the ending "S? '>'" part
3844 */
3845 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003846 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003847 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3848 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003849 if (ctxt->recovery) {
3850 /*
3851 * We're not at the ending > !!
3852 * Error, unless in recover mode where we search forwards
3853 * until we find a >
3854 */
3855 while (CUR != '\0' && CUR != '>') NEXT;
3856 NEXT;
3857 }
Owen Taylor3473f882001-02-23 17:55:21 +00003858 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003859 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003860
3861 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003862 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3863 * out now.
3864 */
3865 if ((ctxt->depth > 0) &&
3866 (xmlStrEqual(name, BAD_CAST "html") ||
3867 xmlStrEqual(name, BAD_CAST "body") ||
3868 xmlStrEqual(name, BAD_CAST "head"))) {
3869 ctxt->depth--;
3870 return (0);
3871 }
3872
3873 /*
Owen Taylor3473f882001-02-23 17:55:21 +00003874 * If the name read is not one of the element in the parsing stack
3875 * then return, it's just an error.
3876 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003877 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3878 if (xmlStrEqual(name, ctxt->nameTab[i]))
3879 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003880 }
3881 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003882 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3883 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003884 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003885 }
3886
3887
3888 /*
3889 * Check for auto-closure of HTML elements.
3890 */
3891
3892 htmlAutoCloseOnClose(ctxt, name);
3893
3894 /*
3895 * Well formedness constraints, opening and closing must match.
3896 * With the exception that the autoclose may have popped stuff out
3897 * of the stack.
3898 */
3899 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003900 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003901 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3902 "Opening and ending tag mismatch: %s and %s\n",
3903 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00003904 }
3905 }
3906
3907 /*
3908 * SAX: End of Tag
3909 */
3910 oldname = ctxt->name;
3911 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003912 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3913 ctxt->sax->endElement(ctxt->userData, name);
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08003914 htmlNodeInfoPop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00003915 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003916 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003917 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003918 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003919 }
3920
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003921 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00003922}
3923
3924
3925/**
3926 * htmlParseReference:
3927 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003928 *
Owen Taylor3473f882001-02-23 17:55:21 +00003929 * parse and handle entity references in content,
3930 * this will end-up in a call to character() since this is either a
3931 * CharRef, or a predefined entity.
3932 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003933static void
Owen Taylor3473f882001-02-23 17:55:21 +00003934htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00003935 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00003936 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003937 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003938 if (CUR != '&') return;
3939
3940 if (NXT(1) == '#') {
3941 unsigned int c;
3942 int bits, i = 0;
3943
3944 c = htmlParseCharRef(ctxt);
3945 if (c == 0)
3946 return;
3947
3948 if (c < 0x80) { out[i++]= c; bits= -6; }
3949 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3950 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3951 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003952
Owen Taylor3473f882001-02-23 17:55:21 +00003953 for ( ; bits >= 0; bits-= 6) {
3954 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3955 }
3956 out[i] = 0;
3957
3958 htmlCheckParagraph(ctxt);
3959 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3960 ctxt->sax->characters(ctxt->userData, out, i);
3961 } else {
3962 ent = htmlParseEntityRef(ctxt, &name);
3963 if (name == NULL) {
3964 htmlCheckParagraph(ctxt);
3965 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3966 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3967 return;
3968 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00003969 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003970 htmlCheckParagraph(ctxt);
3971 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3972 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3973 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3974 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3975 }
3976 } else {
3977 unsigned int c;
3978 int bits, i = 0;
3979
3980 c = ent->value;
3981 if (c < 0x80)
3982 { out[i++]= c; bits= -6; }
3983 else if (c < 0x800)
3984 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3985 else if (c < 0x10000)
3986 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003987 else
Owen Taylor3473f882001-02-23 17:55:21 +00003988 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02003989
Owen Taylor3473f882001-02-23 17:55:21 +00003990 for ( ; bits >= 0; bits-= 6) {
3991 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3992 }
3993 out[i] = 0;
3994
3995 htmlCheckParagraph(ctxt);
3996 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3997 ctxt->sax->characters(ctxt->userData, out, i);
3998 }
Owen Taylor3473f882001-02-23 17:55:21 +00003999 }
4000}
4001
4002/**
4003 * htmlParseContent:
4004 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00004005 *
4006 * Parse a content: comment, sub-element, reference or text.
Eugene Pimenov615904f2010-03-15 15:16:02 +01004007 * Kept for compatibility with old code
Owen Taylor3473f882001-02-23 17:55:21 +00004008 */
4009
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004010static void
Owen Taylor3473f882001-02-23 17:55:21 +00004011htmlParseContent(htmlParserCtxtPtr ctxt) {
4012 xmlChar *currentNode;
4013 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004014 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004015
4016 currentNode = xmlStrdup(ctxt->name);
4017 depth = ctxt->nameNr;
4018 while (1) {
4019 long cons = ctxt->nbChars;
4020
4021 GROW;
Daniel Veillarde77db162009-08-22 11:32:38 +02004022
4023 if (ctxt->instate == XML_PARSER_EOF)
4024 break;
4025
Owen Taylor3473f882001-02-23 17:55:21 +00004026 /*
4027 * Our tag or one of it's parent or children is ending.
4028 */
4029 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00004030 if (htmlParseEndTag(ctxt) &&
4031 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4032 if (currentNode != NULL)
4033 xmlFree(currentNode);
4034 return;
4035 }
4036 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00004037 }
4038
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004039 else if ((CUR == '<') &&
4040 ((IS_ASCII_LETTER(NXT(1))) ||
4041 (NXT(1) == '_') || (NXT(1) == ':'))) {
4042 name = htmlParseHTMLName_nonInvasive(ctxt);
4043 if (name == NULL) {
4044 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4045 "htmlParseStartTag: invalid element name\n",
4046 NULL, NULL);
4047 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02004048 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004049 NEXT;
4050
4051 if (currentNode != NULL)
4052 xmlFree(currentNode);
4053 return;
4054 }
4055
4056 if (ctxt->name != NULL) {
4057 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4058 htmlAutoClose(ctxt, name);
4059 continue;
4060 }
Daniel Veillarde77db162009-08-22 11:32:38 +02004061 }
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004062 }
4063
Owen Taylor3473f882001-02-23 17:55:21 +00004064 /*
4065 * Has this node been popped out during parsing of
4066 * the next element
4067 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00004068 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4069 (!xmlStrEqual(currentNode, ctxt->name)))
4070 {
Owen Taylor3473f882001-02-23 17:55:21 +00004071 if (currentNode != NULL) xmlFree(currentNode);
4072 return;
4073 }
4074
Daniel Veillardf9533d12001-03-03 10:04:57 +00004075 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4076 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004077 /*
4078 * Handle SCRIPT/STYLE separately
4079 */
4080 htmlParseScript(ctxt);
4081 } else {
4082 /*
4083 * Sometimes DOCTYPE arrives in the middle of the document
4084 */
4085 if ((CUR == '<') && (NXT(1) == '!') &&
4086 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4087 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4088 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4089 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004090 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4091 "Misplaced DOCTYPE declaration\n",
4092 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004093 htmlParseDocTypeDecl(ctxt);
4094 }
4095
4096 /*
4097 * First case : a comment
4098 */
4099 if ((CUR == '<') && (NXT(1) == '!') &&
4100 (NXT(2) == '-') && (NXT(3) == '-')) {
4101 htmlParseComment(ctxt);
4102 }
4103
4104 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004105 * Second case : a Processing Instruction.
4106 */
4107 else if ((CUR == '<') && (NXT(1) == '?')) {
4108 htmlParsePI(ctxt);
4109 }
4110
4111 /*
4112 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004113 */
4114 else if (CUR == '<') {
4115 htmlParseElement(ctxt);
4116 }
4117
4118 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004119 * Fourth case : a reference. If if has not been resolved,
Daniel Veillarde77db162009-08-22 11:32:38 +02004120 * parsing returns it's Name, create the node
Owen Taylor3473f882001-02-23 17:55:21 +00004121 */
4122 else if (CUR == '&') {
4123 htmlParseReference(ctxt);
4124 }
4125
4126 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004127 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004128 */
4129 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004130 htmlAutoCloseOnEnd(ctxt);
4131 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004132 }
4133
4134 /*
4135 * Last case, text. Note that References are handled directly.
4136 */
4137 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004138 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004139 }
4140
4141 if (cons == ctxt->nbChars) {
4142 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004143 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4144 "detected an error in element content\n",
4145 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004146 }
4147 break;
4148 }
4149 }
4150 GROW;
4151 }
4152 if (currentNode != NULL) xmlFree(currentNode);
4153}
4154
4155/**
4156 * htmlParseElement:
4157 * @ctxt: an HTML parser context
4158 *
4159 * parse an HTML element, this is highly recursive
Eugene Pimenov615904f2010-03-15 15:16:02 +01004160 * this is kept for compatibility with previous code versions
Owen Taylor3473f882001-02-23 17:55:21 +00004161 *
4162 * [39] element ::= EmptyElemTag | STag content ETag
4163 *
4164 * [41] Attribute ::= Name Eq AttValue
4165 */
4166
4167void
4168htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004169 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004170 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004171 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004172 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004173 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004174 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004175 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004176
Daniel Veillarda03e3652004-11-02 18:45:30 +00004177 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4178 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004179 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004180 return;
4181 }
Daniel Veillarddb4ac222009-08-22 17:58:31 +02004182
4183 if (ctxt->instate == XML_PARSER_EOF)
4184 return;
4185
Owen Taylor3473f882001-02-23 17:55:21 +00004186 /* Capture start position */
4187 if (ctxt->record_info) {
4188 node_info.begin_pos = ctxt->input->consumed +
4189 (CUR_PTR - ctxt->input->base);
4190 node_info.begin_line = ctxt->input->line;
4191 }
4192
Daniel Veillard597f1c12005-07-03 23:00:18 +00004193 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004194 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004195 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004196 if (CUR == '>')
4197 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004198 return;
4199 }
Owen Taylor3473f882001-02-23 17:55:21 +00004200
4201 /*
4202 * Lookup the info for that element.
4203 */
4204 info = htmlTagLookup(name);
4205 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004206 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4207 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004208 }
4209
4210 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004211 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004212 */
4213 if ((CUR == '/') && (NXT(1) == '>')) {
4214 SKIP(2);
4215 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4216 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004217 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004218 return;
4219 }
4220
4221 if (CUR == '>') {
4222 NEXT;
4223 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004224 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4225 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004226
4227 /*
4228 * end of parsing of this node.
4229 */
Daniel Veillarde77db162009-08-22 11:32:38 +02004230 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004231 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004232 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004233 }
Owen Taylor3473f882001-02-23 17:55:21 +00004234
4235 /*
4236 * Capture end position and add node
4237 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004238 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004239 node_info.end_pos = ctxt->input->consumed +
4240 (CUR_PTR - ctxt->input->base);
4241 node_info.end_line = ctxt->input->line;
4242 node_info.node = ctxt->node;
4243 xmlParserAddNodeInfo(ctxt, &node_info);
4244 }
4245 return;
4246 }
4247
4248 /*
4249 * Check for an Empty Element from DTD definition
4250 */
4251 if ((info != NULL) && (info->empty)) {
4252 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4253 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004254 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004255 return;
4256 }
4257
4258 /*
4259 * Parse the content of the element:
4260 */
4261 currentNode = xmlStrdup(ctxt->name);
4262 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004263 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004264 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004265 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004266 if (oldptr==ctxt->input->cur) break;
Daniel Veillarde77db162009-08-22 11:32:38 +02004267 if (ctxt->nameNr < depth) break;
4268 }
Owen Taylor3473f882001-02-23 17:55:21 +00004269
Owen Taylor3473f882001-02-23 17:55:21 +00004270 /*
4271 * Capture end position and add node
4272 */
4273 if ( currentNode != NULL && ctxt->record_info ) {
4274 node_info.end_pos = ctxt->input->consumed +
4275 (CUR_PTR - ctxt->input->base);
4276 node_info.end_line = ctxt->input->line;
4277 node_info.node = ctxt->node;
4278 xmlParserAddNodeInfo(ctxt, &node_info);
4279 }
William M. Brack76e95df2003-10-18 16:20:14 +00004280 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004281 htmlAutoCloseOnEnd(ctxt);
4282 }
4283
Owen Taylor3473f882001-02-23 17:55:21 +00004284 if (currentNode != NULL)
4285 xmlFree(currentNode);
4286}
4287
Eugene Pimenov615904f2010-03-15 15:16:02 +01004288static void
4289htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4290 /*
4291 * Capture end position and add node
4292 */
4293 if ( ctxt->node != NULL && ctxt->record_info ) {
4294 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4295 (CUR_PTR - ctxt->input->base);
4296 ctxt->nodeInfo->end_line = ctxt->input->line;
4297 ctxt->nodeInfo->node = ctxt->node;
4298 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4299 htmlNodeInfoPop(ctxt);
4300 }
4301 if (!IS_CHAR_CH(CUR)) {
4302 htmlAutoCloseOnEnd(ctxt);
4303 }
4304}
4305
4306/**
4307 * htmlParseElementInternal:
4308 * @ctxt: an HTML parser context
4309 *
4310 * parse an HTML element, new version, non recursive
4311 *
4312 * [39] element ::= EmptyElemTag | STag content ETag
4313 *
4314 * [41] Attribute ::= Name Eq AttValue
4315 */
4316
4317static void
4318htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4319 const xmlChar *name;
4320 const htmlElemDesc * info;
4321 htmlParserNodeInfo node_info;
4322 int failed;
Eugene Pimenov615904f2010-03-15 15:16:02 +01004323
4324 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4325 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4326 "htmlParseElementInternal: context error\n", NULL, NULL);
4327 return;
4328 }
4329
4330 if (ctxt->instate == XML_PARSER_EOF)
4331 return;
4332
4333 /* Capture start position */
4334 if (ctxt->record_info) {
4335 node_info.begin_pos = ctxt->input->consumed +
4336 (CUR_PTR - ctxt->input->base);
4337 node_info.begin_line = ctxt->input->line;
4338 }
4339
4340 failed = htmlParseStartTag(ctxt);
4341 name = ctxt->name;
4342 if ((failed == -1) || (name == NULL)) {
4343 if (CUR == '>')
4344 NEXT;
4345 return;
4346 }
4347
4348 /*
4349 * Lookup the info for that element.
4350 */
4351 info = htmlTagLookup(name);
4352 if (info == NULL) {
4353 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4354 "Tag %s invalid\n", name, NULL);
4355 }
4356
4357 /*
4358 * Check for an Empty Element labeled the XML/SGML way
4359 */
4360 if ((CUR == '/') && (NXT(1) == '>')) {
4361 SKIP(2);
4362 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4363 ctxt->sax->endElement(ctxt->userData, name);
4364 htmlnamePop(ctxt);
4365 return;
4366 }
4367
4368 if (CUR == '>') {
4369 NEXT;
4370 } else {
4371 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4372 "Couldn't find end of Start Tag %s\n", name, NULL);
4373
4374 /*
4375 * end of parsing of this node.
4376 */
4377 if (xmlStrEqual(name, ctxt->name)) {
4378 nodePop(ctxt);
4379 htmlnamePop(ctxt);
4380 }
4381
4382 if (ctxt->record_info)
4383 htmlNodeInfoPush(ctxt, &node_info);
4384 htmlParserFinishElementParsing(ctxt);
4385 return;
4386 }
4387
4388 /*
4389 * Check for an Empty Element from DTD definition
4390 */
4391 if ((info != NULL) && (info->empty)) {
4392 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4393 ctxt->sax->endElement(ctxt->userData, name);
4394 htmlnamePop(ctxt);
4395 return;
4396 }
4397
4398 if (ctxt->record_info)
4399 htmlNodeInfoPush(ctxt, &node_info);
4400}
4401
4402/**
4403 * htmlParseContentInternal:
4404 * @ctxt: an HTML parser context
4405 *
4406 * Parse a content: comment, sub-element, reference or text.
4407 * New version for non recursive htmlParseElementInternal
4408 */
4409
4410static void
4411htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4412 xmlChar *currentNode;
4413 int depth;
4414 const xmlChar *name;
4415
4416 currentNode = xmlStrdup(ctxt->name);
4417 depth = ctxt->nameNr;
4418 while (1) {
4419 long cons = ctxt->nbChars;
4420
4421 GROW;
4422
4423 if (ctxt->instate == XML_PARSER_EOF)
4424 break;
4425
4426 /*
4427 * Our tag or one of it's parent or children is ending.
4428 */
4429 if ((CUR == '<') && (NXT(1) == '/')) {
4430 if (htmlParseEndTag(ctxt) &&
4431 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4432 if (currentNode != NULL)
4433 xmlFree(currentNode);
4434
4435 currentNode = xmlStrdup(ctxt->name);
4436 depth = ctxt->nameNr;
4437 }
4438 continue; /* while */
4439 }
4440
4441 else if ((CUR == '<') &&
4442 ((IS_ASCII_LETTER(NXT(1))) ||
4443 (NXT(1) == '_') || (NXT(1) == ':'))) {
4444 name = htmlParseHTMLName_nonInvasive(ctxt);
4445 if (name == NULL) {
4446 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4447 "htmlParseStartTag: invalid element name\n",
4448 NULL, NULL);
4449 /* Dump the bogus tag like browsers do */
4450 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4451 NEXT;
4452
4453 htmlParserFinishElementParsing(ctxt);
4454 if (currentNode != NULL)
4455 xmlFree(currentNode);
4456
4457 currentNode = xmlStrdup(ctxt->name);
4458 depth = ctxt->nameNr;
4459 continue;
4460 }
4461
4462 if (ctxt->name != NULL) {
4463 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4464 htmlAutoClose(ctxt, name);
4465 continue;
4466 }
4467 }
4468 }
4469
4470 /*
4471 * Has this node been popped out during parsing of
4472 * the next element
4473 */
4474 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4475 (!xmlStrEqual(currentNode, ctxt->name)))
4476 {
4477 htmlParserFinishElementParsing(ctxt);
4478 if (currentNode != NULL) xmlFree(currentNode);
4479
4480 currentNode = xmlStrdup(ctxt->name);
4481 depth = ctxt->nameNr;
4482 continue;
4483 }
4484
4485 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4486 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4487 /*
4488 * Handle SCRIPT/STYLE separately
4489 */
4490 htmlParseScript(ctxt);
4491 } else {
4492 /*
4493 * Sometimes DOCTYPE arrives in the middle of the document
4494 */
4495 if ((CUR == '<') && (NXT(1) == '!') &&
4496 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4497 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4498 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4499 (UPP(8) == 'E')) {
4500 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4501 "Misplaced DOCTYPE declaration\n",
4502 BAD_CAST "DOCTYPE" , NULL);
4503 htmlParseDocTypeDecl(ctxt);
4504 }
4505
4506 /*
4507 * First case : a comment
4508 */
4509 if ((CUR == '<') && (NXT(1) == '!') &&
4510 (NXT(2) == '-') && (NXT(3) == '-')) {
4511 htmlParseComment(ctxt);
4512 }
4513
4514 /*
4515 * Second case : a Processing Instruction.
4516 */
4517 else if ((CUR == '<') && (NXT(1) == '?')) {
4518 htmlParsePI(ctxt);
4519 }
4520
4521 /*
4522 * Third case : a sub-element.
4523 */
4524 else if (CUR == '<') {
4525 htmlParseElementInternal(ctxt);
4526 if (currentNode != NULL) xmlFree(currentNode);
4527
4528 currentNode = xmlStrdup(ctxt->name);
4529 depth = ctxt->nameNr;
4530 }
4531
4532 /*
4533 * Fourth case : a reference. If if has not been resolved,
4534 * parsing returns it's Name, create the node
4535 */
4536 else if (CUR == '&') {
4537 htmlParseReference(ctxt);
4538 }
4539
4540 /*
4541 * Fifth case : end of the resource
4542 */
4543 else if (CUR == 0) {
4544 htmlAutoCloseOnEnd(ctxt);
4545 break;
4546 }
4547
4548 /*
4549 * Last case, text. Note that References are handled directly.
4550 */
4551 else {
4552 htmlParseCharData(ctxt);
4553 }
4554
4555 if (cons == ctxt->nbChars) {
4556 if (ctxt->node != NULL) {
4557 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4558 "detected an error in element content\n",
4559 NULL, NULL);
4560 }
4561 break;
4562 }
4563 }
4564 GROW;
4565 }
4566 if (currentNode != NULL) xmlFree(currentNode);
4567}
4568
4569/**
4570 * htmlParseContent:
4571 * @ctxt: an HTML parser context
4572 *
4573 * Parse a content: comment, sub-element, reference or text.
4574 * This is the entry point when called from parser.c
4575 */
4576
4577void
4578__htmlParseContent(void *ctxt) {
4579 if (ctxt != NULL)
4580 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4581}
4582
Owen Taylor3473f882001-02-23 17:55:21 +00004583/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004584 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004585 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004586 *
Owen Taylor3473f882001-02-23 17:55:21 +00004587 * parse an HTML document (and build a tree if using the standard SAX
4588 * interface).
4589 *
4590 * Returns 0, -1 in case of error. the parser context is augmented
4591 * as a result of the parsing.
4592 */
4593
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004594int
Owen Taylor3473f882001-02-23 17:55:21 +00004595htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004596 xmlChar start[4];
4597 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004598 xmlDtdPtr dtd;
4599
Daniel Veillardd0463562001-10-13 09:15:48 +00004600 xmlInitParser();
4601
Owen Taylor3473f882001-02-23 17:55:21 +00004602 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004603
Daniel Veillarda03e3652004-11-02 18:45:30 +00004604 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4605 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4606 "htmlParseDocument: context error\n", NULL, NULL);
4607 return(XML_ERR_INTERNAL_ERROR);
4608 }
4609 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004610 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004611 GROW;
4612 /*
4613 * SAX: beginning of the document processing.
4614 */
4615 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4616 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4617
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004618 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4619 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4620 /*
4621 * Get the 4 first bytes and decode the charset
4622 * if enc != XML_CHAR_ENCODING_NONE
4623 * plug some encoding conversion routines.
4624 */
4625 start[0] = RAW;
4626 start[1] = NXT(1);
4627 start[2] = NXT(2);
4628 start[3] = NXT(3);
4629 enc = xmlDetectCharEncoding(&start[0], 4);
4630 if (enc != XML_CHAR_ENCODING_NONE) {
4631 xmlSwitchEncoding(ctxt, enc);
4632 }
4633 }
4634
Owen Taylor3473f882001-02-23 17:55:21 +00004635 /*
4636 * Wipe out everything which is before the first '<'
4637 */
4638 SKIP_BLANKS;
4639 if (CUR == 0) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004640 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
Daniel Veillardf403d292003-10-05 13:51:35 +00004641 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004642 }
4643
4644 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4645 ctxt->sax->startDocument(ctxt->userData);
4646
4647
4648 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004649 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004650 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004651 while (((CUR == '<') && (NXT(1) == '!') &&
4652 (NXT(2) == '-') && (NXT(3) == '-')) ||
4653 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004654 htmlParseComment(ctxt);
4655 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004656 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004657 }
Owen Taylor3473f882001-02-23 17:55:21 +00004658
4659
4660 /*
4661 * Then possibly doc type declaration(s) and more Misc
4662 * (doctypedecl Misc*)?
4663 */
4664 if ((CUR == '<') && (NXT(1) == '!') &&
4665 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4666 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4667 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4668 (UPP(8) == 'E')) {
4669 htmlParseDocTypeDecl(ctxt);
4670 }
4671 SKIP_BLANKS;
4672
4673 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004674 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004675 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004676 while (((CUR == '<') && (NXT(1) == '!') &&
4677 (NXT(2) == '-') && (NXT(3) == '-')) ||
4678 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004679 htmlParseComment(ctxt);
4680 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004681 SKIP_BLANKS;
Daniel Veillarde77db162009-08-22 11:32:38 +02004682 }
Owen Taylor3473f882001-02-23 17:55:21 +00004683
4684 /*
4685 * Time to start parsing the tree itself
4686 */
Eugene Pimenov615904f2010-03-15 15:16:02 +01004687 htmlParseContentInternal(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004688
4689 /*
4690 * autoclose
4691 */
4692 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004693 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004694
4695
4696 /*
4697 * SAX: end of the document processing.
4698 */
4699 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4700 ctxt->sax->endDocument(ctxt->userData);
4701
Daniel Veillardf1121c42010-07-26 14:02:42 +02004702 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004703 dtd = xmlGetIntSubset(ctxt->myDoc);
4704 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02004705 ctxt->myDoc->intSubset =
4706 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004707 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4708 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4709 }
4710 if (! ctxt->wellFormed) return(-1);
4711 return(0);
4712}
4713
4714
4715/************************************************************************
4716 * *
4717 * Parser contexts handling *
4718 * *
4719 ************************************************************************/
4720
4721/**
William M. Brackedb65a72004-02-06 07:36:04 +00004722 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004723 * @ctxt: an HTML parser context
4724 *
4725 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004726 *
4727 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004728 */
4729
Daniel Veillardf403d292003-10-05 13:51:35 +00004730static int
Owen Taylor3473f882001-02-23 17:55:21 +00004731htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4732{
4733 htmlSAXHandler *sax;
4734
Daniel Veillardf403d292003-10-05 13:51:35 +00004735 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004736 memset(ctxt, 0, sizeof(htmlParserCtxt));
4737
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004738 ctxt->dict = xmlDictCreate();
4739 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004740 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4741 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004742 }
Owen Taylor3473f882001-02-23 17:55:21 +00004743 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4744 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004745 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4746 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004747 }
4748 else
4749 memset(sax, 0, sizeof(htmlSAXHandler));
4750
4751 /* Allocate the Input stack */
Daniel Veillarde77db162009-08-22 11:32:38 +02004752 ctxt->inputTab = (htmlParserInputPtr *)
Owen Taylor3473f882001-02-23 17:55:21 +00004753 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4754 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004755 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004756 ctxt->inputNr = 0;
4757 ctxt->inputMax = 0;
4758 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004759 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004760 }
4761 ctxt->inputNr = 0;
4762 ctxt->inputMax = 5;
4763 ctxt->input = NULL;
4764 ctxt->version = NULL;
4765 ctxt->encoding = NULL;
4766 ctxt->standalone = -1;
4767 ctxt->instate = XML_PARSER_START;
4768
4769 /* Allocate the Node stack */
4770 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4771 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004772 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004773 ctxt->nodeNr = 0;
4774 ctxt->nodeMax = 0;
4775 ctxt->node = NULL;
4776 ctxt->inputNr = 0;
4777 ctxt->inputMax = 0;
4778 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004779 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004780 }
4781 ctxt->nodeNr = 0;
4782 ctxt->nodeMax = 10;
4783 ctxt->node = NULL;
4784
4785 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004786 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004787 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004788 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004789 ctxt->nameNr = 0;
Eugene Pimenovef9c6362010-03-15 11:37:48 +01004790 ctxt->nameMax = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004791 ctxt->name = NULL;
4792 ctxt->nodeNr = 0;
4793 ctxt->nodeMax = 0;
4794 ctxt->node = NULL;
4795 ctxt->inputNr = 0;
4796 ctxt->inputMax = 0;
4797 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004798 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004799 }
4800 ctxt->nameNr = 0;
4801 ctxt->nameMax = 10;
4802 ctxt->name = NULL;
4803
Eugene Pimenov615904f2010-03-15 15:16:02 +01004804 ctxt->nodeInfoTab = NULL;
4805 ctxt->nodeInfoNr = 0;
4806 ctxt->nodeInfoMax = 0;
4807
Daniel Veillard092643b2003-09-25 14:29:29 +00004808 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004809 else {
4810 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004811 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004812 }
4813 ctxt->userData = ctxt;
4814 ctxt->myDoc = NULL;
4815 ctxt->wellFormed = 1;
4816 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004817 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004818 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004819 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004820 ctxt->vctxt.userData = ctxt;
4821 ctxt->vctxt.error = xmlParserValidityError;
4822 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004823 ctxt->record_info = 0;
4824 ctxt->validate = 0;
4825 ctxt->nbChars = 0;
4826 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004827 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004828 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004829 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004830}
4831
4832/**
4833 * htmlFreeParserCtxt:
4834 * @ctxt: an HTML parser context
4835 *
4836 * Free all the memory used by a parser context. However the parsed
4837 * document in ctxt->myDoc is not freed.
4838 */
4839
4840void
4841htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4842{
4843 xmlFreeParserCtxt(ctxt);
4844}
4845
4846/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004847 * htmlNewParserCtxt:
4848 *
4849 * Allocate and initialize a new parser context.
4850 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004851 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004852 */
4853
Daniel Veillard34c647c2006-09-21 06:53:59 +00004854htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004855htmlNewParserCtxt(void)
4856{
4857 xmlParserCtxtPtr ctxt;
4858
4859 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4860 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004861 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004862 return(NULL);
4863 }
4864 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004865 if (htmlInitParserCtxt(ctxt) < 0) {
4866 htmlFreeParserCtxt(ctxt);
4867 return(NULL);
4868 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004869 return(ctxt);
4870}
4871
4872/**
4873 * htmlCreateMemoryParserCtxt:
4874 * @buffer: a pointer to a char array
4875 * @size: the size of the array
4876 *
4877 * Create a parser context for an HTML in-memory document.
4878 *
4879 * Returns the new parser context or NULL
4880 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00004881htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004882htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4883 xmlParserCtxtPtr ctxt;
4884 xmlParserInputPtr input;
4885 xmlParserInputBufferPtr buf;
4886
4887 if (buffer == NULL)
4888 return(NULL);
4889 if (size <= 0)
4890 return(NULL);
4891
4892 ctxt = htmlNewParserCtxt();
4893 if (ctxt == NULL)
4894 return(NULL);
4895
4896 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4897 if (buf == NULL) return(NULL);
4898
4899 input = xmlNewInputStream(ctxt);
4900 if (input == NULL) {
4901 xmlFreeParserCtxt(ctxt);
4902 return(NULL);
4903 }
4904
4905 input->filename = NULL;
4906 input->buf = buf;
Daniel Veillard61551a12012-07-16 16:28:47 +08004907 xmlBufResetInput(buf->buffer, input);
Daniel Veillard1d995272002-07-22 16:43:32 +00004908
4909 inputPush(ctxt, input);
4910 return(ctxt);
4911}
4912
4913/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004914 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004915 * @cur: a pointer to an array of xmlChar
4916 * @encoding: a free form C string describing the HTML document encoding, or NULL
4917 *
4918 * Create a parser context for an HTML document.
4919 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004920 * TODO: check the need to add encoding handling there
4921 *
Owen Taylor3473f882001-02-23 17:55:21 +00004922 * Returns the new parser context or NULL
4923 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004924static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00004925htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00004926 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004927 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00004928
Daniel Veillard1d995272002-07-22 16:43:32 +00004929 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00004930 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00004931 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004932 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00004933 if (ctxt == NULL)
4934 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004935
4936 if (encoding != NULL) {
4937 xmlCharEncoding enc;
4938 xmlCharEncodingHandlerPtr handler;
4939
4940 if (ctxt->input->encoding != NULL)
4941 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00004942 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004943
4944 enc = xmlParseCharEncoding(encoding);
4945 /*
4946 * registered set of known encodings
4947 */
4948 if (enc != XML_CHAR_ENCODING_ERROR) {
4949 xmlSwitchEncoding(ctxt, enc);
4950 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004951 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
Daniel Veillarde77db162009-08-22 11:32:38 +02004952 "Unsupported encoding %s\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00004953 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004954 }
4955 } else {
4956 /*
4957 * fallback for unknown encodings
4958 */
4959 handler = xmlFindCharEncodingHandler((const char *) encoding);
4960 if (handler != NULL) {
4961 xmlSwitchToEncoding(ctxt, handler);
4962 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004963 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4964 "Unsupported encoding %s\n",
4965 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00004966 }
4967 }
4968 }
4969 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004970}
4971
Daniel Veillard73b013f2003-09-30 12:36:01 +00004972#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00004973/************************************************************************
4974 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02004975 * Progressive parsing interfaces *
Owen Taylor3473f882001-02-23 17:55:21 +00004976 * *
4977 ************************************************************************/
4978
4979/**
4980 * htmlParseLookupSequence:
4981 * @ctxt: an HTML parser context
4982 * @first: the first char to lookup
4983 * @next: the next char to lookup or zero
4984 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00004985 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00004986 *
4987 * Try to find if a sequence (first, next, third) or just (first next) or
4988 * (first) is available in the input stream.
4989 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4990 * to avoid rescanning sequences of bytes, it DOES change the state of the
4991 * parser, do not use liberally.
4992 * This is basically similar to xmlParseLookupSequence()
4993 *
4994 * Returns the index to the current parsing point if the full sequence
4995 * is available, -1 otherwise.
4996 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004997static int
Owen Taylor3473f882001-02-23 17:55:21 +00004998htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02004999 xmlChar next, xmlChar third, int iscomment,
Daniel Veillardeeb99322009-08-25 14:42:16 +02005000 int ignoreattrval)
5001{
Owen Taylor3473f882001-02-23 17:55:21 +00005002 int base, len;
5003 htmlParserInputPtr in;
5004 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00005005 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02005006 int invalue = 0;
5007 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00005008
5009 in = ctxt->input;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005010 if (in == NULL)
5011 return (-1);
5012
Owen Taylor3473f882001-02-23 17:55:21 +00005013 base = in->cur - in->base;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005014 if (base < 0)
5015 return (-1);
5016
Owen Taylor3473f882001-02-23 17:55:21 +00005017 if (ctxt->checkIndex > base)
5018 base = ctxt->checkIndex;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005019
Owen Taylor3473f882001-02-23 17:55:21 +00005020 if (in->buf == NULL) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02005021 buf = in->base;
5022 len = in->length;
Owen Taylor3473f882001-02-23 17:55:21 +00005023 } else {
Daniel Veillarda78d8032012-07-16 14:56:50 +08005024 buf = xmlBufContent(in->buf->buffer);
5025 len = xmlBufUse(in->buf->buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00005026 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02005027
Owen Taylor3473f882001-02-23 17:55:21 +00005028 /* take into account the sequence length */
Daniel Veillardeeb99322009-08-25 14:42:16 +02005029 if (third)
5030 len -= 2;
5031 else if (next)
5032 len--;
5033 for (; base < len; base++) {
5034 if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5035 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5036 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5037 incomment = 1;
5038 /* do not increment past <! - some people use <!--> */
5039 base += 2;
5040 }
5041 }
5042 if (ignoreattrval) {
5043 if (buf[base] == '"' || buf[base] == '\'') {
5044 if (invalue) {
5045 if (buf[base] == valdellim) {
5046 invalue = 0;
5047 continue;
5048 }
5049 } else {
5050 valdellim = buf[base];
5051 invalue = 1;
5052 continue;
5053 }
5054 } else if (invalue) {
5055 continue;
5056 }
5057 }
5058 if (incomment) {
5059 if (base + 3 > len)
5060 return (-1);
5061 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5062 (buf[base + 2] == '>')) {
5063 incomment = 0;
5064 base += 2;
5065 }
5066 continue;
5067 }
Owen Taylor3473f882001-02-23 17:55:21 +00005068 if (buf[base] == first) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02005069 if (third != 0) {
5070 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5071 continue;
5072 } else if (next != 0) {
5073 if (buf[base + 1] != next)
5074 continue;
5075 }
5076 ctxt->checkIndex = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00005077#ifdef DEBUG_PUSH
Daniel Veillardeeb99322009-08-25 14:42:16 +02005078 if (next == 0)
5079 xmlGenericError(xmlGenericErrorContext,
5080 "HPP: lookup '%c' found at %d\n",
5081 first, base);
5082 else if (third == 0)
5083 xmlGenericError(xmlGenericErrorContext,
5084 "HPP: lookup '%c%c' found at %d\n",
5085 first, next, base);
5086 else
5087 xmlGenericError(xmlGenericErrorContext,
5088 "HPP: lookup '%c%c%c' found at %d\n",
5089 first, next, third, base);
Owen Taylor3473f882001-02-23 17:55:21 +00005090#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02005091 return (base - (in->cur - in->base));
5092 }
Owen Taylor3473f882001-02-23 17:55:21 +00005093 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02005094 if ((!incomment) && (!invalue))
5095 ctxt->checkIndex = base;
Owen Taylor3473f882001-02-23 17:55:21 +00005096#ifdef DEBUG_PUSH
5097 if (next == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02005098 xmlGenericError(xmlGenericErrorContext,
5099 "HPP: lookup '%c' failed\n", first);
Owen Taylor3473f882001-02-23 17:55:21 +00005100 else if (third == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02005101 xmlGenericError(xmlGenericErrorContext,
5102 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillarde77db162009-08-22 11:32:38 +02005103 else
Daniel Veillardeeb99322009-08-25 14:42:16 +02005104 xmlGenericError(xmlGenericErrorContext,
5105 "HPP: lookup '%c%c%c' failed\n", first, next,
5106 third);
Owen Taylor3473f882001-02-23 17:55:21 +00005107#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02005108 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00005109}
5110
5111/**
Markus Kull56a03032009-08-24 19:00:23 +02005112 * htmlParseLookupChars:
5113 * @ctxt: an HTML parser context
5114 * @stop: Array of chars, which stop the lookup.
5115 * @stopLen: Length of stop-Array
5116 *
5117 * Try to find if any char of the stop-Array is available in the input
5118 * stream.
5119 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5120 * to avoid rescanning sequences of bytes, it DOES change the state of the
5121 * parser, do not use liberally.
5122 *
5123 * Returns the index to the current parsing point if a stopChar
5124 * is available, -1 otherwise.
5125 */
5126static int
5127htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5128 int stopLen)
5129{
5130 int base, len;
5131 htmlParserInputPtr in;
5132 const xmlChar *buf;
5133 int incomment = 0;
5134 int i;
5135
5136 in = ctxt->input;
5137 if (in == NULL)
5138 return (-1);
5139
5140 base = in->cur - in->base;
5141 if (base < 0)
5142 return (-1);
5143
5144 if (ctxt->checkIndex > base)
5145 base = ctxt->checkIndex;
5146
5147 if (in->buf == NULL) {
5148 buf = in->base;
5149 len = in->length;
5150 } else {
Daniel Veillarda78d8032012-07-16 14:56:50 +08005151 buf = xmlBufContent(in->buf->buffer);
5152 len = xmlBufUse(in->buf->buffer);
Markus Kull56a03032009-08-24 19:00:23 +02005153 }
5154
5155 for (; base < len; base++) {
5156 if (!incomment && (base + 4 < len)) {
5157 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5158 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5159 incomment = 1;
5160 /* do not increment past <! - some people use <!--> */
5161 base += 2;
5162 }
5163 }
5164 if (incomment) {
5165 if (base + 3 > len)
5166 return (-1);
5167 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5168 (buf[base + 2] == '>')) {
5169 incomment = 0;
5170 base += 2;
5171 }
5172 continue;
5173 }
5174 for (i = 0; i < stopLen; ++i) {
5175 if (buf[base] == stop[i]) {
5176 ctxt->checkIndex = 0;
5177 return (base - (in->cur - in->base));
5178 }
5179 }
5180 }
5181 ctxt->checkIndex = base;
5182 return (-1);
5183}
5184
5185/**
Owen Taylor3473f882001-02-23 17:55:21 +00005186 * htmlParseTryOrFinish:
5187 * @ctxt: an HTML parser context
5188 * @terminate: last chunk indicator
5189 *
5190 * Try to progress on parsing
5191 *
5192 * Returns zero if no parsing was possible
5193 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005194static int
Owen Taylor3473f882001-02-23 17:55:21 +00005195htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5196 int ret = 0;
5197 htmlParserInputPtr in;
5198 int avail = 0;
5199 xmlChar cur, next;
5200
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005201 htmlParserNodeInfo node_info;
5202
Owen Taylor3473f882001-02-23 17:55:21 +00005203#ifdef DEBUG_PUSH
5204 switch (ctxt->instate) {
5205 case XML_PARSER_EOF:
5206 xmlGenericError(xmlGenericErrorContext,
5207 "HPP: try EOF\n"); break;
5208 case XML_PARSER_START:
5209 xmlGenericError(xmlGenericErrorContext,
5210 "HPP: try START\n"); break;
5211 case XML_PARSER_MISC:
5212 xmlGenericError(xmlGenericErrorContext,
5213 "HPP: try MISC\n");break;
5214 case XML_PARSER_COMMENT:
5215 xmlGenericError(xmlGenericErrorContext,
5216 "HPP: try COMMENT\n");break;
5217 case XML_PARSER_PROLOG:
5218 xmlGenericError(xmlGenericErrorContext,
5219 "HPP: try PROLOG\n");break;
5220 case XML_PARSER_START_TAG:
5221 xmlGenericError(xmlGenericErrorContext,
5222 "HPP: try START_TAG\n");break;
5223 case XML_PARSER_CONTENT:
5224 xmlGenericError(xmlGenericErrorContext,
5225 "HPP: try CONTENT\n");break;
5226 case XML_PARSER_CDATA_SECTION:
5227 xmlGenericError(xmlGenericErrorContext,
5228 "HPP: try CDATA_SECTION\n");break;
5229 case XML_PARSER_END_TAG:
5230 xmlGenericError(xmlGenericErrorContext,
5231 "HPP: try END_TAG\n");break;
5232 case XML_PARSER_ENTITY_DECL:
5233 xmlGenericError(xmlGenericErrorContext,
5234 "HPP: try ENTITY_DECL\n");break;
5235 case XML_PARSER_ENTITY_VALUE:
5236 xmlGenericError(xmlGenericErrorContext,
5237 "HPP: try ENTITY_VALUE\n");break;
5238 case XML_PARSER_ATTRIBUTE_VALUE:
5239 xmlGenericError(xmlGenericErrorContext,
5240 "HPP: try ATTRIBUTE_VALUE\n");break;
5241 case XML_PARSER_DTD:
5242 xmlGenericError(xmlGenericErrorContext,
5243 "HPP: try DTD\n");break;
5244 case XML_PARSER_EPILOG:
5245 xmlGenericError(xmlGenericErrorContext,
5246 "HPP: try EPILOG\n");break;
5247 case XML_PARSER_PI:
5248 xmlGenericError(xmlGenericErrorContext,
5249 "HPP: try PI\n");break;
5250 case XML_PARSER_SYSTEM_LITERAL:
5251 xmlGenericError(xmlGenericErrorContext,
5252 "HPP: try SYSTEM_LITERAL\n");break;
5253 }
5254#endif
5255
5256 while (1) {
5257
5258 in = ctxt->input;
5259 if (in == NULL) break;
5260 if (in->buf == NULL)
5261 avail = in->length - (in->cur - in->base);
5262 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005263 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Owen Taylor3473f882001-02-23 17:55:21 +00005264 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005265 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005266 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005267 /*
5268 * SAX: end of the document processing.
5269 */
5270 ctxt->instate = XML_PARSER_EOF;
5271 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5272 ctxt->sax->endDocument(ctxt->userData);
5273 }
5274 }
5275 if (avail < 1)
5276 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00005277 cur = in->cur[0];
5278 if (cur == 0) {
5279 SKIP(1);
5280 continue;
5281 }
5282
Owen Taylor3473f882001-02-23 17:55:21 +00005283 switch (ctxt->instate) {
5284 case XML_PARSER_EOF:
5285 /*
5286 * Document parsing is done !
5287 */
5288 goto done;
5289 case XML_PARSER_START:
5290 /*
5291 * Very first chars read from the document flow.
5292 */
5293 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005294 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005295 SKIP_BLANKS;
5296 if (in->buf == NULL)
5297 avail = in->length - (in->cur - in->base);
5298 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005299 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Owen Taylor3473f882001-02-23 17:55:21 +00005300 }
5301 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5302 ctxt->sax->setDocumentLocator(ctxt->userData,
5303 &xmlDefaultSAXLocator);
5304 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5305 (!ctxt->disableSAX))
5306 ctxt->sax->startDocument(ctxt->userData);
5307
5308 cur = in->cur[0];
5309 next = in->cur[1];
5310 if ((cur == '<') && (next == '!') &&
5311 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5312 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5313 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5314 (UPP(8) == 'E')) {
5315 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005316 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005317 goto done;
5318#ifdef DEBUG_PUSH
5319 xmlGenericError(xmlGenericErrorContext,
5320 "HPP: Parsing internal subset\n");
5321#endif
5322 htmlParseDocTypeDecl(ctxt);
5323 ctxt->instate = XML_PARSER_PROLOG;
5324#ifdef DEBUG_PUSH
5325 xmlGenericError(xmlGenericErrorContext,
5326 "HPP: entering PROLOG\n");
5327#endif
5328 } else {
5329 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005330#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00005331 xmlGenericError(xmlGenericErrorContext,
5332 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005333#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00005334 }
Owen Taylor3473f882001-02-23 17:55:21 +00005335 break;
5336 case XML_PARSER_MISC:
5337 SKIP_BLANKS;
5338 if (in->buf == NULL)
5339 avail = in->length - (in->cur - in->base);
5340 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005341 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Denis Paukfdf990c2012-05-10 20:40:49 +08005342 /*
5343 * no chars in buffer
5344 */
5345 if (avail < 1)
Owen Taylor3473f882001-02-23 17:55:21 +00005346 goto done;
Denis Paukfdf990c2012-05-10 20:40:49 +08005347 /*
5348 * not enouth chars in buffer
5349 */
5350 if (avail < 2) {
5351 if (!terminate)
5352 goto done;
5353 else
5354 next = ' ';
5355 } else {
5356 next = in->cur[1];
5357 }
Owen Taylor3473f882001-02-23 17:55:21 +00005358 cur = in->cur[0];
Owen Taylor3473f882001-02-23 17:55:21 +00005359 if ((cur == '<') && (next == '!') &&
5360 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5361 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005362 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005363 goto done;
5364#ifdef DEBUG_PUSH
5365 xmlGenericError(xmlGenericErrorContext,
5366 "HPP: Parsing Comment\n");
5367#endif
5368 htmlParseComment(ctxt);
5369 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005370 } else if ((cur == '<') && (next == '?')) {
5371 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005372 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005373 goto done;
5374#ifdef DEBUG_PUSH
5375 xmlGenericError(xmlGenericErrorContext,
5376 "HPP: Parsing PI\n");
5377#endif
5378 htmlParsePI(ctxt);
5379 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005380 } else if ((cur == '<') && (next == '!') &&
5381 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5382 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5383 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5384 (UPP(8) == 'E')) {
5385 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005386 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005387 goto done;
5388#ifdef DEBUG_PUSH
5389 xmlGenericError(xmlGenericErrorContext,
5390 "HPP: Parsing internal subset\n");
5391#endif
5392 htmlParseDocTypeDecl(ctxt);
5393 ctxt->instate = XML_PARSER_PROLOG;
5394#ifdef DEBUG_PUSH
5395 xmlGenericError(xmlGenericErrorContext,
5396 "HPP: entering PROLOG\n");
5397#endif
5398 } else if ((cur == '<') && (next == '!') &&
5399 (avail < 9)) {
5400 goto done;
5401 } else {
5402 ctxt->instate = XML_PARSER_START_TAG;
5403#ifdef DEBUG_PUSH
5404 xmlGenericError(xmlGenericErrorContext,
5405 "HPP: entering START_TAG\n");
5406#endif
5407 }
5408 break;
5409 case XML_PARSER_PROLOG:
5410 SKIP_BLANKS;
5411 if (in->buf == NULL)
5412 avail = in->length - (in->cur - in->base);
5413 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005414 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Daniel Veillarde77db162009-08-22 11:32:38 +02005415 if (avail < 2)
Owen Taylor3473f882001-02-23 17:55:21 +00005416 goto done;
5417 cur = in->cur[0];
5418 next = in->cur[1];
5419 if ((cur == '<') && (next == '!') &&
5420 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5421 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005422 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005423 goto done;
5424#ifdef DEBUG_PUSH
5425 xmlGenericError(xmlGenericErrorContext,
5426 "HPP: Parsing Comment\n");
5427#endif
5428 htmlParseComment(ctxt);
5429 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005430 } else if ((cur == '<') && (next == '?')) {
5431 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005432 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005433 goto done;
5434#ifdef DEBUG_PUSH
5435 xmlGenericError(xmlGenericErrorContext,
5436 "HPP: Parsing PI\n");
5437#endif
5438 htmlParsePI(ctxt);
5439 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005440 } else if ((cur == '<') && (next == '!') &&
5441 (avail < 4)) {
5442 goto done;
5443 } else {
5444 ctxt->instate = XML_PARSER_START_TAG;
5445#ifdef DEBUG_PUSH
5446 xmlGenericError(xmlGenericErrorContext,
5447 "HPP: entering START_TAG\n");
5448#endif
5449 }
5450 break;
5451 case XML_PARSER_EPILOG:
5452 if (in->buf == NULL)
5453 avail = in->length - (in->cur - in->base);
5454 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005455 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Owen Taylor3473f882001-02-23 17:55:21 +00005456 if (avail < 1)
5457 goto done;
5458 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005459 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005460 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005461 goto done;
5462 }
5463 if (avail < 2)
5464 goto done;
5465 next = in->cur[1];
5466 if ((cur == '<') && (next == '!') &&
5467 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5468 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005469 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005470 goto done;
5471#ifdef DEBUG_PUSH
5472 xmlGenericError(xmlGenericErrorContext,
5473 "HPP: Parsing Comment\n");
5474#endif
5475 htmlParseComment(ctxt);
5476 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005477 } else if ((cur == '<') && (next == '?')) {
5478 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005479 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005480 goto done;
5481#ifdef DEBUG_PUSH
5482 xmlGenericError(xmlGenericErrorContext,
5483 "HPP: Parsing PI\n");
5484#endif
5485 htmlParsePI(ctxt);
5486 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005487 } else if ((cur == '<') && (next == '!') &&
5488 (avail < 4)) {
5489 goto done;
5490 } else {
5491 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005492 ctxt->wellFormed = 0;
5493 ctxt->instate = XML_PARSER_EOF;
5494#ifdef DEBUG_PUSH
5495 xmlGenericError(xmlGenericErrorContext,
5496 "HPP: entering EOF\n");
5497#endif
5498 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5499 ctxt->sax->endDocument(ctxt->userData);
5500 goto done;
5501 }
5502 break;
5503 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005504 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005505 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005506 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005507
Denis Paukfdf990c2012-05-10 20:40:49 +08005508 /*
5509 * no chars in buffer
5510 */
5511 if (avail < 1)
Owen Taylor3473f882001-02-23 17:55:21 +00005512 goto done;
Denis Paukfdf990c2012-05-10 20:40:49 +08005513 /*
5514 * not enouth chars in buffer
5515 */
5516 if (avail < 2) {
5517 if (!terminate)
5518 goto done;
5519 else
5520 next = ' ';
5521 } else {
5522 next = in->cur[1];
5523 }
Owen Taylor3473f882001-02-23 17:55:21 +00005524 cur = in->cur[0];
5525 if (cur != '<') {
5526 ctxt->instate = XML_PARSER_CONTENT;
5527#ifdef DEBUG_PUSH
5528 xmlGenericError(xmlGenericErrorContext,
5529 "HPP: entering CONTENT\n");
5530#endif
5531 break;
5532 }
Denis Paukfdf990c2012-05-10 20:40:49 +08005533 if (next == '/') {
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005534 ctxt->instate = XML_PARSER_END_TAG;
5535 ctxt->checkIndex = 0;
5536#ifdef DEBUG_PUSH
5537 xmlGenericError(xmlGenericErrorContext,
5538 "HPP: entering END_TAG\n");
5539#endif
5540 break;
5541 }
Owen Taylor3473f882001-02-23 17:55:21 +00005542 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005543 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005544 goto done;
5545
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005546 /* Capture start position */
5547 if (ctxt->record_info) {
5548 node_info.begin_pos = ctxt->input->consumed +
5549 (CUR_PTR - ctxt->input->base);
5550 node_info.begin_line = ctxt->input->line;
5551 }
5552
5553
Daniel Veillard597f1c12005-07-03 23:00:18 +00005554 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005555 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005556 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005557 (name == NULL)) {
5558 if (CUR == '>')
5559 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005560 break;
5561 }
Owen Taylor3473f882001-02-23 17:55:21 +00005562
5563 /*
5564 * Lookup the info for that element.
5565 */
5566 info = htmlTagLookup(name);
5567 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005568 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5569 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005570 }
5571
5572 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005573 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005574 */
5575 if ((CUR == '/') && (NXT(1) == '>')) {
5576 SKIP(2);
5577 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5578 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005579 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005580 ctxt->instate = XML_PARSER_CONTENT;
5581#ifdef DEBUG_PUSH
5582 xmlGenericError(xmlGenericErrorContext,
5583 "HPP: entering CONTENT\n");
5584#endif
5585 break;
5586 }
5587
5588 if (CUR == '>') {
5589 NEXT;
5590 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005591 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5592 "Couldn't find end of Start Tag %s\n",
5593 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005594
5595 /*
5596 * end of parsing of this node.
5597 */
Daniel Veillarde77db162009-08-22 11:32:38 +02005598 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005599 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005600 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005601 }
Owen Taylor3473f882001-02-23 17:55:21 +00005602
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005603 if (ctxt->record_info)
5604 htmlNodeInfoPush(ctxt, &node_info);
5605
Owen Taylor3473f882001-02-23 17:55:21 +00005606 ctxt->instate = XML_PARSER_CONTENT;
5607#ifdef DEBUG_PUSH
5608 xmlGenericError(xmlGenericErrorContext,
5609 "HPP: entering CONTENT\n");
5610#endif
5611 break;
5612 }
5613
5614 /*
5615 * Check for an Empty Element from DTD definition
5616 */
5617 if ((info != NULL) && (info->empty)) {
5618 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5619 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005620 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005621 }
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005622
5623 if (ctxt->record_info)
5624 htmlNodeInfoPush(ctxt, &node_info);
5625
Owen Taylor3473f882001-02-23 17:55:21 +00005626 ctxt->instate = XML_PARSER_CONTENT;
5627#ifdef DEBUG_PUSH
5628 xmlGenericError(xmlGenericErrorContext,
5629 "HPP: entering CONTENT\n");
5630#endif
5631 break;
5632 }
5633 case XML_PARSER_CONTENT: {
5634 long cons;
5635 /*
5636 * Handle preparsed entities and charRef
5637 */
5638 if (ctxt->token != 0) {
5639 xmlChar chr[2] = { 0 , 0 } ;
5640
5641 chr[0] = (xmlChar) ctxt->token;
5642 htmlCheckParagraph(ctxt);
5643 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5644 ctxt->sax->characters(ctxt->userData, chr, 1);
5645 ctxt->token = 0;
5646 ctxt->checkIndex = 0;
5647 }
5648 if ((avail == 1) && (terminate)) {
5649 cur = in->cur[0];
5650 if ((cur != '<') && (cur != '&')) {
5651 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005652 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005653 if (ctxt->sax->ignorableWhitespace != NULL)
5654 ctxt->sax->ignorableWhitespace(
5655 ctxt->userData, &cur, 1);
5656 } else {
5657 htmlCheckParagraph(ctxt);
5658 if (ctxt->sax->characters != NULL)
5659 ctxt->sax->characters(
5660 ctxt->userData, &cur, 1);
5661 }
5662 }
5663 ctxt->token = 0;
5664 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005665 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005666 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005667 }
Owen Taylor3473f882001-02-23 17:55:21 +00005668 }
5669 if (avail < 2)
5670 goto done;
5671 cur = in->cur[0];
5672 next = in->cur[1];
5673 cons = ctxt->nbChars;
5674 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5675 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5676 /*
5677 * Handle SCRIPT/STYLE separately
5678 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005679 if (!terminate) {
5680 int idx;
5681 xmlChar val;
5682
Denis Pauk91d239c2010-11-04 12:39:18 +01005683 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
Daniel Veillard68716a72006-10-16 09:32:17 +00005684 if (idx < 0)
5685 goto done;
5686 val = in->cur[idx + 2];
5687 if (val == 0) /* bad cut of input */
5688 goto done;
5689 }
Owen Taylor3473f882001-02-23 17:55:21 +00005690 htmlParseScript(ctxt);
5691 if ((cur == '<') && (next == '/')) {
5692 ctxt->instate = XML_PARSER_END_TAG;
5693 ctxt->checkIndex = 0;
5694#ifdef DEBUG_PUSH
5695 xmlGenericError(xmlGenericErrorContext,
5696 "HPP: entering END_TAG\n");
5697#endif
5698 break;
5699 }
5700 } else {
5701 /*
5702 * Sometimes DOCTYPE arrives in the middle of the document
5703 */
5704 if ((cur == '<') && (next == '!') &&
5705 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5706 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5707 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5708 (UPP(8) == 'E')) {
5709 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005710 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005711 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005712 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5713 "Misplaced DOCTYPE declaration\n",
5714 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005715 htmlParseDocTypeDecl(ctxt);
5716 } else if ((cur == '<') && (next == '!') &&
5717 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5718 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005719 (htmlParseLookupSequence(
Daniel Veillarde77db162009-08-22 11:32:38 +02005720 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005721 goto done;
5722#ifdef DEBUG_PUSH
5723 xmlGenericError(xmlGenericErrorContext,
5724 "HPP: Parsing Comment\n");
5725#endif
5726 htmlParseComment(ctxt);
5727 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005728 } else if ((cur == '<') && (next == '?')) {
5729 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005730 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005731 goto done;
5732#ifdef DEBUG_PUSH
5733 xmlGenericError(xmlGenericErrorContext,
5734 "HPP: Parsing PI\n");
5735#endif
5736 htmlParsePI(ctxt);
5737 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005738 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5739 goto done;
5740 } else if ((cur == '<') && (next == '/')) {
5741 ctxt->instate = XML_PARSER_END_TAG;
5742 ctxt->checkIndex = 0;
5743#ifdef DEBUG_PUSH
5744 xmlGenericError(xmlGenericErrorContext,
5745 "HPP: entering END_TAG\n");
5746#endif
5747 break;
5748 } else if (cur == '<') {
5749 ctxt->instate = XML_PARSER_START_TAG;
5750 ctxt->checkIndex = 0;
5751#ifdef DEBUG_PUSH
5752 xmlGenericError(xmlGenericErrorContext,
5753 "HPP: entering START_TAG\n");
5754#endif
5755 break;
5756 } else if (cur == '&') {
5757 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005758 (htmlParseLookupChars(ctxt,
5759 BAD_CAST "; >/", 4) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005760 goto done;
5761#ifdef DEBUG_PUSH
5762 xmlGenericError(xmlGenericErrorContext,
5763 "HPP: Parsing Reference\n");
5764#endif
5765 /* TODO: check generation of subtrees if noent !!! */
5766 htmlParseReference(ctxt);
5767 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005768 /*
5769 * check that the text sequence is complete
5770 * before handing out the data to the parser
5771 * to avoid problems with erroneous end of
5772 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005773 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005774 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005775 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005776 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005777 ctxt->checkIndex = 0;
5778#ifdef DEBUG_PUSH
5779 xmlGenericError(xmlGenericErrorContext,
5780 "HPP: Parsing char data\n");
5781#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005782 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005783 }
5784 }
5785 if (cons == ctxt->nbChars) {
5786 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005787 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5788 "detected an error in element content\n",
5789 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005790 }
5791 NEXT;
5792 break;
5793 }
5794
5795 break;
5796 }
5797 case XML_PARSER_END_TAG:
5798 if (avail < 2)
5799 goto done;
5800 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005801 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005802 goto done;
5803 htmlParseEndTag(ctxt);
5804 if (ctxt->nameNr == 0) {
5805 ctxt->instate = XML_PARSER_EPILOG;
5806 } else {
5807 ctxt->instate = XML_PARSER_CONTENT;
5808 }
5809 ctxt->checkIndex = 0;
5810#ifdef DEBUG_PUSH
5811 xmlGenericError(xmlGenericErrorContext,
5812 "HPP: entering CONTENT\n");
5813#endif
5814 break;
5815 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005816 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5817 "HPP: internal error, state == CDATA\n",
5818 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005819 ctxt->instate = XML_PARSER_CONTENT;
5820 ctxt->checkIndex = 0;
5821#ifdef DEBUG_PUSH
5822 xmlGenericError(xmlGenericErrorContext,
5823 "HPP: entering CONTENT\n");
5824#endif
5825 break;
5826 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005827 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5828 "HPP: internal error, state == DTD\n",
5829 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005830 ctxt->instate = XML_PARSER_CONTENT;
5831 ctxt->checkIndex = 0;
5832#ifdef DEBUG_PUSH
5833 xmlGenericError(xmlGenericErrorContext,
5834 "HPP: entering CONTENT\n");
5835#endif
5836 break;
5837 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005838 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5839 "HPP: internal error, state == COMMENT\n",
5840 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005841 ctxt->instate = XML_PARSER_CONTENT;
5842 ctxt->checkIndex = 0;
5843#ifdef DEBUG_PUSH
5844 xmlGenericError(xmlGenericErrorContext,
5845 "HPP: entering CONTENT\n");
5846#endif
5847 break;
5848 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005849 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5850 "HPP: internal error, state == PI\n",
5851 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005852 ctxt->instate = XML_PARSER_CONTENT;
5853 ctxt->checkIndex = 0;
5854#ifdef DEBUG_PUSH
5855 xmlGenericError(xmlGenericErrorContext,
5856 "HPP: entering CONTENT\n");
5857#endif
5858 break;
5859 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005860 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5861 "HPP: internal error, state == ENTITY_DECL\n",
5862 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005863 ctxt->instate = XML_PARSER_CONTENT;
5864 ctxt->checkIndex = 0;
5865#ifdef DEBUG_PUSH
5866 xmlGenericError(xmlGenericErrorContext,
5867 "HPP: entering CONTENT\n");
5868#endif
5869 break;
5870 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005871 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5872 "HPP: internal error, state == ENTITY_VALUE\n",
5873 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005874 ctxt->instate = XML_PARSER_CONTENT;
5875 ctxt->checkIndex = 0;
5876#ifdef DEBUG_PUSH
5877 xmlGenericError(xmlGenericErrorContext,
5878 "HPP: entering DTD\n");
5879#endif
5880 break;
5881 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005882 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5883 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5884 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005885 ctxt->instate = XML_PARSER_START_TAG;
5886 ctxt->checkIndex = 0;
5887#ifdef DEBUG_PUSH
5888 xmlGenericError(xmlGenericErrorContext,
5889 "HPP: entering START_TAG\n");
5890#endif
5891 break;
5892 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005893 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5894 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5895 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005896 ctxt->instate = XML_PARSER_CONTENT;
5897 ctxt->checkIndex = 0;
5898#ifdef DEBUG_PUSH
5899 xmlGenericError(xmlGenericErrorContext,
5900 "HPP: entering CONTENT\n");
5901#endif
5902 break;
5903 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00005904 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5905 "HPP: internal error, state == XML_PARSER_IGNORE\n",
5906 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005907 ctxt->instate = XML_PARSER_CONTENT;
5908 ctxt->checkIndex = 0;
5909#ifdef DEBUG_PUSH
5910 xmlGenericError(xmlGenericErrorContext,
5911 "HPP: entering CONTENT\n");
5912#endif
5913 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005914 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005915 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5916 "HPP: internal error, state == XML_PARSER_LITERAL\n",
5917 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00005918 ctxt->instate = XML_PARSER_CONTENT;
5919 ctxt->checkIndex = 0;
5920#ifdef DEBUG_PUSH
5921 xmlGenericError(xmlGenericErrorContext,
5922 "HPP: entering CONTENT\n");
5923#endif
5924 break;
5925
Owen Taylor3473f882001-02-23 17:55:21 +00005926 }
5927 }
Daniel Veillarde77db162009-08-22 11:32:38 +02005928done:
Owen Taylor3473f882001-02-23 17:55:21 +00005929 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005930 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005931 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005932 /*
5933 * SAX: end of the document processing.
5934 */
5935 ctxt->instate = XML_PARSER_EOF;
5936 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5937 ctxt->sax->endDocument(ctxt->userData);
5938 }
5939 }
5940 if ((ctxt->myDoc != NULL) &&
5941 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5942 (ctxt->instate == XML_PARSER_EPILOG))) {
5943 xmlDtdPtr dtd;
5944 dtd = xmlGetIntSubset(ctxt->myDoc);
5945 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02005946 ctxt->myDoc->intSubset =
5947 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00005948 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5949 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5950 }
5951#ifdef DEBUG_PUSH
5952 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5953#endif
5954 return(ret);
5955}
5956
5957/**
Owen Taylor3473f882001-02-23 17:55:21 +00005958 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00005959 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00005960 * @chunk: an char array
5961 * @size: the size in byte of the chunk
5962 * @terminate: last chunk indicator
5963 *
5964 * Parse a Chunk of memory
5965 *
5966 * Returns zero if no error, the xmlParserErrors otherwise.
5967 */
5968int
5969htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5970 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00005971 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5972 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5973 "htmlParseChunk: context error\n", NULL, NULL);
5974 return(XML_ERR_INTERNAL_ERROR);
5975 }
Owen Taylor3473f882001-02-23 17:55:21 +00005976 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5977 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
Daniel Veillard00ac0d32012-07-16 18:03:01 +08005978 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
5979 size_t cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00005980 int res;
Daniel Veillarde77db162009-08-22 11:32:38 +02005981
5982 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillardd2755a82005-08-07 23:42:39 +00005983 if (res < 0) {
5984 ctxt->errNo = XML_PARSER_EOF;
5985 ctxt->disableSAX = 1;
5986 return (XML_PARSER_EOF);
5987 }
Daniel Veillard00ac0d32012-07-16 18:03:01 +08005988 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Owen Taylor3473f882001-02-23 17:55:21 +00005989#ifdef DEBUG_PUSH
5990 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5991#endif
5992
Daniel Veillard14f752c2003-08-09 11:44:50 +00005993#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00005994 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5995 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00005996#endif
Owen Taylor3473f882001-02-23 17:55:21 +00005997 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005998 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5999 xmlParserInputBufferPtr in = ctxt->input->buf;
6000 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6001 (in->raw != NULL)) {
6002 int nbchars;
Daniel Veillarde77db162009-08-22 11:32:38 +02006003
Daniel Veillarda78d8032012-07-16 14:56:50 +08006004 nbchars = xmlCharEncInput(in);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006005 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00006006 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6007 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006008 return(XML_ERR_INVALID_ENCODING);
6009 }
6010 }
6011 }
Owen Taylor3473f882001-02-23 17:55:21 +00006012 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00006013 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00006014 if (terminate) {
6015 if ((ctxt->instate != XML_PARSER_EOF) &&
6016 (ctxt->instate != XML_PARSER_EPILOG) &&
6017 (ctxt->instate != XML_PARSER_MISC)) {
6018 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00006019 ctxt->wellFormed = 0;
Daniel Veillarde77db162009-08-22 11:32:38 +02006020 }
Owen Taylor3473f882001-02-23 17:55:21 +00006021 if (ctxt->instate != XML_PARSER_EOF) {
6022 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6023 ctxt->sax->endDocument(ctxt->userData);
6024 }
6025 ctxt->instate = XML_PARSER_EOF;
6026 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006027 return((xmlParserErrors) ctxt->errNo);
Owen Taylor3473f882001-02-23 17:55:21 +00006028}
6029
6030/************************************************************************
6031 * *
6032 * User entry points *
6033 * *
6034 ************************************************************************/
6035
6036/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006037 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00006038 * @sax: a SAX handler
6039 * @user_data: The user data returned on SAX callbacks
6040 * @chunk: a pointer to an array of chars
6041 * @size: number of chars in the array
6042 * @filename: an optional file name or URI
6043 * @enc: an optional encoding
6044 *
6045 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00006046 * The value of @filename is used for fetching external entities
6047 * and error/warning reports.
6048 *
6049 * Returns the new parser context or NULL
6050 */
6051htmlParserCtxtPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02006052htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
Owen Taylor3473f882001-02-23 17:55:21 +00006053 const char *chunk, int size, const char *filename,
6054 xmlCharEncoding enc) {
6055 htmlParserCtxtPtr ctxt;
6056 htmlParserInputPtr inputStream;
6057 xmlParserInputBufferPtr buf;
6058
Daniel Veillardd0463562001-10-13 09:15:48 +00006059 xmlInitParser();
6060
Owen Taylor3473f882001-02-23 17:55:21 +00006061 buf = xmlAllocParserInputBuffer(enc);
6062 if (buf == NULL) return(NULL);
6063
Daniel Veillardf403d292003-10-05 13:51:35 +00006064 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00006065 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00006066 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00006067 return(NULL);
6068 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00006069 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6070 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00006071 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00006072 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00006073 xmlFree(ctxt->sax);
6074 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6075 if (ctxt->sax == NULL) {
6076 xmlFree(buf);
6077 xmlFree(ctxt);
6078 return(NULL);
6079 }
6080 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6081 if (user_data != NULL)
6082 ctxt->userData = user_data;
Daniel Veillarde77db162009-08-22 11:32:38 +02006083 }
Owen Taylor3473f882001-02-23 17:55:21 +00006084 if (filename == NULL) {
6085 ctxt->directory = NULL;
6086 } else {
6087 ctxt->directory = xmlParserGetDirectory(filename);
6088 }
6089
6090 inputStream = htmlNewInputStream(ctxt);
6091 if (inputStream == NULL) {
6092 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00006093 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00006094 return(NULL);
6095 }
6096
6097 if (filename == NULL)
6098 inputStream->filename = NULL;
6099 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00006100 inputStream->filename = (char *)
6101 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00006102 inputStream->buf = buf;
Daniel Veillard61551a12012-07-16 16:28:47 +08006103 xmlBufResetInput(buf->buffer, inputStream);
Owen Taylor3473f882001-02-23 17:55:21 +00006104
6105 inputPush(ctxt, inputStream);
6106
6107 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02006108 (ctxt->input->buf != NULL)) {
Daniel Veillard00ac0d32012-07-16 18:03:01 +08006109 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6110 size_t cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillard5f704af2003-03-05 10:01:43 +00006111
Daniel Veillarde77db162009-08-22 11:32:38 +02006112 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00006113
Daniel Veillard00ac0d32012-07-16 18:03:01 +08006114 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Owen Taylor3473f882001-02-23 17:55:21 +00006115#ifdef DEBUG_PUSH
6116 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6117#endif
6118 }
Daniel Veillard68716a72006-10-16 09:32:17 +00006119 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00006120
6121 return(ctxt);
6122}
William M. Brack21e4ef22005-01-02 09:53:13 +00006123#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00006124
6125/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006126 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00006127 * @cur: a pointer to an array of xmlChar
6128 * @encoding: a free form C string describing the HTML document encoding, or NULL
6129 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02006130 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00006131 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006132 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6133 * to handle parse events. If sax is NULL, fallback to the default DOM
6134 * behavior and return a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006135 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006136 * Returns the resulting document tree unless SAX is NULL or the document is
6137 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00006138 */
6139
6140htmlDocPtr
6141htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
6142 htmlDocPtr ret;
6143 htmlParserCtxtPtr ctxt;
6144
Daniel Veillardd0463562001-10-13 09:15:48 +00006145 xmlInitParser();
6146
Owen Taylor3473f882001-02-23 17:55:21 +00006147 if (cur == NULL) return(NULL);
6148
6149
6150 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6151 if (ctxt == NULL) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02006152 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00006153 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00006154 ctxt->sax = sax;
6155 ctxt->userData = userData;
6156 }
6157
6158 htmlParseDocument(ctxt);
6159 ret = ctxt->myDoc;
6160 if (sax != NULL) {
6161 ctxt->sax = NULL;
6162 ctxt->userData = NULL;
6163 }
6164 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006165
Owen Taylor3473f882001-02-23 17:55:21 +00006166 return(ret);
6167}
6168
6169/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006170 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00006171 * @cur: a pointer to an array of xmlChar
6172 * @encoding: a free form C string describing the HTML document encoding, or NULL
6173 *
6174 * parse an HTML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006175 *
Owen Taylor3473f882001-02-23 17:55:21 +00006176 * Returns the resulting document tree
6177 */
6178
6179htmlDocPtr
6180htmlParseDoc(xmlChar *cur, const char *encoding) {
6181 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6182}
6183
6184
6185/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006186 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00006187 * @filename: the filename
6188 * @encoding: a free form C string describing the HTML document encoding, or NULL
6189 *
Daniel Veillarde77db162009-08-22 11:32:38 +02006190 * Create a parser context for a file content.
Owen Taylor3473f882001-02-23 17:55:21 +00006191 * Automatic support for ZLIB/Compress compressed document is provided
6192 * by default if found at compile-time.
6193 *
6194 * Returns the new parser context or NULL
6195 */
6196htmlParserCtxtPtr
6197htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6198{
6199 htmlParserCtxtPtr ctxt;
6200 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006201 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00006202 /* htmlCharEncoding enc; */
6203 xmlChar *content, *content_line = (xmlChar *) "charset=";
6204
Daniel Veillarda03e3652004-11-02 18:45:30 +00006205 if (filename == NULL)
6206 return(NULL);
6207
Daniel Veillardf403d292003-10-05 13:51:35 +00006208 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00006209 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00006210 return(NULL);
6211 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006212 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6213 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00006214#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006215 if (xmlDefaultSAXHandler.error != NULL) {
6216 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6217 }
Daniel Veillard87247e82004-01-13 20:42:02 +00006218#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00006219 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00006220 return(NULL);
6221 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006222
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006223 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6224 xmlFree(canonicFilename);
6225 if (inputStream == NULL) {
6226 xmlFreeParserCtxt(ctxt);
6227 return(NULL);
6228 }
Owen Taylor3473f882001-02-23 17:55:21 +00006229
6230 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006231
Owen Taylor3473f882001-02-23 17:55:21 +00006232 /* set encoding */
6233 if (encoding) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00006234 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
Daniel Veillarde77db162009-08-22 11:32:38 +02006235 if (content) {
Owen Taylor3473f882001-02-23 17:55:21 +00006236 strcpy ((char *)content, (char *)content_line);
6237 strcat ((char *)content, (char *)encoding);
6238 htmlCheckEncoding (ctxt, content);
6239 xmlFree (content);
6240 }
6241 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006242
Owen Taylor3473f882001-02-23 17:55:21 +00006243 return(ctxt);
6244}
6245
6246/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006247 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00006248 * @filename: the filename
6249 * @encoding: a free form C string describing the HTML document encoding, or NULL
6250 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02006251 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00006252 *
6253 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6254 * compressed document is provided by default if found at compile-time.
6255 * It use the given SAX function block to handle the parsing callback.
6256 * If sax is NULL, fallback to the default DOM tree building routines.
6257 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006258 * Returns the resulting document tree unless SAX is NULL or the document is
6259 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00006260 */
6261
6262htmlDocPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02006263htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
Owen Taylor3473f882001-02-23 17:55:21 +00006264 void *userData) {
6265 htmlDocPtr ret;
6266 htmlParserCtxtPtr ctxt;
6267 htmlSAXHandlerPtr oldsax = NULL;
6268
Daniel Veillardd0463562001-10-13 09:15:48 +00006269 xmlInitParser();
6270
Owen Taylor3473f882001-02-23 17:55:21 +00006271 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6272 if (ctxt == NULL) return(NULL);
6273 if (sax != NULL) {
6274 oldsax = ctxt->sax;
6275 ctxt->sax = sax;
6276 ctxt->userData = userData;
6277 }
6278
6279 htmlParseDocument(ctxt);
6280
6281 ret = ctxt->myDoc;
6282 if (sax != NULL) {
6283 ctxt->sax = oldsax;
6284 ctxt->userData = NULL;
6285 }
6286 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006287
Owen Taylor3473f882001-02-23 17:55:21 +00006288 return(ret);
6289}
6290
6291/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006292 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00006293 * @filename: the filename
6294 * @encoding: a free form C string describing the HTML document encoding, or NULL
6295 *
6296 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6297 * compressed document is provided by default if found at compile-time.
6298 *
6299 * Returns the resulting document tree
6300 */
6301
6302htmlDocPtr
6303htmlParseFile(const char *filename, const char *encoding) {
6304 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6305}
6306
6307/**
6308 * htmlHandleOmittedElem:
Daniel Veillarde77db162009-08-22 11:32:38 +02006309 * @val: int 0 or 1
Owen Taylor3473f882001-02-23 17:55:21 +00006310 *
6311 * Set and return the previous value for handling HTML omitted tags.
6312 *
6313 * Returns the last value for 0 for no handling, 1 for auto insertion.
6314 */
6315
6316int
6317htmlHandleOmittedElem(int val) {
6318 int old = htmlOmittedDefaultValue;
6319
6320 htmlOmittedDefaultValue = val;
6321 return(old);
6322}
6323
Daniel Veillard930dfb62003-02-05 10:17:38 +00006324/**
6325 * htmlElementAllowedHere:
6326 * @parent: HTML parent element
6327 * @elt: HTML element
6328 *
6329 * Checks whether an HTML element may be a direct child of a parent element.
6330 * Note - doesn't check for deprecated elements
6331 *
6332 * Returns 1 if allowed; 0 otherwise.
6333 */
6334int
6335htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6336 const char** p ;
6337
6338 if ( ! elt || ! parent || ! parent->subelts )
6339 return 0 ;
6340
6341 for ( p = parent->subelts; *p; ++p )
6342 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6343 return 1 ;
6344
6345 return 0 ;
6346}
6347/**
6348 * htmlElementStatusHere:
6349 * @parent: HTML parent element
6350 * @elt: HTML element
6351 *
6352 * Checks whether an HTML element may be a direct child of a parent element.
6353 * and if so whether it is valid or deprecated.
6354 *
6355 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6356 */
6357htmlStatus
6358htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6359 if ( ! parent || ! elt )
6360 return HTML_INVALID ;
6361 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6362 return HTML_INVALID ;
6363
6364 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6365}
6366/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006367 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00006368 * @elt: HTML element
6369 * @attr: HTML attribute
6370 * @legacy: whether to allow deprecated attributes
6371 *
6372 * Checks whether an attribute is valid for an element
6373 * Has full knowledge of Required and Deprecated attributes
6374 *
6375 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6376 */
6377htmlStatus
6378htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6379 const char** p ;
6380
6381 if ( !elt || ! attr )
6382 return HTML_INVALID ;
6383
6384 if ( elt->attrs_req )
6385 for ( p = elt->attrs_req; *p; ++p)
6386 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6387 return HTML_REQUIRED ;
6388
6389 if ( elt->attrs_opt )
6390 for ( p = elt->attrs_opt; *p; ++p)
6391 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6392 return HTML_VALID ;
6393
6394 if ( legacy && elt->attrs_depr )
6395 for ( p = elt->attrs_depr; *p; ++p)
6396 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6397 return HTML_DEPRECATED ;
6398
6399 return HTML_INVALID ;
6400}
6401/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006402 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00006403 * @node: an htmlNodePtr in a tree
6404 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00006405 * for Element nodes)
6406 *
6407 * Checks whether the tree node is valid. Experimental (the author
6408 * only uses the HTML enhancements in a SAX parser)
6409 *
6410 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6411 * legacy allowed) or htmlElementStatusHere (otherwise).
6412 * for Attribute nodes, a return from htmlAttrAllowed
6413 * for other nodes, HTML_NA (no checks performed)
6414 */
6415htmlStatus
6416htmlNodeStatus(const htmlNodePtr node, int legacy) {
6417 if ( ! node )
6418 return HTML_INVALID ;
6419
6420 switch ( node->type ) {
6421 case XML_ELEMENT_NODE:
6422 return legacy
6423 ? ( htmlElementAllowedHere (
6424 htmlTagLookup(node->parent->name) , node->name
6425 ) ? HTML_VALID : HTML_INVALID )
6426 : htmlElementStatusHere(
6427 htmlTagLookup(node->parent->name) ,
6428 htmlTagLookup(node->name) )
6429 ;
6430 case XML_ATTRIBUTE_NODE:
6431 return htmlAttrAllowed(
6432 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6433 default: return HTML_NA ;
6434 }
6435}
Daniel Veillard9475a352003-09-26 12:47:50 +00006436/************************************************************************
6437 * *
6438 * New set (2.6.0) of simpler and more flexible APIs *
6439 * *
6440 ************************************************************************/
6441/**
6442 * DICT_FREE:
6443 * @str: a string
6444 *
6445 * Free a string if it is not owned by the "dict" dictionnary in the
6446 * current scope
6447 */
6448#define DICT_FREE(str) \
Daniel Veillarde77db162009-08-22 11:32:38 +02006449 if ((str) && ((!dict) || \
Daniel Veillard9475a352003-09-26 12:47:50 +00006450 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6451 xmlFree((char *)(str));
6452
6453/**
6454 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00006455 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00006456 *
6457 * Reset a parser context
6458 */
6459void
6460htmlCtxtReset(htmlParserCtxtPtr ctxt)
6461{
6462 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00006463 xmlDictPtr dict;
Daniel Veillarde77db162009-08-22 11:32:38 +02006464
Daniel Veillarda03e3652004-11-02 18:45:30 +00006465 if (ctxt == NULL)
6466 return;
6467
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006468 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00006469 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00006470
6471 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6472 xmlFreeInputStream(input);
6473 }
6474 ctxt->inputNr = 0;
6475 ctxt->input = NULL;
6476
6477 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00006478 if (ctxt->spaceTab != NULL) {
6479 ctxt->spaceTab[0] = -1;
6480 ctxt->space = &ctxt->spaceTab[0];
6481 } else {
6482 ctxt->space = NULL;
6483 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006484
6485
6486 ctxt->nodeNr = 0;
6487 ctxt->node = NULL;
6488
6489 ctxt->nameNr = 0;
6490 ctxt->name = NULL;
6491
6492 DICT_FREE(ctxt->version);
6493 ctxt->version = NULL;
6494 DICT_FREE(ctxt->encoding);
6495 ctxt->encoding = NULL;
6496 DICT_FREE(ctxt->directory);
6497 ctxt->directory = NULL;
6498 DICT_FREE(ctxt->extSubURI);
6499 ctxt->extSubURI = NULL;
6500 DICT_FREE(ctxt->extSubSystem);
6501 ctxt->extSubSystem = NULL;
6502 if (ctxt->myDoc != NULL)
6503 xmlFreeDoc(ctxt->myDoc);
6504 ctxt->myDoc = NULL;
6505
6506 ctxt->standalone = -1;
6507 ctxt->hasExternalSubset = 0;
6508 ctxt->hasPErefs = 0;
6509 ctxt->html = 1;
6510 ctxt->external = 0;
6511 ctxt->instate = XML_PARSER_START;
6512 ctxt->token = 0;
6513
6514 ctxt->wellFormed = 1;
6515 ctxt->nsWellFormed = 1;
Daniel Veillard8ad29302010-10-28 11:51:22 +02006516 ctxt->disableSAX = 0;
Daniel Veillard9475a352003-09-26 12:47:50 +00006517 ctxt->valid = 1;
6518 ctxt->vctxt.userData = ctxt;
6519 ctxt->vctxt.error = xmlParserValidityError;
6520 ctxt->vctxt.warning = xmlParserValidityWarning;
6521 ctxt->record_info = 0;
6522 ctxt->nbChars = 0;
6523 ctxt->checkIndex = 0;
6524 ctxt->inSubset = 0;
6525 ctxt->errNo = XML_ERR_OK;
6526 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006527 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006528 ctxt->catalogs = NULL;
6529 xmlInitNodeInfoSeq(&ctxt->node_seq);
6530
6531 if (ctxt->attsDefault != NULL) {
6532 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6533 ctxt->attsDefault = NULL;
6534 }
6535 if (ctxt->attsSpecial != NULL) {
6536 xmlHashFree(ctxt->attsSpecial, NULL);
6537 ctxt->attsSpecial = NULL;
6538 }
6539}
6540
6541/**
6542 * htmlCtxtUseOptions:
6543 * @ctxt: an HTML parser context
6544 * @options: a combination of htmlParserOption(s)
6545 *
6546 * Applies the options to the parser context
6547 *
6548 * Returns 0 in case of success, the set of unknown or unimplemented options
6549 * in case of error.
6550 */
6551int
6552htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6553{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006554 if (ctxt == NULL)
6555 return(-1);
6556
Daniel Veillard9475a352003-09-26 12:47:50 +00006557 if (options & HTML_PARSE_NOWARNING) {
6558 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006559 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006560 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006561 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006562 }
6563 if (options & HTML_PARSE_NOERROR) {
6564 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006565 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006566 ctxt->sax->fatalError = NULL;
6567 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006568 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006569 }
6570 if (options & HTML_PARSE_PEDANTIC) {
6571 ctxt->pedantic = 1;
6572 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006573 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006574 } else
6575 ctxt->pedantic = 0;
6576 if (options & XML_PARSE_NOBLANKS) {
6577 ctxt->keepBlanks = 0;
6578 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6579 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006580 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006581 } else
6582 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006583 if (options & HTML_PARSE_RECOVER) {
6584 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006585 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006586 } else
6587 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006588 if (options & HTML_PARSE_COMPACT) {
6589 ctxt->options |= HTML_PARSE_COMPACT;
6590 options -= HTML_PARSE_COMPACT;
6591 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006592 if (options & XML_PARSE_HUGE) {
6593 ctxt->options |= XML_PARSE_HUGE;
6594 options -= XML_PARSE_HUGE;
6595 }
Daniel Veillardf1121c42010-07-26 14:02:42 +02006596 if (options & HTML_PARSE_NODEFDTD) {
6597 ctxt->options |= HTML_PARSE_NODEFDTD;
6598 options -= HTML_PARSE_NODEFDTD;
6599 }
Daniel Veillardc62efc82011-05-16 16:03:50 +08006600 if (options & HTML_PARSE_IGNORE_ENC) {
6601 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6602 options -= HTML_PARSE_IGNORE_ENC;
6603 }
Martin Schröderb91111b2012-05-10 18:52:37 +08006604 if (options & HTML_PARSE_NOIMPLIED) {
6605 ctxt->options |= HTML_PARSE_NOIMPLIED;
6606 options -= HTML_PARSE_NOIMPLIED;
6607 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006608 ctxt->dictNames = 0;
6609 return (options);
6610}
6611
6612/**
6613 * htmlDoRead:
6614 * @ctxt: an HTML parser context
6615 * @URL: the base URL to use for the document
6616 * @encoding: the document encoding, or NULL
6617 * @options: a combination of htmlParserOption(s)
6618 * @reuse: keep the context for reuse
6619 *
6620 * Common front-end for the htmlRead functions
Daniel Veillarde77db162009-08-22 11:32:38 +02006621 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006622 * Returns the resulting document tree or NULL
6623 */
6624static htmlDocPtr
6625htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6626 int options, int reuse)
6627{
6628 htmlDocPtr ret;
Daniel Veillarde77db162009-08-22 11:32:38 +02006629
Daniel Veillard9475a352003-09-26 12:47:50 +00006630 htmlCtxtUseOptions(ctxt, options);
6631 ctxt->html = 1;
6632 if (encoding != NULL) {
6633 xmlCharEncodingHandlerPtr hdlr;
6634
6635 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006636 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006637 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006638 if (ctxt->input->encoding != NULL)
6639 xmlFree((xmlChar *) ctxt->input->encoding);
6640 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6641 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006642 }
6643 if ((URL != NULL) && (ctxt->input != NULL) &&
6644 (ctxt->input->filename == NULL))
6645 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6646 htmlParseDocument(ctxt);
6647 ret = ctxt->myDoc;
6648 ctxt->myDoc = NULL;
6649 if (!reuse) {
6650 if ((ctxt->dictNames) &&
6651 (ret != NULL) &&
6652 (ret->dict == ctxt->dict))
6653 ctxt->dict = NULL;
6654 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006655 }
6656 return (ret);
6657}
6658
6659/**
6660 * htmlReadDoc:
6661 * @cur: a pointer to a zero terminated string
6662 * @URL: the base URL to use for the document
6663 * @encoding: the document encoding, or NULL
6664 * @options: a combination of htmlParserOption(s)
6665 *
6666 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006667 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006668 * Returns the resulting document tree
6669 */
6670htmlDocPtr
6671htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6672{
6673 htmlParserCtxtPtr ctxt;
6674
6675 if (cur == NULL)
6676 return (NULL);
6677
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006678 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006679 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006680 if (ctxt == NULL)
6681 return (NULL);
6682 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6683}
6684
6685/**
6686 * htmlReadFile:
6687 * @filename: a file or URL
6688 * @encoding: the document encoding, or NULL
6689 * @options: a combination of htmlParserOption(s)
6690 *
6691 * parse an XML file from the filesystem or the network.
Daniel Veillarde77db162009-08-22 11:32:38 +02006692 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006693 * Returns the resulting document tree
6694 */
6695htmlDocPtr
6696htmlReadFile(const char *filename, const char *encoding, int options)
6697{
6698 htmlParserCtxtPtr ctxt;
6699
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006700 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006701 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6702 if (ctxt == NULL)
6703 return (NULL);
6704 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6705}
6706
6707/**
6708 * htmlReadMemory:
6709 * @buffer: a pointer to a char array
6710 * @size: the size of the array
6711 * @URL: the base URL to use for the document
6712 * @encoding: the document encoding, or NULL
6713 * @options: a combination of htmlParserOption(s)
6714 *
6715 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006716 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006717 * Returns the resulting document tree
6718 */
6719htmlDocPtr
6720htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6721{
6722 htmlParserCtxtPtr ctxt;
6723
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006724 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006725 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6726 if (ctxt == NULL)
6727 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006728 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006729 if (ctxt->sax != NULL)
6730 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006731 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6732}
6733
6734/**
6735 * htmlReadFd:
6736 * @fd: an open file descriptor
6737 * @URL: the base URL to use for the document
6738 * @encoding: the document encoding, or NULL
6739 * @options: a combination of htmlParserOption(s)
6740 *
6741 * parse an XML from a file descriptor and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006742 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006743 * Returns the resulting document tree
6744 */
6745htmlDocPtr
6746htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6747{
6748 htmlParserCtxtPtr ctxt;
6749 xmlParserInputBufferPtr input;
6750 xmlParserInputPtr stream;
6751
6752 if (fd < 0)
6753 return (NULL);
6754
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006755 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006756 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6757 if (input == NULL)
6758 return (NULL);
6759 ctxt = xmlNewParserCtxt();
6760 if (ctxt == NULL) {
6761 xmlFreeParserInputBuffer(input);
6762 return (NULL);
6763 }
6764 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6765 if (stream == NULL) {
6766 xmlFreeParserInputBuffer(input);
6767 xmlFreeParserCtxt(ctxt);
6768 return (NULL);
6769 }
6770 inputPush(ctxt, stream);
6771 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6772}
6773
6774/**
6775 * htmlReadIO:
6776 * @ioread: an I/O read function
6777 * @ioclose: an I/O close function
6778 * @ioctx: an I/O handler
6779 * @URL: the base URL to use for the document
6780 * @encoding: the document encoding, or NULL
6781 * @options: a combination of htmlParserOption(s)
6782 *
6783 * parse an HTML document from I/O functions and source and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006784 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006785 * Returns the resulting document tree
6786 */
6787htmlDocPtr
6788htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6789 void *ioctx, const char *URL, const char *encoding, int options)
6790{
6791 htmlParserCtxtPtr ctxt;
6792 xmlParserInputBufferPtr input;
6793 xmlParserInputPtr stream;
6794
6795 if (ioread == NULL)
6796 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006797 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006798
6799 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6800 XML_CHAR_ENCODING_NONE);
Lin Yi-Li24464be2012-05-10 16:14:55 +08006801 if (input == NULL) {
6802 if (ioclose != NULL)
6803 ioclose(ioctx);
Daniel Veillard9475a352003-09-26 12:47:50 +00006804 return (NULL);
Lin Yi-Li24464be2012-05-10 16:14:55 +08006805 }
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006806 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006807 if (ctxt == NULL) {
6808 xmlFreeParserInputBuffer(input);
6809 return (NULL);
6810 }
6811 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6812 if (stream == NULL) {
6813 xmlFreeParserInputBuffer(input);
6814 xmlFreeParserCtxt(ctxt);
6815 return (NULL);
6816 }
6817 inputPush(ctxt, stream);
6818 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6819}
6820
6821/**
6822 * htmlCtxtReadDoc:
6823 * @ctxt: an HTML parser context
6824 * @cur: a pointer to a zero terminated string
6825 * @URL: the base URL to use for the document
6826 * @encoding: the document encoding, or NULL
6827 * @options: a combination of htmlParserOption(s)
6828 *
6829 * parse an XML in-memory document and build a tree.
6830 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006831 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006832 * Returns the resulting document tree
6833 */
6834htmlDocPtr
6835htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6836 const char *URL, const char *encoding, int options)
6837{
6838 xmlParserInputPtr stream;
6839
6840 if (cur == NULL)
6841 return (NULL);
6842 if (ctxt == NULL)
6843 return (NULL);
6844
6845 htmlCtxtReset(ctxt);
6846
6847 stream = xmlNewStringInputStream(ctxt, cur);
6848 if (stream == NULL) {
6849 return (NULL);
6850 }
6851 inputPush(ctxt, stream);
6852 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6853}
6854
6855/**
6856 * htmlCtxtReadFile:
6857 * @ctxt: an HTML parser context
6858 * @filename: a file or URL
6859 * @encoding: the document encoding, or NULL
6860 * @options: a combination of htmlParserOption(s)
6861 *
6862 * parse an XML file from the filesystem or the network.
6863 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006864 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006865 * Returns the resulting document tree
6866 */
6867htmlDocPtr
6868htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6869 const char *encoding, int options)
6870{
6871 xmlParserInputPtr stream;
6872
6873 if (filename == NULL)
6874 return (NULL);
6875 if (ctxt == NULL)
6876 return (NULL);
6877
6878 htmlCtxtReset(ctxt);
6879
Daniel Veillard29614c72004-11-26 10:47:26 +00006880 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006881 if (stream == NULL) {
6882 return (NULL);
6883 }
6884 inputPush(ctxt, stream);
6885 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6886}
6887
6888/**
6889 * htmlCtxtReadMemory:
6890 * @ctxt: an HTML parser context
6891 * @buffer: a pointer to a char array
6892 * @size: the size of the array
6893 * @URL: the base URL to use for the document
6894 * @encoding: the document encoding, or NULL
6895 * @options: a combination of htmlParserOption(s)
6896 *
6897 * parse an XML in-memory document and build a tree.
6898 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006899 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006900 * Returns the resulting document tree
6901 */
6902htmlDocPtr
6903htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6904 const char *URL, const char *encoding, int options)
6905{
6906 xmlParserInputBufferPtr input;
6907 xmlParserInputPtr stream;
6908
6909 if (ctxt == NULL)
6910 return (NULL);
6911 if (buffer == NULL)
6912 return (NULL);
6913
6914 htmlCtxtReset(ctxt);
6915
6916 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6917 if (input == NULL) {
6918 return(NULL);
6919 }
6920
6921 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6922 if (stream == NULL) {
6923 xmlFreeParserInputBuffer(input);
6924 return(NULL);
6925 }
6926
6927 inputPush(ctxt, stream);
6928 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6929}
6930
6931/**
6932 * htmlCtxtReadFd:
6933 * @ctxt: an HTML parser context
6934 * @fd: an open file descriptor
6935 * @URL: the base URL to use for the document
6936 * @encoding: the document encoding, or NULL
6937 * @options: a combination of htmlParserOption(s)
6938 *
6939 * parse an XML from a file descriptor and build a tree.
6940 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006941 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006942 * Returns the resulting document tree
6943 */
6944htmlDocPtr
6945htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6946 const char *URL, const char *encoding, int options)
6947{
6948 xmlParserInputBufferPtr input;
6949 xmlParserInputPtr stream;
6950
6951 if (fd < 0)
6952 return (NULL);
6953 if (ctxt == NULL)
6954 return (NULL);
6955
6956 htmlCtxtReset(ctxt);
6957
6958
6959 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6960 if (input == NULL)
6961 return (NULL);
6962 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6963 if (stream == NULL) {
6964 xmlFreeParserInputBuffer(input);
6965 return (NULL);
6966 }
6967 inputPush(ctxt, stream);
6968 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6969}
6970
6971/**
6972 * htmlCtxtReadIO:
6973 * @ctxt: an HTML parser context
6974 * @ioread: an I/O read function
6975 * @ioclose: an I/O close function
6976 * @ioctx: an I/O handler
6977 * @URL: the base URL to use for the document
6978 * @encoding: the document encoding, or NULL
6979 * @options: a combination of htmlParserOption(s)
6980 *
6981 * parse an HTML document from I/O functions and source and build a tree.
6982 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006983 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006984 * Returns the resulting document tree
6985 */
6986htmlDocPtr
6987htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6988 xmlInputCloseCallback ioclose, void *ioctx,
6989 const char *URL,
6990 const char *encoding, int options)
6991{
6992 xmlParserInputBufferPtr input;
6993 xmlParserInputPtr stream;
6994
6995 if (ioread == NULL)
6996 return (NULL);
6997 if (ctxt == NULL)
6998 return (NULL);
6999
7000 htmlCtxtReset(ctxt);
7001
7002 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7003 XML_CHAR_ENCODING_NONE);
Lin Yi-Li24464be2012-05-10 16:14:55 +08007004 if (input == NULL) {
7005 if (ioclose != NULL)
7006 ioclose(ioctx);
Daniel Veillard9475a352003-09-26 12:47:50 +00007007 return (NULL);
Lin Yi-Li24464be2012-05-10 16:14:55 +08007008 }
Daniel Veillard9475a352003-09-26 12:47:50 +00007009 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7010 if (stream == NULL) {
7011 xmlFreeParserInputBuffer(input);
7012 return (NULL);
7013 }
7014 inputPush(ctxt, stream);
7015 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7016}
7017
Daniel Veillard5d4644e2005-04-01 13:11:58 +00007018#define bottom_HTMLparser
7019#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00007020#endif /* LIBXML_HTML_ENABLED */