blob: e4f816e5e7bfee05f0c6dcae6d1a84e6236de4a1 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
Daniel Veillarda78d8032012-07-16 14:56:50 +080047#include "buf.h"
48#include "enc.h"
49
Owen Taylor3473f882001-02-23 17:55:21 +000050#define HTML_MAX_NAMELEN 1000
51#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52#define HTML_PARSER_BUFFER_SIZE 100
53
54/* #define DEBUG */
55/* #define DEBUG_PUSH */
56
Daniel Veillard22090732001-07-16 00:06:07 +000057static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000058
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000061static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000062
63/************************************************************************
64 * *
Daniel Veillarde77db162009-08-22 11:32:38 +020065 * Some factorized error routines *
Daniel Veillardf403d292003-10-05 13:51:35 +000066 * *
67 ************************************************************************/
68
69/**
William M. Brackedb65a72004-02-06 07:36:04 +000070 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000071 * @ctxt: an HTML parser context
72 * @extra: extra informations
73 *
74 * Handle a redefinition of attribute error
75 */
76static void
77htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78{
Daniel Veillard157fee02003-10-31 10:36:03 +000079 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80 (ctxt->instate == XML_PARSER_EOF))
81 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000082 if (ctxt != NULL) {
83 ctxt->errNo = XML_ERR_NO_MEMORY;
84 ctxt->instate = XML_PARSER_EOF;
85 ctxt->disableSAX = 1;
86 }
87 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000088 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000089 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90 NULL, NULL, 0, 0,
91 "Memory allocation failed : %s\n", extra);
92 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000093 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000094 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95 NULL, NULL, 0, 0, "Memory allocation failed\n");
96}
97
98/**
99 * htmlParseErr:
100 * @ctxt: an HTML parser context
101 * @error: the error number
102 * @msg: the error message
103 * @str1: string infor
104 * @str2: string infor
105 *
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107 */
David Kilzer4472c3a2016-05-13 15:13:17 +0800108static void LIBXML_ATTR_FORMAT(3,0)
Daniel Veillardf403d292003-10-05 13:51:35 +0000109htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110 const char *msg, const xmlChar *str1, const xmlChar *str2)
111{
Daniel Veillard157fee02003-10-31 10:36:03 +0000112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113 (ctxt->instate == XML_PARSER_EOF))
114 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000115 if (ctxt != NULL)
116 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000118 XML_ERR_ERROR, NULL, 0,
119 (const char *) str1, (const char *) str2,
120 NULL, 0, 0,
121 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000122 if (ctxt != NULL)
123 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000124}
125
126/**
127 * htmlParseErrInt:
128 * @ctxt: an HTML parser context
129 * @error: the error number
130 * @msg: the error message
131 * @val: integer info
132 *
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134 */
David Kilzer4472c3a2016-05-13 15:13:17 +0800135static void LIBXML_ATTR_FORMAT(3,0)
Daniel Veillardf403d292003-10-05 13:51:35 +0000136htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137 const char *msg, int val)
138{
Daniel Veillard157fee02003-10-31 10:36:03 +0000139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140 (ctxt->instate == XML_PARSER_EOF))
141 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000142 if (ctxt != NULL)
143 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000145 XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000147 if (ctxt != NULL)
148 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000149}
150
151/************************************************************************
152 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200153 * Parser stacks related functions and macros *
Owen Taylor3473f882001-02-23 17:55:21 +0000154 * *
155 ************************************************************************/
156
Daniel Veillard1c732d22002-11-30 11:22:59 +0000157/**
158 * htmlnamePush:
159 * @ctxt: an HTML parser context
160 * @value: the element name
161 *
162 * Pushes a new element name on top of the name stack
163 *
164 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000165 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000166static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000167htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000168{
Daniel Veillard029a04d2009-08-24 12:50:23 +0200169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170 ctxt->html = 3;
171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172 ctxt->html = 10;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000173 if (ctxt->nameNr >= ctxt->nameMax) {
174 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000175 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000176 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000177 ctxt->nameMax *
178 sizeof(ctxt->nameTab[0]));
179 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000180 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000181 return (0);
182 }
183 }
184 ctxt->nameTab[ctxt->nameNr] = value;
185 ctxt->name = value;
186 return (ctxt->nameNr++);
187}
188/**
189 * htmlnamePop:
190 * @ctxt: an HTML parser context
191 *
192 * Pops the top element name from the name stack
193 *
194 * Returns the name just removed
195 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000196static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000197htmlnamePop(htmlParserCtxtPtr ctxt)
198{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000199 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000200
Daniel Veillard1c732d22002-11-30 11:22:59 +0000201 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000202 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000203 ctxt->nameNr--;
204 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000205 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000206 if (ctxt->nameNr > 0)
207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208 else
209 ctxt->name = NULL;
210 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000211 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000212 return (ret);
213}
Owen Taylor3473f882001-02-23 17:55:21 +0000214
Eugene Pimenov615904f2010-03-15 15:16:02 +0100215/**
216 * htmlNodeInfoPush:
217 * @ctxt: an HTML parser context
218 * @value: the node info
219 *
220 * Pushes a new element name on top of the node info stack
221 *
222 * Returns 0 in case of error, the index in the stack otherwise
223 */
224static int
225htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226{
227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228 if (ctxt->nodeInfoMax == 0)
229 ctxt->nodeInfoMax = 5;
230 ctxt->nodeInfoMax *= 2;
231 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233 ctxt->nodeInfoMax *
234 sizeof(ctxt->nodeInfoTab[0]));
235 if (ctxt->nodeInfoTab == NULL) {
236 htmlErrMemory(ctxt, NULL);
237 return (0);
238 }
239 }
240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242 return (ctxt->nodeInfoNr++);
243}
244
245/**
246 * htmlNodeInfoPop:
247 * @ctxt: an HTML parser context
248 *
249 * Pops the top element name from the node info stack
250 *
251 * Returns 0 in case of error, the pointer to NodeInfo otherwise
252 */
253static htmlParserNodeInfo *
254htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255{
256 if (ctxt->nodeInfoNr <= 0)
257 return (NULL);
258 ctxt->nodeInfoNr--;
259 if (ctxt->nodeInfoNr < 0)
260 return (NULL);
261 if (ctxt->nodeInfoNr > 0)
262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263 else
264 ctxt->nodeInfo = NULL;
265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266}
267
Owen Taylor3473f882001-02-23 17:55:21 +0000268/*
269 * Macros for accessing the content. Those should be used only by the parser,
270 * and not exported.
271 *
272 * Dirty macros, i.e. one need to make assumption on the context to use them
273 *
274 * CUR_PTR return the current pointer to the xmlChar to be parsed.
275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277 * in UNICODE mode. This should be used internally by the parser
278 * only to compare to ASCII values otherwise it would break when
279 * running with UTF-8 encoding.
280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281 * to compare on ASCII based substring.
282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283 * it should be used only to compare on ASCII based substring.
284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000285 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000286 *
287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288 *
289 * CURRENT Returns the current char value, with the full decoding of
290 * UTF-8 if we are using this mode. It returns an int.
291 * NEXT Skip to the next character, this does the proper decoding
292 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000293 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295 */
296
297#define UPPER (toupper(*ctxt->input->cur))
298
Daniel Veillard77a90a72003-03-22 00:04:05 +0000299#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000300
301#define NXT(val) ctxt->input->cur[(val)]
302
303#define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305#define CUR_PTR ctxt->input->cur
Pranjal Jumde11ed4a72016-03-02 15:52:24 -0800306#define BASE_PTR ctxt->input->base
Owen Taylor3473f882001-02-23 17:55:21 +0000307
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000308#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000311
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000312#define GROW if ((ctxt->progressive == 0) && \
313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000315
316#define CURRENT ((int) (*ctxt->input->cur))
317
318#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319
320/* Inported from XML */
321
Daniel Veillard561b7f82002-03-20 21:55:57 +0000322/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000324#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000325
Daniel Veillard561b7f82002-03-20 21:55:57 +0000326#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000327
328
329#define NEXTL(l) do { \
330 if (*(ctxt->input->cur) == '\n') { \
331 ctxt->input->line++; ctxt->input->col = 1; \
332 } else ctxt->input->col++; \
333 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
334 } while (0)
Daniel Veillarde77db162009-08-22 11:32:38 +0200335
Owen Taylor3473f882001-02-23 17:55:21 +0000336/************
337 \
338 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340 ************/
341
342#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344
345#define COPY_BUF(l,b,i,v) \
346 if (l == 1) b[i++] = (xmlChar) v; \
347 else i += xmlCopyChar(l,&b[i],v)
348
349/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200350 * htmlFindEncoding:
351 * @the HTML parser context
352 *
353 * Ty to find and encoding in the current data available in the input
354 * buffer this is needed to try to switch to the proper encoding when
355 * one face a character error.
356 * That's an heuristic, since it's operating outside of parsing it could
357 * try to use a meta which had been commented out, that's the reason it
358 * should only be used in case of error, not as a default.
359 *
360 * Returns an encoding string or NULL if not found, the string need to
361 * be freed
362 */
363static xmlChar *
364htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365 const xmlChar *start, *cur, *end;
366
367 if ((ctxt == NULL) || (ctxt->input == NULL) ||
368 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369 (ctxt->input->buf->encoder != NULL))
370 return(NULL);
371 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372 return(NULL);
373
374 start = ctxt->input->cur;
375 end = ctxt->input->end;
376 /* we also expect the input buffer to be zero terminated */
377 if (*end != 0)
378 return(NULL);
379
380 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381 if (cur == NULL)
382 return(NULL);
383 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
384 if (cur == NULL)
385 return(NULL);
386 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
387 if (cur == NULL)
388 return(NULL);
389 cur += 8;
390 start = cur;
391 while (((*cur >= 'A') && (*cur <= 'Z')) ||
392 ((*cur >= 'a') && (*cur <= 'z')) ||
393 ((*cur >= '0') && (*cur <= '9')) ||
394 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395 cur++;
396 if (cur == start)
397 return(NULL);
398 return(xmlStrndup(start, cur - start));
399}
400
401/**
Owen Taylor3473f882001-02-23 17:55:21 +0000402 * htmlCurrentChar:
403 * @ctxt: the HTML parser context
404 * @len: pointer to the length of the char read
405 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000406 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000407 * bytes in the input buffer. Implement the end of line normalization:
408 * 2.11 End-of-Line Handling
409 * If the encoding is unspecified, in the case we find an ISO-Latin-1
410 * char, then the encoding converter is plugged in automatically.
411 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000412 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000413 */
414
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000415static int
Owen Taylor3473f882001-02-23 17:55:21 +0000416htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
417 if (ctxt->instate == XML_PARSER_EOF)
418 return(0);
419
420 if (ctxt->token != 0) {
421 *len = 0;
422 return(ctxt->token);
Daniel Veillarde77db162009-08-22 11:32:38 +0200423 }
Owen Taylor3473f882001-02-23 17:55:21 +0000424 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
425 /*
426 * We are supposed to handle UTF8, check it's valid
427 * From rfc2044: encoding of the Unicode values on UTF-8:
428 *
429 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
430 * 0000 0000-0000 007F 0xxxxxxx
431 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
Daniel Veillarde77db162009-08-22 11:32:38 +0200432 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
Owen Taylor3473f882001-02-23 17:55:21 +0000433 *
434 * Check for the 0x110000 limit too
435 */
436 const unsigned char *cur = ctxt->input->cur;
437 unsigned char c;
438 unsigned int val;
439
440 c = *cur;
441 if (c & 0x80) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200442 if (cur[1] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000443 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200444 cur = ctxt->input->cur;
445 }
Owen Taylor3473f882001-02-23 17:55:21 +0000446 if ((cur[1] & 0xc0) != 0x80)
447 goto encoding_error;
448 if ((c & 0xe0) == 0xe0) {
449
Adiel Mittmann8a103792009-08-25 11:27:13 +0200450 if (cur[2] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000451 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200452 cur = ctxt->input->cur;
453 }
Owen Taylor3473f882001-02-23 17:55:21 +0000454 if ((cur[2] & 0xc0) != 0x80)
455 goto encoding_error;
456 if ((c & 0xf0) == 0xf0) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200457 if (cur[3] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000458 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200459 cur = ctxt->input->cur;
460 }
Owen Taylor3473f882001-02-23 17:55:21 +0000461 if (((c & 0xf8) != 0xf0) ||
462 ((cur[3] & 0xc0) != 0x80))
463 goto encoding_error;
464 /* 4-byte code */
465 *len = 4;
466 val = (cur[0] & 0x7) << 18;
467 val |= (cur[1] & 0x3f) << 12;
468 val |= (cur[2] & 0x3f) << 6;
469 val |= cur[3] & 0x3f;
470 } else {
471 /* 3-byte code */
472 *len = 3;
473 val = (cur[0] & 0xf) << 12;
474 val |= (cur[1] & 0x3f) << 6;
475 val |= cur[2] & 0x3f;
476 }
477 } else {
478 /* 2-byte code */
479 *len = 2;
480 val = (cur[0] & 0x1f) << 6;
481 val |= cur[1] & 0x3f;
482 }
483 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000484 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
485 "Char 0x%X out of allowed range\n", val);
Daniel Veillarde77db162009-08-22 11:32:38 +0200486 }
Owen Taylor3473f882001-02-23 17:55:21 +0000487 return(val);
488 } else {
Daniel Veillard856c6682009-08-24 18:16:56 +0200489 if ((*ctxt->input->cur == 0) &&
490 (ctxt->input->cur < ctxt->input->end)) {
491 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
492 "Char 0x%X out of allowed range\n", 0);
493 *len = 1;
494 return(' ');
495 }
Owen Taylor3473f882001-02-23 17:55:21 +0000496 /* 1-byte code */
497 *len = 1;
498 return((int) *ctxt->input->cur);
499 }
500 }
501 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000502 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000503 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000504 * XML constructs only use < 128 chars
505 */
506 *len = 1;
507 if ((int) *ctxt->input->cur < 0x80)
508 return((int) *ctxt->input->cur);
509
510 /*
511 * Humm this is bad, do an automatic flow conversion
512 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200513 {
514 xmlChar * guess;
515 xmlCharEncodingHandlerPtr handler;
516
517 guess = htmlFindEncoding(ctxt);
518 if (guess == NULL) {
519 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
520 } else {
521 if (ctxt->input->encoding != NULL)
522 xmlFree((xmlChar *) ctxt->input->encoding);
523 ctxt->input->encoding = guess;
524 handler = xmlFindCharEncodingHandler((const char *) guess);
525 if (handler != NULL) {
526 xmlSwitchToEncoding(ctxt, handler);
527 } else {
528 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
529 "Unsupported encoding %s", guess, NULL);
530 }
531 }
532 ctxt->charset = XML_CHAR_ENCODING_UTF8;
533 }
534
Owen Taylor3473f882001-02-23 17:55:21 +0000535 return(xmlCurrentChar(ctxt, len));
536
537encoding_error:
538 /*
539 * If we detect an UTF8 error that probably mean that the
540 * input encoding didn't get properly advertized in the
541 * declaration header. Report the error and switch the encoding
542 * to ISO-Latin-1 (if you don't like this policy, just declare the
543 * encoding !)
544 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000545 {
546 char buffer[150];
547
Daniel Veillard861101d2007-06-12 08:38:57 +0000548 if (ctxt->input->end - ctxt->input->cur >= 4) {
549 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
550 ctxt->input->cur[0], ctxt->input->cur[1],
551 ctxt->input->cur[2], ctxt->input->cur[3]);
552 } else {
553 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
554 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000555 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
556 "Input is not proper UTF-8, indicate encoding !\n",
557 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000558 }
559
Daniel Veillarde77db162009-08-22 11:32:38 +0200560 ctxt->charset = XML_CHAR_ENCODING_8859_1;
Owen Taylor3473f882001-02-23 17:55:21 +0000561 *len = 1;
562 return((int) *ctxt->input->cur);
563}
564
565/**
Owen Taylor3473f882001-02-23 17:55:21 +0000566 * htmlSkipBlankChars:
567 * @ctxt: the HTML parser context
568 *
569 * skip all blanks character found at that point in the input streams.
570 *
571 * Returns the number of space chars skipped
572 */
573
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000574static int
Owen Taylor3473f882001-02-23 17:55:21 +0000575htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
576 int res = 0;
577
William M. Brack76e95df2003-10-18 16:20:14 +0000578 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000579 if ((*ctxt->input->cur == 0) &&
580 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
581 xmlPopInput(ctxt);
582 } else {
583 if (*(ctxt->input->cur) == '\n') {
584 ctxt->input->line++; ctxt->input->col = 1;
585 } else ctxt->input->col++;
586 ctxt->input->cur++;
587 ctxt->nbChars++;
588 if (*ctxt->input->cur == 0)
589 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
590 }
591 res++;
592 }
593 return(res);
594}
595
596
597
598/************************************************************************
599 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200600 * The list of HTML elements and their properties *
Owen Taylor3473f882001-02-23 17:55:21 +0000601 * *
602 ************************************************************************/
603
604/*
605 * Start Tag: 1 means the start tag can be ommited
606 * End Tag: 1 means the end tag can be ommited
607 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000608 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000609 * Depr: this element is deprecated
610 * DTD: 1 means that this element is valid only in the Loose DTD
611 * 2 means that this element is valid only in the Frameset DTD
612 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000613 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000614 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000615 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000616
617/* Definitions and a couple of vars for HTML Elements */
618
619#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000620#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000621#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000622#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000623#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
624#define NB_SPECIAL 16
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100625#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000626#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100627#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000628#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000629#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000630#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000631#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000632#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000633#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000634#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000635#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000636#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000637#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000638#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000639#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000640#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000641#define EMPTY NULL
642
643
Daniel Veillard065abe82006-07-03 08:55:04 +0000644static const char* const html_flow[] = { FLOW, NULL } ;
645static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000646
647/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000648static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000649#define html_cdata html_pcdata
650
651
652/* ... and for HTML Attributes */
653
654#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000655#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000656#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000657#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000658#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000659#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000660#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000661#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000662#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000663#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000664#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000665#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000666
Daniel Veillard065abe82006-07-03 08:55:04 +0000667static const char* const html_attrs[] = { ATTRS, NULL } ;
668static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
669static const char* const core_attrs[] = { COREATTRS, NULL } ;
670static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000671
672
673/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000674static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000675 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
676 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000677static const char* const target_attr[] = { "target", NULL } ;
678static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
679static const char* const alt_attr[] = { "alt", NULL } ;
680static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
681static const char* const href_attrs[] = { "href", NULL } ;
682static const char* const clear_attrs[] = { "clear", NULL } ;
683static const char* const inline_p[] = { INLINE, "p", NULL } ;
684
685static const char* const flow_param[] = { FLOW, "param", NULL } ;
686static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000687 "archive", "alt", "name", "height", "width", "align",
688 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000689static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000690 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000691static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000692 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000693static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
694static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
695static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
696static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000697 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000698static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000699 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
700
701
Daniel Veillard065abe82006-07-03 08:55:04 +0000702static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
703static const char* const col_elt[] = { "col", NULL } ;
704static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
705static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
706static const char* const dl_contents[] = { "dt", "dd", NULL } ;
707static const char* const compact_attr[] = { "compact", NULL } ;
708static const char* const label_attr[] = { "label", NULL } ;
709static const char* const fieldset_contents[] = { FLOW, "legend" } ;
710static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
711static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
712static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
713static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
714static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
715static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
716static const char* const head_attrs[] = { I18N, "profile", NULL } ;
717static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
718static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
719static const char* const version_attr[] = { "version", NULL } ;
720static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
721static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
722static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000723static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000724static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
725static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
726static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
727static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
728static const char* const align_attr[] = { "align", NULL } ;
729static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
730static const char* const map_contents[] = { BLOCK, "area", NULL } ;
731static const char* const name_attr[] = { "name", NULL } ;
732static const char* const action_attr[] = { "action", NULL } ;
733static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
Denis Pauk868d92d2012-05-10 15:34:57 +0800734static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000735static const char* const content_attr[] = { "content", NULL } ;
736static const char* const type_attr[] = { "type", NULL } ;
737static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
738static const char* const object_contents[] = { FLOW, "param", NULL } ;
739static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
740static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
741static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
742static const char* const option_elt[] = { "option", NULL } ;
743static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
744static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
745static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
746static const char* const width_attr[] = { "width", NULL } ;
747static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
748static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
749static const char* const language_attr[] = { "language", NULL } ;
750static const char* const select_content[] = { "optgroup", "option", NULL } ;
751static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
752static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200753static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000754static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
755static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
756static const char* const tr_elt[] = { "tr", NULL } ;
757static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
758static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
759static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
760static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
761static const char* const tr_contents[] = { "th", "td", NULL } ;
762static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
763static const char* const li_elt[] = { "li", NULL } ;
764static const char* const ul_depr[] = { "type", "compact", NULL} ;
765static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000766
767#define DECL (const char**)
768
Daniel Veillard22090732001-07-16 00:06:07 +0000769static const htmlElemDesc
770html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000771{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
772 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
773},
774{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
775 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
776},
777{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
778 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
779},
780{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
781 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
782},
783{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
784 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
785},
786{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
787 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
788},
789{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
790 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
791},
792{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
793 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
794},
795{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
796 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
797},
798{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
799 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
800},
801{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
802 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
803},
804{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
805 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
806},
807{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
808 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
809},
810{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
811 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
812},
813{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
814 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
815},
816{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
817 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
818},
819{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
820 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
821},
822{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
823 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
824},
825{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
826 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
827},
828{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
829 EMPTY , NULL , DECL col_attrs , NULL, NULL
830},
831{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
832 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
833},
834{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
835 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
836},
837{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
838 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
839},
840{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
841 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
842},
843{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
844 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
845},
846{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
847 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
848},
849{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000850 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000851},
852{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
853 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854},
855{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
856 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
857},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000858{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000859 EMPTY, NULL, DECL embed_attrs, NULL, NULL
860},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000861{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
862 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
863},
864{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
865 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
866},
867{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
868 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
869},
870{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
871 EMPTY, NULL, NULL, DECL frame_attrs, NULL
872},
873{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
874 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
875},
876{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
877 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
878},
879{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
880 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
881},
882{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
883 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
884},
885{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
886 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
887},
888{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
889 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
890},
891{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
892 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
893},
894{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
895 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
896},
897{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
898 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
899},
900{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
901 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
902},
903{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
904 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
905},
906{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
907 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
908},
909{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000910 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000911},
912{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
913 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
914},
915{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
916 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
917},
918{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
919 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
920},
921{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
922 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
923},
924{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
925 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
926},
927{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
928 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
929},
930{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
931 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
932},
933{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
934 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
935},
936{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000937 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000938},
939{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
940 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
941},
942{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
943 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
944},
945{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
946 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
947},
948{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
949 DECL html_flow, "div", DECL html_attrs, NULL, NULL
950},
951{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
952 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
953},
954{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
955 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
956},
957{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000958 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000959},
960{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
961 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
962},
963{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
964 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
965},
966{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000967 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000968},
969{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
970 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
971},
972{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
973 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
974},
975{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
976 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
977},
978{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
979 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
980},
981{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
982 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
983},
984{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
985 DECL select_content, NULL, DECL select_attrs, NULL, NULL
986},
987{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
988 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
989},
990{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
991 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
992},
993{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
994 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
995},
996{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
997 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
998},
999{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1000 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1001},
1002{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1003 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1004},
1005{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1006 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1007},
1008{ "table", 0, 0, 0, 0, 0, 0, 0, "",
1009 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1010},
1011{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1012 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1013},
1014{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1015 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1016},
1017{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1018 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1019},
1020{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1021 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1022},
1023{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1024 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1025},
1026{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1027 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1028},
1029{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1030 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1031},
1032{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1033 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1034},
1035{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1036 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1037},
1038{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1039 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1040},
1041{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1042 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1043},
1044{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1045 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1046}
Owen Taylor3473f882001-02-23 17:55:21 +00001047};
1048
1049/*
Owen Taylor3473f882001-02-23 17:55:21 +00001050 * start tags that imply the end of current element
1051 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001052static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001053"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1054 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1055 "listing", "xmp", "head", NULL,
1056"head", "p", NULL,
1057"title", "p", NULL,
1058"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +00001059"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001060"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1061 "pre", "listing", "xmp", "head", "li", NULL,
1062"hr", "p", "head", NULL,
1063"h1", "p", "head", NULL,
1064"h2", "p", "head", NULL,
1065"h3", "p", "head", NULL,
1066"h4", "p", "head", NULL,
1067"h5", "p", "head", NULL,
1068"h6", "p", "head", NULL,
1069"dir", "p", "head", NULL,
1070"address", "p", "head", "ul", NULL,
1071"pre", "p", "head", "ul", NULL,
1072"listing", "p", "head", NULL,
1073"xmp", "p", "head", NULL,
1074"blockquote", "p", "head", NULL,
1075"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1076 "xmp", "head", NULL,
1077"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1078 "head", "dd", NULL,
1079"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1080 "head", "dt", NULL,
1081"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1082 "listing", "xmp", NULL,
1083"ol", "p", "head", "ul", NULL,
1084"menu", "p", "head", "ul", NULL,
Daniel Veillard6339c1a2009-08-24 11:59:51 +02001085"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001086"div", "p", "head", NULL,
Denis Pauka0cd0752012-05-11 19:31:12 +08001087"noscript", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001088"center", "font", "b", "i", "p", "head", NULL,
Conrad Irwinb60061a2012-07-27 15:42:27 -07001089"a", "a", "head", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001090"caption", "p", NULL,
1091"colgroup", "caption", "colgroup", "col", "p", NULL,
1092"col", "caption", "col", "p", NULL,
1093"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1094 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001095"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Daniel Veillarde77db162009-08-22 11:32:38 +02001096"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001097"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1098"thead", "caption", "col", "colgroup", NULL,
1099"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1100 "tbody", "p", NULL,
1101"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1102 "tfoot", "tbody", "p", NULL,
1103"optgroup", "option", NULL,
1104"option", "option", NULL,
1105"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1106 "pre", "listing", "xmp", "a", NULL,
Conrad Irwinb60061a2012-07-27 15:42:27 -07001107/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1108"tt", "head", NULL,
1109"i", "head", NULL,
1110"b", "head", NULL,
1111"u", "head", NULL,
1112"s", "head", NULL,
1113"strike", "head", NULL,
1114"big", "head", NULL,
1115"small", "head", NULL,
1116
1117"em", "head", NULL,
1118"strong", "head", NULL,
1119"dfn", "head", NULL,
1120"code", "head", NULL,
1121"samp", "head", NULL,
1122"kbd", "head", NULL,
1123"var", "head", NULL,
1124"cite", "head", NULL,
1125"abbr", "head", NULL,
1126"acronym", "head", NULL,
1127
1128/* "a" */
1129"img", "head", NULL,
1130/* "applet" */
1131/* "embed" */
1132/* "object" */
1133"font", "head", NULL,
1134/* "basefont" */
1135"br", "head", NULL,
1136/* "script" */
1137"map", "head", NULL,
1138"q", "head", NULL,
1139"sub", "head", NULL,
1140"sup", "head", NULL,
1141"span", "head", NULL,
1142"bdo", "head", NULL,
1143"iframe", "head", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001144NULL
1145};
1146
1147/*
1148 * The list of HTML elements which are supposed not to have
1149 * CDATA content and where a p element will be implied
1150 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001151 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001152 * implied paragraph
1153 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001154static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001155 "html",
1156 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001157 NULL
1158};
1159
1160/*
1161 * The list of HTML attributes which are of content %Script;
1162 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1163 * it assumes the name starts with 'on'
1164 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001165static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001166 "onclick",
1167 "ondblclick",
1168 "onmousedown",
1169 "onmouseup",
1170 "onmouseover",
1171 "onmousemove",
1172 "onmouseout",
1173 "onkeypress",
1174 "onkeydown",
1175 "onkeyup",
1176 "onload",
1177 "onunload",
1178 "onfocus",
1179 "onblur",
1180 "onsubmit",
Daniel Veillardb0c7e7e2014-02-06 10:50:35 +01001181 "onreset",
Owen Taylor3473f882001-02-23 17:55:21 +00001182 "onchange",
1183 "onselect"
1184};
1185
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001186/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001187 * This table is used by the htmlparser to know what to do with
1188 * broken html pages. By assigning different priorities to different
1189 * elements the parser can decide how to handle extra endtags.
1190 * Endtags are only allowed to close elements with lower or equal
1191 * priority.
Daniel Veillarde77db162009-08-22 11:32:38 +02001192 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001193
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001194typedef struct {
1195 const char *name;
1196 int priority;
1197} elementPriority;
1198
Daniel Veillard22090732001-07-16 00:06:07 +00001199static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001200 {"div", 150},
1201 {"td", 160},
1202 {"th", 160},
1203 {"tr", 170},
1204 {"thead", 180},
1205 {"tbody", 180},
1206 {"tfoot", 180},
1207 {"table", 190},
1208 {"head", 200},
1209 {"body", 200},
1210 {"html", 220},
1211 {NULL, 100} /* Default priority */
1212};
Owen Taylor3473f882001-02-23 17:55:21 +00001213
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001214static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001215static int htmlStartCloseIndexinitialized = 0;
1216
1217/************************************************************************
1218 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001219 * functions to handle HTML specific data *
Owen Taylor3473f882001-02-23 17:55:21 +00001220 * *
1221 ************************************************************************/
1222
1223/**
1224 * htmlInitAutoClose:
1225 *
1226 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1227 * This is not reentrant. Call xmlInitParser() once before processing in
1228 * case of use in multithreaded programs.
1229 */
1230void
1231htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001232 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001233
1234 if (htmlStartCloseIndexinitialized) return;
1235
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001236 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1237 indx = 0;
1238 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001239 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001240 while (htmlStartClose[i] != NULL) i++;
1241 i++;
1242 }
1243 htmlStartCloseIndexinitialized = 1;
1244}
1245
1246/**
1247 * htmlTagLookup:
1248 * @tag: The tag name in lowercase
1249 *
1250 * Lookup the HTML tag in the ElementTable
1251 *
1252 * Returns the related htmlElemDescPtr or NULL if not found.
1253 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001254const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001255htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001256 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001257
1258 for (i = 0; i < (sizeof(html40ElementTable) /
1259 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001260 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001261 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001262 }
1263 return(NULL);
1264}
1265
1266/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001267 * htmlGetEndPriority:
1268 * @name: The name of the element to look up the priority for.
Daniel Veillarde77db162009-08-22 11:32:38 +02001269 *
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001270 * Return value: The "endtag" priority.
1271 **/
1272static int
1273htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001274 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001275
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001276 while ((htmlEndPriority[i].name != NULL) &&
1277 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1278 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001279
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001280 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001281}
1282
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001283
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001284/**
Owen Taylor3473f882001-02-23 17:55:21 +00001285 * htmlCheckAutoClose:
1286 * @newtag: The new tag name
1287 * @oldtag: The old tag name
1288 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001289 * Checks whether the new tag is one of the registered valid tags for
1290 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001291 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1292 *
1293 * Returns 0 if no, 1 if yes.
1294 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001295static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001296htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1297{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001298 int i, indx;
1299 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001300
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001301 if (htmlStartCloseIndexinitialized == 0)
1302 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001303
1304 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001305 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001306 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001307 if (closed == NULL)
1308 return (0);
1309 if (xmlStrEqual(BAD_CAST * closed, newtag))
1310 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001311 }
1312
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001313 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001314 i++;
1315 while (htmlStartClose[i] != NULL) {
1316 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001317 return (1);
1318 }
1319 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001320 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001321 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001322}
1323
1324/**
1325 * htmlAutoCloseOnClose:
1326 * @ctxt: an HTML parser context
1327 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001328 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001329 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001330 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001331 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001332static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001333htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1334{
1335 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001336 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001337
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001338 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001339
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001340 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001341
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001342 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1343 break;
1344 /*
1345 * A missplaced endtag can only close elements with lower
1346 * or equal priority, so if we find an element with higher
1347 * priority before we find an element with
Daniel Veillarde77db162009-08-22 11:32:38 +02001348 * matching name, we just ignore this endtag
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001349 */
1350 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1351 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001352 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001353 if (i < 0)
1354 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001355
1356 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001357 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001358 if ((info != NULL) && (info->endTag == 3)) {
1359 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1360 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001361 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001362 }
1363 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1364 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001365 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001366 }
1367}
1368
1369/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001370 * htmlAutoCloseOnEnd:
1371 * @ctxt: an HTML parser context
1372 *
1373 * Close all remaining tags at the end of the stream
1374 */
1375static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001376htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1377{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001378 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001379
William M. Brack899e64a2003-09-26 18:03:42 +00001380 if (ctxt->nameNr == 0)
1381 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001382 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001383 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1384 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001385 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001386 }
1387}
1388
1389/**
Owen Taylor3473f882001-02-23 17:55:21 +00001390 * htmlAutoClose:
1391 * @ctxt: an HTML parser context
1392 * @newtag: The new tag name or NULL
1393 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001394 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001395 * The list is kept in htmlStartClose array. This function is
1396 * called when a new tag has been detected and generates the
1397 * appropriates closes if possible/needed.
1398 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillarde77db162009-08-22 11:32:38 +02001399 * and we should check
Owen Taylor3473f882001-02-23 17:55:21 +00001400 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001401static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001402htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1403{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001404 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001405 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001406 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1407 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001408 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001409 }
1410 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001411 htmlAutoCloseOnEnd(ctxt);
1412 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001413 }
1414 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001415 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1416 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1417 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001418 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1419 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001420 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001421 }
Owen Taylor3473f882001-02-23 17:55:21 +00001422}
1423
1424/**
1425 * htmlAutoCloseTag:
1426 * @doc: the HTML document
1427 * @name: The tag name
1428 * @elem: the HTML element
1429 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001430 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001431 * The list is kept in htmlStartClose array. This function checks
1432 * if the element or one of it's children would autoclose the
1433 * given tag.
1434 *
1435 * Returns 1 if autoclose, 0 otherwise
1436 */
1437int
1438htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1439 htmlNodePtr child;
1440
1441 if (elem == NULL) return(1);
1442 if (xmlStrEqual(name, elem->name)) return(0);
1443 if (htmlCheckAutoClose(elem->name, name)) return(1);
1444 child = elem->children;
1445 while (child != NULL) {
1446 if (htmlAutoCloseTag(doc, name, child)) return(1);
1447 child = child->next;
1448 }
1449 return(0);
1450}
1451
1452/**
1453 * htmlIsAutoClosed:
1454 * @doc: the HTML document
1455 * @elem: the HTML element
1456 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001457 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001458 * The list is kept in htmlStartClose array. This function checks
1459 * if a tag is autoclosed by one of it's child
1460 *
1461 * Returns 1 if autoclosed, 0 otherwise
1462 */
1463int
1464htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1465 htmlNodePtr child;
1466
1467 if (elem == NULL) return(1);
1468 child = elem->children;
1469 while (child != NULL) {
1470 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1471 child = child->next;
1472 }
1473 return(0);
1474}
1475
1476/**
1477 * htmlCheckImplied:
1478 * @ctxt: an HTML parser context
1479 * @newtag: The new tag name
1480 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001481 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001482 * called when a new tag has been detected and generates the
1483 * appropriates implicit tags if missing
1484 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001485static void
Owen Taylor3473f882001-02-23 17:55:21 +00001486htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardb468f742009-08-24 18:45:33 +02001487 int i;
1488
Daniel Veillarde20fb5a2010-01-29 20:47:08 +01001489 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1490 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001491 if (!htmlOmittedDefaultValue)
1492 return;
1493 if (xmlStrEqual(newtag, BAD_CAST"html"))
1494 return;
1495 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001496 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001497 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1498 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1499 }
1500 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1501 return;
Daniel Veillarde77db162009-08-22 11:32:38 +02001502 if ((ctxt->nameNr <= 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001503 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1504 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1505 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1506 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1507 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1508 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001509 if (ctxt->html >= 3) {
1510 /* we already saw or generated an <head> before */
1511 return;
1512 }
1513 /*
1514 * dropped OBJECT ... i you put it first BODY will be
1515 * assumed !
1516 */
1517 htmlnamePush(ctxt, BAD_CAST"head");
1518 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1519 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001520 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1521 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1522 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001523 if (ctxt->html >= 10) {
1524 /* we already saw or generated a <body> before */
1525 return;
1526 }
Owen Taylor3473f882001-02-23 17:55:21 +00001527 for (i = 0;i < ctxt->nameNr;i++) {
1528 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1529 return;
1530 }
1531 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1532 return;
1533 }
1534 }
Daniel Veillarde77db162009-08-22 11:32:38 +02001535
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001536 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001537 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1538 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1539 }
1540}
1541
1542/**
1543 * htmlCheckParagraph
1544 * @ctxt: an HTML parser context
1545 *
1546 * Check whether a p element need to be implied before inserting
1547 * characters in the current element.
1548 *
1549 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1550 * in case of error.
1551 */
1552
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001553static int
Owen Taylor3473f882001-02-23 17:55:21 +00001554htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1555 const xmlChar *tag;
1556 int i;
1557
1558 if (ctxt == NULL)
1559 return(-1);
1560 tag = ctxt->name;
1561 if (tag == NULL) {
1562 htmlAutoClose(ctxt, BAD_CAST"p");
1563 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001564 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001565 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1567 return(1);
1568 }
1569 if (!htmlOmittedDefaultValue)
1570 return(0);
1571 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1572 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001573 htmlAutoClose(ctxt, BAD_CAST"p");
1574 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001575 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001576 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1577 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1578 return(1);
1579 }
1580 }
1581 return(0);
1582}
1583
1584/**
1585 * htmlIsScriptAttribute:
1586 * @name: an attribute name
1587 *
1588 * Check if an attribute is of content type Script
1589 *
1590 * Returns 1 is the attribute is a script 0 otherwise
1591 */
1592int
1593htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001594 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001595
1596 if (name == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02001597 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001598 /*
1599 * all script attributes start with 'on'
1600 */
1601 if ((name[0] != 'o') || (name[1] != 'n'))
Daniel Veillarde77db162009-08-22 11:32:38 +02001602 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001603 for (i = 0;
1604 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1605 i++) {
1606 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1607 return(1);
1608 }
1609 return(0);
1610}
1611
1612/************************************************************************
1613 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001614 * The list of HTML predefined entities *
Owen Taylor3473f882001-02-23 17:55:21 +00001615 * *
1616 ************************************************************************/
1617
1618
Daniel Veillard22090732001-07-16 00:06:07 +00001619static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001620/*
1621 * the 4 absolute ones, plus apostrophe.
1622 */
1623{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1624{ 38, "amp", "ampersand, U+0026 ISOnum" },
1625{ 39, "apos", "single quote" },
1626{ 60, "lt", "less-than sign, U+003C ISOnum" },
1627{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1628
1629/*
1630 * A bunch still in the 128-255 range
1631 * Replacing them depend really on the charset used.
1632 */
1633{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1634{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1635{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1636{ 163, "pound","pound sign, U+00A3 ISOnum" },
1637{ 164, "curren","currency sign, U+00A4 ISOnum" },
1638{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1639{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1640{ 167, "sect", "section sign, U+00A7 ISOnum" },
1641{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1642{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1643{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1644{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1645{ 172, "not", "not sign, U+00AC ISOnum" },
1646{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1647{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1648{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1649{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1650{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1651{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1652{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1653{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1654{ 181, "micro","micro sign, U+00B5 ISOnum" },
1655{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1656{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1657{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1658{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1659{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1660{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1661{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1662{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1663{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1664{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1665{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1666{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1667{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1668{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1669{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1670{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1671{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1672{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1673{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1674{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1675{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1676{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1677{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1678{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1679{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1680{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1681{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1682{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1683{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1684{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1685{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1686{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1687{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1688{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1689{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1690{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1691{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1692{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1693{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1694{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1695{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1696{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1697{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1698{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1699{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1700{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1701{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1702{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1703{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1704{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1705{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1706{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1707{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1708{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1709{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1710{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1711{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1712{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1713{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1714{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1715{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1716{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1717{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1718{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1719{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1720{ 247, "divide","division sign, U+00F7 ISOnum" },
1721{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1722{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1723{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1724{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1725{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1726{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1727{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1728{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1729
1730{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1731{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1732{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1733{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1734{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1735
1736/*
1737 * Anything below should really be kept as entities references
1738 */
1739{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1740
1741{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1742{ 732, "tilde","small tilde, U+02DC ISOdia" },
1743
1744{ 913, "Alpha","greek capital letter alpha, U+0391" },
1745{ 914, "Beta", "greek capital letter beta, U+0392" },
1746{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1747{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1748{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1749{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1750{ 919, "Eta", "greek capital letter eta, U+0397" },
1751{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1752{ 921, "Iota", "greek capital letter iota, U+0399" },
1753{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001754{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001755{ 924, "Mu", "greek capital letter mu, U+039C" },
1756{ 925, "Nu", "greek capital letter nu, U+039D" },
1757{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1758{ 927, "Omicron","greek capital letter omicron, U+039F" },
1759{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1760{ 929, "Rho", "greek capital letter rho, U+03A1" },
1761{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1762{ 932, "Tau", "greek capital letter tau, U+03A4" },
1763{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1764{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1765{ 935, "Chi", "greek capital letter chi, U+03A7" },
1766{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1767{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1768
1769{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1770{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1771{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1772{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1773{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1774{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1775{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1776{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1777{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1778{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1779{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1780{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1781{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1782{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1783{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1784{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1785{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1786{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1787{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1788{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1789{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1790{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1791{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1792{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1793{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1794{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1795{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1796{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1797
1798{ 8194, "ensp", "en space, U+2002 ISOpub" },
1799{ 8195, "emsp", "em space, U+2003 ISOpub" },
1800{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1801{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1802{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1803{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1804{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1805{ 8211, "ndash","en dash, U+2013 ISOpub" },
1806{ 8212, "mdash","em dash, U+2014 ISOpub" },
1807{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1808{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1809{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1810{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1811{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1812{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1813{ 8224, "dagger","dagger, U+2020 ISOpub" },
1814{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1815
1816{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1817{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1818
1819{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1820
1821{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1822{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1823
1824{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1825{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1826
1827{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1828{ 8260, "frasl","fraction slash, U+2044 NEW" },
1829
1830{ 8364, "euro", "euro sign, U+20AC NEW" },
1831
1832{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1833{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1834{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1835{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1836{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1837{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1838{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1839{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1840{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1841{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1842{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1843{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1844{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1845{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1846{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1847{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1848
1849{ 8704, "forall","for all, U+2200 ISOtech" },
1850{ 8706, "part", "partial differential, U+2202 ISOtech" },
1851{ 8707, "exist","there exists, U+2203 ISOtech" },
1852{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1853{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1854{ 8712, "isin", "element of, U+2208 ISOtech" },
1855{ 8713, "notin","not an element of, U+2209 ISOtech" },
1856{ 8715, "ni", "contains as member, U+220B ISOtech" },
1857{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001858{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001859{ 8722, "minus","minus sign, U+2212 ISOtech" },
1860{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1861{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1862{ 8733, "prop", "proportional to, U+221D ISOtech" },
1863{ 8734, "infin","infinity, U+221E ISOtech" },
1864{ 8736, "ang", "angle, U+2220 ISOamso" },
1865{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1866{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1867{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1868{ 8746, "cup", "union = cup, U+222A ISOtech" },
1869{ 8747, "int", "integral, U+222B ISOtech" },
1870{ 8756, "there4","therefore, U+2234 ISOtech" },
1871{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1872{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1873{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1874{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1875{ 8801, "equiv","identical to, U+2261 ISOtech" },
1876{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1877{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1878{ 8834, "sub", "subset of, U+2282 ISOtech" },
1879{ 8835, "sup", "superset of, U+2283 ISOtech" },
1880{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1881{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1882{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1883{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1884{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1885{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1886{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1887{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1888{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1889{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1890{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1891{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1892{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1893{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1894
1895{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1896{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1897{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1898{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1899
1900};
1901
1902/************************************************************************
1903 * *
1904 * Commodity functions to handle entities *
1905 * *
1906 ************************************************************************/
1907
1908/*
1909 * Macro used to grow the current buffer.
1910 */
1911#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001912 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001913 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001914 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1915 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001916 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001917 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001918 return(NULL); \
1919 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001920 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001921}
1922
1923/**
1924 * htmlEntityLookup:
1925 * @name: the entity name
1926 *
1927 * Lookup the given entity in EntitiesTable
1928 *
1929 * TODO: the linear scan is really ugly, an hash table is really needed.
1930 *
1931 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1932 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001933const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001934htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001935 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001936
1937 for (i = 0;i < (sizeof(html40EntitiesTable)/
1938 sizeof(html40EntitiesTable[0]));i++) {
1939 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001940 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001941 }
1942 }
1943 return(NULL);
1944}
1945
1946/**
1947 * htmlEntityValueLookup:
1948 * @value: the entity's unicode value
1949 *
1950 * Lookup the given entity in EntitiesTable
1951 *
1952 * TODO: the linear scan is really ugly, an hash table is really needed.
1953 *
1954 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1955 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001956const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001957htmlEntityValueLookup(unsigned int value) {
1958 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001959
1960 for (i = 0;i < (sizeof(html40EntitiesTable)/
1961 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001962 if (html40EntitiesTable[i].value >= value) {
1963 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001964 break;
William M. Brack78637da2003-07-31 14:47:38 +00001965 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001966 }
Owen Taylor3473f882001-02-23 17:55:21 +00001967 }
1968 return(NULL);
1969}
1970
1971/**
1972 * UTF8ToHtml:
1973 * @out: a pointer to an array of bytes to store the result
1974 * @outlen: the length of @out
1975 * @in: a pointer to an array of UTF-8 chars
1976 * @inlen: the length of @in
1977 *
1978 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1979 * plus HTML entities block of chars out.
1980 *
1981 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1982 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001983 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001984 * The value of @outlen after return is the number of octets consumed.
1985 */
1986int
1987UTF8ToHtml(unsigned char* out, int *outlen,
1988 const unsigned char* in, int *inlen) {
1989 const unsigned char* processed = in;
1990 const unsigned char* outend;
1991 const unsigned char* outstart = out;
1992 const unsigned char* instart = in;
1993 const unsigned char* inend;
1994 unsigned int c, d;
1995 int trailing;
1996
Daniel Veillardce682bc2004-11-05 17:22:25 +00001997 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001998 if (in == NULL) {
1999 /*
2000 * initialization nothing to do
2001 */
2002 *outlen = 0;
2003 *inlen = 0;
2004 return(0);
2005 }
2006 inend = in + (*inlen);
2007 outend = out + (*outlen);
2008 while (in < inend) {
2009 d = *in++;
2010 if (d < 0x80) { c= d; trailing= 0; }
2011 else if (d < 0xC0) {
2012 /* trailing byte in leading position */
2013 *outlen = out - outstart;
2014 *inlen = processed - instart;
2015 return(-2);
2016 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2017 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2018 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2019 else {
2020 /* no chance for this in Ascii */
2021 *outlen = out - outstart;
2022 *inlen = processed - instart;
2023 return(-2);
2024 }
2025
2026 if (inend - in < trailing) {
2027 break;
Daniel Veillarde77db162009-08-22 11:32:38 +02002028 }
Owen Taylor3473f882001-02-23 17:55:21 +00002029
2030 for ( ; trailing; trailing--) {
2031 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2032 break;
2033 c <<= 6;
2034 c |= d & 0x3F;
2035 }
2036
2037 /* assertion: c is a single UTF-4 value */
2038 if (c < 0x80) {
2039 if (out + 1 >= outend)
2040 break;
2041 *out++ = c;
2042 } else {
2043 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00002044 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00002045 const char *cp;
2046 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00002047
2048 /*
2049 * Try to lookup a predefined HTML entity for it
2050 */
2051
2052 ent = htmlEntityValueLookup(c);
2053 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00002054 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2055 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00002056 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00002057 else
2058 cp = ent->name;
2059 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00002060 if (out + 2 + len >= outend)
2061 break;
2062 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00002063 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00002064 out += len;
2065 *out++ = ';';
2066 }
2067 processed = in;
2068 }
2069 *outlen = out - outstart;
2070 *inlen = processed - instart;
2071 return(0);
2072}
2073
2074/**
2075 * htmlEncodeEntities:
2076 * @out: a pointer to an array of bytes to store the result
2077 * @outlen: the length of @out
2078 * @in: a pointer to an array of UTF-8 chars
2079 * @inlen: the length of @in
2080 * @quoteChar: the quote character to escape (' or ") or zero.
2081 *
2082 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2083 * plus HTML entities block of chars out.
2084 *
2085 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2086 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002087 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00002088 * The value of @outlen after return is the number of octets consumed.
2089 */
2090int
2091htmlEncodeEntities(unsigned char* out, int *outlen,
2092 const unsigned char* in, int *inlen, int quoteChar) {
2093 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002094 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00002095 const unsigned char* outstart = out;
2096 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002097 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00002098 unsigned int c, d;
2099 int trailing;
2100
Daniel Veillardce682bc2004-11-05 17:22:25 +00002101 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2102 return(-1);
2103 outend = out + (*outlen);
2104 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00002105 while (in < inend) {
2106 d = *in++;
2107 if (d < 0x80) { c= d; trailing= 0; }
2108 else if (d < 0xC0) {
2109 /* trailing byte in leading position */
2110 *outlen = out - outstart;
2111 *inlen = processed - instart;
2112 return(-2);
2113 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2114 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2115 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2116 else {
2117 /* no chance for this in Ascii */
2118 *outlen = out - outstart;
2119 *inlen = processed - instart;
2120 return(-2);
2121 }
2122
2123 if (inend - in < trailing)
2124 break;
2125
2126 while (trailing--) {
2127 if (((d= *in++) & 0xC0) != 0x80) {
2128 *outlen = out - outstart;
2129 *inlen = processed - instart;
2130 return(-2);
2131 }
2132 c <<= 6;
2133 c |= d & 0x3F;
2134 }
2135
2136 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002137 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2138 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002139 if (out >= outend)
2140 break;
2141 *out++ = c;
2142 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002143 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002144 const char *cp;
2145 char nbuf[16];
2146 int len;
2147
2148 /*
2149 * Try to lookup a predefined HTML entity for it
2150 */
2151 ent = htmlEntityValueLookup(c);
2152 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002153 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002154 cp = nbuf;
2155 }
2156 else
2157 cp = ent->name;
2158 len = strlen(cp);
2159 if (out + 2 + len > outend)
2160 break;
2161 *out++ = '&';
2162 memcpy(out, cp, len);
2163 out += len;
2164 *out++ = ';';
2165 }
2166 processed = in;
2167 }
2168 *outlen = out - outstart;
2169 *inlen = processed - instart;
2170 return(0);
2171}
2172
Owen Taylor3473f882001-02-23 17:55:21 +00002173/************************************************************************
2174 * *
2175 * Commodity functions to handle streams *
2176 * *
2177 ************************************************************************/
2178
2179/**
Owen Taylor3473f882001-02-23 17:55:21 +00002180 * htmlNewInputStream:
2181 * @ctxt: an HTML parser context
2182 *
2183 * Create a new input stream structure
2184 * Returns the new input stream or NULL
2185 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002186static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002187htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2188 htmlParserInputPtr input;
2189
2190 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2191 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002192 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002193 return(NULL);
2194 }
2195 memset(input, 0, sizeof(htmlParserInput));
2196 input->filename = NULL;
2197 input->directory = NULL;
2198 input->base = NULL;
2199 input->cur = NULL;
2200 input->buf = NULL;
2201 input->line = 1;
2202 input->col = 1;
2203 input->buf = NULL;
2204 input->free = NULL;
2205 input->version = NULL;
2206 input->consumed = 0;
2207 input->length = 0;
2208 return(input);
2209}
2210
2211
2212/************************************************************************
2213 * *
2214 * Commodity functions, cleanup needed ? *
2215 * *
2216 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002217/*
Daniel Veillarde77db162009-08-22 11:32:38 +02002218 * all tags allowing pc data from the html 4.01 loose dtd
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002219 * NOTE: it might be more apropriate to integrate this information
2220 * into the html40ElementTable array but I don't want to risk any
2221 * binary incomptibility
2222 */
2223static const char *allowPCData[] = {
2224 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2225 "blockquote", "body", "button", "caption", "center", "cite", "code",
2226 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2227 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2228 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2229 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2230};
Owen Taylor3473f882001-02-23 17:55:21 +00002231
2232/**
2233 * areBlanks:
2234 * @ctxt: an HTML parser context
2235 * @str: a xmlChar *
2236 * @len: the size of @str
2237 *
2238 * Is this a sequence of blank chars that one can ignore ?
2239 *
2240 * Returns 1 if ignorable 0 otherwise.
2241 */
2242
2243static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002244 unsigned int i;
2245 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002246 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002247 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002248
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002249 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002250 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002251
2252 if (CUR == 0) return(1);
2253 if (CUR != '<') return(0);
2254 if (ctxt->name == NULL)
2255 return(1);
2256 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2257 return(1);
2258 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2259 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002260
2261 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2262 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2263 dtd = xmlGetIntSubset(ctxt->myDoc);
2264 if (dtd != NULL && dtd->ExternalID != NULL) {
2265 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2266 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2267 return(1);
2268 }
2269 }
2270
Owen Taylor3473f882001-02-23 17:55:21 +00002271 if (ctxt->node == NULL) return(0);
2272 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002273 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2274 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002275 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002276 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2277 (ctxt->node->content != NULL)) return(0);
Daniel Veillarde77db162009-08-22 11:32:38 +02002278 /* keep ws in constructs like ...<b> </b>...
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002279 for all tags "b" allowing PCDATA */
2280 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2281 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2282 return(0);
2283 }
2284 }
Owen Taylor3473f882001-02-23 17:55:21 +00002285 } else if (xmlNodeIsText(lastChild)) {
2286 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002287 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02002288 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002289 for all tags "p" allowing PCDATA */
2290 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2291 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2292 return(0);
2293 }
2294 }
Owen Taylor3473f882001-02-23 17:55:21 +00002295 }
2296 return(1);
2297}
2298
2299/**
Owen Taylor3473f882001-02-23 17:55:21 +00002300 * htmlNewDocNoDtD:
2301 * @URI: URI for the dtd, or NULL
2302 * @ExternalID: the external ID of the DTD, or NULL
2303 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002304 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2305 * are NULL
2306 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002307 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002308 */
2309htmlDocPtr
2310htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2311 xmlDocPtr cur;
2312
2313 /*
2314 * Allocate a new document and fill the fields.
2315 */
2316 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2317 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002318 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002319 return(NULL);
2320 }
2321 memset(cur, 0, sizeof(xmlDoc));
2322
2323 cur->type = XML_HTML_DOCUMENT_NODE;
2324 cur->version = NULL;
2325 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002326 cur->doc = cur;
2327 cur->name = NULL;
Daniel Veillarde77db162009-08-22 11:32:38 +02002328 cur->children = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002329 cur->extSubset = NULL;
2330 cur->oldNs = NULL;
2331 cur->encoding = NULL;
2332 cur->standalone = 1;
2333 cur->compression = 0;
2334 cur->ids = NULL;
2335 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002336 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002337 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002338 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002339 if ((ExternalID != NULL) ||
2340 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002341 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002342 return(cur);
2343}
2344
2345/**
2346 * htmlNewDoc:
2347 * @URI: URI for the dtd, or NULL
2348 * @ExternalID: the external ID of the DTD, or NULL
2349 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002350 * Creates a new HTML document
2351 *
Owen Taylor3473f882001-02-23 17:55:21 +00002352 * Returns a new document
2353 */
2354htmlDocPtr
2355htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2356 if ((URI == NULL) && (ExternalID == NULL))
2357 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002358 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2359 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002360
2361 return(htmlNewDocNoDtD(URI, ExternalID));
2362}
2363
2364
2365/************************************************************************
2366 * *
2367 * The parser itself *
2368 * Relates to http://www.w3.org/TR/html40 *
2369 * *
2370 ************************************************************************/
2371
2372/************************************************************************
2373 * *
2374 * The parser itself *
2375 * *
2376 ************************************************************************/
2377
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002378static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002379
Owen Taylor3473f882001-02-23 17:55:21 +00002380/**
2381 * htmlParseHTMLName:
2382 * @ctxt: an HTML parser context
2383 *
2384 * parse an HTML tag or attribute name, note that we convert it to lowercase
2385 * since HTML names are not case-sensitive.
2386 *
2387 * Returns the Tag Name parsed or NULL
2388 */
2389
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002390static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002391htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002392 int i = 0;
2393 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2394
William M. Brackd1757ab2004-10-02 22:07:48 +00002395 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002396 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002397
2398 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002399 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002400 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2401 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002402 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2403 else loc[i] = CUR;
2404 i++;
Daniel Veillarde77db162009-08-22 11:32:38 +02002405
Owen Taylor3473f882001-02-23 17:55:21 +00002406 NEXT;
2407 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002408
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002409 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002410}
2411
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002412
2413/**
2414 * htmlParseHTMLName_nonInvasive:
2415 * @ctxt: an HTML parser context
2416 *
2417 * parse an HTML tag or attribute name, note that we convert it to lowercase
2418 * since HTML names are not case-sensitive, this doesn't consume the data
2419 * from the stream, it's a look-ahead
2420 *
2421 * Returns the Tag Name parsed or NULL
2422 */
2423
2424static const xmlChar *
2425htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2426 int i = 0;
2427 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2428
2429 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2430 (NXT(1) != ':')) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02002431
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002432 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2433 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2434 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2435 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2436 else loc[i] = NXT(1+i);
2437 i++;
2438 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002439
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002440 return(xmlDictLookup(ctxt->dict, loc, i));
2441}
2442
2443
Owen Taylor3473f882001-02-23 17:55:21 +00002444/**
2445 * htmlParseName:
2446 * @ctxt: an HTML parser context
2447 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002448 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002449 *
2450 * Returns the Name parsed or NULL
2451 */
2452
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002453static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002454htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002455 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002456 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002457 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002458
2459 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002460
2461 /*
2462 * Accelerator for simple ASCII names
2463 */
2464 in = ctxt->input->cur;
2465 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2466 ((*in >= 0x41) && (*in <= 0x5A)) ||
2467 (*in == '_') || (*in == ':')) {
2468 in++;
2469 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2470 ((*in >= 0x41) && (*in <= 0x5A)) ||
2471 ((*in >= 0x30) && (*in <= 0x39)) ||
2472 (*in == '_') || (*in == '-') ||
2473 (*in == ':') || (*in == '.'))
2474 in++;
Pranjal Jumdea820dbe2016-03-01 11:34:04 -08002475
2476 if (in == ctxt->input->end)
2477 return(NULL);
2478
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002479 if ((*in > 0) && (*in < 0x80)) {
2480 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002481 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002482 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002483 ctxt->nbChars += count;
2484 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002485 return(ret);
2486 }
2487 }
2488 return(htmlParseNameComplex(ctxt));
2489}
2490
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002491static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002492htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002493 int len = 0, l;
2494 int c;
2495 int count = 0;
Hugh Davenportbeca86e2016-05-04 11:23:49 +08002496 const xmlChar *base = ctxt->input->base;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002497
2498 /*
2499 * Handler for more complex cases
2500 */
2501 GROW;
2502 c = CUR_CHAR(l);
2503 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2504 (!IS_LETTER(c) && (c != '_') &&
2505 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002506 return(NULL);
2507 }
2508
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002509 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2510 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2511 (c == '.') || (c == '-') ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002512 (c == '_') || (c == ':') ||
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002513 (IS_COMBINING(c)) ||
2514 (IS_EXTENDER(c)))) {
2515 if (count++ > 100) {
2516 count = 0;
2517 GROW;
2518 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002519 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002520 NEXTL(l);
2521 c = CUR_CHAR(l);
Hugh Davenportbeca86e2016-05-04 11:23:49 +08002522 if (ctxt->input->base != base) {
2523 /*
2524 * We changed encoding from an unknown encoding
2525 * Input buffer changed location, so we better start again
2526 */
2527 return(htmlParseNameComplex(ctxt));
2528 }
Owen Taylor3473f882001-02-23 17:55:21 +00002529 }
Pranjal Jumdea820dbe2016-03-01 11:34:04 -08002530
Nick Wellnhoferf39e3be2017-06-11 12:35:59 +02002531 if (ctxt->input->cur - ctxt->input->base < len) {
2532 /* Sanity check */
2533 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2534 "unexpected change of input buffer", NULL, NULL);
2535 return (NULL);
2536 }
Pranjal Jumdea820dbe2016-03-01 11:34:04 -08002537
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002538 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002539}
2540
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002541
Owen Taylor3473f882001-02-23 17:55:21 +00002542/**
2543 * htmlParseHTMLAttribute:
2544 * @ctxt: an HTML parser context
2545 * @stop: a char stop value
Daniel Veillarde77db162009-08-22 11:32:38 +02002546 *
Owen Taylor3473f882001-02-23 17:55:21 +00002547 * parse an HTML attribute value till the stop (quote), if
2548 * stop is 0 then it stops at the first space
2549 *
2550 * Returns the attribute parsed or NULL
2551 */
2552
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002553static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002554htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2555 xmlChar *buffer = NULL;
2556 int buffer_size = 0;
2557 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002558 const xmlChar *name = NULL;
2559 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002560 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002561
2562 /*
2563 * allocate a translation buffer.
2564 */
2565 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002566 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002567 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002568 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002569 return(NULL);
2570 }
2571 out = buffer;
2572
2573 /*
2574 * Ok loop until we reach one of the ending chars
2575 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002576 while ((CUR != 0) && (CUR != stop)) {
2577 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002578 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002579 if (CUR == '&') {
2580 if (NXT(1) == '#') {
2581 unsigned int c;
2582 int bits;
2583
2584 c = htmlParseCharRef(ctxt);
2585 if (c < 0x80)
2586 { *out++ = c; bits= -6; }
2587 else if (c < 0x800)
2588 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2589 else if (c < 0x10000)
2590 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002591 else
Owen Taylor3473f882001-02-23 17:55:21 +00002592 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002593
Owen Taylor3473f882001-02-23 17:55:21 +00002594 for ( ; bits >= 0; bits-= 6) {
2595 *out++ = ((c >> bits) & 0x3F) | 0x80;
2596 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002597
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002598 if (out - buffer > buffer_size - 100) {
2599 int indx = out - buffer;
2600
2601 growBuffer(buffer);
2602 out = &buffer[indx];
2603 }
Owen Taylor3473f882001-02-23 17:55:21 +00002604 } else {
2605 ent = htmlParseEntityRef(ctxt, &name);
2606 if (name == NULL) {
2607 *out++ = '&';
2608 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002609 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002610
2611 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002612 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002613 }
2614 } else if (ent == NULL) {
2615 *out++ = '&';
2616 cur = name;
2617 while (*cur != 0) {
2618 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002619 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002620
2621 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002622 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002623 }
2624 *out++ = *cur++;
2625 }
Owen Taylor3473f882001-02-23 17:55:21 +00002626 } else {
2627 unsigned int c;
2628 int bits;
2629
2630 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002631 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002632
2633 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002634 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002635 }
Daniel Veillard48519092006-10-17 15:56:35 +00002636 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002637 if (c < 0x80)
2638 { *out++ = c; bits= -6; }
2639 else if (c < 0x800)
2640 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2641 else if (c < 0x10000)
2642 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002643 else
Owen Taylor3473f882001-02-23 17:55:21 +00002644 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002645
Owen Taylor3473f882001-02-23 17:55:21 +00002646 for ( ; bits >= 0; bits-= 6) {
2647 *out++ = ((c >> bits) & 0x3F) | 0x80;
2648 }
Owen Taylor3473f882001-02-23 17:55:21 +00002649 }
2650 }
2651 } else {
2652 unsigned int c;
2653 int bits, l;
2654
2655 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002656 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002657
2658 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002659 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002660 }
2661 c = CUR_CHAR(l);
2662 if (c < 0x80)
2663 { *out++ = c; bits= -6; }
2664 else if (c < 0x800)
2665 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2666 else if (c < 0x10000)
2667 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002668 else
Owen Taylor3473f882001-02-23 17:55:21 +00002669 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002670
Owen Taylor3473f882001-02-23 17:55:21 +00002671 for ( ; bits >= 0; bits-= 6) {
2672 *out++ = ((c >> bits) & 0x3F) | 0x80;
2673 }
2674 NEXT;
2675 }
2676 }
Daniel Veillard13cee4e2009-09-05 14:52:55 +02002677 *out = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002678 return(buffer);
2679}
2680
2681/**
Owen Taylor3473f882001-02-23 17:55:21 +00002682 * htmlParseEntityRef:
2683 * @ctxt: an HTML parser context
2684 * @str: location to store the entity name
2685 *
2686 * parse an HTML ENTITY references
2687 *
2688 * [68] EntityRef ::= '&' Name ';'
2689 *
2690 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2691 * if non-NULL *str will have to be freed by the caller.
2692 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002693const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002694htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2695 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002696 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002697
2698 if (str != NULL) *str = NULL;
2699 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002700
2701 if (CUR == '&') {
2702 NEXT;
2703 name = htmlParseName(ctxt);
2704 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002705 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2706 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002707 } else {
2708 GROW;
2709 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002710 if (str != NULL)
2711 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002712
2713 /*
2714 * Lookup the entity in the table.
2715 */
2716 ent = htmlEntityLookup(name);
2717 if (ent != NULL) /* OK that's ugly !!! */
2718 NEXT;
2719 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002720 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2721 "htmlParseEntityRef: expecting ';'\n",
2722 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002723 if (str != NULL)
2724 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002725 }
2726 }
2727 }
2728 return(ent);
2729}
2730
2731/**
2732 * htmlParseAttValue:
2733 * @ctxt: an HTML parser context
2734 *
2735 * parse a value for an attribute
2736 * Note: the parser won't do substitution of entities here, this
2737 * will be handled later in xmlStringGetNodeList, unless it was
Daniel Veillarde77db162009-08-22 11:32:38 +02002738 * asked for ctxt->replaceEntities != 0
Owen Taylor3473f882001-02-23 17:55:21 +00002739 *
2740 * Returns the AttValue parsed or NULL.
2741 */
2742
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002743static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002744htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2745 xmlChar *ret = NULL;
2746
2747 if (CUR == '"') {
2748 NEXT;
2749 ret = htmlParseHTMLAttribute(ctxt, '"');
2750 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002751 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2752 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002753 } else
2754 NEXT;
2755 } else if (CUR == '\'') {
2756 NEXT;
2757 ret = htmlParseHTMLAttribute(ctxt, '\'');
2758 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002759 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2760 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002761 } else
2762 NEXT;
2763 } else {
2764 /*
2765 * That's an HTMLism, the attribute value may not be quoted
2766 */
2767 ret = htmlParseHTMLAttribute(ctxt, 0);
2768 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002769 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2770 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002771 }
2772 }
2773 return(ret);
2774}
2775
2776/**
2777 * htmlParseSystemLiteral:
2778 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02002779 *
Owen Taylor3473f882001-02-23 17:55:21 +00002780 * parse an HTML Literal
2781 *
2782 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2783 *
2784 * Returns the SystemLiteral parsed or NULL
2785 */
2786
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002787static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002788htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
Pranjal Jumde11ed4a72016-03-02 15:52:24 -08002789 size_t len = 0, startPosition = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002790 xmlChar *ret = NULL;
2791
2792 if (CUR == '"') {
2793 NEXT;
Pranjal Jumde11ed4a72016-03-02 15:52:24 -08002794
2795 if (CUR_PTR < BASE_PTR)
2796 return(ret);
2797 startPosition = CUR_PTR - BASE_PTR;
2798
2799 while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002800 NEXT;
Pranjal Jumde11ed4a72016-03-02 15:52:24 -08002801 len++;
2802 }
William M. Brack76e95df2003-10-18 16:20:14 +00002803 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002804 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2805 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002806 } else {
Pranjal Jumde11ed4a72016-03-02 15:52:24 -08002807 ret = xmlStrndup((BASE_PTR+startPosition), len);
Owen Taylor3473f882001-02-23 17:55:21 +00002808 NEXT;
2809 }
2810 } else if (CUR == '\'') {
2811 NEXT;
Pranjal Jumde11ed4a72016-03-02 15:52:24 -08002812
2813 if (CUR_PTR < BASE_PTR)
2814 return(ret);
2815 startPosition = CUR_PTR - BASE_PTR;
2816
2817 while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002818 NEXT;
Pranjal Jumde11ed4a72016-03-02 15:52:24 -08002819 len++;
2820 }
William M. Brack76e95df2003-10-18 16:20:14 +00002821 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002822 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2823 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002824 } else {
Pranjal Jumde11ed4a72016-03-02 15:52:24 -08002825 ret = xmlStrndup((BASE_PTR+startPosition), len);
Owen Taylor3473f882001-02-23 17:55:21 +00002826 NEXT;
2827 }
2828 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002829 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2830 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002831 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002832
Owen Taylor3473f882001-02-23 17:55:21 +00002833 return(ret);
2834}
2835
2836/**
2837 * htmlParsePubidLiteral:
2838 * @ctxt: an HTML parser context
2839 *
2840 * parse an HTML public literal
2841 *
2842 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2843 *
2844 * Returns the PubidLiteral parsed or NULL.
2845 */
2846
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002847static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002848htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
Pranjal Jumde11ed4a72016-03-02 15:52:24 -08002849 size_t len = 0, startPosition = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002850 xmlChar *ret = NULL;
2851 /*
2852 * Name ::= (Letter | '_') (NameChar)*
2853 */
2854 if (CUR == '"') {
2855 NEXT;
Pranjal Jumde11ed4a72016-03-02 15:52:24 -08002856
2857 if (CUR_PTR < BASE_PTR)
2858 return(ret);
2859 startPosition = CUR_PTR - BASE_PTR;
2860
2861 while (IS_PUBIDCHAR_CH(CUR)) {
2862 len++;
2863 NEXT;
2864 }
2865
Owen Taylor3473f882001-02-23 17:55:21 +00002866 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002867 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2868 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002869 } else {
Pranjal Jumde11ed4a72016-03-02 15:52:24 -08002870 ret = xmlStrndup((BASE_PTR + startPosition), len);
Owen Taylor3473f882001-02-23 17:55:21 +00002871 NEXT;
2872 }
2873 } else if (CUR == '\'') {
2874 NEXT;
Pranjal Jumde11ed4a72016-03-02 15:52:24 -08002875
2876 if (CUR_PTR < BASE_PTR)
2877 return(ret);
2878 startPosition = CUR_PTR - BASE_PTR;
2879
2880 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
2881 len++;
2882 NEXT;
2883 }
2884
Daniel Veillard6560a422003-03-27 21:25:38 +00002885 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002886 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2887 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002888 } else {
Pranjal Jumde11ed4a72016-03-02 15:52:24 -08002889 ret = xmlStrndup((BASE_PTR + startPosition), len);
Owen Taylor3473f882001-02-23 17:55:21 +00002890 NEXT;
2891 }
2892 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002893 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2894 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002895 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002896
Owen Taylor3473f882001-02-23 17:55:21 +00002897 return(ret);
2898}
2899
2900/**
2901 * htmlParseScript:
2902 * @ctxt: an HTML parser context
2903 *
2904 * parse the content of an HTML SCRIPT or STYLE element
2905 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2906 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2907 * http://www.w3.org/TR/html4/types.html#type-script
2908 * http://www.w3.org/TR/html4/types.html#h-6.15
2909 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2910 *
2911 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2912 * element and the value of intrinsic event attributes. User agents must
2913 * not evaluate script data as HTML markup but instead must pass it on as
2914 * data to a script engine.
2915 * NOTES:
2916 * - The content is passed like CDATA
2917 * - the attributes for style and scripting "onXXX" are also described
2918 * as CDATA but SGML allows entities references in attributes so their
2919 * processing is identical as other attributes
2920 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002921static void
Owen Taylor3473f882001-02-23 17:55:21 +00002922htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002923 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002924 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002925 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002926
2927 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002928 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002929 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002930 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002931 /*
2932 * One should break here, the specification is clear:
2933 * Authors should therefore escape "</" within the content.
2934 * Escape mechanisms are specific to each scripting or
2935 * style sheet language.
2936 *
2937 * In recovery mode, only break if end tag match the
2938 * current tag, effectively ignoring all tags inside the
2939 * script/style block and treating the entire block as
2940 * CDATA.
2941 */
2942 if (ctxt->recovery) {
Daniel Veillarde77db162009-08-22 11:32:38 +02002943 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2944 xmlStrlen(ctxt->name)) == 0)
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002945 {
2946 break; /* while */
2947 } else {
2948 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002949 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002950 ctxt->name, NULL);
2951 }
2952 } else {
2953 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002954 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002955 {
2956 break; /* while */
2957 }
2958 }
Owen Taylor3473f882001-02-23 17:55:21 +00002959 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002960 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002961 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2962 if (ctxt->sax->cdataBlock!= NULL) {
2963 /*
2964 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2965 */
2966 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002967 } else if (ctxt->sax->characters != NULL) {
2968 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002969 }
2970 nbchar = 0;
2971 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002972 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002973 NEXTL(l);
2974 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002975 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002976
Daniel Veillard68716a72006-10-16 09:32:17 +00002977 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Pierre Belziled4b54472010-11-04 10:18:17 +01002978 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2979 "Invalid char in CDATA 0x%X\n", cur);
2980 if (ctxt->input->cur < ctxt->input->end) {
2981 NEXT;
2982 }
Owen Taylor3473f882001-02-23 17:55:21 +00002983 }
2984
2985 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2986 if (ctxt->sax->cdataBlock!= NULL) {
2987 /*
2988 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2989 */
2990 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002991 } else if (ctxt->sax->characters != NULL) {
2992 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002993 }
2994 }
2995}
2996
2997
2998/**
Daniel Veillard140c2512015-06-30 11:36:28 +08002999 * htmlParseCharDataInternal:
Owen Taylor3473f882001-02-23 17:55:21 +00003000 * @ctxt: an HTML parser context
Daniel Veillard140c2512015-06-30 11:36:28 +08003001 * @readahead: optional read ahead character in ascii range
Owen Taylor3473f882001-02-23 17:55:21 +00003002 *
3003 * parse a CharData section.
3004 * if we are within a CDATA section ']]>' marks an end of section.
3005 *
3006 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3007 */
3008
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003009static void
Daniel Veillard140c2512015-06-30 11:36:28 +08003010htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3011 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
Owen Taylor3473f882001-02-23 17:55:21 +00003012 int nbchar = 0;
3013 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00003014 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003015
Daniel Veillard140c2512015-06-30 11:36:28 +08003016 if (readahead)
3017 buf[nbchar++] = readahead;
3018
Owen Taylor3473f882001-02-23 17:55:21 +00003019 SHRINK;
3020 cur = CUR_CHAR(l);
3021 while (((cur != '<') || (ctxt->token == '<')) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003022 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00003023 (cur != 0)) {
3024 if (!(IS_CHAR(cur))) {
3025 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3026 "Invalid char in CDATA 0x%X\n", cur);
3027 } else {
3028 COPY_BUF(l,buf,nbchar,cur);
3029 }
Owen Taylor3473f882001-02-23 17:55:21 +00003030 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3031 /*
3032 * Ok the segment is to be consumed as chars.
3033 */
3034 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3035 if (areBlanks(ctxt, buf, nbchar)) {
Daniel Veillardf933c892012-09-07 19:32:12 +08003036 if (ctxt->keepBlanks) {
3037 if (ctxt->sax->characters != NULL)
3038 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3039 } else {
3040 if (ctxt->sax->ignorableWhitespace != NULL)
3041 ctxt->sax->ignorableWhitespace(ctxt->userData,
3042 buf, nbchar);
3043 }
Owen Taylor3473f882001-02-23 17:55:21 +00003044 } else {
3045 htmlCheckParagraph(ctxt);
3046 if (ctxt->sax->characters != NULL)
3047 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3048 }
3049 }
3050 nbchar = 0;
3051 }
3052 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00003053 chunk++;
3054 if (chunk > HTML_PARSER_BUFFER_SIZE) {
3055 chunk = 0;
3056 SHRINK;
3057 GROW;
3058 }
Owen Taylor3473f882001-02-23 17:55:21 +00003059 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00003060 if (cur == 0) {
3061 SHRINK;
3062 GROW;
3063 cur = CUR_CHAR(l);
3064 }
Owen Taylor3473f882001-02-23 17:55:21 +00003065 }
3066 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00003067 buf[nbchar] = 0;
3068
Owen Taylor3473f882001-02-23 17:55:21 +00003069 /*
3070 * Ok the segment is to be consumed as chars.
3071 */
3072 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3073 if (areBlanks(ctxt, buf, nbchar)) {
Daniel Veillardf933c892012-09-07 19:32:12 +08003074 if (ctxt->keepBlanks) {
3075 if (ctxt->sax->characters != NULL)
3076 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3077 } else {
3078 if (ctxt->sax->ignorableWhitespace != NULL)
3079 ctxt->sax->ignorableWhitespace(ctxt->userData,
3080 buf, nbchar);
3081 }
Owen Taylor3473f882001-02-23 17:55:21 +00003082 } else {
3083 htmlCheckParagraph(ctxt);
3084 if (ctxt->sax->characters != NULL)
3085 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3086 }
3087 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00003088 } else {
3089 /*
3090 * Loop detection
3091 */
3092 if (cur == 0)
3093 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00003094 }
3095}
3096
3097/**
Daniel Veillard140c2512015-06-30 11:36:28 +08003098 * htmlParseCharData:
3099 * @ctxt: an HTML parser context
3100 *
3101 * parse a CharData section.
3102 * if we are within a CDATA section ']]>' marks an end of section.
3103 *
3104 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3105 */
3106
3107static void
3108htmlParseCharData(htmlParserCtxtPtr ctxt) {
3109 htmlParseCharDataInternal(ctxt, 0);
3110}
3111
3112/**
Owen Taylor3473f882001-02-23 17:55:21 +00003113 * htmlParseExternalID:
3114 * @ctxt: an HTML parser context
3115 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00003116 *
3117 * Parse an External ID or a Public ID
3118 *
Owen Taylor3473f882001-02-23 17:55:21 +00003119 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3120 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3121 *
3122 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3123 *
3124 * Returns the function returns SystemLiteral and in the second
3125 * case publicID receives PubidLiteral, is strict is off
3126 * it is possible to return NULL and have publicID set.
3127 */
3128
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003129static xmlChar *
3130htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00003131 xmlChar *URI = NULL;
3132
3133 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3134 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3135 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3136 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00003137 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003138 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3139 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003140 }
3141 SKIP_BLANKS;
3142 URI = htmlParseSystemLiteral(ctxt);
3143 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003144 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3145 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003146 }
3147 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3148 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3149 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3150 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00003151 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003152 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3153 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003154 }
3155 SKIP_BLANKS;
3156 *publicID = htmlParsePubidLiteral(ctxt);
3157 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003158 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3159 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3160 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003161 }
3162 SKIP_BLANKS;
3163 if ((CUR == '"') || (CUR == '\'')) {
3164 URI = htmlParseSystemLiteral(ctxt);
3165 }
3166 }
3167 return(URI);
3168}
3169
3170/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003171 * xmlParsePI:
3172 * @ctxt: an XML parser context
3173 *
3174 * parse an XML Processing Instruction.
3175 *
3176 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3177 */
3178static void
3179htmlParsePI(htmlParserCtxtPtr ctxt) {
3180 xmlChar *buf = NULL;
3181 int len = 0;
3182 int size = HTML_PARSER_BUFFER_SIZE;
3183 int cur, l;
3184 const xmlChar *target;
3185 xmlParserInputState state;
3186 int count = 0;
3187
3188 if ((RAW == '<') && (NXT(1) == '?')) {
3189 state = ctxt->instate;
3190 ctxt->instate = XML_PARSER_PI;
3191 /*
3192 * this is a Processing Instruction.
3193 */
3194 SKIP(2);
3195 SHRINK;
3196
3197 /*
3198 * Parse the target name and check for special support like
3199 * namespace.
3200 */
3201 target = htmlParseName(ctxt);
3202 if (target != NULL) {
3203 if (RAW == '>') {
3204 SKIP(1);
3205
3206 /*
3207 * SAX: PI detected.
3208 */
3209 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3210 (ctxt->sax->processingInstruction != NULL))
3211 ctxt->sax->processingInstruction(ctxt->userData,
3212 target, NULL);
3213 ctxt->instate = state;
3214 return;
3215 }
3216 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3217 if (buf == NULL) {
3218 htmlErrMemory(ctxt, NULL);
3219 ctxt->instate = state;
3220 return;
3221 }
3222 cur = CUR;
3223 if (!IS_BLANK(cur)) {
3224 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3225 "ParsePI: PI %s space expected\n", target, NULL);
3226 }
3227 SKIP_BLANKS;
3228 cur = CUR_CHAR(l);
3229 while (IS_CHAR(cur) && (cur != '>')) {
3230 if (len + 5 >= size) {
3231 xmlChar *tmp;
3232
3233 size *= 2;
3234 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3235 if (tmp == NULL) {
3236 htmlErrMemory(ctxt, NULL);
3237 xmlFree(buf);
3238 ctxt->instate = state;
3239 return;
3240 }
3241 buf = tmp;
3242 }
3243 count++;
3244 if (count > 50) {
3245 GROW;
3246 count = 0;
3247 }
3248 COPY_BUF(l,buf,len,cur);
3249 NEXTL(l);
3250 cur = CUR_CHAR(l);
3251 if (cur == 0) {
3252 SHRINK;
3253 GROW;
3254 cur = CUR_CHAR(l);
3255 }
3256 }
3257 buf[len] = 0;
3258 if (cur != '>') {
3259 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3260 "ParsePI: PI %s never end ...\n", target, NULL);
3261 } else {
3262 SKIP(1);
3263
3264 /*
3265 * SAX: PI detected.
3266 */
3267 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3268 (ctxt->sax->processingInstruction != NULL))
3269 ctxt->sax->processingInstruction(ctxt->userData,
3270 target, buf);
3271 }
3272 xmlFree(buf);
3273 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02003274 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003275 "PI is not started correctly", NULL, NULL);
3276 }
3277 ctxt->instate = state;
3278 }
3279}
3280
3281/**
Owen Taylor3473f882001-02-23 17:55:21 +00003282 * htmlParseComment:
3283 * @ctxt: an HTML parser context
3284 *
3285 * Parse an XML (SGML) comment <!-- .... -->
3286 *
3287 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3288 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003289static void
Owen Taylor3473f882001-02-23 17:55:21 +00003290htmlParseComment(htmlParserCtxtPtr ctxt) {
3291 xmlChar *buf = NULL;
3292 int len;
3293 int size = HTML_PARSER_BUFFER_SIZE;
3294 int q, ql;
3295 int r, rl;
3296 int cur, l;
3297 xmlParserInputState state;
3298
3299 /*
3300 * Check that there is a comment right here.
3301 */
3302 if ((RAW != '<') || (NXT(1) != '!') ||
3303 (NXT(2) != '-') || (NXT(3) != '-')) return;
3304
3305 state = ctxt->instate;
3306 ctxt->instate = XML_PARSER_COMMENT;
3307 SHRINK;
3308 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003309 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003310 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003311 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003312 ctxt->instate = state;
3313 return;
3314 }
Daniel Veillarde7248792015-10-30 21:14:55 +08003315 len = 0;
3316 buf[len] = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003317 q = CUR_CHAR(ql);
Daniel Veillarde7248792015-10-30 21:14:55 +08003318 if (!IS_CHAR(q))
3319 goto unfinished;
Owen Taylor3473f882001-02-23 17:55:21 +00003320 NEXTL(ql);
3321 r = CUR_CHAR(rl);
Daniel Veillarde7248792015-10-30 21:14:55 +08003322 if (!IS_CHAR(r))
3323 goto unfinished;
Owen Taylor3473f882001-02-23 17:55:21 +00003324 NEXTL(rl);
3325 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00003326 while (IS_CHAR(cur) &&
3327 ((cur != '>') ||
3328 (r != '-') || (q != '-'))) {
3329 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003330 xmlChar *tmp;
3331
Owen Taylor3473f882001-02-23 17:55:21 +00003332 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003333 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3334 if (tmp == NULL) {
3335 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003336 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003337 ctxt->instate = state;
3338 return;
3339 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003340 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003341 }
3342 COPY_BUF(ql,buf,len,q);
3343 q = r;
3344 ql = rl;
3345 r = cur;
3346 rl = l;
3347 NEXTL(l);
3348 cur = CUR_CHAR(l);
3349 if (cur == 0) {
3350 SHRINK;
3351 GROW;
3352 cur = CUR_CHAR(l);
3353 }
3354 }
3355 buf[len] = 0;
Daniel Veillarde7248792015-10-30 21:14:55 +08003356 if (IS_CHAR(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003357 NEXT;
3358 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3359 (!ctxt->disableSAX))
3360 ctxt->sax->comment(ctxt->userData, buf);
3361 xmlFree(buf);
Daniel Veillarde7248792015-10-30 21:14:55 +08003362 ctxt->instate = state;
3363 return;
Owen Taylor3473f882001-02-23 17:55:21 +00003364 }
Daniel Veillarde7248792015-10-30 21:14:55 +08003365
3366unfinished:
3367 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3368 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3369 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00003370}
3371
3372/**
3373 * htmlParseCharRef:
3374 * @ctxt: an HTML parser context
3375 *
3376 * parse Reference declarations
3377 *
3378 * [66] CharRef ::= '&#' [0-9]+ ';' |
3379 * '&#x' [0-9a-fA-F]+ ';'
3380 *
3381 * Returns the value parsed (as an int)
3382 */
3383int
3384htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3385 int val = 0;
3386
Daniel Veillarda03e3652004-11-02 18:45:30 +00003387 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3388 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3389 "htmlParseCharRef: context error\n",
3390 NULL, NULL);
3391 return(0);
3392 }
Owen Taylor3473f882001-02-23 17:55:21 +00003393 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003394 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003395 SKIP(3);
3396 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003397 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003398 val = val * 16 + (CUR - '0');
3399 else if ((CUR >= 'a') && (CUR <= 'f'))
3400 val = val * 16 + (CUR - 'a') + 10;
3401 else if ((CUR >= 'A') && (CUR <= 'F'))
3402 val = val * 16 + (CUR - 'A') + 10;
3403 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003404 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Michael Dayaf58ee12010-08-02 13:43:28 +02003405 "htmlParseCharRef: missing semicolon\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003406 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003407 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003408 }
3409 NEXT;
3410 }
3411 if (CUR == ';')
3412 NEXT;
3413 } else if ((CUR == '&') && (NXT(1) == '#')) {
3414 SKIP(2);
3415 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003416 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003417 val = val * 10 + (CUR - '0');
3418 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003419 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Michael Dayaf58ee12010-08-02 13:43:28 +02003420 "htmlParseCharRef: missing semicolon\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003421 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003422 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003423 }
3424 NEXT;
3425 }
3426 if (CUR == ';')
3427 NEXT;
3428 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003429 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3430 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003431 }
3432 /*
3433 * Check the value IS_CHAR ...
3434 */
3435 if (IS_CHAR(val)) {
3436 return(val);
3437 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003438 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3439 "htmlParseCharRef: invalid xmlChar value %d\n",
3440 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003441 }
3442 return(0);
3443}
3444
3445
3446/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003447 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003448 * @ctxt: an HTML parser context
3449 *
3450 * parse a DOCTYPE declaration
3451 *
Daniel Veillarde77db162009-08-22 11:32:38 +02003452 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
Owen Taylor3473f882001-02-23 17:55:21 +00003453 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3454 */
3455
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003456static void
Owen Taylor3473f882001-02-23 17:55:21 +00003457htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003458 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003459 xmlChar *ExternalID = NULL;
3460 xmlChar *URI = NULL;
3461
3462 /*
3463 * We know that '<!DOCTYPE' has been detected.
3464 */
3465 SKIP(9);
3466
3467 SKIP_BLANKS;
3468
3469 /*
3470 * Parse the DOCTYPE name.
3471 */
3472 name = htmlParseName(ctxt);
3473 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003474 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3475 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3476 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003477 }
3478 /*
3479 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3480 */
3481
3482 SKIP_BLANKS;
3483
3484 /*
3485 * Check for SystemID and ExternalID
3486 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003487 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003488 SKIP_BLANKS;
3489
3490 /*
3491 * We should be at the end of the DOCTYPE declaration.
3492 */
3493 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003494 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3495 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003496 /* We shouldn't try to resynchronize ... */
3497 }
3498 NEXT;
3499
3500 /*
3501 * Create or update the document accordingly to the DOCTYPE
3502 */
3503 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3504 (!ctxt->disableSAX))
3505 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3506
3507 /*
3508 * Cleanup, since we don't use all those identifiers
3509 */
3510 if (URI != NULL) xmlFree(URI);
3511 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003512}
3513
3514/**
3515 * htmlParseAttribute:
3516 * @ctxt: an HTML parser context
3517 * @value: a xmlChar ** used to store the value of the attribute
3518 *
3519 * parse an attribute
3520 *
3521 * [41] Attribute ::= Name Eq AttValue
3522 *
3523 * [25] Eq ::= S? '=' S?
3524 *
3525 * With namespace:
3526 *
3527 * [NS 11] Attribute ::= QName Eq AttValue
3528 *
3529 * Also the case QName == xmlns:??? is handled independently as a namespace
3530 * definition.
3531 *
3532 * Returns the attribute name, and the value in *value.
3533 */
3534
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003535static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003536htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003537 const xmlChar *name;
3538 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003539
3540 *value = NULL;
3541 name = htmlParseHTMLName(ctxt);
3542 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003543 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3544 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003545 return(NULL);
3546 }
3547
3548 /*
3549 * read the value
3550 */
3551 SKIP_BLANKS;
3552 if (CUR == '=') {
3553 NEXT;
3554 SKIP_BLANKS;
3555 val = htmlParseAttValue(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003556 }
3557
3558 *value = val;
3559 return(name);
3560}
3561
3562/**
Denis Pauk868d92d2012-05-10 15:34:57 +08003563 * htmlCheckEncodingDirect:
Owen Taylor3473f882001-02-23 17:55:21 +00003564 * @ctxt: an HTML parser context
3565 * @attvalue: the attribute value
3566 *
Denis Pauk868d92d2012-05-10 15:34:57 +08003567 * Checks an attribute value to detect
Owen Taylor3473f882001-02-23 17:55:21 +00003568 * the encoding
3569 * If a new encoding is detected the parser is switched to decode
3570 * it and pass UTF8
3571 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003572static void
Denis Pauk868d92d2012-05-10 15:34:57 +08003573htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
Owen Taylor3473f882001-02-23 17:55:21 +00003574
Denis Pauk868d92d2012-05-10 15:34:57 +08003575 if ((ctxt == NULL) || (encoding == NULL) ||
Daniel Veillardc62efc82011-05-16 16:03:50 +08003576 (ctxt->options & HTML_PARSE_IGNORE_ENC))
Owen Taylor3473f882001-02-23 17:55:21 +00003577 return;
3578
Daniel Veillarde77db162009-08-22 11:32:38 +02003579 /* do not change encoding */
Owen Taylor3473f882001-02-23 17:55:21 +00003580 if (ctxt->input->encoding != NULL)
3581 return;
3582
Owen Taylor3473f882001-02-23 17:55:21 +00003583 if (encoding != NULL) {
3584 xmlCharEncoding enc;
3585 xmlCharEncodingHandlerPtr handler;
3586
3587 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3588
3589 if (ctxt->input->encoding != NULL)
3590 xmlFree((xmlChar *) ctxt->input->encoding);
3591 ctxt->input->encoding = xmlStrdup(encoding);
3592
3593 enc = xmlParseCharEncoding((const char *) encoding);
3594 /*
3595 * registered set of known encodings
3596 */
3597 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillarde77db162009-08-22 11:32:38 +02003598 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
Daniel Veillard7e303562006-10-16 13:14:55 +00003599 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3600 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3601 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3602 (ctxt->input->buf != NULL) &&
3603 (ctxt->input->buf->encoder == NULL)) {
3604 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3605 "htmlCheckEncoding: wrong encoding meta\n",
3606 NULL, NULL);
3607 } else {
3608 xmlSwitchEncoding(ctxt, enc);
3609 }
Owen Taylor3473f882001-02-23 17:55:21 +00003610 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3611 } else {
3612 /*
3613 * fallback for unknown encodings
3614 */
3615 handler = xmlFindCharEncodingHandler((const char *) encoding);
3616 if (handler != NULL) {
3617 xmlSwitchToEncoding(ctxt, handler);
3618 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3619 } else {
Daniel Veillardc62efc82011-05-16 16:03:50 +08003620 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3621 "htmlCheckEncoding: unknown encoding %s\n",
3622 encoding, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003623 }
3624 }
3625
3626 if ((ctxt->input->buf != NULL) &&
3627 (ctxt->input->buf->encoder != NULL) &&
3628 (ctxt->input->buf->raw != NULL) &&
3629 (ctxt->input->buf->buffer != NULL)) {
3630 int nbchars;
3631 int processed;
3632
3633 /*
3634 * convert as much as possible to the parser reading buffer.
3635 */
3636 processed = ctxt->input->cur - ctxt->input->base;
Daniel Veillarda78d8032012-07-16 14:56:50 +08003637 xmlBufShrink(ctxt->input->buf->buffer, processed);
Daniel Veillardbf058dc2013-02-13 18:19:42 +08003638 nbchars = xmlCharEncInput(ctxt->input->buf, 1);
Owen Taylor3473f882001-02-23 17:55:21 +00003639 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003640 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3641 "htmlCheckEncoding: encoder error\n",
3642 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003643 }
Daniel Veillard61551a12012-07-16 16:28:47 +08003644 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
Owen Taylor3473f882001-02-23 17:55:21 +00003645 }
3646 }
3647}
3648
3649/**
Denis Pauk868d92d2012-05-10 15:34:57 +08003650 * htmlCheckEncoding:
3651 * @ctxt: an HTML parser context
3652 * @attvalue: the attribute value
3653 *
3654 * Checks an http-equiv attribute from a Meta tag to detect
3655 * the encoding
3656 * If a new encoding is detected the parser is switched to decode
3657 * it and pass UTF8
3658 */
3659static void
3660htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3661 const xmlChar *encoding;
3662
3663 if (!attvalue)
3664 return;
3665
3666 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3667 if (encoding != NULL) {
3668 encoding += 7;
3669 }
3670 /*
3671 * skip blank
3672 */
3673 if (encoding && IS_BLANK_CH(*encoding))
3674 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3675 if (encoding && *encoding == '=') {
3676 encoding ++;
3677 htmlCheckEncodingDirect(ctxt, encoding);
3678 }
3679}
3680
3681/**
Owen Taylor3473f882001-02-23 17:55:21 +00003682 * htmlCheckMeta:
3683 * @ctxt: an HTML parser context
3684 * @atts: the attributes values
3685 *
3686 * Checks an attributes from a Meta tag
3687 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003688static void
Owen Taylor3473f882001-02-23 17:55:21 +00003689htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3690 int i;
3691 const xmlChar *att, *value;
3692 int http = 0;
3693 const xmlChar *content = NULL;
3694
3695 if ((ctxt == NULL) || (atts == NULL))
3696 return;
3697
3698 i = 0;
3699 att = atts[i++];
3700 while (att != NULL) {
3701 value = atts[i++];
3702 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3703 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3704 http = 1;
Denis Pauk868d92d2012-05-10 15:34:57 +08003705 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3706 htmlCheckEncodingDirect(ctxt, value);
Owen Taylor3473f882001-02-23 17:55:21 +00003707 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3708 content = value;
3709 att = atts[i++];
3710 }
3711 if ((http) && (content != NULL))
3712 htmlCheckEncoding(ctxt, content);
3713
3714}
3715
3716/**
3717 * htmlParseStartTag:
3718 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003719 *
Owen Taylor3473f882001-02-23 17:55:21 +00003720 * parse a start of tag either for rule element or
3721 * EmptyElement. In both case we don't parse the tag closing chars.
3722 *
3723 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3724 *
3725 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3726 *
3727 * With namespace:
3728 *
3729 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3730 *
3731 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3732 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003733 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003734 */
3735
Daniel Veillard597f1c12005-07-03 23:00:18 +00003736static int
Owen Taylor3473f882001-02-23 17:55:21 +00003737htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003738 const xmlChar *name;
3739 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003740 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003741 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003742 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003743 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003744 int meta = 0;
3745 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003746 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003747
Daniel Veillarda03e3652004-11-02 18:45:30 +00003748 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3749 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3750 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003751 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003752 }
Gaurav3e0eec42014-06-13 14:45:20 +08003753 if (ctxt->instate == XML_PARSER_EOF)
3754 return(-1);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003755 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003756 NEXT;
3757
Daniel Veillard30e76072006-03-09 14:13:55 +00003758 atts = ctxt->atts;
3759 maxatts = ctxt->maxatts;
3760
Owen Taylor3473f882001-02-23 17:55:21 +00003761 GROW;
3762 name = htmlParseHTMLName(ctxt);
3763 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003764 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3765 "htmlParseStartTag: invalid element name\n",
3766 NULL, NULL);
Daniel Veillard140c2512015-06-30 11:36:28 +08003767 /* if recover preserve text on classic misconstructs */
3768 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3769 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3770 htmlParseCharDataInternal(ctxt, '<');
3771 return(-1);
3772 }
3773
3774
Owen Taylor3473f882001-02-23 17:55:21 +00003775 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003776 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3777 (ctxt->instate != XML_PARSER_EOF))
Owen Taylor3473f882001-02-23 17:55:21 +00003778 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003779 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003780 }
3781 if (xmlStrEqual(name, BAD_CAST"meta"))
3782 meta = 1;
3783
3784 /*
3785 * Check for auto-closure of HTML elements.
3786 */
3787 htmlAutoClose(ctxt, name);
3788
3789 /*
3790 * Check for implied HTML elements.
3791 */
3792 htmlCheckImplied(ctxt, name);
3793
3794 /*
3795 * Avoid html at any level > 0, head at any level != 1
3796 * or any attempt to recurse body
3797 */
3798 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003799 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3800 "htmlParseStartTag: misplaced <html> tag\n",
3801 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003802 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003803 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003804 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003805 if ((ctxt->nameNr != 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003806 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003807 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3808 "htmlParseStartTag: misplaced <head> tag\n",
3809 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003810 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003811 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003812 }
3813 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003814 int indx;
3815 for (indx = 0;indx < ctxt->nameNr;indx++) {
3816 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003817 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3818 "htmlParseStartTag: misplaced <body> tag\n",
3819 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003820 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003821 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003822 }
3823 }
3824 }
3825
3826 /*
3827 * Now parse the attributes, it ends up with the ending
3828 *
3829 * (S Attribute)* S?
3830 */
3831 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003832 while ((IS_CHAR_CH(CUR)) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003833 (CUR != '>') &&
Owen Taylor3473f882001-02-23 17:55:21 +00003834 ((CUR != '/') || (NXT(1) != '>'))) {
3835 long cons = ctxt->nbChars;
3836
3837 GROW;
3838 attname = htmlParseAttribute(ctxt, &attvalue);
3839 if (attname != NULL) {
3840
3841 /*
3842 * Well formedness requires at most one declaration of an attribute
3843 */
3844 for (i = 0; i < nbatts;i += 2) {
3845 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003846 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3847 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003848 if (attvalue != NULL)
3849 xmlFree(attvalue);
3850 goto failed;
3851 }
3852 }
3853
3854 /*
3855 * Add the pair to atts
3856 */
3857 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003858 maxatts = 22; /* allow for 10 attrs by default */
3859 atts = (const xmlChar **)
3860 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003861 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003862 htmlErrMemory(ctxt, NULL);
3863 if (attvalue != NULL)
3864 xmlFree(attvalue);
3865 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003866 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003867 ctxt->atts = atts;
3868 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003869 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003870 const xmlChar **n;
3871
Owen Taylor3473f882001-02-23 17:55:21 +00003872 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003873 n = (const xmlChar **) xmlRealloc((void *) atts,
3874 maxatts * sizeof(const xmlChar *));
3875 if (n == NULL) {
3876 htmlErrMemory(ctxt, NULL);
3877 if (attvalue != NULL)
3878 xmlFree(attvalue);
3879 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003880 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003881 atts = n;
3882 ctxt->atts = atts;
3883 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003884 }
3885 atts[nbatts++] = attname;
3886 atts[nbatts++] = attvalue;
3887 atts[nbatts] = NULL;
3888 atts[nbatts + 1] = NULL;
3889 }
3890 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003891 if (attvalue != NULL)
3892 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003893 /* Dump the bogus attribute string up to the next blank or
3894 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003895 while ((IS_CHAR_CH(CUR)) &&
3896 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003897 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003898 NEXT;
3899 }
3900
3901failed:
3902 SKIP_BLANKS;
3903 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003904 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3905 "htmlParseStartTag: problem parsing attributes\n",
3906 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003907 break;
3908 }
3909 }
3910
3911 /*
3912 * Handle specific association to the META tag
3913 */
William M. Bracke978ae22007-03-21 06:16:02 +00003914 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003915 htmlCheckMeta(ctxt, atts);
3916
3917 /*
3918 * SAX: Start of Element !
3919 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003920 if (!discardtag) {
3921 htmlnamePush(ctxt, name);
3922 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3923 if (nbatts != 0)
3924 ctxt->sax->startElement(ctxt->userData, name, atts);
3925 else
3926 ctxt->sax->startElement(ctxt->userData, name, NULL);
3927 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003928 }
Owen Taylor3473f882001-02-23 17:55:21 +00003929
3930 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003931 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003932 if (atts[i] != NULL)
3933 xmlFree((xmlChar *) atts[i]);
3934 }
Owen Taylor3473f882001-02-23 17:55:21 +00003935 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003936
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003937 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003938}
3939
3940/**
3941 * htmlParseEndTag:
3942 * @ctxt: an HTML parser context
3943 *
3944 * parse an end of tag
3945 *
3946 * [42] ETag ::= '</' Name S? '>'
3947 *
3948 * With namespace
3949 *
3950 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003951 *
3952 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003953 */
3954
Daniel Veillardf420ac52001-07-04 16:04:09 +00003955static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003956htmlParseEndTag(htmlParserCtxtPtr ctxt)
3957{
3958 const xmlChar *name;
3959 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003960 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003961
3962 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003963 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3964 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003965 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003966 }
3967 SKIP(2);
3968
3969 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003970 if (name == NULL)
3971 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003972 /*
3973 * We should definitely be at the ending "S? '>'" part
3974 */
3975 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003976 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003977 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3978 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003979 if (ctxt->recovery) {
3980 /*
3981 * We're not at the ending > !!
3982 * Error, unless in recover mode where we search forwards
3983 * until we find a >
3984 */
3985 while (CUR != '\0' && CUR != '>') NEXT;
3986 NEXT;
3987 }
Owen Taylor3473f882001-02-23 17:55:21 +00003988 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003989 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003990
3991 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003992 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3993 * out now.
3994 */
3995 if ((ctxt->depth > 0) &&
3996 (xmlStrEqual(name, BAD_CAST "html") ||
3997 xmlStrEqual(name, BAD_CAST "body") ||
3998 xmlStrEqual(name, BAD_CAST "head"))) {
3999 ctxt->depth--;
4000 return (0);
4001 }
4002
4003 /*
Owen Taylor3473f882001-02-23 17:55:21 +00004004 * If the name read is not one of the element in the parsing stack
4005 * then return, it's just an error.
4006 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004007 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4008 if (xmlStrEqual(name, ctxt->nameTab[i]))
4009 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004010 }
4011 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004012 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4013 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004014 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00004015 }
4016
4017
4018 /*
4019 * Check for auto-closure of HTML elements.
4020 */
4021
4022 htmlAutoCloseOnClose(ctxt, name);
4023
4024 /*
4025 * Well formedness constraints, opening and closing must match.
4026 * With the exception that the autoclose may have popped stuff out
4027 * of the stack.
4028 */
4029 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004030 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004031 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4032 "Opening and ending tag mismatch: %s and %s\n",
4033 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00004034 }
4035 }
4036
4037 /*
4038 * SAX: End of Tag
4039 */
4040 oldname = ctxt->name;
4041 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004042 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4043 ctxt->sax->endElement(ctxt->userData, name);
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08004044 htmlNodeInfoPop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004045 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004046 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00004047 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004048 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004049 }
4050
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004051 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00004052}
4053
4054
4055/**
4056 * htmlParseReference:
4057 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004058 *
Owen Taylor3473f882001-02-23 17:55:21 +00004059 * parse and handle entity references in content,
4060 * this will end-up in a call to character() since this is either a
4061 * CharRef, or a predefined entity.
4062 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004063static void
Owen Taylor3473f882001-02-23 17:55:21 +00004064htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00004065 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00004066 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004067 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004068 if (CUR != '&') return;
4069
4070 if (NXT(1) == '#') {
4071 unsigned int c;
4072 int bits, i = 0;
4073
4074 c = htmlParseCharRef(ctxt);
4075 if (c == 0)
4076 return;
4077
4078 if (c < 0x80) { out[i++]= c; bits= -6; }
4079 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4080 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4081 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02004082
Owen Taylor3473f882001-02-23 17:55:21 +00004083 for ( ; bits >= 0; bits-= 6) {
4084 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4085 }
4086 out[i] = 0;
4087
4088 htmlCheckParagraph(ctxt);
4089 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4090 ctxt->sax->characters(ctxt->userData, out, i);
4091 } else {
4092 ent = htmlParseEntityRef(ctxt, &name);
4093 if (name == NULL) {
4094 htmlCheckParagraph(ctxt);
4095 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4096 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4097 return;
4098 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00004099 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004100 htmlCheckParagraph(ctxt);
4101 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4102 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4103 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4104 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4105 }
4106 } else {
4107 unsigned int c;
4108 int bits, i = 0;
4109
4110 c = ent->value;
4111 if (c < 0x80)
4112 { out[i++]= c; bits= -6; }
4113 else if (c < 0x800)
4114 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4115 else if (c < 0x10000)
4116 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02004117 else
Owen Taylor3473f882001-02-23 17:55:21 +00004118 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02004119
Owen Taylor3473f882001-02-23 17:55:21 +00004120 for ( ; bits >= 0; bits-= 6) {
4121 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4122 }
4123 out[i] = 0;
4124
4125 htmlCheckParagraph(ctxt);
4126 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4127 ctxt->sax->characters(ctxt->userData, out, i);
4128 }
Owen Taylor3473f882001-02-23 17:55:21 +00004129 }
4130}
4131
4132/**
4133 * htmlParseContent:
4134 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00004135 *
4136 * Parse a content: comment, sub-element, reference or text.
Eugene Pimenov615904f2010-03-15 15:16:02 +01004137 * Kept for compatibility with old code
Owen Taylor3473f882001-02-23 17:55:21 +00004138 */
4139
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004140static void
Owen Taylor3473f882001-02-23 17:55:21 +00004141htmlParseContent(htmlParserCtxtPtr ctxt) {
4142 xmlChar *currentNode;
4143 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004144 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004145
4146 currentNode = xmlStrdup(ctxt->name);
4147 depth = ctxt->nameNr;
4148 while (1) {
4149 long cons = ctxt->nbChars;
4150
4151 GROW;
Daniel Veillarde77db162009-08-22 11:32:38 +02004152
4153 if (ctxt->instate == XML_PARSER_EOF)
4154 break;
4155
Owen Taylor3473f882001-02-23 17:55:21 +00004156 /*
4157 * Our tag or one of it's parent or children is ending.
4158 */
4159 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00004160 if (htmlParseEndTag(ctxt) &&
4161 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4162 if (currentNode != NULL)
4163 xmlFree(currentNode);
4164 return;
4165 }
4166 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00004167 }
4168
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004169 else if ((CUR == '<') &&
4170 ((IS_ASCII_LETTER(NXT(1))) ||
4171 (NXT(1) == '_') || (NXT(1) == ':'))) {
4172 name = htmlParseHTMLName_nonInvasive(ctxt);
4173 if (name == NULL) {
4174 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4175 "htmlParseStartTag: invalid element name\n",
4176 NULL, NULL);
4177 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02004178 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004179 NEXT;
4180
4181 if (currentNode != NULL)
4182 xmlFree(currentNode);
4183 return;
4184 }
4185
4186 if (ctxt->name != NULL) {
4187 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4188 htmlAutoClose(ctxt, name);
4189 continue;
4190 }
Daniel Veillarde77db162009-08-22 11:32:38 +02004191 }
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004192 }
4193
Owen Taylor3473f882001-02-23 17:55:21 +00004194 /*
4195 * Has this node been popped out during parsing of
4196 * the next element
4197 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00004198 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4199 (!xmlStrEqual(currentNode, ctxt->name)))
4200 {
Owen Taylor3473f882001-02-23 17:55:21 +00004201 if (currentNode != NULL) xmlFree(currentNode);
4202 return;
4203 }
4204
Daniel Veillardf9533d12001-03-03 10:04:57 +00004205 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4206 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004207 /*
4208 * Handle SCRIPT/STYLE separately
4209 */
4210 htmlParseScript(ctxt);
4211 } else {
4212 /*
4213 * Sometimes DOCTYPE arrives in the middle of the document
4214 */
4215 if ((CUR == '<') && (NXT(1) == '!') &&
4216 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4217 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4218 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4219 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004220 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4221 "Misplaced DOCTYPE declaration\n",
4222 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004223 htmlParseDocTypeDecl(ctxt);
4224 }
4225
4226 /*
4227 * First case : a comment
4228 */
4229 if ((CUR == '<') && (NXT(1) == '!') &&
4230 (NXT(2) == '-') && (NXT(3) == '-')) {
4231 htmlParseComment(ctxt);
4232 }
4233
4234 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004235 * Second case : a Processing Instruction.
4236 */
4237 else if ((CUR == '<') && (NXT(1) == '?')) {
4238 htmlParsePI(ctxt);
4239 }
4240
4241 /*
4242 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004243 */
4244 else if (CUR == '<') {
4245 htmlParseElement(ctxt);
4246 }
4247
4248 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004249 * Fourth case : a reference. If if has not been resolved,
Daniel Veillarde77db162009-08-22 11:32:38 +02004250 * parsing returns it's Name, create the node
Owen Taylor3473f882001-02-23 17:55:21 +00004251 */
4252 else if (CUR == '&') {
4253 htmlParseReference(ctxt);
4254 }
4255
4256 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004257 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004258 */
4259 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004260 htmlAutoCloseOnEnd(ctxt);
4261 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004262 }
4263
4264 /*
4265 * Last case, text. Note that References are handled directly.
4266 */
4267 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004268 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004269 }
4270
4271 if (cons == ctxt->nbChars) {
4272 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004273 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4274 "detected an error in element content\n",
4275 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004276 }
4277 break;
4278 }
4279 }
4280 GROW;
4281 }
4282 if (currentNode != NULL) xmlFree(currentNode);
4283}
4284
4285/**
4286 * htmlParseElement:
4287 * @ctxt: an HTML parser context
4288 *
4289 * parse an HTML element, this is highly recursive
Eugene Pimenov615904f2010-03-15 15:16:02 +01004290 * this is kept for compatibility with previous code versions
Owen Taylor3473f882001-02-23 17:55:21 +00004291 *
4292 * [39] element ::= EmptyElemTag | STag content ETag
4293 *
4294 * [41] Attribute ::= Name Eq AttValue
4295 */
4296
4297void
4298htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004299 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004300 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004301 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004302 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004303 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004304 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004305 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004306
Daniel Veillarda03e3652004-11-02 18:45:30 +00004307 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4308 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004309 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004310 return;
4311 }
Daniel Veillarddb4ac222009-08-22 17:58:31 +02004312
4313 if (ctxt->instate == XML_PARSER_EOF)
4314 return;
4315
Owen Taylor3473f882001-02-23 17:55:21 +00004316 /* Capture start position */
4317 if (ctxt->record_info) {
4318 node_info.begin_pos = ctxt->input->consumed +
4319 (CUR_PTR - ctxt->input->base);
4320 node_info.begin_line = ctxt->input->line;
4321 }
4322
Daniel Veillard597f1c12005-07-03 23:00:18 +00004323 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004324 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004325 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004326 if (CUR == '>')
4327 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004328 return;
4329 }
Owen Taylor3473f882001-02-23 17:55:21 +00004330
4331 /*
4332 * Lookup the info for that element.
4333 */
4334 info = htmlTagLookup(name);
4335 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004336 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4337 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004338 }
4339
4340 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004341 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004342 */
4343 if ((CUR == '/') && (NXT(1) == '>')) {
4344 SKIP(2);
4345 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4346 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004347 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004348 return;
4349 }
4350
4351 if (CUR == '>') {
4352 NEXT;
4353 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004354 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4355 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004356
4357 /*
4358 * end of parsing of this node.
4359 */
Daniel Veillarde77db162009-08-22 11:32:38 +02004360 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004361 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004362 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004363 }
Owen Taylor3473f882001-02-23 17:55:21 +00004364
4365 /*
4366 * Capture end position and add node
4367 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004368 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004369 node_info.end_pos = ctxt->input->consumed +
4370 (CUR_PTR - ctxt->input->base);
4371 node_info.end_line = ctxt->input->line;
4372 node_info.node = ctxt->node;
4373 xmlParserAddNodeInfo(ctxt, &node_info);
4374 }
4375 return;
4376 }
4377
4378 /*
4379 * Check for an Empty Element from DTD definition
4380 */
4381 if ((info != NULL) && (info->empty)) {
4382 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4383 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004384 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004385 return;
4386 }
4387
4388 /*
4389 * Parse the content of the element:
4390 */
4391 currentNode = xmlStrdup(ctxt->name);
4392 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004393 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004394 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004395 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004396 if (oldptr==ctxt->input->cur) break;
Daniel Veillarde77db162009-08-22 11:32:38 +02004397 if (ctxt->nameNr < depth) break;
4398 }
Owen Taylor3473f882001-02-23 17:55:21 +00004399
Owen Taylor3473f882001-02-23 17:55:21 +00004400 /*
4401 * Capture end position and add node
4402 */
4403 if ( currentNode != NULL && ctxt->record_info ) {
4404 node_info.end_pos = ctxt->input->consumed +
4405 (CUR_PTR - ctxt->input->base);
4406 node_info.end_line = ctxt->input->line;
4407 node_info.node = ctxt->node;
4408 xmlParserAddNodeInfo(ctxt, &node_info);
4409 }
William M. Brack76e95df2003-10-18 16:20:14 +00004410 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004411 htmlAutoCloseOnEnd(ctxt);
4412 }
4413
Owen Taylor3473f882001-02-23 17:55:21 +00004414 if (currentNode != NULL)
4415 xmlFree(currentNode);
4416}
4417
Eugene Pimenov615904f2010-03-15 15:16:02 +01004418static void
4419htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4420 /*
4421 * Capture end position and add node
4422 */
4423 if ( ctxt->node != NULL && ctxt->record_info ) {
4424 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4425 (CUR_PTR - ctxt->input->base);
4426 ctxt->nodeInfo->end_line = ctxt->input->line;
4427 ctxt->nodeInfo->node = ctxt->node;
4428 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4429 htmlNodeInfoPop(ctxt);
4430 }
4431 if (!IS_CHAR_CH(CUR)) {
4432 htmlAutoCloseOnEnd(ctxt);
4433 }
4434}
4435
4436/**
4437 * htmlParseElementInternal:
4438 * @ctxt: an HTML parser context
4439 *
4440 * parse an HTML element, new version, non recursive
4441 *
4442 * [39] element ::= EmptyElemTag | STag content ETag
4443 *
4444 * [41] Attribute ::= Name Eq AttValue
4445 */
4446
4447static void
4448htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4449 const xmlChar *name;
4450 const htmlElemDesc * info;
Nick Wellnhofer9a366a32017-06-11 12:40:01 +02004451 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
Eugene Pimenov615904f2010-03-15 15:16:02 +01004452 int failed;
Eugene Pimenov615904f2010-03-15 15:16:02 +01004453
4454 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4455 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4456 "htmlParseElementInternal: context error\n", NULL, NULL);
4457 return;
4458 }
4459
4460 if (ctxt->instate == XML_PARSER_EOF)
4461 return;
4462
4463 /* Capture start position */
4464 if (ctxt->record_info) {
4465 node_info.begin_pos = ctxt->input->consumed +
4466 (CUR_PTR - ctxt->input->base);
4467 node_info.begin_line = ctxt->input->line;
4468 }
4469
4470 failed = htmlParseStartTag(ctxt);
4471 name = ctxt->name;
4472 if ((failed == -1) || (name == NULL)) {
4473 if (CUR == '>')
4474 NEXT;
4475 return;
4476 }
4477
4478 /*
4479 * Lookup the info for that element.
4480 */
4481 info = htmlTagLookup(name);
4482 if (info == NULL) {
4483 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4484 "Tag %s invalid\n", name, NULL);
4485 }
4486
4487 /*
4488 * Check for an Empty Element labeled the XML/SGML way
4489 */
4490 if ((CUR == '/') && (NXT(1) == '>')) {
4491 SKIP(2);
4492 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4493 ctxt->sax->endElement(ctxt->userData, name);
4494 htmlnamePop(ctxt);
4495 return;
4496 }
4497
4498 if (CUR == '>') {
4499 NEXT;
4500 } else {
4501 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4502 "Couldn't find end of Start Tag %s\n", name, NULL);
4503
4504 /*
4505 * end of parsing of this node.
4506 */
4507 if (xmlStrEqual(name, ctxt->name)) {
4508 nodePop(ctxt);
4509 htmlnamePop(ctxt);
4510 }
4511
4512 if (ctxt->record_info)
4513 htmlNodeInfoPush(ctxt, &node_info);
4514 htmlParserFinishElementParsing(ctxt);
4515 return;
4516 }
4517
4518 /*
4519 * Check for an Empty Element from DTD definition
4520 */
4521 if ((info != NULL) && (info->empty)) {
4522 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4523 ctxt->sax->endElement(ctxt->userData, name);
4524 htmlnamePop(ctxt);
4525 return;
4526 }
4527
4528 if (ctxt->record_info)
4529 htmlNodeInfoPush(ctxt, &node_info);
4530}
4531
4532/**
4533 * htmlParseContentInternal:
4534 * @ctxt: an HTML parser context
4535 *
4536 * Parse a content: comment, sub-element, reference or text.
4537 * New version for non recursive htmlParseElementInternal
4538 */
4539
4540static void
4541htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4542 xmlChar *currentNode;
4543 int depth;
4544 const xmlChar *name;
4545
4546 currentNode = xmlStrdup(ctxt->name);
4547 depth = ctxt->nameNr;
4548 while (1) {
4549 long cons = ctxt->nbChars;
4550
4551 GROW;
4552
4553 if (ctxt->instate == XML_PARSER_EOF)
4554 break;
4555
4556 /*
4557 * Our tag or one of it's parent or children is ending.
4558 */
4559 if ((CUR == '<') && (NXT(1) == '/')) {
4560 if (htmlParseEndTag(ctxt) &&
4561 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4562 if (currentNode != NULL)
4563 xmlFree(currentNode);
4564
4565 currentNode = xmlStrdup(ctxt->name);
4566 depth = ctxt->nameNr;
4567 }
4568 continue; /* while */
4569 }
4570
4571 else if ((CUR == '<') &&
4572 ((IS_ASCII_LETTER(NXT(1))) ||
4573 (NXT(1) == '_') || (NXT(1) == ':'))) {
4574 name = htmlParseHTMLName_nonInvasive(ctxt);
4575 if (name == NULL) {
4576 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4577 "htmlParseStartTag: invalid element name\n",
4578 NULL, NULL);
4579 /* Dump the bogus tag like browsers do */
4580 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4581 NEXT;
4582
4583 htmlParserFinishElementParsing(ctxt);
4584 if (currentNode != NULL)
4585 xmlFree(currentNode);
4586
4587 currentNode = xmlStrdup(ctxt->name);
4588 depth = ctxt->nameNr;
4589 continue;
4590 }
4591
4592 if (ctxt->name != NULL) {
4593 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4594 htmlAutoClose(ctxt, name);
4595 continue;
4596 }
4597 }
4598 }
4599
4600 /*
4601 * Has this node been popped out during parsing of
4602 * the next element
4603 */
4604 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4605 (!xmlStrEqual(currentNode, ctxt->name)))
4606 {
4607 htmlParserFinishElementParsing(ctxt);
4608 if (currentNode != NULL) xmlFree(currentNode);
4609
4610 currentNode = xmlStrdup(ctxt->name);
4611 depth = ctxt->nameNr;
4612 continue;
4613 }
4614
4615 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4616 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4617 /*
4618 * Handle SCRIPT/STYLE separately
4619 */
4620 htmlParseScript(ctxt);
4621 } else {
4622 /*
4623 * Sometimes DOCTYPE arrives in the middle of the document
4624 */
4625 if ((CUR == '<') && (NXT(1) == '!') &&
4626 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4627 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4628 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4629 (UPP(8) == 'E')) {
4630 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4631 "Misplaced DOCTYPE declaration\n",
4632 BAD_CAST "DOCTYPE" , NULL);
4633 htmlParseDocTypeDecl(ctxt);
4634 }
4635
4636 /*
4637 * First case : a comment
4638 */
4639 if ((CUR == '<') && (NXT(1) == '!') &&
4640 (NXT(2) == '-') && (NXT(3) == '-')) {
4641 htmlParseComment(ctxt);
4642 }
4643
4644 /*
4645 * Second case : a Processing Instruction.
4646 */
4647 else if ((CUR == '<') && (NXT(1) == '?')) {
4648 htmlParsePI(ctxt);
4649 }
4650
4651 /*
4652 * Third case : a sub-element.
4653 */
4654 else if (CUR == '<') {
4655 htmlParseElementInternal(ctxt);
4656 if (currentNode != NULL) xmlFree(currentNode);
4657
4658 currentNode = xmlStrdup(ctxt->name);
4659 depth = ctxt->nameNr;
4660 }
4661
4662 /*
4663 * Fourth case : a reference. If if has not been resolved,
4664 * parsing returns it's Name, create the node
4665 */
4666 else if (CUR == '&') {
4667 htmlParseReference(ctxt);
4668 }
4669
4670 /*
4671 * Fifth case : end of the resource
4672 */
4673 else if (CUR == 0) {
4674 htmlAutoCloseOnEnd(ctxt);
4675 break;
4676 }
4677
4678 /*
4679 * Last case, text. Note that References are handled directly.
4680 */
4681 else {
4682 htmlParseCharData(ctxt);
4683 }
4684
4685 if (cons == ctxt->nbChars) {
4686 if (ctxt->node != NULL) {
4687 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4688 "detected an error in element content\n",
4689 NULL, NULL);
4690 }
4691 break;
4692 }
4693 }
4694 GROW;
4695 }
4696 if (currentNode != NULL) xmlFree(currentNode);
4697}
4698
4699/**
4700 * htmlParseContent:
4701 * @ctxt: an HTML parser context
4702 *
4703 * Parse a content: comment, sub-element, reference or text.
4704 * This is the entry point when called from parser.c
4705 */
4706
4707void
4708__htmlParseContent(void *ctxt) {
4709 if (ctxt != NULL)
4710 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4711}
4712
Owen Taylor3473f882001-02-23 17:55:21 +00004713/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004714 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004715 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004716 *
Owen Taylor3473f882001-02-23 17:55:21 +00004717 * parse an HTML document (and build a tree if using the standard SAX
4718 * interface).
4719 *
4720 * Returns 0, -1 in case of error. the parser context is augmented
4721 * as a result of the parsing.
4722 */
4723
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004724int
Owen Taylor3473f882001-02-23 17:55:21 +00004725htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004726 xmlChar start[4];
4727 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004728 xmlDtdPtr dtd;
4729
Daniel Veillardd0463562001-10-13 09:15:48 +00004730 xmlInitParser();
4731
Owen Taylor3473f882001-02-23 17:55:21 +00004732 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004733
Daniel Veillarda03e3652004-11-02 18:45:30 +00004734 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4735 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4736 "htmlParseDocument: context error\n", NULL, NULL);
4737 return(XML_ERR_INTERNAL_ERROR);
4738 }
4739 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004740 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004741 GROW;
4742 /*
4743 * SAX: beginning of the document processing.
4744 */
4745 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4746 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4747
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004748 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4749 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4750 /*
4751 * Get the 4 first bytes and decode the charset
4752 * if enc != XML_CHAR_ENCODING_NONE
4753 * plug some encoding conversion routines.
4754 */
4755 start[0] = RAW;
4756 start[1] = NXT(1);
4757 start[2] = NXT(2);
4758 start[3] = NXT(3);
4759 enc = xmlDetectCharEncoding(&start[0], 4);
4760 if (enc != XML_CHAR_ENCODING_NONE) {
4761 xmlSwitchEncoding(ctxt, enc);
4762 }
4763 }
4764
Owen Taylor3473f882001-02-23 17:55:21 +00004765 /*
4766 * Wipe out everything which is before the first '<'
4767 */
4768 SKIP_BLANKS;
4769 if (CUR == 0) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004770 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
Daniel Veillardf403d292003-10-05 13:51:35 +00004771 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004772 }
4773
4774 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4775 ctxt->sax->startDocument(ctxt->userData);
4776
4777
4778 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004779 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004780 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004781 while (((CUR == '<') && (NXT(1) == '!') &&
4782 (NXT(2) == '-') && (NXT(3) == '-')) ||
4783 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004784 htmlParseComment(ctxt);
4785 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004786 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004787 }
Owen Taylor3473f882001-02-23 17:55:21 +00004788
4789
4790 /*
4791 * Then possibly doc type declaration(s) and more Misc
4792 * (doctypedecl Misc*)?
4793 */
4794 if ((CUR == '<') && (NXT(1) == '!') &&
4795 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4796 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4797 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4798 (UPP(8) == 'E')) {
4799 htmlParseDocTypeDecl(ctxt);
4800 }
4801 SKIP_BLANKS;
4802
4803 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004804 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004805 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004806 while (((CUR == '<') && (NXT(1) == '!') &&
4807 (NXT(2) == '-') && (NXT(3) == '-')) ||
4808 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004809 htmlParseComment(ctxt);
4810 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004811 SKIP_BLANKS;
Daniel Veillarde77db162009-08-22 11:32:38 +02004812 }
Owen Taylor3473f882001-02-23 17:55:21 +00004813
4814 /*
4815 * Time to start parsing the tree itself
4816 */
Eugene Pimenov615904f2010-03-15 15:16:02 +01004817 htmlParseContentInternal(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004818
4819 /*
4820 * autoclose
4821 */
4822 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004823 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004824
4825
4826 /*
4827 * SAX: end of the document processing.
4828 */
4829 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4830 ctxt->sax->endDocument(ctxt->userData);
4831
Daniel Veillardf1121c42010-07-26 14:02:42 +02004832 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004833 dtd = xmlGetIntSubset(ctxt->myDoc);
4834 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02004835 ctxt->myDoc->intSubset =
4836 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004837 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4838 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4839 }
4840 if (! ctxt->wellFormed) return(-1);
4841 return(0);
4842}
4843
4844
4845/************************************************************************
4846 * *
4847 * Parser contexts handling *
4848 * *
4849 ************************************************************************/
4850
4851/**
William M. Brackedb65a72004-02-06 07:36:04 +00004852 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004853 * @ctxt: an HTML parser context
4854 *
4855 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004856 *
4857 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004858 */
4859
Daniel Veillardf403d292003-10-05 13:51:35 +00004860static int
Owen Taylor3473f882001-02-23 17:55:21 +00004861htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4862{
4863 htmlSAXHandler *sax;
4864
Daniel Veillardf403d292003-10-05 13:51:35 +00004865 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004866 memset(ctxt, 0, sizeof(htmlParserCtxt));
4867
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004868 ctxt->dict = xmlDictCreate();
4869 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004870 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4871 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004872 }
Owen Taylor3473f882001-02-23 17:55:21 +00004873 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4874 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004875 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4876 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004877 }
4878 else
4879 memset(sax, 0, sizeof(htmlSAXHandler));
4880
4881 /* Allocate the Input stack */
Daniel Veillarde77db162009-08-22 11:32:38 +02004882 ctxt->inputTab = (htmlParserInputPtr *)
Owen Taylor3473f882001-02-23 17:55:21 +00004883 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4884 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004885 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004886 ctxt->inputNr = 0;
4887 ctxt->inputMax = 0;
4888 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004889 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004890 }
4891 ctxt->inputNr = 0;
4892 ctxt->inputMax = 5;
4893 ctxt->input = NULL;
4894 ctxt->version = NULL;
4895 ctxt->encoding = NULL;
4896 ctxt->standalone = -1;
4897 ctxt->instate = XML_PARSER_START;
4898
4899 /* Allocate the Node stack */
4900 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4901 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004902 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004903 ctxt->nodeNr = 0;
4904 ctxt->nodeMax = 0;
4905 ctxt->node = NULL;
4906 ctxt->inputNr = 0;
4907 ctxt->inputMax = 0;
4908 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004909 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004910 }
4911 ctxt->nodeNr = 0;
4912 ctxt->nodeMax = 10;
4913 ctxt->node = NULL;
4914
4915 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004916 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004917 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004918 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004919 ctxt->nameNr = 0;
Eugene Pimenovef9c6362010-03-15 11:37:48 +01004920 ctxt->nameMax = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004921 ctxt->name = NULL;
4922 ctxt->nodeNr = 0;
4923 ctxt->nodeMax = 0;
4924 ctxt->node = NULL;
4925 ctxt->inputNr = 0;
4926 ctxt->inputMax = 0;
4927 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004928 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004929 }
4930 ctxt->nameNr = 0;
4931 ctxt->nameMax = 10;
4932 ctxt->name = NULL;
4933
Eugene Pimenov615904f2010-03-15 15:16:02 +01004934 ctxt->nodeInfoTab = NULL;
4935 ctxt->nodeInfoNr = 0;
4936 ctxt->nodeInfoMax = 0;
4937
Daniel Veillard092643b2003-09-25 14:29:29 +00004938 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004939 else {
4940 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004941 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004942 }
4943 ctxt->userData = ctxt;
4944 ctxt->myDoc = NULL;
4945 ctxt->wellFormed = 1;
4946 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004947 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Nick Wellnhofer0b2d5c42017-06-12 19:10:04 +02004948 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004949 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004950 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004951 ctxt->vctxt.userData = ctxt;
4952 ctxt->vctxt.error = xmlParserValidityError;
4953 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004954 ctxt->record_info = 0;
4955 ctxt->validate = 0;
4956 ctxt->nbChars = 0;
4957 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004958 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004959 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004960 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004961}
4962
4963/**
4964 * htmlFreeParserCtxt:
4965 * @ctxt: an HTML parser context
4966 *
4967 * Free all the memory used by a parser context. However the parsed
4968 * document in ctxt->myDoc is not freed.
4969 */
4970
4971void
4972htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4973{
4974 xmlFreeParserCtxt(ctxt);
4975}
4976
4977/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004978 * htmlNewParserCtxt:
4979 *
4980 * Allocate and initialize a new parser context.
4981 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004982 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004983 */
4984
Daniel Veillard34c647c2006-09-21 06:53:59 +00004985htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004986htmlNewParserCtxt(void)
4987{
4988 xmlParserCtxtPtr ctxt;
4989
4990 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4991 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004992 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004993 return(NULL);
4994 }
4995 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004996 if (htmlInitParserCtxt(ctxt) < 0) {
4997 htmlFreeParserCtxt(ctxt);
4998 return(NULL);
4999 }
Daniel Veillard1d995272002-07-22 16:43:32 +00005000 return(ctxt);
5001}
5002
5003/**
5004 * htmlCreateMemoryParserCtxt:
5005 * @buffer: a pointer to a char array
5006 * @size: the size of the array
5007 *
5008 * Create a parser context for an HTML in-memory document.
5009 *
5010 * Returns the new parser context or NULL
5011 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00005012htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00005013htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5014 xmlParserCtxtPtr ctxt;
5015 xmlParserInputPtr input;
5016 xmlParserInputBufferPtr buf;
5017
5018 if (buffer == NULL)
5019 return(NULL);
5020 if (size <= 0)
5021 return(NULL);
5022
5023 ctxt = htmlNewParserCtxt();
5024 if (ctxt == NULL)
5025 return(NULL);
5026
5027 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5028 if (buf == NULL) return(NULL);
5029
5030 input = xmlNewInputStream(ctxt);
5031 if (input == NULL) {
5032 xmlFreeParserCtxt(ctxt);
5033 return(NULL);
5034 }
5035
5036 input->filename = NULL;
5037 input->buf = buf;
Daniel Veillard61551a12012-07-16 16:28:47 +08005038 xmlBufResetInput(buf->buffer, input);
Daniel Veillard1d995272002-07-22 16:43:32 +00005039
5040 inputPush(ctxt, input);
5041 return(ctxt);
5042}
5043
5044/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005045 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005046 * @cur: a pointer to an array of xmlChar
5047 * @encoding: a free form C string describing the HTML document encoding, or NULL
5048 *
5049 * Create a parser context for an HTML document.
5050 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005051 * TODO: check the need to add encoding handling there
5052 *
Owen Taylor3473f882001-02-23 17:55:21 +00005053 * Returns the new parser context or NULL
5054 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005055static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00005056htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00005057 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005058 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00005059
Daniel Veillard1d995272002-07-22 16:43:32 +00005060 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00005061 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00005062 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005063 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00005064 if (ctxt == NULL)
5065 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005066
5067 if (encoding != NULL) {
5068 xmlCharEncoding enc;
5069 xmlCharEncodingHandlerPtr handler;
5070
5071 if (ctxt->input->encoding != NULL)
5072 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00005073 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005074
5075 enc = xmlParseCharEncoding(encoding);
5076 /*
5077 * registered set of known encodings
5078 */
5079 if (enc != XML_CHAR_ENCODING_ERROR) {
5080 xmlSwitchEncoding(ctxt, enc);
5081 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005082 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
Daniel Veillarde77db162009-08-22 11:32:38 +02005083 "Unsupported encoding %s\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00005084 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005085 }
5086 } else {
5087 /*
5088 * fallback for unknown encodings
5089 */
5090 handler = xmlFindCharEncodingHandler((const char *) encoding);
5091 if (handler != NULL) {
5092 xmlSwitchToEncoding(ctxt, handler);
5093 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005094 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5095 "Unsupported encoding %s\n",
5096 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005097 }
5098 }
5099 }
5100 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005101}
5102
Daniel Veillard73b013f2003-09-30 12:36:01 +00005103#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00005104/************************************************************************
5105 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02005106 * Progressive parsing interfaces *
Owen Taylor3473f882001-02-23 17:55:21 +00005107 * *
5108 ************************************************************************/
5109
5110/**
5111 * htmlParseLookupSequence:
5112 * @ctxt: an HTML parser context
5113 * @first: the first char to lookup
5114 * @next: the next char to lookup or zero
5115 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00005116 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00005117 *
5118 * Try to find if a sequence (first, next, third) or just (first next) or
5119 * (first) is available in the input stream.
5120 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5121 * to avoid rescanning sequences of bytes, it DOES change the state of the
5122 * parser, do not use liberally.
5123 * This is basically similar to xmlParseLookupSequence()
5124 *
5125 * Returns the index to the current parsing point if the full sequence
5126 * is available, -1 otherwise.
5127 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005128static int
Owen Taylor3473f882001-02-23 17:55:21 +00005129htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02005130 xmlChar next, xmlChar third, int iscomment,
Daniel Veillardeeb99322009-08-25 14:42:16 +02005131 int ignoreattrval)
5132{
Owen Taylor3473f882001-02-23 17:55:21 +00005133 int base, len;
5134 htmlParserInputPtr in;
5135 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00005136 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02005137 int invalue = 0;
5138 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00005139
5140 in = ctxt->input;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005141 if (in == NULL)
5142 return (-1);
5143
Owen Taylor3473f882001-02-23 17:55:21 +00005144 base = in->cur - in->base;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005145 if (base < 0)
5146 return (-1);
5147
Owen Taylor3473f882001-02-23 17:55:21 +00005148 if (ctxt->checkIndex > base)
5149 base = ctxt->checkIndex;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005150
Owen Taylor3473f882001-02-23 17:55:21 +00005151 if (in->buf == NULL) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02005152 buf = in->base;
5153 len = in->length;
Owen Taylor3473f882001-02-23 17:55:21 +00005154 } else {
Daniel Veillarda78d8032012-07-16 14:56:50 +08005155 buf = xmlBufContent(in->buf->buffer);
5156 len = xmlBufUse(in->buf->buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00005157 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02005158
Owen Taylor3473f882001-02-23 17:55:21 +00005159 /* take into account the sequence length */
Daniel Veillardeeb99322009-08-25 14:42:16 +02005160 if (third)
5161 len -= 2;
5162 else if (next)
5163 len--;
5164 for (; base < len; base++) {
5165 if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5166 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5167 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5168 incomment = 1;
5169 /* do not increment past <! - some people use <!--> */
5170 base += 2;
5171 }
5172 }
5173 if (ignoreattrval) {
5174 if (buf[base] == '"' || buf[base] == '\'') {
5175 if (invalue) {
5176 if (buf[base] == valdellim) {
5177 invalue = 0;
5178 continue;
5179 }
5180 } else {
5181 valdellim = buf[base];
5182 invalue = 1;
5183 continue;
5184 }
5185 } else if (invalue) {
5186 continue;
5187 }
5188 }
5189 if (incomment) {
5190 if (base + 3 > len)
5191 return (-1);
5192 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5193 (buf[base + 2] == '>')) {
5194 incomment = 0;
5195 base += 2;
5196 }
5197 continue;
5198 }
Owen Taylor3473f882001-02-23 17:55:21 +00005199 if (buf[base] == first) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02005200 if (third != 0) {
5201 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5202 continue;
5203 } else if (next != 0) {
5204 if (buf[base + 1] != next)
5205 continue;
5206 }
5207 ctxt->checkIndex = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00005208#ifdef DEBUG_PUSH
Daniel Veillardeeb99322009-08-25 14:42:16 +02005209 if (next == 0)
5210 xmlGenericError(xmlGenericErrorContext,
5211 "HPP: lookup '%c' found at %d\n",
5212 first, base);
5213 else if (third == 0)
5214 xmlGenericError(xmlGenericErrorContext,
5215 "HPP: lookup '%c%c' found at %d\n",
5216 first, next, base);
5217 else
5218 xmlGenericError(xmlGenericErrorContext,
5219 "HPP: lookup '%c%c%c' found at %d\n",
5220 first, next, third, base);
Owen Taylor3473f882001-02-23 17:55:21 +00005221#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02005222 return (base - (in->cur - in->base));
5223 }
Owen Taylor3473f882001-02-23 17:55:21 +00005224 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02005225 if ((!incomment) && (!invalue))
5226 ctxt->checkIndex = base;
Owen Taylor3473f882001-02-23 17:55:21 +00005227#ifdef DEBUG_PUSH
5228 if (next == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02005229 xmlGenericError(xmlGenericErrorContext,
5230 "HPP: lookup '%c' failed\n", first);
Owen Taylor3473f882001-02-23 17:55:21 +00005231 else if (third == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02005232 xmlGenericError(xmlGenericErrorContext,
5233 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillarde77db162009-08-22 11:32:38 +02005234 else
Daniel Veillardeeb99322009-08-25 14:42:16 +02005235 xmlGenericError(xmlGenericErrorContext,
5236 "HPP: lookup '%c%c%c' failed\n", first, next,
5237 third);
Owen Taylor3473f882001-02-23 17:55:21 +00005238#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02005239 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00005240}
5241
5242/**
Markus Kull56a03032009-08-24 19:00:23 +02005243 * htmlParseLookupChars:
5244 * @ctxt: an HTML parser context
5245 * @stop: Array of chars, which stop the lookup.
5246 * @stopLen: Length of stop-Array
5247 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005248 * Try to find if any char of the stop-Array is available in the input
Markus Kull56a03032009-08-24 19:00:23 +02005249 * stream.
5250 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5251 * to avoid rescanning sequences of bytes, it DOES change the state of the
5252 * parser, do not use liberally.
5253 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005254 * Returns the index to the current parsing point if a stopChar
Markus Kull56a03032009-08-24 19:00:23 +02005255 * is available, -1 otherwise.
5256 */
5257static int
5258htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5259 int stopLen)
5260{
5261 int base, len;
5262 htmlParserInputPtr in;
5263 const xmlChar *buf;
5264 int incomment = 0;
5265 int i;
5266
5267 in = ctxt->input;
5268 if (in == NULL)
5269 return (-1);
5270
5271 base = in->cur - in->base;
5272 if (base < 0)
5273 return (-1);
5274
5275 if (ctxt->checkIndex > base)
5276 base = ctxt->checkIndex;
5277
5278 if (in->buf == NULL) {
5279 buf = in->base;
5280 len = in->length;
5281 } else {
Daniel Veillarda78d8032012-07-16 14:56:50 +08005282 buf = xmlBufContent(in->buf->buffer);
5283 len = xmlBufUse(in->buf->buffer);
Markus Kull56a03032009-08-24 19:00:23 +02005284 }
5285
5286 for (; base < len; base++) {
5287 if (!incomment && (base + 4 < len)) {
5288 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5289 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5290 incomment = 1;
5291 /* do not increment past <! - some people use <!--> */
5292 base += 2;
5293 }
5294 }
5295 if (incomment) {
5296 if (base + 3 > len)
5297 return (-1);
5298 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5299 (buf[base + 2] == '>')) {
5300 incomment = 0;
5301 base += 2;
5302 }
5303 continue;
5304 }
5305 for (i = 0; i < stopLen; ++i) {
5306 if (buf[base] == stop[i]) {
5307 ctxt->checkIndex = 0;
5308 return (base - (in->cur - in->base));
5309 }
5310 }
5311 }
5312 ctxt->checkIndex = base;
5313 return (-1);
5314}
5315
5316/**
Owen Taylor3473f882001-02-23 17:55:21 +00005317 * htmlParseTryOrFinish:
5318 * @ctxt: an HTML parser context
5319 * @terminate: last chunk indicator
5320 *
5321 * Try to progress on parsing
5322 *
5323 * Returns zero if no parsing was possible
5324 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005325static int
Owen Taylor3473f882001-02-23 17:55:21 +00005326htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5327 int ret = 0;
5328 htmlParserInputPtr in;
5329 int avail = 0;
5330 xmlChar cur, next;
5331
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005332 htmlParserNodeInfo node_info;
5333
Owen Taylor3473f882001-02-23 17:55:21 +00005334#ifdef DEBUG_PUSH
5335 switch (ctxt->instate) {
5336 case XML_PARSER_EOF:
5337 xmlGenericError(xmlGenericErrorContext,
5338 "HPP: try EOF\n"); break;
5339 case XML_PARSER_START:
5340 xmlGenericError(xmlGenericErrorContext,
5341 "HPP: try START\n"); break;
5342 case XML_PARSER_MISC:
5343 xmlGenericError(xmlGenericErrorContext,
5344 "HPP: try MISC\n");break;
5345 case XML_PARSER_COMMENT:
5346 xmlGenericError(xmlGenericErrorContext,
5347 "HPP: try COMMENT\n");break;
5348 case XML_PARSER_PROLOG:
5349 xmlGenericError(xmlGenericErrorContext,
5350 "HPP: try PROLOG\n");break;
5351 case XML_PARSER_START_TAG:
5352 xmlGenericError(xmlGenericErrorContext,
5353 "HPP: try START_TAG\n");break;
5354 case XML_PARSER_CONTENT:
5355 xmlGenericError(xmlGenericErrorContext,
5356 "HPP: try CONTENT\n");break;
5357 case XML_PARSER_CDATA_SECTION:
5358 xmlGenericError(xmlGenericErrorContext,
5359 "HPP: try CDATA_SECTION\n");break;
5360 case XML_PARSER_END_TAG:
5361 xmlGenericError(xmlGenericErrorContext,
5362 "HPP: try END_TAG\n");break;
5363 case XML_PARSER_ENTITY_DECL:
5364 xmlGenericError(xmlGenericErrorContext,
5365 "HPP: try ENTITY_DECL\n");break;
5366 case XML_PARSER_ENTITY_VALUE:
5367 xmlGenericError(xmlGenericErrorContext,
5368 "HPP: try ENTITY_VALUE\n");break;
5369 case XML_PARSER_ATTRIBUTE_VALUE:
5370 xmlGenericError(xmlGenericErrorContext,
5371 "HPP: try ATTRIBUTE_VALUE\n");break;
5372 case XML_PARSER_DTD:
5373 xmlGenericError(xmlGenericErrorContext,
5374 "HPP: try DTD\n");break;
5375 case XML_PARSER_EPILOG:
5376 xmlGenericError(xmlGenericErrorContext,
5377 "HPP: try EPILOG\n");break;
5378 case XML_PARSER_PI:
5379 xmlGenericError(xmlGenericErrorContext,
5380 "HPP: try PI\n");break;
5381 case XML_PARSER_SYSTEM_LITERAL:
5382 xmlGenericError(xmlGenericErrorContext,
5383 "HPP: try SYSTEM_LITERAL\n");break;
5384 }
5385#endif
5386
5387 while (1) {
5388
5389 in = ctxt->input;
5390 if (in == NULL) break;
5391 if (in->buf == NULL)
5392 avail = in->length - (in->cur - in->base);
5393 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005394 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Owen Taylor3473f882001-02-23 17:55:21 +00005395 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005396 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005397 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005398 /*
5399 * SAX: end of the document processing.
5400 */
5401 ctxt->instate = XML_PARSER_EOF;
5402 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5403 ctxt->sax->endDocument(ctxt->userData);
5404 }
5405 }
5406 if (avail < 1)
5407 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00005408 cur = in->cur[0];
5409 if (cur == 0) {
5410 SKIP(1);
5411 continue;
5412 }
5413
Owen Taylor3473f882001-02-23 17:55:21 +00005414 switch (ctxt->instate) {
5415 case XML_PARSER_EOF:
5416 /*
5417 * Document parsing is done !
5418 */
5419 goto done;
5420 case XML_PARSER_START:
5421 /*
5422 * Very first chars read from the document flow.
5423 */
5424 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005425 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005426 SKIP_BLANKS;
5427 if (in->buf == NULL)
5428 avail = in->length - (in->cur - in->base);
5429 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005430 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Owen Taylor3473f882001-02-23 17:55:21 +00005431 }
5432 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5433 ctxt->sax->setDocumentLocator(ctxt->userData,
5434 &xmlDefaultSAXLocator);
5435 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5436 (!ctxt->disableSAX))
5437 ctxt->sax->startDocument(ctxt->userData);
5438
5439 cur = in->cur[0];
5440 next = in->cur[1];
5441 if ((cur == '<') && (next == '!') &&
5442 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5443 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5444 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5445 (UPP(8) == 'E')) {
5446 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005447 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005448 goto done;
5449#ifdef DEBUG_PUSH
5450 xmlGenericError(xmlGenericErrorContext,
5451 "HPP: Parsing internal subset\n");
5452#endif
5453 htmlParseDocTypeDecl(ctxt);
5454 ctxt->instate = XML_PARSER_PROLOG;
5455#ifdef DEBUG_PUSH
5456 xmlGenericError(xmlGenericErrorContext,
5457 "HPP: entering PROLOG\n");
5458#endif
5459 } else {
5460 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005461#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00005462 xmlGenericError(xmlGenericErrorContext,
5463 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005464#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00005465 }
Owen Taylor3473f882001-02-23 17:55:21 +00005466 break;
5467 case XML_PARSER_MISC:
5468 SKIP_BLANKS;
5469 if (in->buf == NULL)
5470 avail = in->length - (in->cur - in->base);
5471 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005472 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Denis Paukfdf990c2012-05-10 20:40:49 +08005473 /*
5474 * no chars in buffer
5475 */
5476 if (avail < 1)
Owen Taylor3473f882001-02-23 17:55:21 +00005477 goto done;
Denis Paukfdf990c2012-05-10 20:40:49 +08005478 /*
5479 * not enouth chars in buffer
5480 */
5481 if (avail < 2) {
5482 if (!terminate)
5483 goto done;
5484 else
5485 next = ' ';
5486 } else {
5487 next = in->cur[1];
5488 }
Owen Taylor3473f882001-02-23 17:55:21 +00005489 cur = in->cur[0];
Owen Taylor3473f882001-02-23 17:55:21 +00005490 if ((cur == '<') && (next == '!') &&
5491 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5492 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005493 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005494 goto done;
5495#ifdef DEBUG_PUSH
5496 xmlGenericError(xmlGenericErrorContext,
5497 "HPP: Parsing Comment\n");
5498#endif
5499 htmlParseComment(ctxt);
5500 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005501 } else if ((cur == '<') && (next == '?')) {
5502 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005503 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005504 goto done;
5505#ifdef DEBUG_PUSH
5506 xmlGenericError(xmlGenericErrorContext,
5507 "HPP: Parsing PI\n");
5508#endif
5509 htmlParsePI(ctxt);
5510 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005511 } else if ((cur == '<') && (next == '!') &&
5512 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5513 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5514 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5515 (UPP(8) == 'E')) {
5516 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005517 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005518 goto done;
5519#ifdef DEBUG_PUSH
5520 xmlGenericError(xmlGenericErrorContext,
5521 "HPP: Parsing internal subset\n");
5522#endif
5523 htmlParseDocTypeDecl(ctxt);
5524 ctxt->instate = XML_PARSER_PROLOG;
5525#ifdef DEBUG_PUSH
5526 xmlGenericError(xmlGenericErrorContext,
5527 "HPP: entering PROLOG\n");
5528#endif
5529 } else if ((cur == '<') && (next == '!') &&
5530 (avail < 9)) {
5531 goto done;
5532 } else {
5533 ctxt->instate = XML_PARSER_START_TAG;
5534#ifdef DEBUG_PUSH
5535 xmlGenericError(xmlGenericErrorContext,
5536 "HPP: entering START_TAG\n");
5537#endif
5538 }
5539 break;
5540 case XML_PARSER_PROLOG:
5541 SKIP_BLANKS;
5542 if (in->buf == NULL)
5543 avail = in->length - (in->cur - in->base);
5544 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005545 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Daniel Veillarde77db162009-08-22 11:32:38 +02005546 if (avail < 2)
Owen Taylor3473f882001-02-23 17:55:21 +00005547 goto done;
5548 cur = in->cur[0];
5549 next = in->cur[1];
5550 if ((cur == '<') && (next == '!') &&
5551 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5552 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005553 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005554 goto done;
5555#ifdef DEBUG_PUSH
5556 xmlGenericError(xmlGenericErrorContext,
5557 "HPP: Parsing Comment\n");
5558#endif
5559 htmlParseComment(ctxt);
5560 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005561 } else if ((cur == '<') && (next == '?')) {
5562 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005563 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005564 goto done;
5565#ifdef DEBUG_PUSH
5566 xmlGenericError(xmlGenericErrorContext,
5567 "HPP: Parsing PI\n");
5568#endif
5569 htmlParsePI(ctxt);
5570 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005571 } else if ((cur == '<') && (next == '!') &&
5572 (avail < 4)) {
5573 goto done;
5574 } else {
5575 ctxt->instate = XML_PARSER_START_TAG;
5576#ifdef DEBUG_PUSH
5577 xmlGenericError(xmlGenericErrorContext,
5578 "HPP: entering START_TAG\n");
5579#endif
5580 }
5581 break;
5582 case XML_PARSER_EPILOG:
5583 if (in->buf == NULL)
5584 avail = in->length - (in->cur - in->base);
5585 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005586 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Owen Taylor3473f882001-02-23 17:55:21 +00005587 if (avail < 1)
5588 goto done;
5589 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005590 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005591 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005592 goto done;
5593 }
5594 if (avail < 2)
5595 goto done;
5596 next = in->cur[1];
5597 if ((cur == '<') && (next == '!') &&
5598 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5599 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005600 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005601 goto done;
5602#ifdef DEBUG_PUSH
5603 xmlGenericError(xmlGenericErrorContext,
5604 "HPP: Parsing Comment\n");
5605#endif
5606 htmlParseComment(ctxt);
5607 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005608 } else if ((cur == '<') && (next == '?')) {
5609 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005610 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005611 goto done;
5612#ifdef DEBUG_PUSH
5613 xmlGenericError(xmlGenericErrorContext,
5614 "HPP: Parsing PI\n");
5615#endif
5616 htmlParsePI(ctxt);
5617 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005618 } else if ((cur == '<') && (next == '!') &&
5619 (avail < 4)) {
5620 goto done;
5621 } else {
5622 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005623 ctxt->wellFormed = 0;
5624 ctxt->instate = XML_PARSER_EOF;
5625#ifdef DEBUG_PUSH
5626 xmlGenericError(xmlGenericErrorContext,
5627 "HPP: entering EOF\n");
5628#endif
5629 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5630 ctxt->sax->endDocument(ctxt->userData);
5631 goto done;
5632 }
5633 break;
5634 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005635 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005636 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005637 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005638
Denis Paukfdf990c2012-05-10 20:40:49 +08005639 /*
5640 * no chars in buffer
5641 */
5642 if (avail < 1)
Owen Taylor3473f882001-02-23 17:55:21 +00005643 goto done;
Denis Paukfdf990c2012-05-10 20:40:49 +08005644 /*
5645 * not enouth chars in buffer
5646 */
5647 if (avail < 2) {
5648 if (!terminate)
5649 goto done;
5650 else
5651 next = ' ';
5652 } else {
5653 next = in->cur[1];
5654 }
Owen Taylor3473f882001-02-23 17:55:21 +00005655 cur = in->cur[0];
5656 if (cur != '<') {
5657 ctxt->instate = XML_PARSER_CONTENT;
5658#ifdef DEBUG_PUSH
5659 xmlGenericError(xmlGenericErrorContext,
5660 "HPP: entering CONTENT\n");
5661#endif
5662 break;
5663 }
Denis Paukfdf990c2012-05-10 20:40:49 +08005664 if (next == '/') {
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005665 ctxt->instate = XML_PARSER_END_TAG;
5666 ctxt->checkIndex = 0;
5667#ifdef DEBUG_PUSH
5668 xmlGenericError(xmlGenericErrorContext,
5669 "HPP: entering END_TAG\n");
5670#endif
5671 break;
5672 }
Owen Taylor3473f882001-02-23 17:55:21 +00005673 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005674 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005675 goto done;
5676
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005677 /* Capture start position */
5678 if (ctxt->record_info) {
5679 node_info.begin_pos = ctxt->input->consumed +
5680 (CUR_PTR - ctxt->input->base);
5681 node_info.begin_line = ctxt->input->line;
5682 }
5683
5684
Daniel Veillard597f1c12005-07-03 23:00:18 +00005685 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005686 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005687 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005688 (name == NULL)) {
5689 if (CUR == '>')
5690 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005691 break;
5692 }
Owen Taylor3473f882001-02-23 17:55:21 +00005693
5694 /*
5695 * Lookup the info for that element.
5696 */
5697 info = htmlTagLookup(name);
5698 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005699 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5700 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005701 }
5702
5703 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005704 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005705 */
5706 if ((CUR == '/') && (NXT(1) == '>')) {
5707 SKIP(2);
5708 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5709 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005710 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005711 ctxt->instate = XML_PARSER_CONTENT;
5712#ifdef DEBUG_PUSH
5713 xmlGenericError(xmlGenericErrorContext,
5714 "HPP: entering CONTENT\n");
5715#endif
5716 break;
5717 }
5718
5719 if (CUR == '>') {
5720 NEXT;
5721 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005722 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5723 "Couldn't find end of Start Tag %s\n",
5724 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005725
5726 /*
5727 * end of parsing of this node.
5728 */
Daniel Veillarde77db162009-08-22 11:32:38 +02005729 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005730 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005731 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005732 }
Owen Taylor3473f882001-02-23 17:55:21 +00005733
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005734 if (ctxt->record_info)
5735 htmlNodeInfoPush(ctxt, &node_info);
5736
Owen Taylor3473f882001-02-23 17:55:21 +00005737 ctxt->instate = XML_PARSER_CONTENT;
5738#ifdef DEBUG_PUSH
5739 xmlGenericError(xmlGenericErrorContext,
5740 "HPP: entering CONTENT\n");
5741#endif
5742 break;
5743 }
5744
5745 /*
5746 * Check for an Empty Element from DTD definition
5747 */
5748 if ((info != NULL) && (info->empty)) {
5749 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5750 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005751 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005752 }
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005753
5754 if (ctxt->record_info)
5755 htmlNodeInfoPush(ctxt, &node_info);
5756
Owen Taylor3473f882001-02-23 17:55:21 +00005757 ctxt->instate = XML_PARSER_CONTENT;
5758#ifdef DEBUG_PUSH
5759 xmlGenericError(xmlGenericErrorContext,
5760 "HPP: entering CONTENT\n");
5761#endif
5762 break;
5763 }
5764 case XML_PARSER_CONTENT: {
5765 long cons;
5766 /*
5767 * Handle preparsed entities and charRef
5768 */
5769 if (ctxt->token != 0) {
5770 xmlChar chr[2] = { 0 , 0 } ;
5771
5772 chr[0] = (xmlChar) ctxt->token;
5773 htmlCheckParagraph(ctxt);
5774 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5775 ctxt->sax->characters(ctxt->userData, chr, 1);
5776 ctxt->token = 0;
5777 ctxt->checkIndex = 0;
5778 }
5779 if ((avail == 1) && (terminate)) {
5780 cur = in->cur[0];
5781 if ((cur != '<') && (cur != '&')) {
5782 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005783 if (IS_BLANK_CH(cur)) {
Daniel Veillardf933c892012-09-07 19:32:12 +08005784 if (ctxt->keepBlanks) {
5785 if (ctxt->sax->characters != NULL)
5786 ctxt->sax->characters(
Hugh Davenport8fb4a772015-11-20 17:16:06 +08005787 ctxt->userData, &in->cur[0], 1);
Daniel Veillardf933c892012-09-07 19:32:12 +08005788 } else {
5789 if (ctxt->sax->ignorableWhitespace != NULL)
5790 ctxt->sax->ignorableWhitespace(
Hugh Davenport8fb4a772015-11-20 17:16:06 +08005791 ctxt->userData, &in->cur[0], 1);
Daniel Veillardf933c892012-09-07 19:32:12 +08005792 }
Owen Taylor3473f882001-02-23 17:55:21 +00005793 } else {
5794 htmlCheckParagraph(ctxt);
5795 if (ctxt->sax->characters != NULL)
5796 ctxt->sax->characters(
Hugh Davenport8fb4a772015-11-20 17:16:06 +08005797 ctxt->userData, &in->cur[0], 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005798 }
5799 }
5800 ctxt->token = 0;
5801 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005802 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005803 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005804 }
Owen Taylor3473f882001-02-23 17:55:21 +00005805 }
5806 if (avail < 2)
5807 goto done;
5808 cur = in->cur[0];
5809 next = in->cur[1];
5810 cons = ctxt->nbChars;
5811 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5812 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5813 /*
5814 * Handle SCRIPT/STYLE separately
5815 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005816 if (!terminate) {
5817 int idx;
5818 xmlChar val;
5819
Denis Pauk91d239c2010-11-04 12:39:18 +01005820 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
Daniel Veillard68716a72006-10-16 09:32:17 +00005821 if (idx < 0)
5822 goto done;
5823 val = in->cur[idx + 2];
5824 if (val == 0) /* bad cut of input */
5825 goto done;
5826 }
Owen Taylor3473f882001-02-23 17:55:21 +00005827 htmlParseScript(ctxt);
5828 if ((cur == '<') && (next == '/')) {
5829 ctxt->instate = XML_PARSER_END_TAG;
5830 ctxt->checkIndex = 0;
5831#ifdef DEBUG_PUSH
5832 xmlGenericError(xmlGenericErrorContext,
5833 "HPP: entering END_TAG\n");
5834#endif
5835 break;
5836 }
5837 } else {
5838 /*
5839 * Sometimes DOCTYPE arrives in the middle of the document
5840 */
5841 if ((cur == '<') && (next == '!') &&
5842 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5843 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5844 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5845 (UPP(8) == 'E')) {
5846 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005847 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005848 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005849 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5850 "Misplaced DOCTYPE declaration\n",
5851 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005852 htmlParseDocTypeDecl(ctxt);
5853 } else if ((cur == '<') && (next == '!') &&
5854 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5855 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005856 (htmlParseLookupSequence(
Daniel Veillarde77db162009-08-22 11:32:38 +02005857 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005858 goto done;
5859#ifdef DEBUG_PUSH
5860 xmlGenericError(xmlGenericErrorContext,
5861 "HPP: Parsing Comment\n");
5862#endif
5863 htmlParseComment(ctxt);
5864 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005865 } else if ((cur == '<') && (next == '?')) {
5866 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005867 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005868 goto done;
5869#ifdef DEBUG_PUSH
5870 xmlGenericError(xmlGenericErrorContext,
5871 "HPP: Parsing PI\n");
5872#endif
5873 htmlParsePI(ctxt);
5874 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005875 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5876 goto done;
5877 } else if ((cur == '<') && (next == '/')) {
5878 ctxt->instate = XML_PARSER_END_TAG;
5879 ctxt->checkIndex = 0;
5880#ifdef DEBUG_PUSH
5881 xmlGenericError(xmlGenericErrorContext,
5882 "HPP: entering END_TAG\n");
5883#endif
5884 break;
5885 } else if (cur == '<') {
5886 ctxt->instate = XML_PARSER_START_TAG;
5887 ctxt->checkIndex = 0;
5888#ifdef DEBUG_PUSH
5889 xmlGenericError(xmlGenericErrorContext,
5890 "HPP: entering START_TAG\n");
5891#endif
5892 break;
5893 } else if (cur == '&') {
5894 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005895 (htmlParseLookupChars(ctxt,
5896 BAD_CAST "; >/", 4) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005897 goto done;
5898#ifdef DEBUG_PUSH
5899 xmlGenericError(xmlGenericErrorContext,
5900 "HPP: Parsing Reference\n");
5901#endif
5902 /* TODO: check generation of subtrees if noent !!! */
5903 htmlParseReference(ctxt);
5904 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005905 /*
5906 * check that the text sequence is complete
5907 * before handing out the data to the parser
5908 * to avoid problems with erroneous end of
5909 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005910 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005911 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005912 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005913 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005914 ctxt->checkIndex = 0;
5915#ifdef DEBUG_PUSH
5916 xmlGenericError(xmlGenericErrorContext,
5917 "HPP: Parsing char data\n");
5918#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005919 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005920 }
5921 }
5922 if (cons == ctxt->nbChars) {
5923 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005924 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5925 "detected an error in element content\n",
5926 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005927 }
5928 NEXT;
5929 break;
5930 }
5931
5932 break;
5933 }
5934 case XML_PARSER_END_TAG:
5935 if (avail < 2)
5936 goto done;
5937 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005938 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005939 goto done;
5940 htmlParseEndTag(ctxt);
5941 if (ctxt->nameNr == 0) {
5942 ctxt->instate = XML_PARSER_EPILOG;
5943 } else {
5944 ctxt->instate = XML_PARSER_CONTENT;
5945 }
5946 ctxt->checkIndex = 0;
5947#ifdef DEBUG_PUSH
5948 xmlGenericError(xmlGenericErrorContext,
5949 "HPP: entering CONTENT\n");
5950#endif
5951 break;
5952 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005953 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5954 "HPP: internal error, state == CDATA\n",
5955 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005956 ctxt->instate = XML_PARSER_CONTENT;
5957 ctxt->checkIndex = 0;
5958#ifdef DEBUG_PUSH
5959 xmlGenericError(xmlGenericErrorContext,
5960 "HPP: entering CONTENT\n");
5961#endif
5962 break;
5963 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005964 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5965 "HPP: internal error, state == DTD\n",
5966 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005967 ctxt->instate = XML_PARSER_CONTENT;
5968 ctxt->checkIndex = 0;
5969#ifdef DEBUG_PUSH
5970 xmlGenericError(xmlGenericErrorContext,
5971 "HPP: entering CONTENT\n");
5972#endif
5973 break;
5974 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005975 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5976 "HPP: internal error, state == COMMENT\n",
5977 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005978 ctxt->instate = XML_PARSER_CONTENT;
5979 ctxt->checkIndex = 0;
5980#ifdef DEBUG_PUSH
5981 xmlGenericError(xmlGenericErrorContext,
5982 "HPP: entering CONTENT\n");
5983#endif
5984 break;
5985 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005986 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5987 "HPP: internal error, state == PI\n",
5988 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005989 ctxt->instate = XML_PARSER_CONTENT;
5990 ctxt->checkIndex = 0;
5991#ifdef DEBUG_PUSH
5992 xmlGenericError(xmlGenericErrorContext,
5993 "HPP: entering CONTENT\n");
5994#endif
5995 break;
5996 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005997 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5998 "HPP: internal error, state == ENTITY_DECL\n",
5999 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00006000 ctxt->instate = XML_PARSER_CONTENT;
6001 ctxt->checkIndex = 0;
6002#ifdef DEBUG_PUSH
6003 xmlGenericError(xmlGenericErrorContext,
6004 "HPP: entering CONTENT\n");
6005#endif
6006 break;
6007 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00006008 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6009 "HPP: internal error, state == ENTITY_VALUE\n",
6010 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00006011 ctxt->instate = XML_PARSER_CONTENT;
6012 ctxt->checkIndex = 0;
6013#ifdef DEBUG_PUSH
6014 xmlGenericError(xmlGenericErrorContext,
6015 "HPP: entering DTD\n");
6016#endif
6017 break;
6018 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00006019 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6020 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6021 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00006022 ctxt->instate = XML_PARSER_START_TAG;
6023 ctxt->checkIndex = 0;
6024#ifdef DEBUG_PUSH
6025 xmlGenericError(xmlGenericErrorContext,
6026 "HPP: entering START_TAG\n");
6027#endif
6028 break;
6029 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00006030 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6031 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6032 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00006033 ctxt->instate = XML_PARSER_CONTENT;
6034 ctxt->checkIndex = 0;
6035#ifdef DEBUG_PUSH
6036 xmlGenericError(xmlGenericErrorContext,
6037 "HPP: entering CONTENT\n");
6038#endif
6039 break;
6040 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00006041 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6042 "HPP: internal error, state == XML_PARSER_IGNORE\n",
6043 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00006044 ctxt->instate = XML_PARSER_CONTENT;
6045 ctxt->checkIndex = 0;
6046#ifdef DEBUG_PUSH
6047 xmlGenericError(xmlGenericErrorContext,
6048 "HPP: entering CONTENT\n");
6049#endif
6050 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00006051 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00006052 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6053 "HPP: internal error, state == XML_PARSER_LITERAL\n",
6054 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00006055 ctxt->instate = XML_PARSER_CONTENT;
6056 ctxt->checkIndex = 0;
6057#ifdef DEBUG_PUSH
6058 xmlGenericError(xmlGenericErrorContext,
6059 "HPP: entering CONTENT\n");
6060#endif
6061 break;
6062
Owen Taylor3473f882001-02-23 17:55:21 +00006063 }
6064 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006065done:
Owen Taylor3473f882001-02-23 17:55:21 +00006066 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00006067 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006068 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00006069 /*
6070 * SAX: end of the document processing.
6071 */
6072 ctxt->instate = XML_PARSER_EOF;
6073 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6074 ctxt->sax->endDocument(ctxt->userData);
6075 }
6076 }
Arnold Hendriks826bc322013-11-29 14:12:12 +08006077 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00006078 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6079 (ctxt->instate == XML_PARSER_EPILOG))) {
6080 xmlDtdPtr dtd;
6081 dtd = xmlGetIntSubset(ctxt->myDoc);
6082 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02006083 ctxt->myDoc->intSubset =
6084 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00006085 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6086 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6087 }
6088#ifdef DEBUG_PUSH
6089 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6090#endif
6091 return(ret);
6092}
6093
6094/**
Owen Taylor3473f882001-02-23 17:55:21 +00006095 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00006096 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00006097 * @chunk: an char array
6098 * @size: the size in byte of the chunk
6099 * @terminate: last chunk indicator
6100 *
6101 * Parse a Chunk of memory
6102 *
6103 * Returns zero if no error, the xmlParserErrors otherwise.
6104 */
6105int
6106htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6107 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00006108 if ((ctxt == NULL) || (ctxt->input == NULL)) {
6109 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6110 "htmlParseChunk: context error\n", NULL, NULL);
6111 return(XML_ERR_INTERNAL_ERROR);
6112 }
Owen Taylor3473f882001-02-23 17:55:21 +00006113 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6114 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
Daniel Veillard00ac0d32012-07-16 18:03:01 +08006115 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6116 size_t cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00006117 int res;
Daniel Veillarde77db162009-08-22 11:32:38 +02006118
6119 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillardd2755a82005-08-07 23:42:39 +00006120 if (res < 0) {
6121 ctxt->errNo = XML_PARSER_EOF;
6122 ctxt->disableSAX = 1;
6123 return (XML_PARSER_EOF);
6124 }
Daniel Veillard00ac0d32012-07-16 18:03:01 +08006125 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Owen Taylor3473f882001-02-23 17:55:21 +00006126#ifdef DEBUG_PUSH
6127 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6128#endif
6129
Daniel Veillard14f752c2003-08-09 11:44:50 +00006130#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00006131 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6132 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006133#endif
Owen Taylor3473f882001-02-23 17:55:21 +00006134 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00006135 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6136 xmlParserInputBufferPtr in = ctxt->input->buf;
6137 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6138 (in->raw != NULL)) {
6139 int nbchars;
Daniel Veillardde0cc202013-02-12 16:55:34 +08006140 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6141 size_t current = ctxt->input->cur - ctxt->input->base;
Daniel Veillarde77db162009-08-22 11:32:38 +02006142
Daniel Veillardbf058dc2013-02-13 18:19:42 +08006143 nbchars = xmlCharEncInput(in, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006144 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00006145 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6146 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006147 return(XML_ERR_INVALID_ENCODING);
6148 }
Daniel Veillardde0cc202013-02-12 16:55:34 +08006149 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006150 }
6151 }
Owen Taylor3473f882001-02-23 17:55:21 +00006152 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00006153 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00006154 if (terminate) {
6155 if ((ctxt->instate != XML_PARSER_EOF) &&
6156 (ctxt->instate != XML_PARSER_EPILOG) &&
6157 (ctxt->instate != XML_PARSER_MISC)) {
6158 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00006159 ctxt->wellFormed = 0;
Daniel Veillarde77db162009-08-22 11:32:38 +02006160 }
Owen Taylor3473f882001-02-23 17:55:21 +00006161 if (ctxt->instate != XML_PARSER_EOF) {
6162 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6163 ctxt->sax->endDocument(ctxt->userData);
6164 }
6165 ctxt->instate = XML_PARSER_EOF;
6166 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006167 return((xmlParserErrors) ctxt->errNo);
Owen Taylor3473f882001-02-23 17:55:21 +00006168}
6169
6170/************************************************************************
6171 * *
6172 * User entry points *
6173 * *
6174 ************************************************************************/
6175
6176/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006177 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00006178 * @sax: a SAX handler
6179 * @user_data: The user data returned on SAX callbacks
6180 * @chunk: a pointer to an array of chars
6181 * @size: number of chars in the array
6182 * @filename: an optional file name or URI
6183 * @enc: an optional encoding
6184 *
6185 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00006186 * The value of @filename is used for fetching external entities
6187 * and error/warning reports.
6188 *
6189 * Returns the new parser context or NULL
6190 */
6191htmlParserCtxtPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02006192htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
Owen Taylor3473f882001-02-23 17:55:21 +00006193 const char *chunk, int size, const char *filename,
6194 xmlCharEncoding enc) {
6195 htmlParserCtxtPtr ctxt;
6196 htmlParserInputPtr inputStream;
6197 xmlParserInputBufferPtr buf;
6198
Daniel Veillardd0463562001-10-13 09:15:48 +00006199 xmlInitParser();
6200
Owen Taylor3473f882001-02-23 17:55:21 +00006201 buf = xmlAllocParserInputBuffer(enc);
6202 if (buf == NULL) return(NULL);
6203
Daniel Veillardf403d292003-10-05 13:51:35 +00006204 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00006205 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00006206 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00006207 return(NULL);
6208 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00006209 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6210 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00006211 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00006212 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00006213 xmlFree(ctxt->sax);
6214 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6215 if (ctxt->sax == NULL) {
6216 xmlFree(buf);
6217 xmlFree(ctxt);
6218 return(NULL);
6219 }
6220 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6221 if (user_data != NULL)
6222 ctxt->userData = user_data;
Daniel Veillarde77db162009-08-22 11:32:38 +02006223 }
Owen Taylor3473f882001-02-23 17:55:21 +00006224 if (filename == NULL) {
6225 ctxt->directory = NULL;
6226 } else {
6227 ctxt->directory = xmlParserGetDirectory(filename);
6228 }
6229
6230 inputStream = htmlNewInputStream(ctxt);
6231 if (inputStream == NULL) {
6232 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00006233 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00006234 return(NULL);
6235 }
6236
6237 if (filename == NULL)
6238 inputStream->filename = NULL;
6239 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00006240 inputStream->filename = (char *)
6241 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00006242 inputStream->buf = buf;
Daniel Veillard61551a12012-07-16 16:28:47 +08006243 xmlBufResetInput(buf->buffer, inputStream);
Owen Taylor3473f882001-02-23 17:55:21 +00006244
6245 inputPush(ctxt, inputStream);
6246
6247 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02006248 (ctxt->input->buf != NULL)) {
Daniel Veillard00ac0d32012-07-16 18:03:01 +08006249 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6250 size_t cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillard5f704af2003-03-05 10:01:43 +00006251
Daniel Veillarde77db162009-08-22 11:32:38 +02006252 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00006253
Daniel Veillard00ac0d32012-07-16 18:03:01 +08006254 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Owen Taylor3473f882001-02-23 17:55:21 +00006255#ifdef DEBUG_PUSH
6256 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6257#endif
6258 }
Daniel Veillard68716a72006-10-16 09:32:17 +00006259 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00006260
6261 return(ctxt);
6262}
William M. Brack21e4ef22005-01-02 09:53:13 +00006263#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00006264
6265/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006266 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00006267 * @cur: a pointer to an array of xmlChar
6268 * @encoding: a free form C string describing the HTML document encoding, or NULL
6269 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02006270 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00006271 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006272 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6273 * to handle parse events. If sax is NULL, fallback to the default DOM
6274 * behavior and return a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006275 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006276 * Returns the resulting document tree unless SAX is NULL or the document is
6277 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00006278 */
6279
6280htmlDocPtr
Nick Wellnhofer576912f2017-06-17 15:59:13 +02006281htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6282 htmlSAXHandlerPtr sax, void *userData) {
Owen Taylor3473f882001-02-23 17:55:21 +00006283 htmlDocPtr ret;
6284 htmlParserCtxtPtr ctxt;
6285
Daniel Veillardd0463562001-10-13 09:15:48 +00006286 xmlInitParser();
6287
Owen Taylor3473f882001-02-23 17:55:21 +00006288 if (cur == NULL) return(NULL);
6289
6290
6291 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6292 if (ctxt == NULL) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02006293 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00006294 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00006295 ctxt->sax = sax;
6296 ctxt->userData = userData;
6297 }
6298
6299 htmlParseDocument(ctxt);
6300 ret = ctxt->myDoc;
6301 if (sax != NULL) {
6302 ctxt->sax = NULL;
6303 ctxt->userData = NULL;
6304 }
6305 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006306
Owen Taylor3473f882001-02-23 17:55:21 +00006307 return(ret);
6308}
6309
6310/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006311 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00006312 * @cur: a pointer to an array of xmlChar
6313 * @encoding: a free form C string describing the HTML document encoding, or NULL
6314 *
6315 * parse an HTML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006316 *
Owen Taylor3473f882001-02-23 17:55:21 +00006317 * Returns the resulting document tree
6318 */
6319
6320htmlDocPtr
Nick Wellnhofer576912f2017-06-17 15:59:13 +02006321htmlParseDoc(const xmlChar *cur, const char *encoding) {
Owen Taylor3473f882001-02-23 17:55:21 +00006322 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6323}
6324
6325
6326/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006327 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00006328 * @filename: the filename
6329 * @encoding: a free form C string describing the HTML document encoding, or NULL
6330 *
Daniel Veillarde77db162009-08-22 11:32:38 +02006331 * Create a parser context for a file content.
Owen Taylor3473f882001-02-23 17:55:21 +00006332 * Automatic support for ZLIB/Compress compressed document is provided
6333 * by default if found at compile-time.
6334 *
6335 * Returns the new parser context or NULL
6336 */
6337htmlParserCtxtPtr
6338htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6339{
6340 htmlParserCtxtPtr ctxt;
6341 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006342 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00006343 /* htmlCharEncoding enc; */
6344 xmlChar *content, *content_line = (xmlChar *) "charset=";
6345
Daniel Veillarda03e3652004-11-02 18:45:30 +00006346 if (filename == NULL)
6347 return(NULL);
6348
Daniel Veillardf403d292003-10-05 13:51:35 +00006349 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00006350 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00006351 return(NULL);
6352 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006353 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6354 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00006355#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006356 if (xmlDefaultSAXHandler.error != NULL) {
6357 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6358 }
Daniel Veillard87247e82004-01-13 20:42:02 +00006359#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00006360 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00006361 return(NULL);
6362 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006363
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006364 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6365 xmlFree(canonicFilename);
6366 if (inputStream == NULL) {
6367 xmlFreeParserCtxt(ctxt);
6368 return(NULL);
6369 }
Owen Taylor3473f882001-02-23 17:55:21 +00006370
6371 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006372
Owen Taylor3473f882001-02-23 17:55:21 +00006373 /* set encoding */
6374 if (encoding) {
Daniel Veillard292a9f22014-10-06 18:51:04 +08006375 size_t l = strlen(encoding);
6376
6377 if (l < 1000) {
6378 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6379 if (content) {
6380 strcpy ((char *)content, (char *)content_line);
6381 strcat ((char *)content, (char *)encoding);
6382 htmlCheckEncoding (ctxt, content);
6383 xmlFree (content);
6384 }
Owen Taylor3473f882001-02-23 17:55:21 +00006385 }
6386 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006387
Owen Taylor3473f882001-02-23 17:55:21 +00006388 return(ctxt);
6389}
6390
6391/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006392 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00006393 * @filename: the filename
6394 * @encoding: a free form C string describing the HTML document encoding, or NULL
6395 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02006396 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00006397 *
6398 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6399 * compressed document is provided by default if found at compile-time.
6400 * It use the given SAX function block to handle the parsing callback.
6401 * If sax is NULL, fallback to the default DOM tree building routines.
6402 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006403 * Returns the resulting document tree unless SAX is NULL or the document is
6404 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00006405 */
6406
6407htmlDocPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02006408htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
Owen Taylor3473f882001-02-23 17:55:21 +00006409 void *userData) {
6410 htmlDocPtr ret;
6411 htmlParserCtxtPtr ctxt;
6412 htmlSAXHandlerPtr oldsax = NULL;
6413
Daniel Veillardd0463562001-10-13 09:15:48 +00006414 xmlInitParser();
6415
Owen Taylor3473f882001-02-23 17:55:21 +00006416 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6417 if (ctxt == NULL) return(NULL);
6418 if (sax != NULL) {
6419 oldsax = ctxt->sax;
6420 ctxt->sax = sax;
6421 ctxt->userData = userData;
6422 }
6423
6424 htmlParseDocument(ctxt);
6425
6426 ret = ctxt->myDoc;
6427 if (sax != NULL) {
6428 ctxt->sax = oldsax;
6429 ctxt->userData = NULL;
6430 }
6431 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006432
Owen Taylor3473f882001-02-23 17:55:21 +00006433 return(ret);
6434}
6435
6436/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006437 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00006438 * @filename: the filename
6439 * @encoding: a free form C string describing the HTML document encoding, or NULL
6440 *
6441 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6442 * compressed document is provided by default if found at compile-time.
6443 *
6444 * Returns the resulting document tree
6445 */
6446
6447htmlDocPtr
6448htmlParseFile(const char *filename, const char *encoding) {
6449 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6450}
6451
6452/**
6453 * htmlHandleOmittedElem:
Daniel Veillarde77db162009-08-22 11:32:38 +02006454 * @val: int 0 or 1
Owen Taylor3473f882001-02-23 17:55:21 +00006455 *
6456 * Set and return the previous value for handling HTML omitted tags.
6457 *
6458 * Returns the last value for 0 for no handling, 1 for auto insertion.
6459 */
6460
6461int
6462htmlHandleOmittedElem(int val) {
6463 int old = htmlOmittedDefaultValue;
6464
6465 htmlOmittedDefaultValue = val;
6466 return(old);
6467}
6468
Daniel Veillard930dfb62003-02-05 10:17:38 +00006469/**
6470 * htmlElementAllowedHere:
6471 * @parent: HTML parent element
6472 * @elt: HTML element
6473 *
6474 * Checks whether an HTML element may be a direct child of a parent element.
6475 * Note - doesn't check for deprecated elements
6476 *
6477 * Returns 1 if allowed; 0 otherwise.
6478 */
6479int
6480htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6481 const char** p ;
6482
6483 if ( ! elt || ! parent || ! parent->subelts )
6484 return 0 ;
6485
6486 for ( p = parent->subelts; *p; ++p )
6487 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6488 return 1 ;
6489
6490 return 0 ;
6491}
6492/**
6493 * htmlElementStatusHere:
6494 * @parent: HTML parent element
6495 * @elt: HTML element
6496 *
6497 * Checks whether an HTML element may be a direct child of a parent element.
6498 * and if so whether it is valid or deprecated.
6499 *
6500 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6501 */
6502htmlStatus
6503htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6504 if ( ! parent || ! elt )
6505 return HTML_INVALID ;
6506 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6507 return HTML_INVALID ;
6508
6509 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6510}
6511/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006512 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00006513 * @elt: HTML element
6514 * @attr: HTML attribute
6515 * @legacy: whether to allow deprecated attributes
6516 *
6517 * Checks whether an attribute is valid for an element
6518 * Has full knowledge of Required and Deprecated attributes
6519 *
6520 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6521 */
6522htmlStatus
6523htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6524 const char** p ;
6525
6526 if ( !elt || ! attr )
6527 return HTML_INVALID ;
6528
6529 if ( elt->attrs_req )
6530 for ( p = elt->attrs_req; *p; ++p)
6531 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6532 return HTML_REQUIRED ;
6533
6534 if ( elt->attrs_opt )
6535 for ( p = elt->attrs_opt; *p; ++p)
6536 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6537 return HTML_VALID ;
6538
6539 if ( legacy && elt->attrs_depr )
6540 for ( p = elt->attrs_depr; *p; ++p)
6541 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6542 return HTML_DEPRECATED ;
6543
6544 return HTML_INVALID ;
6545}
6546/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006547 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00006548 * @node: an htmlNodePtr in a tree
6549 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00006550 * for Element nodes)
6551 *
6552 * Checks whether the tree node is valid. Experimental (the author
6553 * only uses the HTML enhancements in a SAX parser)
6554 *
6555 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6556 * legacy allowed) or htmlElementStatusHere (otherwise).
6557 * for Attribute nodes, a return from htmlAttrAllowed
6558 * for other nodes, HTML_NA (no checks performed)
6559 */
6560htmlStatus
6561htmlNodeStatus(const htmlNodePtr node, int legacy) {
6562 if ( ! node )
6563 return HTML_INVALID ;
6564
6565 switch ( node->type ) {
6566 case XML_ELEMENT_NODE:
6567 return legacy
6568 ? ( htmlElementAllowedHere (
6569 htmlTagLookup(node->parent->name) , node->name
6570 ) ? HTML_VALID : HTML_INVALID )
6571 : htmlElementStatusHere(
6572 htmlTagLookup(node->parent->name) ,
6573 htmlTagLookup(node->name) )
6574 ;
6575 case XML_ATTRIBUTE_NODE:
6576 return htmlAttrAllowed(
6577 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6578 default: return HTML_NA ;
6579 }
6580}
Daniel Veillard9475a352003-09-26 12:47:50 +00006581/************************************************************************
6582 * *
6583 * New set (2.6.0) of simpler and more flexible APIs *
6584 * *
6585 ************************************************************************/
6586/**
6587 * DICT_FREE:
6588 * @str: a string
6589 *
Jan Pokornýbb654fe2016-04-13 16:56:07 +02006590 * Free a string if it is not owned by the "dict" dictionary in the
Daniel Veillard9475a352003-09-26 12:47:50 +00006591 * current scope
6592 */
6593#define DICT_FREE(str) \
Daniel Veillarde77db162009-08-22 11:32:38 +02006594 if ((str) && ((!dict) || \
Daniel Veillard9475a352003-09-26 12:47:50 +00006595 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6596 xmlFree((char *)(str));
6597
6598/**
6599 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00006600 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00006601 *
6602 * Reset a parser context
6603 */
6604void
6605htmlCtxtReset(htmlParserCtxtPtr ctxt)
6606{
6607 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00006608 xmlDictPtr dict;
Daniel Veillarde77db162009-08-22 11:32:38 +02006609
Daniel Veillarda03e3652004-11-02 18:45:30 +00006610 if (ctxt == NULL)
6611 return;
6612
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006613 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00006614 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00006615
6616 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6617 xmlFreeInputStream(input);
6618 }
6619 ctxt->inputNr = 0;
6620 ctxt->input = NULL;
6621
6622 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00006623 if (ctxt->spaceTab != NULL) {
6624 ctxt->spaceTab[0] = -1;
6625 ctxt->space = &ctxt->spaceTab[0];
6626 } else {
6627 ctxt->space = NULL;
6628 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006629
6630
6631 ctxt->nodeNr = 0;
6632 ctxt->node = NULL;
6633
6634 ctxt->nameNr = 0;
6635 ctxt->name = NULL;
6636
6637 DICT_FREE(ctxt->version);
6638 ctxt->version = NULL;
6639 DICT_FREE(ctxt->encoding);
6640 ctxt->encoding = NULL;
6641 DICT_FREE(ctxt->directory);
6642 ctxt->directory = NULL;
6643 DICT_FREE(ctxt->extSubURI);
6644 ctxt->extSubURI = NULL;
6645 DICT_FREE(ctxt->extSubSystem);
6646 ctxt->extSubSystem = NULL;
6647 if (ctxt->myDoc != NULL)
6648 xmlFreeDoc(ctxt->myDoc);
6649 ctxt->myDoc = NULL;
6650
6651 ctxt->standalone = -1;
6652 ctxt->hasExternalSubset = 0;
6653 ctxt->hasPErefs = 0;
6654 ctxt->html = 1;
6655 ctxt->external = 0;
6656 ctxt->instate = XML_PARSER_START;
6657 ctxt->token = 0;
6658
6659 ctxt->wellFormed = 1;
6660 ctxt->nsWellFormed = 1;
Daniel Veillard8ad29302010-10-28 11:51:22 +02006661 ctxt->disableSAX = 0;
Daniel Veillard9475a352003-09-26 12:47:50 +00006662 ctxt->valid = 1;
6663 ctxt->vctxt.userData = ctxt;
6664 ctxt->vctxt.error = xmlParserValidityError;
6665 ctxt->vctxt.warning = xmlParserValidityWarning;
6666 ctxt->record_info = 0;
6667 ctxt->nbChars = 0;
6668 ctxt->checkIndex = 0;
6669 ctxt->inSubset = 0;
6670 ctxt->errNo = XML_ERR_OK;
6671 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006672 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006673 ctxt->catalogs = NULL;
6674 xmlInitNodeInfoSeq(&ctxt->node_seq);
6675
6676 if (ctxt->attsDefault != NULL) {
6677 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6678 ctxt->attsDefault = NULL;
6679 }
6680 if (ctxt->attsSpecial != NULL) {
6681 xmlHashFree(ctxt->attsSpecial, NULL);
6682 ctxt->attsSpecial = NULL;
6683 }
6684}
6685
6686/**
6687 * htmlCtxtUseOptions:
6688 * @ctxt: an HTML parser context
6689 * @options: a combination of htmlParserOption(s)
6690 *
6691 * Applies the options to the parser context
6692 *
6693 * Returns 0 in case of success, the set of unknown or unimplemented options
6694 * in case of error.
6695 */
6696int
6697htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6698{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006699 if (ctxt == NULL)
6700 return(-1);
6701
Daniel Veillard9475a352003-09-26 12:47:50 +00006702 if (options & HTML_PARSE_NOWARNING) {
6703 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006704 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006705 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006706 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006707 }
6708 if (options & HTML_PARSE_NOERROR) {
6709 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006710 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006711 ctxt->sax->fatalError = NULL;
6712 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006713 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006714 }
6715 if (options & HTML_PARSE_PEDANTIC) {
6716 ctxt->pedantic = 1;
6717 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006718 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006719 } else
6720 ctxt->pedantic = 0;
6721 if (options & XML_PARSE_NOBLANKS) {
6722 ctxt->keepBlanks = 0;
6723 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6724 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006725 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006726 } else
6727 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006728 if (options & HTML_PARSE_RECOVER) {
6729 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006730 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006731 } else
6732 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006733 if (options & HTML_PARSE_COMPACT) {
6734 ctxt->options |= HTML_PARSE_COMPACT;
6735 options -= HTML_PARSE_COMPACT;
6736 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006737 if (options & XML_PARSE_HUGE) {
6738 ctxt->options |= XML_PARSE_HUGE;
6739 options -= XML_PARSE_HUGE;
6740 }
Daniel Veillardf1121c42010-07-26 14:02:42 +02006741 if (options & HTML_PARSE_NODEFDTD) {
6742 ctxt->options |= HTML_PARSE_NODEFDTD;
6743 options -= HTML_PARSE_NODEFDTD;
6744 }
Daniel Veillardc62efc82011-05-16 16:03:50 +08006745 if (options & HTML_PARSE_IGNORE_ENC) {
6746 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6747 options -= HTML_PARSE_IGNORE_ENC;
6748 }
Martin Schröderb91111b2012-05-10 18:52:37 +08006749 if (options & HTML_PARSE_NOIMPLIED) {
6750 ctxt->options |= HTML_PARSE_NOIMPLIED;
6751 options -= HTML_PARSE_NOIMPLIED;
6752 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006753 ctxt->dictNames = 0;
6754 return (options);
6755}
6756
6757/**
6758 * htmlDoRead:
6759 * @ctxt: an HTML parser context
6760 * @URL: the base URL to use for the document
6761 * @encoding: the document encoding, or NULL
6762 * @options: a combination of htmlParserOption(s)
6763 * @reuse: keep the context for reuse
6764 *
6765 * Common front-end for the htmlRead functions
Daniel Veillarde77db162009-08-22 11:32:38 +02006766 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006767 * Returns the resulting document tree or NULL
6768 */
6769static htmlDocPtr
6770htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6771 int options, int reuse)
6772{
6773 htmlDocPtr ret;
Daniel Veillarde77db162009-08-22 11:32:38 +02006774
Daniel Veillard9475a352003-09-26 12:47:50 +00006775 htmlCtxtUseOptions(ctxt, options);
6776 ctxt->html = 1;
6777 if (encoding != NULL) {
6778 xmlCharEncodingHandlerPtr hdlr;
6779
6780 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006781 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006782 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006783 if (ctxt->input->encoding != NULL)
6784 xmlFree((xmlChar *) ctxt->input->encoding);
6785 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6786 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006787 }
6788 if ((URL != NULL) && (ctxt->input != NULL) &&
6789 (ctxt->input->filename == NULL))
6790 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6791 htmlParseDocument(ctxt);
6792 ret = ctxt->myDoc;
6793 ctxt->myDoc = NULL;
6794 if (!reuse) {
6795 if ((ctxt->dictNames) &&
6796 (ret != NULL) &&
6797 (ret->dict == ctxt->dict))
6798 ctxt->dict = NULL;
6799 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006800 }
6801 return (ret);
6802}
6803
6804/**
6805 * htmlReadDoc:
6806 * @cur: a pointer to a zero terminated string
6807 * @URL: the base URL to use for the document
6808 * @encoding: the document encoding, or NULL
6809 * @options: a combination of htmlParserOption(s)
6810 *
6811 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006812 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006813 * Returns the resulting document tree
6814 */
6815htmlDocPtr
6816htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6817{
6818 htmlParserCtxtPtr ctxt;
6819
6820 if (cur == NULL)
6821 return (NULL);
6822
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006823 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006824 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006825 if (ctxt == NULL)
6826 return (NULL);
6827 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6828}
6829
6830/**
6831 * htmlReadFile:
6832 * @filename: a file or URL
6833 * @encoding: the document encoding, or NULL
6834 * @options: a combination of htmlParserOption(s)
6835 *
6836 * parse an XML file from the filesystem or the network.
Daniel Veillarde77db162009-08-22 11:32:38 +02006837 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006838 * Returns the resulting document tree
6839 */
6840htmlDocPtr
6841htmlReadFile(const char *filename, const char *encoding, int options)
6842{
6843 htmlParserCtxtPtr ctxt;
6844
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006845 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006846 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6847 if (ctxt == NULL)
6848 return (NULL);
6849 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6850}
6851
6852/**
6853 * htmlReadMemory:
6854 * @buffer: a pointer to a char array
6855 * @size: the size of the array
6856 * @URL: the base URL to use for the document
6857 * @encoding: the document encoding, or NULL
6858 * @options: a combination of htmlParserOption(s)
6859 *
6860 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006861 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006862 * Returns the resulting document tree
6863 */
6864htmlDocPtr
6865htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6866{
6867 htmlParserCtxtPtr ctxt;
6868
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006869 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006870 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6871 if (ctxt == NULL)
6872 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006873 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006874 if (ctxt->sax != NULL)
6875 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006876 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6877}
6878
6879/**
6880 * htmlReadFd:
6881 * @fd: an open file descriptor
6882 * @URL: the base URL to use for the document
6883 * @encoding: the document encoding, or NULL
6884 * @options: a combination of htmlParserOption(s)
6885 *
6886 * parse an XML from a file descriptor and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006887 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006888 * Returns the resulting document tree
6889 */
6890htmlDocPtr
6891htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6892{
6893 htmlParserCtxtPtr ctxt;
6894 xmlParserInputBufferPtr input;
6895 xmlParserInputPtr stream;
6896
6897 if (fd < 0)
6898 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08006899 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006900
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006901 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006902 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6903 if (input == NULL)
6904 return (NULL);
6905 ctxt = xmlNewParserCtxt();
6906 if (ctxt == NULL) {
6907 xmlFreeParserInputBuffer(input);
6908 return (NULL);
6909 }
6910 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6911 if (stream == NULL) {
6912 xmlFreeParserInputBuffer(input);
6913 xmlFreeParserCtxt(ctxt);
6914 return (NULL);
6915 }
6916 inputPush(ctxt, stream);
6917 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6918}
6919
6920/**
6921 * htmlReadIO:
6922 * @ioread: an I/O read function
6923 * @ioclose: an I/O close function
6924 * @ioctx: an I/O handler
6925 * @URL: the base URL to use for the document
6926 * @encoding: the document encoding, or NULL
6927 * @options: a combination of htmlParserOption(s)
6928 *
6929 * parse an HTML document from I/O functions and source and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006930 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006931 * Returns the resulting document tree
6932 */
6933htmlDocPtr
6934htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6935 void *ioctx, const char *URL, const char *encoding, int options)
6936{
6937 htmlParserCtxtPtr ctxt;
6938 xmlParserInputBufferPtr input;
6939 xmlParserInputPtr stream;
6940
6941 if (ioread == NULL)
6942 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006943 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006944
6945 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6946 XML_CHAR_ENCODING_NONE);
Lin Yi-Li24464be2012-05-10 16:14:55 +08006947 if (input == NULL) {
6948 if (ioclose != NULL)
6949 ioclose(ioctx);
Daniel Veillard9475a352003-09-26 12:47:50 +00006950 return (NULL);
Lin Yi-Li24464be2012-05-10 16:14:55 +08006951 }
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006952 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006953 if (ctxt == NULL) {
6954 xmlFreeParserInputBuffer(input);
6955 return (NULL);
6956 }
6957 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6958 if (stream == NULL) {
6959 xmlFreeParserInputBuffer(input);
6960 xmlFreeParserCtxt(ctxt);
6961 return (NULL);
6962 }
6963 inputPush(ctxt, stream);
6964 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6965}
6966
6967/**
6968 * htmlCtxtReadDoc:
6969 * @ctxt: an HTML parser context
6970 * @cur: a pointer to a zero terminated string
6971 * @URL: the base URL to use for the document
6972 * @encoding: the document encoding, or NULL
6973 * @options: a combination of htmlParserOption(s)
6974 *
6975 * parse an XML in-memory document and build a tree.
6976 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006977 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006978 * Returns the resulting document tree
6979 */
6980htmlDocPtr
6981htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6982 const char *URL, const char *encoding, int options)
6983{
6984 xmlParserInputPtr stream;
6985
6986 if (cur == NULL)
6987 return (NULL);
6988 if (ctxt == NULL)
6989 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08006990 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006991
6992 htmlCtxtReset(ctxt);
6993
6994 stream = xmlNewStringInputStream(ctxt, cur);
6995 if (stream == NULL) {
6996 return (NULL);
6997 }
6998 inputPush(ctxt, stream);
6999 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7000}
7001
7002/**
7003 * htmlCtxtReadFile:
7004 * @ctxt: an HTML parser context
7005 * @filename: a file or URL
7006 * @encoding: the document encoding, or NULL
7007 * @options: a combination of htmlParserOption(s)
7008 *
7009 * parse an XML file from the filesystem or the network.
7010 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02007011 *
Daniel Veillard9475a352003-09-26 12:47:50 +00007012 * Returns the resulting document tree
7013 */
7014htmlDocPtr
7015htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7016 const char *encoding, int options)
7017{
7018 xmlParserInputPtr stream;
7019
7020 if (filename == NULL)
7021 return (NULL);
7022 if (ctxt == NULL)
7023 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08007024 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00007025
7026 htmlCtxtReset(ctxt);
7027
Daniel Veillard29614c72004-11-26 10:47:26 +00007028 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00007029 if (stream == NULL) {
7030 return (NULL);
7031 }
7032 inputPush(ctxt, stream);
7033 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7034}
7035
7036/**
7037 * htmlCtxtReadMemory:
7038 * @ctxt: an HTML parser context
7039 * @buffer: a pointer to a char array
7040 * @size: the size of the array
7041 * @URL: the base URL to use for the document
7042 * @encoding: the document encoding, or NULL
7043 * @options: a combination of htmlParserOption(s)
7044 *
7045 * parse an XML in-memory document and build a tree.
7046 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02007047 *
Daniel Veillard9475a352003-09-26 12:47:50 +00007048 * Returns the resulting document tree
7049 */
7050htmlDocPtr
7051htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7052 const char *URL, const char *encoding, int options)
7053{
7054 xmlParserInputBufferPtr input;
7055 xmlParserInputPtr stream;
7056
7057 if (ctxt == NULL)
7058 return (NULL);
7059 if (buffer == NULL)
7060 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08007061 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00007062
7063 htmlCtxtReset(ctxt);
7064
7065 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7066 if (input == NULL) {
7067 return(NULL);
7068 }
7069
7070 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7071 if (stream == NULL) {
7072 xmlFreeParserInputBuffer(input);
7073 return(NULL);
7074 }
7075
7076 inputPush(ctxt, stream);
7077 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7078}
7079
7080/**
7081 * htmlCtxtReadFd:
7082 * @ctxt: an HTML parser context
7083 * @fd: an open file descriptor
7084 * @URL: the base URL to use for the document
7085 * @encoding: the document encoding, or NULL
7086 * @options: a combination of htmlParserOption(s)
7087 *
7088 * parse an XML from a file descriptor and build a tree.
7089 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02007090 *
Daniel Veillard9475a352003-09-26 12:47:50 +00007091 * Returns the resulting document tree
7092 */
7093htmlDocPtr
7094htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7095 const char *URL, const char *encoding, int options)
7096{
7097 xmlParserInputBufferPtr input;
7098 xmlParserInputPtr stream;
7099
7100 if (fd < 0)
7101 return (NULL);
7102 if (ctxt == NULL)
7103 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08007104 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00007105
7106 htmlCtxtReset(ctxt);
7107
7108
7109 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7110 if (input == NULL)
7111 return (NULL);
7112 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7113 if (stream == NULL) {
7114 xmlFreeParserInputBuffer(input);
7115 return (NULL);
7116 }
7117 inputPush(ctxt, stream);
7118 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7119}
7120
7121/**
7122 * htmlCtxtReadIO:
7123 * @ctxt: an HTML parser context
7124 * @ioread: an I/O read function
7125 * @ioclose: an I/O close function
7126 * @ioctx: an I/O handler
7127 * @URL: the base URL to use for the document
7128 * @encoding: the document encoding, or NULL
7129 * @options: a combination of htmlParserOption(s)
7130 *
7131 * parse an HTML document from I/O functions and source and build a tree.
7132 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02007133 *
Daniel Veillard9475a352003-09-26 12:47:50 +00007134 * Returns the resulting document tree
7135 */
7136htmlDocPtr
7137htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7138 xmlInputCloseCallback ioclose, void *ioctx,
7139 const char *URL,
7140 const char *encoding, int options)
7141{
7142 xmlParserInputBufferPtr input;
7143 xmlParserInputPtr stream;
7144
7145 if (ioread == NULL)
7146 return (NULL);
7147 if (ctxt == NULL)
7148 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08007149 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00007150
7151 htmlCtxtReset(ctxt);
7152
7153 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7154 XML_CHAR_ENCODING_NONE);
Lin Yi-Li24464be2012-05-10 16:14:55 +08007155 if (input == NULL) {
7156 if (ioclose != NULL)
7157 ioclose(ioctx);
Daniel Veillard9475a352003-09-26 12:47:50 +00007158 return (NULL);
Lin Yi-Li24464be2012-05-10 16:14:55 +08007159 }
Daniel Veillard9475a352003-09-26 12:47:50 +00007160 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7161 if (stream == NULL) {
7162 xmlFreeParserInputBuffer(input);
7163 return (NULL);
7164 }
7165 inputPush(ctxt, stream);
7166 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7167}
7168
Daniel Veillard5d4644e2005-04-01 13:11:58 +00007169#define bottom_HTMLparser
7170#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00007171#endif /* LIBXML_HTML_ENABLED */