blob: d1395fa507c7cdcb78e468fe3d793edf5f424536 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
Daniel Veillardc5d64342001-06-24 12:13:24 +00006 * daniel@veillard.com
Owen Taylor3473f882001-02-23 17:55:21 +00007 */
8
Daniel Veillard34ce8be2002-03-18 19:37:11 +00009#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
Bjorn Reese70a9da52001-04-21 16:57:29 +000012
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
Daniel Veillard56a4cb82001-03-24 17:00:36 +000039#include <libxml/HTMLtree.h>
Owen Taylor3473f882001-02-23 17:55:21 +000040#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
Daniel Veillard3c01b1d2001-10-17 15:58:35 +000044#include <libxml/globals.h>
Igor Zlatkovic5f9fada2003-02-19 14:51:00 +000045#include <libxml/uri.h>
Owen Taylor3473f882001-02-23 17:55:21 +000046
Daniel Veillarda78d8032012-07-16 14:56:50 +080047#include "buf.h"
48#include "enc.h"
49
Owen Taylor3473f882001-02-23 17:55:21 +000050#define HTML_MAX_NAMELEN 1000
51#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52#define HTML_PARSER_BUFFER_SIZE 100
53
54/* #define DEBUG */
55/* #define DEBUG_PUSH */
56
Daniel Veillard22090732001-07-16 00:06:07 +000057static int htmlOmittedDefaultValue = 1;
Owen Taylor3473f882001-02-23 17:55:21 +000058
Daniel Veillard56a4cb82001-03-24 17:00:36 +000059xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 xmlChar end, xmlChar end2, xmlChar end3);
Daniel Veillardc1f78342001-11-10 11:43:05 +000061static void htmlParseComment(htmlParserCtxtPtr ctxt);
Daniel Veillard56a4cb82001-03-24 17:00:36 +000062
63/************************************************************************
64 * *
Daniel Veillarde77db162009-08-22 11:32:38 +020065 * Some factorized error routines *
Daniel Veillardf403d292003-10-05 13:51:35 +000066 * *
67 ************************************************************************/
68
69/**
William M. Brackedb65a72004-02-06 07:36:04 +000070 * htmlErrMemory:
Daniel Veillardf403d292003-10-05 13:51:35 +000071 * @ctxt: an HTML parser context
72 * @extra: extra informations
73 *
74 * Handle a redefinition of attribute error
75 */
76static void
77htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78{
Daniel Veillard157fee02003-10-31 10:36:03 +000079 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80 (ctxt->instate == XML_PARSER_EOF))
81 return;
Daniel Veillardf403d292003-10-05 13:51:35 +000082 if (ctxt != NULL) {
83 ctxt->errNo = XML_ERR_NO_MEMORY;
84 ctxt->instate = XML_PARSER_EOF;
85 ctxt->disableSAX = 1;
86 }
87 if (extra)
Daniel Veillard659e71e2003-10-10 14:10:40 +000088 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000089 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90 NULL, NULL, 0, 0,
91 "Memory allocation failed : %s\n", extra);
92 else
Daniel Veillard659e71e2003-10-10 14:10:40 +000093 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
Daniel Veillardf403d292003-10-05 13:51:35 +000094 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95 NULL, NULL, 0, 0, "Memory allocation failed\n");
96}
97
98/**
99 * htmlParseErr:
100 * @ctxt: an HTML parser context
101 * @error: the error number
102 * @msg: the error message
103 * @str1: string infor
104 * @str2: string infor
105 *
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107 */
Xin Li28c53d32017-03-07 00:33:02 +0000108static void LIBXML_ATTR_FORMAT(3,0)
Daniel Veillardf403d292003-10-05 13:51:35 +0000109htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110 const char *msg, const xmlChar *str1, const xmlChar *str2)
111{
Daniel Veillard157fee02003-10-31 10:36:03 +0000112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113 (ctxt->instate == XML_PARSER_EOF))
114 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000115 if (ctxt != NULL)
116 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000118 XML_ERR_ERROR, NULL, 0,
119 (const char *) str1, (const char *) str2,
120 NULL, 0, 0,
121 msg, str1, str2);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000122 if (ctxt != NULL)
123 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000124}
125
126/**
127 * htmlParseErrInt:
128 * @ctxt: an HTML parser context
129 * @error: the error number
130 * @msg: the error message
131 * @val: integer info
132 *
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134 */
Xin Li28c53d32017-03-07 00:33:02 +0000135static void LIBXML_ATTR_FORMAT(3,0)
Daniel Veillardf403d292003-10-05 13:51:35 +0000136htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137 const char *msg, int val)
138{
Daniel Veillard157fee02003-10-31 10:36:03 +0000139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140 (ctxt->instate == XML_PARSER_EOF))
141 return;
Daniel Veillarda03e3652004-11-02 18:45:30 +0000142 if (ctxt != NULL)
143 ctxt->errNo = error;
Daniel Veillard659e71e2003-10-10 14:10:40 +0000144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
Daniel Veillardf403d292003-10-05 13:51:35 +0000145 XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 NULL, val, 0, msg, val);
Daniel Veillarda03e3652004-11-02 18:45:30 +0000147 if (ctxt != NULL)
148 ctxt->wellFormed = 0;
Daniel Veillardf403d292003-10-05 13:51:35 +0000149}
150
151/************************************************************************
152 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200153 * Parser stacks related functions and macros *
Owen Taylor3473f882001-02-23 17:55:21 +0000154 * *
155 ************************************************************************/
156
Daniel Veillard1c732d22002-11-30 11:22:59 +0000157/**
158 * htmlnamePush:
159 * @ctxt: an HTML parser context
160 * @value: the element name
161 *
162 * Pushes a new element name on top of the name stack
163 *
164 * Returns 0 in case of error, the index in the stack otherwise
Owen Taylor3473f882001-02-23 17:55:21 +0000165 */
Daniel Veillard1c732d22002-11-30 11:22:59 +0000166static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000167htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
Daniel Veillard1c732d22002-11-30 11:22:59 +0000168{
Daniel Veillard029a04d2009-08-24 12:50:23 +0200169 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170 ctxt->html = 3;
171 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172 ctxt->html = 10;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000173 if (ctxt->nameNr >= ctxt->nameMax) {
174 ctxt->nameMax *= 2;
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000175 ctxt->nameTab = (const xmlChar * *)
Igor Zlatkovicd37c1392003-08-28 10:34:33 +0000176 xmlRealloc((xmlChar * *)ctxt->nameTab,
Daniel Veillard1c732d22002-11-30 11:22:59 +0000177 ctxt->nameMax *
178 sizeof(ctxt->nameTab[0]));
179 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000180 htmlErrMemory(ctxt, NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000181 return (0);
182 }
183 }
184 ctxt->nameTab[ctxt->nameNr] = value;
185 ctxt->name = value;
186 return (ctxt->nameNr++);
187}
188/**
189 * htmlnamePop:
190 * @ctxt: an HTML parser context
191 *
192 * Pops the top element name from the name stack
193 *
194 * Returns the name just removed
195 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000196static const xmlChar *
Daniel Veillard1c732d22002-11-30 11:22:59 +0000197htmlnamePop(htmlParserCtxtPtr ctxt)
198{
Daniel Veillard2fdbd322003-08-18 12:15:38 +0000199 const xmlChar *ret;
Owen Taylor3473f882001-02-23 17:55:21 +0000200
Daniel Veillard1c732d22002-11-30 11:22:59 +0000201 if (ctxt->nameNr <= 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000202 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000203 ctxt->nameNr--;
204 if (ctxt->nameNr < 0)
Daniel Veillard24505b02005-07-28 23:49:35 +0000205 return (NULL);
Daniel Veillard1c732d22002-11-30 11:22:59 +0000206 if (ctxt->nameNr > 0)
207 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208 else
209 ctxt->name = NULL;
210 ret = ctxt->nameTab[ctxt->nameNr];
Daniel Veillard24505b02005-07-28 23:49:35 +0000211 ctxt->nameTab[ctxt->nameNr] = NULL;
Daniel Veillard1c732d22002-11-30 11:22:59 +0000212 return (ret);
213}
Owen Taylor3473f882001-02-23 17:55:21 +0000214
Eugene Pimenov615904f2010-03-15 15:16:02 +0100215/**
216 * htmlNodeInfoPush:
217 * @ctxt: an HTML parser context
218 * @value: the node info
219 *
220 * Pushes a new element name on top of the node info stack
221 *
222 * Returns 0 in case of error, the index in the stack otherwise
223 */
224static int
225htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226{
227 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228 if (ctxt->nodeInfoMax == 0)
229 ctxt->nodeInfoMax = 5;
230 ctxt->nodeInfoMax *= 2;
231 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233 ctxt->nodeInfoMax *
234 sizeof(ctxt->nodeInfoTab[0]));
235 if (ctxt->nodeInfoTab == NULL) {
236 htmlErrMemory(ctxt, NULL);
237 return (0);
238 }
239 }
240 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242 return (ctxt->nodeInfoNr++);
243}
244
245/**
246 * htmlNodeInfoPop:
247 * @ctxt: an HTML parser context
248 *
249 * Pops the top element name from the node info stack
250 *
251 * Returns 0 in case of error, the pointer to NodeInfo otherwise
252 */
253static htmlParserNodeInfo *
254htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255{
256 if (ctxt->nodeInfoNr <= 0)
257 return (NULL);
258 ctxt->nodeInfoNr--;
259 if (ctxt->nodeInfoNr < 0)
260 return (NULL);
261 if (ctxt->nodeInfoNr > 0)
262 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263 else
264 ctxt->nodeInfo = NULL;
265 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266}
267
Owen Taylor3473f882001-02-23 17:55:21 +0000268/*
269 * Macros for accessing the content. Those should be used only by the parser,
270 * and not exported.
271 *
272 * Dirty macros, i.e. one need to make assumption on the context to use them
273 *
274 * CUR_PTR return the current pointer to the xmlChar to be parsed.
275 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277 * in UNICODE mode. This should be used internally by the parser
278 * only to compare to ASCII values otherwise it would break when
279 * running with UTF-8 encoding.
280 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281 * to compare on ASCII based substring.
282 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283 * it should be used only to compare on ASCII based substring.
284 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
Daniel Veillard77a90a72003-03-22 00:04:05 +0000285 * strings without newlines within the parser.
Owen Taylor3473f882001-02-23 17:55:21 +0000286 *
287 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288 *
289 * CURRENT Returns the current char value, with the full decoding of
290 * UTF-8 if we are using this mode. It returns an int.
291 * NEXT Skip to the next character, this does the proper decoding
292 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
Daniel Veillard77a90a72003-03-22 00:04:05 +0000293 * NEXTL(l) Skip the current unicode character of l xmlChars long.
Owen Taylor3473f882001-02-23 17:55:21 +0000294 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295 */
296
297#define UPPER (toupper(*ctxt->input->cur))
298
Daniel Veillard77a90a72003-03-22 00:04:05 +0000299#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
Owen Taylor3473f882001-02-23 17:55:21 +0000300
301#define NXT(val) ctxt->input->cur[(val)]
302
303#define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305#define CUR_PTR ctxt->input->cur
Xin Li28c53d32017-03-07 00:33:02 +0000306#define BASE_PTR ctxt->input->base
Owen Taylor3473f882001-02-23 17:55:21 +0000307
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000308#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 xmlParserInputShrink(ctxt->input)
Owen Taylor3473f882001-02-23 17:55:21 +0000311
Daniel Veillard652f9aa2003-10-28 22:04:45 +0000312#define GROW if ((ctxt->progressive == 0) && \
313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
Owen Taylor3473f882001-02-23 17:55:21 +0000315
316#define CURRENT ((int) (*ctxt->input->cur))
317
318#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319
320/* Inported from XML */
321
Daniel Veillard561b7f82002-03-20 21:55:57 +0000322/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323#define CUR ((int) (*ctxt->input->cur))
Daniel Veillard77a90a72003-03-22 00:04:05 +0000324#define NEXT xmlNextChar(ctxt)
Owen Taylor3473f882001-02-23 17:55:21 +0000325
Daniel Veillard561b7f82002-03-20 21:55:57 +0000326#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
Owen Taylor3473f882001-02-23 17:55:21 +0000327
328
329#define NEXTL(l) do { \
330 if (*(ctxt->input->cur) == '\n') { \
331 ctxt->input->line++; ctxt->input->col = 1; \
332 } else ctxt->input->col++; \
333 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
334 } while (0)
Daniel Veillarde77db162009-08-22 11:32:38 +0200335
Owen Taylor3473f882001-02-23 17:55:21 +0000336/************
337 \
338 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340 ************/
341
342#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344
345#define COPY_BUF(l,b,i,v) \
346 if (l == 1) b[i++] = (xmlChar) v; \
347 else i += xmlCopyChar(l,&b[i],v)
348
349/**
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200350 * htmlFindEncoding:
351 * @the HTML parser context
352 *
353 * Ty to find and encoding in the current data available in the input
354 * buffer this is needed to try to switch to the proper encoding when
355 * one face a character error.
356 * That's an heuristic, since it's operating outside of parsing it could
357 * try to use a meta which had been commented out, that's the reason it
358 * should only be used in case of error, not as a default.
359 *
360 * Returns an encoding string or NULL if not found, the string need to
361 * be freed
362 */
363static xmlChar *
364htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365 const xmlChar *start, *cur, *end;
366
367 if ((ctxt == NULL) || (ctxt->input == NULL) ||
368 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369 (ctxt->input->buf->encoder != NULL))
370 return(NULL);
371 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372 return(NULL);
373
374 start = ctxt->input->cur;
375 end = ctxt->input->end;
376 /* we also expect the input buffer to be zero terminated */
377 if (*end != 0)
378 return(NULL);
379
380 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381 if (cur == NULL)
382 return(NULL);
383 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
384 if (cur == NULL)
385 return(NULL);
386 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
387 if (cur == NULL)
388 return(NULL);
389 cur += 8;
390 start = cur;
391 while (((*cur >= 'A') && (*cur <= 'Z')) ||
392 ((*cur >= 'a') && (*cur <= 'z')) ||
393 ((*cur >= '0') && (*cur <= '9')) ||
394 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395 cur++;
396 if (cur == start)
397 return(NULL);
398 return(xmlStrndup(start, cur - start));
399}
400
401/**
Owen Taylor3473f882001-02-23 17:55:21 +0000402 * htmlCurrentChar:
403 * @ctxt: the HTML parser context
404 * @len: pointer to the length of the char read
405 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000406 * The current char value, if using UTF-8 this may actually span multiple
Owen Taylor3473f882001-02-23 17:55:21 +0000407 * bytes in the input buffer. Implement the end of line normalization:
408 * 2.11 End-of-Line Handling
409 * If the encoding is unspecified, in the case we find an ISO-Latin-1
410 * char, then the encoding converter is plugged in automatically.
411 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000412 * Returns the current char value and its length
Owen Taylor3473f882001-02-23 17:55:21 +0000413 */
414
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000415static int
Owen Taylor3473f882001-02-23 17:55:21 +0000416htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
417 if (ctxt->instate == XML_PARSER_EOF)
418 return(0);
419
420 if (ctxt->token != 0) {
421 *len = 0;
422 return(ctxt->token);
Daniel Veillarde77db162009-08-22 11:32:38 +0200423 }
Owen Taylor3473f882001-02-23 17:55:21 +0000424 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
425 /*
426 * We are supposed to handle UTF8, check it's valid
427 * From rfc2044: encoding of the Unicode values on UTF-8:
428 *
429 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
430 * 0000 0000-0000 007F 0xxxxxxx
431 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
Daniel Veillarde77db162009-08-22 11:32:38 +0200432 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
Owen Taylor3473f882001-02-23 17:55:21 +0000433 *
434 * Check for the 0x110000 limit too
435 */
436 const unsigned char *cur = ctxt->input->cur;
437 unsigned char c;
438 unsigned int val;
439
440 c = *cur;
441 if (c & 0x80) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200442 if (cur[1] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000443 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200444 cur = ctxt->input->cur;
445 }
Owen Taylor3473f882001-02-23 17:55:21 +0000446 if ((cur[1] & 0xc0) != 0x80)
447 goto encoding_error;
448 if ((c & 0xe0) == 0xe0) {
449
Adiel Mittmann8a103792009-08-25 11:27:13 +0200450 if (cur[2] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000451 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200452 cur = ctxt->input->cur;
453 }
Owen Taylor3473f882001-02-23 17:55:21 +0000454 if ((cur[2] & 0xc0) != 0x80)
455 goto encoding_error;
456 if ((c & 0xf0) == 0xf0) {
Adiel Mittmann8a103792009-08-25 11:27:13 +0200457 if (cur[3] == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000458 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
Adiel Mittmann8a103792009-08-25 11:27:13 +0200459 cur = ctxt->input->cur;
460 }
Owen Taylor3473f882001-02-23 17:55:21 +0000461 if (((c & 0xf8) != 0xf0) ||
462 ((cur[3] & 0xc0) != 0x80))
463 goto encoding_error;
464 /* 4-byte code */
465 *len = 4;
466 val = (cur[0] & 0x7) << 18;
467 val |= (cur[1] & 0x3f) << 12;
468 val |= (cur[2] & 0x3f) << 6;
469 val |= cur[3] & 0x3f;
470 } else {
471 /* 3-byte code */
472 *len = 3;
473 val = (cur[0] & 0xf) << 12;
474 val |= (cur[1] & 0x3f) << 6;
475 val |= cur[2] & 0x3f;
476 }
477 } else {
478 /* 2-byte code */
479 *len = 2;
480 val = (cur[0] & 0x1f) << 6;
481 val |= cur[1] & 0x3f;
482 }
483 if (!IS_CHAR(val)) {
Daniel Veillardf403d292003-10-05 13:51:35 +0000484 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
485 "Char 0x%X out of allowed range\n", val);
Daniel Veillarde77db162009-08-22 11:32:38 +0200486 }
Owen Taylor3473f882001-02-23 17:55:21 +0000487 return(val);
488 } else {
Daniel Veillard856c6682009-08-24 18:16:56 +0200489 if ((*ctxt->input->cur == 0) &&
490 (ctxt->input->cur < ctxt->input->end)) {
491 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
492 "Char 0x%X out of allowed range\n", 0);
493 *len = 1;
494 return(' ');
495 }
Owen Taylor3473f882001-02-23 17:55:21 +0000496 /* 1-byte code */
497 *len = 1;
498 return((int) *ctxt->input->cur);
499 }
500 }
501 /*
Daniel Veillard60087f32001-10-10 09:45:09 +0000502 * Assume it's a fixed length encoding (1) with
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000503 * a compatible encoding for the ASCII set, since
Owen Taylor3473f882001-02-23 17:55:21 +0000504 * XML constructs only use < 128 chars
505 */
506 *len = 1;
507 if ((int) *ctxt->input->cur < 0x80)
508 return((int) *ctxt->input->cur);
509
510 /*
511 * Humm this is bad, do an automatic flow conversion
512 */
Daniel Veillard533ec0e2009-08-12 20:13:38 +0200513 {
514 xmlChar * guess;
515 xmlCharEncodingHandlerPtr handler;
516
517 guess = htmlFindEncoding(ctxt);
518 if (guess == NULL) {
519 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
520 } else {
521 if (ctxt->input->encoding != NULL)
522 xmlFree((xmlChar *) ctxt->input->encoding);
523 ctxt->input->encoding = guess;
524 handler = xmlFindCharEncodingHandler((const char *) guess);
525 if (handler != NULL) {
526 xmlSwitchToEncoding(ctxt, handler);
527 } else {
528 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
529 "Unsupported encoding %s", guess, NULL);
530 }
531 }
532 ctxt->charset = XML_CHAR_ENCODING_UTF8;
533 }
534
Owen Taylor3473f882001-02-23 17:55:21 +0000535 return(xmlCurrentChar(ctxt, len));
536
537encoding_error:
538 /*
539 * If we detect an UTF8 error that probably mean that the
540 * input encoding didn't get properly advertized in the
541 * declaration header. Report the error and switch the encoding
542 * to ISO-Latin-1 (if you don't like this policy, just declare the
543 * encoding !)
544 */
Daniel Veillarda03e3652004-11-02 18:45:30 +0000545 {
546 char buffer[150];
547
Daniel Veillard861101d2007-06-12 08:38:57 +0000548 if (ctxt->input->end - ctxt->input->cur >= 4) {
549 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
550 ctxt->input->cur[0], ctxt->input->cur[1],
551 ctxt->input->cur[2], ctxt->input->cur[3]);
552 } else {
553 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
554 }
Daniel Veillarda03e3652004-11-02 18:45:30 +0000555 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
556 "Input is not proper UTF-8, indicate encoding !\n",
557 BAD_CAST buffer, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +0000558 }
559
Daniel Veillarde77db162009-08-22 11:32:38 +0200560 ctxt->charset = XML_CHAR_ENCODING_8859_1;
Owen Taylor3473f882001-02-23 17:55:21 +0000561 *len = 1;
562 return((int) *ctxt->input->cur);
563}
564
565/**
Owen Taylor3473f882001-02-23 17:55:21 +0000566 * htmlSkipBlankChars:
567 * @ctxt: the HTML parser context
568 *
569 * skip all blanks character found at that point in the input streams.
570 *
571 * Returns the number of space chars skipped
572 */
573
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000574static int
Owen Taylor3473f882001-02-23 17:55:21 +0000575htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
576 int res = 0;
577
William M. Brack76e95df2003-10-18 16:20:14 +0000578 while (IS_BLANK_CH(*(ctxt->input->cur))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000579 if ((*ctxt->input->cur == 0) &&
580 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
581 xmlPopInput(ctxt);
582 } else {
583 if (*(ctxt->input->cur) == '\n') {
584 ctxt->input->line++; ctxt->input->col = 1;
585 } else ctxt->input->col++;
586 ctxt->input->cur++;
587 ctxt->nbChars++;
588 if (*ctxt->input->cur == 0)
589 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
590 }
591 res++;
592 }
593 return(res);
594}
595
596
597
598/************************************************************************
599 * *
Daniel Veillarde77db162009-08-22 11:32:38 +0200600 * The list of HTML elements and their properties *
Owen Taylor3473f882001-02-23 17:55:21 +0000601 * *
602 ************************************************************************/
603
604/*
605 * Start Tag: 1 means the start tag can be ommited
606 * End Tag: 1 means the end tag can be ommited
607 * 2 means it's forbidden (empty elements)
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000608 * 3 means the tag is stylistic and should be closed easily
Owen Taylor3473f882001-02-23 17:55:21 +0000609 * Depr: this element is deprecated
610 * DTD: 1 means that this element is valid only in the Loose DTD
611 * 2 means that this element is valid only in the Frameset DTD
612 *
Daniel Veillard02bb1702001-06-13 21:11:59 +0000613 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
Daniel Veillard930dfb62003-02-05 10:17:38 +0000614 , subElements , impliedsubelt , Attributes, userdata
Owen Taylor3473f882001-02-23 17:55:21 +0000615 */
Daniel Veillard930dfb62003-02-05 10:17:38 +0000616
617/* Definitions and a couple of vars for HTML Elements */
618
619#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000620#define NB_FONTSTYLE 8
Daniel Veillard930dfb62003-02-05 10:17:38 +0000621#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000622#define NB_PHRASE 10
Daniel Veillard491e58e2007-05-02 16:15:18 +0000623#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
624#define NB_SPECIAL 16
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100625#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000626#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
Eugene Pimenov4b41f152010-01-20 14:25:59 +0100627#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000628#define NB_BLOCK NB_HEADING + NB_LIST + 14
Daniel Veillard930dfb62003-02-05 10:17:38 +0000629#define FORMCTRL "input", "select", "textarea", "label", "button"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000630#define NB_FORMCTRL 5
Daniel Veillard930dfb62003-02-05 10:17:38 +0000631#define PCDATA
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000632#define NB_PCDATA 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000633#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000634#define NB_HEADING 6
Daniel Veillard930dfb62003-02-05 10:17:38 +0000635#define LIST "ul", "ol", "dir", "menu"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000636#define NB_LIST 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000637#define MODIFIER
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000638#define NB_MODIFIER 0
Daniel Veillard930dfb62003-02-05 10:17:38 +0000639#define FLOW BLOCK,INLINE
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000640#define NB_FLOW NB_BLOCK + NB_INLINE
Daniel Veillard930dfb62003-02-05 10:17:38 +0000641#define EMPTY NULL
642
643
Daniel Veillard065abe82006-07-03 08:55:04 +0000644static const char* const html_flow[] = { FLOW, NULL } ;
645static const char* const html_inline[] = { INLINE, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000646
647/* placeholders: elts with content but no subelements */
Daniel Veillard065abe82006-07-03 08:55:04 +0000648static const char* const html_pcdata[] = { NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000649#define html_cdata html_pcdata
650
651
652/* ... and for HTML Attributes */
653
654#define COREATTRS "id", "class", "style", "title"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000655#define NB_COREATTRS 4
Daniel Veillard930dfb62003-02-05 10:17:38 +0000656#define I18N "lang", "dir"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000657#define NB_I18N 2
Daniel Veillard930dfb62003-02-05 10:17:38 +0000658#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000659#define NB_EVENTS 9
Daniel Veillard930dfb62003-02-05 10:17:38 +0000660#define ATTRS COREATTRS,I18N,EVENTS
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000661#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
Daniel Veillard930dfb62003-02-05 10:17:38 +0000662#define CELLHALIGN "align", "char", "charoff"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000663#define NB_CELLHALIGN 3
Daniel Veillard930dfb62003-02-05 10:17:38 +0000664#define CELLVALIGN "valign"
Daniel Veillard7a5e0dd2004-09-17 08:45:25 +0000665#define NB_CELLVALIGN 1
Daniel Veillard930dfb62003-02-05 10:17:38 +0000666
Daniel Veillard065abe82006-07-03 08:55:04 +0000667static const char* const html_attrs[] = { ATTRS, NULL } ;
668static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
669static const char* const core_attrs[] = { COREATTRS, NULL } ;
670static const char* const i18n_attrs[] = { I18N, NULL } ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000671
672
673/* Other declarations that should go inline ... */
Daniel Veillard065abe82006-07-03 08:55:04 +0000674static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000675 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
676 "tabindex", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000677static const char* const target_attr[] = { "target", NULL } ;
678static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
679static const char* const alt_attr[] = { "alt", NULL } ;
680static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
681static const char* const href_attrs[] = { "href", NULL } ;
682static const char* const clear_attrs[] = { "clear", NULL } ;
683static const char* const inline_p[] = { INLINE, "p", NULL } ;
684
685static const char* const flow_param[] = { FLOW, "param", NULL } ;
686static const char* const applet_attrs[] = { COREATTRS , "codebase",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000687 "archive", "alt", "name", "height", "width", "align",
688 "hspace", "vspace", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000689static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000690 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000691static const char* const basefont_attrs[] =
Daniel Veillard930dfb62003-02-05 10:17:38 +0000692 { "id", "size", "color", "face", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000693static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
694static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
695static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
696static const char* const body_depr[] = { "background", "bgcolor", "text",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000697 "link", "vlink", "alink", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000698static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
Daniel Veillard930dfb62003-02-05 10:17:38 +0000699 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
700
701
Daniel Veillard065abe82006-07-03 08:55:04 +0000702static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
703static const char* const col_elt[] = { "col", NULL } ;
704static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
705static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
706static const char* const dl_contents[] = { "dt", "dd", NULL } ;
707static const char* const compact_attr[] = { "compact", NULL } ;
708static const char* const label_attr[] = { "label", NULL } ;
709static const char* const fieldset_contents[] = { FLOW, "legend" } ;
710static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
711static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
712static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
713static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
714static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
715static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
716static const char* const head_attrs[] = { I18N, "profile", NULL } ;
717static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
718static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
719static const char* const version_attr[] = { "version", NULL } ;
720static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
721static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
722static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
Daniel Veillard491e58e2007-05-02 16:15:18 +0000723static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000724static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
725static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
726static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
727static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
728static const char* const align_attr[] = { "align", NULL } ;
729static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
730static const char* const map_contents[] = { BLOCK, "area", NULL } ;
731static const char* const name_attr[] = { "name", NULL } ;
732static const char* const action_attr[] = { "action", NULL } ;
733static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
Denis Pauk868d92d2012-05-10 15:34:57 +0800734static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000735static const char* const content_attr[] = { "content", NULL } ;
736static const char* const type_attr[] = { "type", NULL } ;
737static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
738static const char* const object_contents[] = { FLOW, "param", NULL } ;
739static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
740static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
741static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
742static const char* const option_elt[] = { "option", NULL } ;
743static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
744static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
745static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
746static const char* const width_attr[] = { "width", NULL } ;
747static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
748static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
749static const char* const language_attr[] = { "language", NULL } ;
750static const char* const select_content[] = { "optgroup", "option", NULL } ;
751static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
752static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
Roland Steiner04f8eef2009-05-12 09:16:16 +0200753static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
Daniel Veillard065abe82006-07-03 08:55:04 +0000754static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
755static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
756static const char* const tr_elt[] = { "tr", NULL } ;
757static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
758static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
759static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
760static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
761static const char* const tr_contents[] = { "th", "td", NULL } ;
762static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
763static const char* const li_elt[] = { "li", NULL } ;
764static const char* const ul_depr[] = { "type", "compact", NULL} ;
765static const char* const dir_attr[] = { "dir", NULL} ;
Daniel Veillard930dfb62003-02-05 10:17:38 +0000766
767#define DECL (const char**)
768
Daniel Veillard22090732001-07-16 00:06:07 +0000769static const htmlElemDesc
770html40ElementTable[] = {
Daniel Veillard930dfb62003-02-05 10:17:38 +0000771{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
772 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
773},
774{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
775 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
776},
777{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
778 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
779},
780{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
781 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
782},
783{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
784 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
785},
786{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
787 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
788},
789{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
790 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
791},
792{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
793 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
794},
795{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
796 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
797},
798{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
799 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
800},
801{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
802 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
803},
804{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
805 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
806},
807{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
808 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
809},
810{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
811 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
812},
813{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
814 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
815},
816{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
817 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
818},
819{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
820 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
821},
822{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
823 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
824},
825{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
826 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
827},
828{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
829 EMPTY , NULL , DECL col_attrs , NULL, NULL
830},
831{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
832 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
833},
834{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
835 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
836},
837{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
838 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
839},
840{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
841 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
842},
843{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
844 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
845},
846{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
847 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
848},
849{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
William M. Bracke978ae22007-03-21 06:16:02 +0000850 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
Daniel Veillard930dfb62003-02-05 10:17:38 +0000851},
852{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
853 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854},
855{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
856 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
857},
Daniel Veillard640f89e2008-01-11 06:24:09 +0000858{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
Daniel Veillard491e58e2007-05-02 16:15:18 +0000859 EMPTY, NULL, DECL embed_attrs, NULL, NULL
860},
Daniel Veillard930dfb62003-02-05 10:17:38 +0000861{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
862 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
863},
864{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
865 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
866},
867{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
868 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
869},
870{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
871 EMPTY, NULL, NULL, DECL frame_attrs, NULL
872},
873{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
874 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
875},
876{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
877 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
878},
879{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
880 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
881},
882{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
883 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
884},
885{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
886 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
887},
888{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
889 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
890},
891{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
892 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
893},
894{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
895 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
896},
897{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
898 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
899},
900{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
901 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
902},
903{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
904 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
905},
906{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
907 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
908},
909{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
William M. Bracke978ae22007-03-21 06:16:02 +0000910 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
Daniel Veillard930dfb62003-02-05 10:17:38 +0000911},
912{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
913 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
914},
915{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
916 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
917},
918{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
919 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
920},
921{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
922 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
923},
924{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
925 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
926},
927{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
928 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
929},
930{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
931 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
932},
933{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
934 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
935},
936{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
William M. Bracke978ae22007-03-21 06:16:02 +0000937 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000938},
939{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
940 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
941},
942{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
943 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
944},
945{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
946 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
947},
948{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
949 DECL html_flow, "div", DECL html_attrs, NULL, NULL
950},
951{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
952 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
953},
954{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
955 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
956},
957{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
William M. Bracke978ae22007-03-21 06:16:02 +0000958 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000959},
960{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
961 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
962},
963{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
964 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
965},
966{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
William M. Bracke978ae22007-03-21 06:16:02 +0000967 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
Daniel Veillard930dfb62003-02-05 10:17:38 +0000968},
969{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
970 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
971},
972{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
973 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
974},
975{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
976 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
977},
978{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
979 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
980},
981{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
982 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
983},
984{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
985 DECL select_content, NULL, DECL select_attrs, NULL, NULL
986},
987{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
988 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
989},
990{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
991 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
992},
993{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
994 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
995},
996{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
997 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
998},
999{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1000 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1001},
1002{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1003 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1004},
1005{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1006 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1007},
1008{ "table", 0, 0, 0, 0, 0, 0, 0, "",
1009 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1010},
1011{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1012 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1013},
1014{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1015 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1016},
1017{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1018 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1019},
1020{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1021 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1022},
1023{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1024 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1025},
1026{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1027 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1028},
1029{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1030 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1031},
1032{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1033 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1034},
1035{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1036 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1037},
1038{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1039 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1040},
1041{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1042 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1043},
1044{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1045 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1046}
Owen Taylor3473f882001-02-23 17:55:21 +00001047};
1048
1049/*
Owen Taylor3473f882001-02-23 17:55:21 +00001050 * start tags that imply the end of current element
1051 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001052static const char * const htmlStartClose[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001053"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1054 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1055 "listing", "xmp", "head", NULL,
1056"head", "p", NULL,
1057"title", "p", NULL,
1058"body", "head", "style", "link", "title", "p", NULL,
Daniel Veillard25d5d9a2004-04-05 07:08:42 +00001059"frameset", "head", "style", "link", "title", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001060"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1061 "pre", "listing", "xmp", "head", "li", NULL,
1062"hr", "p", "head", NULL,
1063"h1", "p", "head", NULL,
1064"h2", "p", "head", NULL,
1065"h3", "p", "head", NULL,
1066"h4", "p", "head", NULL,
1067"h5", "p", "head", NULL,
1068"h6", "p", "head", NULL,
1069"dir", "p", "head", NULL,
1070"address", "p", "head", "ul", NULL,
1071"pre", "p", "head", "ul", NULL,
1072"listing", "p", "head", NULL,
1073"xmp", "p", "head", NULL,
1074"blockquote", "p", "head", NULL,
1075"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1076 "xmp", "head", NULL,
1077"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1078 "head", "dd", NULL,
1079"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1080 "head", "dt", NULL,
1081"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1082 "listing", "xmp", NULL,
1083"ol", "p", "head", "ul", NULL,
1084"menu", "p", "head", "ul", NULL,
Daniel Veillard6339c1a2009-08-24 11:59:51 +02001085"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001086"div", "p", "head", NULL,
Denis Pauka0cd0752012-05-11 19:31:12 +08001087"noscript", "p", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001088"center", "font", "b", "i", "p", "head", NULL,
Conrad Irwinb60061a2012-07-27 15:42:27 -07001089"a", "a", "head", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001090"caption", "p", NULL,
1091"colgroup", "caption", "colgroup", "col", "p", NULL,
1092"col", "caption", "col", "p", NULL,
1093"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1094 "listing", "xmp", "a", NULL,
Daniel Veillard43dadeb2001-04-24 11:23:35 +00001095"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Daniel Veillarde77db162009-08-22 11:32:38 +02001096"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001097"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1098"thead", "caption", "col", "colgroup", NULL,
1099"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1100 "tbody", "p", NULL,
1101"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1102 "tfoot", "tbody", "p", NULL,
1103"optgroup", "option", NULL,
1104"option", "option", NULL,
1105"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1106 "pre", "listing", "xmp", "a", NULL,
Conrad Irwinb60061a2012-07-27 15:42:27 -07001107/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1108"tt", "head", NULL,
1109"i", "head", NULL,
1110"b", "head", NULL,
1111"u", "head", NULL,
1112"s", "head", NULL,
1113"strike", "head", NULL,
1114"big", "head", NULL,
1115"small", "head", NULL,
1116
1117"em", "head", NULL,
1118"strong", "head", NULL,
1119"dfn", "head", NULL,
1120"code", "head", NULL,
1121"samp", "head", NULL,
1122"kbd", "head", NULL,
1123"var", "head", NULL,
1124"cite", "head", NULL,
1125"abbr", "head", NULL,
1126"acronym", "head", NULL,
1127
1128/* "a" */
1129"img", "head", NULL,
1130/* "applet" */
1131/* "embed" */
1132/* "object" */
1133"font", "head", NULL,
1134/* "basefont" */
1135"br", "head", NULL,
1136/* "script" */
1137"map", "head", NULL,
1138"q", "head", NULL,
1139"sub", "head", NULL,
1140"sup", "head", NULL,
1141"span", "head", NULL,
1142"bdo", "head", NULL,
1143"iframe", "head", NULL,
Owen Taylor3473f882001-02-23 17:55:21 +00001144NULL
1145};
1146
1147/*
1148 * The list of HTML elements which are supposed not to have
1149 * CDATA content and where a p element will be implied
1150 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001151 * TODO: extend that list by reading the HTML SGML DTD on
Owen Taylor3473f882001-02-23 17:55:21 +00001152 * implied paragraph
1153 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001154static const char *const htmlNoContentElements[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001155 "html",
1156 "head",
Owen Taylor3473f882001-02-23 17:55:21 +00001157 NULL
1158};
1159
1160/*
1161 * The list of HTML attributes which are of content %Script;
1162 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1163 * it assumes the name starts with 'on'
1164 */
Daniel Veillard065abe82006-07-03 08:55:04 +00001165static const char *const htmlScriptAttributes[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001166 "onclick",
1167 "ondblclick",
1168 "onmousedown",
1169 "onmouseup",
1170 "onmouseover",
1171 "onmousemove",
1172 "onmouseout",
1173 "onkeypress",
1174 "onkeydown",
1175 "onkeyup",
1176 "onload",
1177 "onunload",
1178 "onfocus",
1179 "onblur",
1180 "onsubmit",
Daniel Veillardb0c7e7e2014-02-06 10:50:35 +01001181 "onreset",
Owen Taylor3473f882001-02-23 17:55:21 +00001182 "onchange",
1183 "onselect"
1184};
1185
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001186/*
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001187 * This table is used by the htmlparser to know what to do with
1188 * broken html pages. By assigning different priorities to different
1189 * elements the parser can decide how to handle extra endtags.
1190 * Endtags are only allowed to close elements with lower or equal
1191 * priority.
Daniel Veillarde77db162009-08-22 11:32:38 +02001192 */
Daniel Veillarda2bc3682001-05-03 08:27:20 +00001193
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001194typedef struct {
1195 const char *name;
1196 int priority;
1197} elementPriority;
1198
Daniel Veillard22090732001-07-16 00:06:07 +00001199static const elementPriority htmlEndPriority[] = {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001200 {"div", 150},
1201 {"td", 160},
1202 {"th", 160},
1203 {"tr", 170},
1204 {"thead", 180},
1205 {"tbody", 180},
1206 {"tfoot", 180},
1207 {"table", 190},
1208 {"head", 200},
1209 {"body", 200},
1210 {"html", 220},
1211 {NULL, 100} /* Default priority */
1212};
Owen Taylor3473f882001-02-23 17:55:21 +00001213
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001214static const char** htmlStartCloseIndex[100];
Owen Taylor3473f882001-02-23 17:55:21 +00001215static int htmlStartCloseIndexinitialized = 0;
1216
1217/************************************************************************
1218 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001219 * functions to handle HTML specific data *
Owen Taylor3473f882001-02-23 17:55:21 +00001220 * *
1221 ************************************************************************/
1222
1223/**
1224 * htmlInitAutoClose:
1225 *
1226 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1227 * This is not reentrant. Call xmlInitParser() once before processing in
1228 * case of use in multithreaded programs.
1229 */
1230void
1231htmlInitAutoClose(void) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001232 int indx, i = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001233
1234 if (htmlStartCloseIndexinitialized) return;
1235
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001236 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1237 indx = 0;
1238 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
Daniel Veillard28aac0b2006-10-16 08:31:18 +00001239 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
Owen Taylor3473f882001-02-23 17:55:21 +00001240 while (htmlStartClose[i] != NULL) i++;
1241 i++;
1242 }
1243 htmlStartCloseIndexinitialized = 1;
1244}
1245
1246/**
1247 * htmlTagLookup:
1248 * @tag: The tag name in lowercase
1249 *
1250 * Lookup the HTML tag in the ElementTable
1251 *
1252 * Returns the related htmlElemDescPtr or NULL if not found.
1253 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001254const htmlElemDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001255htmlTagLookup(const xmlChar *tag) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001256 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001257
1258 for (i = 0; i < (sizeof(html40ElementTable) /
1259 sizeof(html40ElementTable[0]));i++) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +00001260 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
William M. Brack78637da2003-07-31 14:47:38 +00001261 return((htmlElemDescPtr) &html40ElementTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001262 }
1263 return(NULL);
1264}
1265
1266/**
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001267 * htmlGetEndPriority:
1268 * @name: The name of the element to look up the priority for.
Daniel Veillarde77db162009-08-22 11:32:38 +02001269 *
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001270 * Return value: The "endtag" priority.
1271 **/
1272static int
1273htmlGetEndPriority (const xmlChar *name) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001274 int i = 0;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001275
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001276 while ((htmlEndPriority[i].name != NULL) &&
1277 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1278 i++;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001279
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001280 return(htmlEndPriority[i].priority);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001281}
1282
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001283
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001284/**
Owen Taylor3473f882001-02-23 17:55:21 +00001285 * htmlCheckAutoClose:
1286 * @newtag: The new tag name
1287 * @oldtag: The old tag name
1288 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001289 * Checks whether the new tag is one of the registered valid tags for
1290 * closing old.
Owen Taylor3473f882001-02-23 17:55:21 +00001291 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1292 *
1293 * Returns 0 if no, 1 if yes.
1294 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001295static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001296htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1297{
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001298 int i, indx;
1299 const char **closed = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001300
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001301 if (htmlStartCloseIndexinitialized == 0)
1302 htmlInitAutoClose();
Owen Taylor3473f882001-02-23 17:55:21 +00001303
1304 /* inefficient, but not a big deal */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001305 for (indx = 0; indx < 100; indx++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001306 closed = htmlStartCloseIndex[indx];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001307 if (closed == NULL)
1308 return (0);
1309 if (xmlStrEqual(BAD_CAST * closed, newtag))
1310 break;
Owen Taylor3473f882001-02-23 17:55:21 +00001311 }
1312
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001313 i = closed - htmlStartClose;
Owen Taylor3473f882001-02-23 17:55:21 +00001314 i++;
1315 while (htmlStartClose[i] != NULL) {
1316 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001317 return (1);
1318 }
1319 i++;
Owen Taylor3473f882001-02-23 17:55:21 +00001320 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001321 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00001322}
1323
1324/**
1325 * htmlAutoCloseOnClose:
1326 * @ctxt: an HTML parser context
1327 * @newtag: The new tag name
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001328 * @force: force the tag closure
Owen Taylor3473f882001-02-23 17:55:21 +00001329 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001330 * The HTML DTD allows an ending tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001331 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001332static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001333htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1334{
1335 const htmlElemDesc *info;
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001336 int i, priority;
Owen Taylor3473f882001-02-23 17:55:21 +00001337
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001338 priority = htmlGetEndPriority(newtag);
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001339
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001340 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard0a2a1632001-05-11 14:18:03 +00001341
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001342 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1343 break;
1344 /*
1345 * A missplaced endtag can only close elements with lower
1346 * or equal priority, so if we find an element with higher
1347 * priority before we find an element with
Daniel Veillarde77db162009-08-22 11:32:38 +02001348 * matching name, we just ignore this endtag
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001349 */
1350 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1351 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001352 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001353 if (i < 0)
1354 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001355
1356 while (!xmlStrEqual(newtag, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001357 info = htmlTagLookup(ctxt->name);
Daniel Veillardf403d292003-10-05 13:51:35 +00001358 if ((info != NULL) && (info->endTag == 3)) {
1359 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1360 "Opening and ending tag mismatch: %s and %s\n",
Daniel Veillard05bcb7e2003-10-19 14:26:34 +00001361 newtag, ctxt->name);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001362 }
1363 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1364 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001365 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001366 }
1367}
1368
1369/**
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001370 * htmlAutoCloseOnEnd:
1371 * @ctxt: an HTML parser context
1372 *
1373 * Close all remaining tags at the end of the stream
1374 */
1375static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001376htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1377{
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001378 int i;
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001379
William M. Brack899e64a2003-09-26 18:03:42 +00001380 if (ctxt->nameNr == 0)
1381 return;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001382 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001383 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1384 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001385 htmlnamePop(ctxt);
Daniel Veillarda3bfca52001-04-12 15:42:58 +00001386 }
1387}
1388
1389/**
Owen Taylor3473f882001-02-23 17:55:21 +00001390 * htmlAutoClose:
1391 * @ctxt: an HTML parser context
1392 * @newtag: The new tag name or NULL
1393 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001394 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001395 * The list is kept in htmlStartClose array. This function is
1396 * called when a new tag has been detected and generates the
1397 * appropriates closes if possible/needed.
1398 * If newtag is NULL this mean we are at the end of the resource
Daniel Veillarde77db162009-08-22 11:32:38 +02001399 * and we should check
Owen Taylor3473f882001-02-23 17:55:21 +00001400 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001401static void
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001402htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1403{
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001404 while ((newtag != NULL) && (ctxt->name != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001405 (htmlCheckAutoClose(newtag, ctxt->name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001406 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1407 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001408 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00001409 }
1410 if (newtag == NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001411 htmlAutoCloseOnEnd(ctxt);
1412 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001413 }
1414 while ((newtag == NULL) && (ctxt->name != NULL) &&
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001415 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1416 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1417 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001418 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1419 ctxt->sax->endElement(ctxt->userData, ctxt->name);
William M. Brack899e64a2003-09-26 18:03:42 +00001420 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001421 }
Owen Taylor3473f882001-02-23 17:55:21 +00001422}
1423
1424/**
1425 * htmlAutoCloseTag:
1426 * @doc: the HTML document
1427 * @name: The tag name
1428 * @elem: the HTML element
1429 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001430 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001431 * The list is kept in htmlStartClose array. This function checks
1432 * if the element or one of it's children would autoclose the
1433 * given tag.
1434 *
1435 * Returns 1 if autoclose, 0 otherwise
1436 */
1437int
1438htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1439 htmlNodePtr child;
1440
1441 if (elem == NULL) return(1);
1442 if (xmlStrEqual(name, elem->name)) return(0);
1443 if (htmlCheckAutoClose(elem->name, name)) return(1);
1444 child = elem->children;
1445 while (child != NULL) {
1446 if (htmlAutoCloseTag(doc, name, child)) return(1);
1447 child = child->next;
1448 }
1449 return(0);
1450}
1451
1452/**
1453 * htmlIsAutoClosed:
1454 * @doc: the HTML document
1455 * @elem: the HTML element
1456 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001457 * The HTML DTD allows a tag to implicitly close other tags.
Owen Taylor3473f882001-02-23 17:55:21 +00001458 * The list is kept in htmlStartClose array. This function checks
1459 * if a tag is autoclosed by one of it's child
1460 *
1461 * Returns 1 if autoclosed, 0 otherwise
1462 */
1463int
1464htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1465 htmlNodePtr child;
1466
1467 if (elem == NULL) return(1);
1468 child = elem->children;
1469 while (child != NULL) {
1470 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1471 child = child->next;
1472 }
1473 return(0);
1474}
1475
1476/**
1477 * htmlCheckImplied:
1478 * @ctxt: an HTML parser context
1479 * @newtag: The new tag name
1480 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001481 * The HTML DTD allows a tag to exists only implicitly
Owen Taylor3473f882001-02-23 17:55:21 +00001482 * called when a new tag has been detected and generates the
1483 * appropriates implicit tags if missing
1484 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001485static void
Owen Taylor3473f882001-02-23 17:55:21 +00001486htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
Daniel Veillardb468f742009-08-24 18:45:33 +02001487 int i;
1488
Daniel Veillarde20fb5a2010-01-29 20:47:08 +01001489 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1490 return;
Owen Taylor3473f882001-02-23 17:55:21 +00001491 if (!htmlOmittedDefaultValue)
1492 return;
1493 if (xmlStrEqual(newtag, BAD_CAST"html"))
1494 return;
1495 if (ctxt->nameNr <= 0) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001496 htmlnamePush(ctxt, BAD_CAST"html");
Owen Taylor3473f882001-02-23 17:55:21 +00001497 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1498 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1499 }
1500 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1501 return;
Daniel Veillarde77db162009-08-22 11:32:38 +02001502 if ((ctxt->nameNr <= 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00001503 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1504 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1505 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1506 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1507 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1508 (xmlStrEqual(newtag, BAD_CAST"base")))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001509 if (ctxt->html >= 3) {
1510 /* we already saw or generated an <head> before */
1511 return;
1512 }
1513 /*
1514 * dropped OBJECT ... i you put it first BODY will be
1515 * assumed !
1516 */
1517 htmlnamePush(ctxt, BAD_CAST"head");
1518 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1519 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001520 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1521 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1522 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
Daniel Veillard029a04d2009-08-24 12:50:23 +02001523 if (ctxt->html >= 10) {
1524 /* we already saw or generated a <body> before */
1525 return;
1526 }
Owen Taylor3473f882001-02-23 17:55:21 +00001527 for (i = 0;i < ctxt->nameNr;i++) {
1528 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1529 return;
1530 }
1531 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1532 return;
1533 }
1534 }
Daniel Veillarde77db162009-08-22 11:32:38 +02001535
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001536 htmlnamePush(ctxt, BAD_CAST"body");
Owen Taylor3473f882001-02-23 17:55:21 +00001537 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1538 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1539 }
1540}
1541
1542/**
1543 * htmlCheckParagraph
1544 * @ctxt: an HTML parser context
1545 *
1546 * Check whether a p element need to be implied before inserting
1547 * characters in the current element.
1548 *
1549 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1550 * in case of error.
1551 */
1552
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001553static int
Owen Taylor3473f882001-02-23 17:55:21 +00001554htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1555 const xmlChar *tag;
1556 int i;
1557
1558 if (ctxt == NULL)
1559 return(-1);
1560 tag = ctxt->name;
1561 if (tag == NULL) {
1562 htmlAutoClose(ctxt, BAD_CAST"p");
1563 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001564 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001565 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1567 return(1);
1568 }
1569 if (!htmlOmittedDefaultValue)
1570 return(0);
1571 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1572 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
Owen Taylor3473f882001-02-23 17:55:21 +00001573 htmlAutoClose(ctxt, BAD_CAST"p");
1574 htmlCheckImplied(ctxt, BAD_CAST"p");
Daniel Veillard2fdbd322003-08-18 12:15:38 +00001575 htmlnamePush(ctxt, BAD_CAST"p");
Owen Taylor3473f882001-02-23 17:55:21 +00001576 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1577 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1578 return(1);
1579 }
1580 }
1581 return(0);
1582}
1583
1584/**
1585 * htmlIsScriptAttribute:
1586 * @name: an attribute name
1587 *
1588 * Check if an attribute is of content type Script
1589 *
1590 * Returns 1 is the attribute is a script 0 otherwise
1591 */
1592int
1593htmlIsScriptAttribute(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001594 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001595
1596 if (name == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02001597 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001598 /*
1599 * all script attributes start with 'on'
1600 */
1601 if ((name[0] != 'o') || (name[1] != 'n'))
Daniel Veillarde77db162009-08-22 11:32:38 +02001602 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00001603 for (i = 0;
1604 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1605 i++) {
1606 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1607 return(1);
1608 }
1609 return(0);
1610}
1611
1612/************************************************************************
1613 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02001614 * The list of HTML predefined entities *
Owen Taylor3473f882001-02-23 17:55:21 +00001615 * *
1616 ************************************************************************/
1617
1618
Daniel Veillard22090732001-07-16 00:06:07 +00001619static const htmlEntityDesc html40EntitiesTable[] = {
Owen Taylor3473f882001-02-23 17:55:21 +00001620/*
1621 * the 4 absolute ones, plus apostrophe.
1622 */
1623{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1624{ 38, "amp", "ampersand, U+0026 ISOnum" },
1625{ 39, "apos", "single quote" },
1626{ 60, "lt", "less-than sign, U+003C ISOnum" },
1627{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1628
1629/*
1630 * A bunch still in the 128-255 range
1631 * Replacing them depend really on the charset used.
1632 */
1633{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1634{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1635{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1636{ 163, "pound","pound sign, U+00A3 ISOnum" },
1637{ 164, "curren","currency sign, U+00A4 ISOnum" },
1638{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1639{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1640{ 167, "sect", "section sign, U+00A7 ISOnum" },
1641{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1642{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1643{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1644{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1645{ 172, "not", "not sign, U+00AC ISOnum" },
1646{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1647{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1648{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1649{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1650{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1651{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1652{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1653{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1654{ 181, "micro","micro sign, U+00B5 ISOnum" },
1655{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1656{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1657{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1658{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1659{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1660{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1661{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1662{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1663{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1664{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1665{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1666{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1667{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1668{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1669{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1670{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1671{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1672{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1673{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1674{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1675{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1676{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1677{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1678{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1679{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1680{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1681{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1682{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1683{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1684{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1685{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1686{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1687{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1688{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1689{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1690{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1691{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1692{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1693{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1694{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1695{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1696{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1697{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1698{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1699{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1700{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1701{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1702{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1703{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1704{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1705{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1706{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1707{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1708{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1709{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1710{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1711{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1712{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1713{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1714{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1715{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1716{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1717{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1718{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1719{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1720{ 247, "divide","division sign, U+00F7 ISOnum" },
1721{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1722{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1723{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1724{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1725{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1726{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1727{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1728{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1729
1730{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1731{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1732{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1733{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1734{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1735
1736/*
1737 * Anything below should really be kept as entities references
1738 */
1739{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1740
1741{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1742{ 732, "tilde","small tilde, U+02DC ISOdia" },
1743
1744{ 913, "Alpha","greek capital letter alpha, U+0391" },
1745{ 914, "Beta", "greek capital letter beta, U+0392" },
1746{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1747{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1748{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1749{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1750{ 919, "Eta", "greek capital letter eta, U+0397" },
1751{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1752{ 921, "Iota", "greek capital letter iota, U+0399" },
1753{ 922, "Kappa","greek capital letter kappa, U+039A" },
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001754{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
Owen Taylor3473f882001-02-23 17:55:21 +00001755{ 924, "Mu", "greek capital letter mu, U+039C" },
1756{ 925, "Nu", "greek capital letter nu, U+039D" },
1757{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1758{ 927, "Omicron","greek capital letter omicron, U+039F" },
1759{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1760{ 929, "Rho", "greek capital letter rho, U+03A1" },
1761{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1762{ 932, "Tau", "greek capital letter tau, U+03A4" },
1763{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1764{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1765{ 935, "Chi", "greek capital letter chi, U+03A7" },
1766{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1767{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1768
1769{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1770{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1771{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1772{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1773{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1774{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1775{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1776{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1777{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1778{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1779{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1780{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1781{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1782{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1783{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1784{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1785{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1786{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1787{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1788{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1789{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1790{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1791{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1792{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1793{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1794{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1795{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1796{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1797
1798{ 8194, "ensp", "en space, U+2002 ISOpub" },
1799{ 8195, "emsp", "em space, U+2003 ISOpub" },
1800{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1801{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1802{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1803{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1804{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1805{ 8211, "ndash","en dash, U+2013 ISOpub" },
1806{ 8212, "mdash","em dash, U+2014 ISOpub" },
1807{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1808{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1809{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1810{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1811{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1812{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1813{ 8224, "dagger","dagger, U+2020 ISOpub" },
1814{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1815
1816{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1817{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1818
1819{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1820
1821{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1822{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1823
1824{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1825{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1826
1827{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1828{ 8260, "frasl","fraction slash, U+2044 NEW" },
1829
1830{ 8364, "euro", "euro sign, U+20AC NEW" },
1831
1832{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1833{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1834{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1835{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1836{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1837{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1838{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1839{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1840{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1841{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1842{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1843{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1844{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1845{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1846{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1847{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1848
1849{ 8704, "forall","for all, U+2200 ISOtech" },
1850{ 8706, "part", "partial differential, U+2202 ISOtech" },
1851{ 8707, "exist","there exists, U+2203 ISOtech" },
1852{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1853{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1854{ 8712, "isin", "element of, U+2208 ISOtech" },
1855{ 8713, "notin","not an element of, U+2209 ISOtech" },
1856{ 8715, "ni", "contains as member, U+220B ISOtech" },
1857{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001858{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
Owen Taylor3473f882001-02-23 17:55:21 +00001859{ 8722, "minus","minus sign, U+2212 ISOtech" },
1860{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1861{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1862{ 8733, "prop", "proportional to, U+221D ISOtech" },
1863{ 8734, "infin","infinity, U+221E ISOtech" },
1864{ 8736, "ang", "angle, U+2220 ISOamso" },
1865{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1866{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1867{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1868{ 8746, "cup", "union = cup, U+222A ISOtech" },
1869{ 8747, "int", "integral, U+222B ISOtech" },
1870{ 8756, "there4","therefore, U+2234 ISOtech" },
1871{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1872{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1873{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1874{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1875{ 8801, "equiv","identical to, U+2261 ISOtech" },
1876{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1877{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1878{ 8834, "sub", "subset of, U+2282 ISOtech" },
1879{ 8835, "sup", "superset of, U+2283 ISOtech" },
1880{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1881{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1882{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1883{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1884{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1885{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1886{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1887{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1888{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1889{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1890{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1891{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1892{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1893{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1894
1895{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1896{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1897{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1898{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1899
1900};
1901
1902/************************************************************************
1903 * *
1904 * Commodity functions to handle entities *
1905 * *
1906 ************************************************************************/
1907
1908/*
1909 * Macro used to grow the current buffer.
1910 */
1911#define growBuffer(buffer) { \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001912 xmlChar *tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001913 buffer##_size *= 2; \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001914 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1915 if (tmp == NULL) { \
Daniel Veillardf403d292003-10-05 13:51:35 +00001916 htmlErrMemory(ctxt, "growing buffer\n"); \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001917 xmlFree(buffer); \
Owen Taylor3473f882001-02-23 17:55:21 +00001918 return(NULL); \
1919 } \
Daniel Veillard079f6a72004-09-23 13:15:03 +00001920 buffer = tmp; \
Owen Taylor3473f882001-02-23 17:55:21 +00001921}
1922
1923/**
1924 * htmlEntityLookup:
1925 * @name: the entity name
1926 *
1927 * Lookup the given entity in EntitiesTable
1928 *
1929 * TODO: the linear scan is really ugly, an hash table is really needed.
1930 *
1931 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1932 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001933const htmlEntityDesc *
Owen Taylor3473f882001-02-23 17:55:21 +00001934htmlEntityLookup(const xmlChar *name) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001935 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001936
1937 for (i = 0;i < (sizeof(html40EntitiesTable)/
1938 sizeof(html40EntitiesTable[0]));i++) {
1939 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
William M. Brack78637da2003-07-31 14:47:38 +00001940 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001941 }
1942 }
1943 return(NULL);
1944}
1945
1946/**
1947 * htmlEntityValueLookup:
1948 * @value: the entity's unicode value
1949 *
1950 * Lookup the given entity in EntitiesTable
1951 *
1952 * TODO: the linear scan is really ugly, an hash table is really needed.
1953 *
1954 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1955 */
Daniel Veillardbb371292001-08-16 23:26:59 +00001956const htmlEntityDesc *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001957htmlEntityValueLookup(unsigned int value) {
1958 unsigned int i;
Owen Taylor3473f882001-02-23 17:55:21 +00001959
1960 for (i = 0;i < (sizeof(html40EntitiesTable)/
1961 sizeof(html40EntitiesTable[0]));i++) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001962 if (html40EntitiesTable[i].value >= value) {
1963 if (html40EntitiesTable[i].value > value)
Owen Taylor3473f882001-02-23 17:55:21 +00001964 break;
William M. Brack78637da2003-07-31 14:47:38 +00001965 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
Owen Taylor3473f882001-02-23 17:55:21 +00001966 }
Owen Taylor3473f882001-02-23 17:55:21 +00001967 }
1968 return(NULL);
1969}
1970
1971/**
1972 * UTF8ToHtml:
1973 * @out: a pointer to an array of bytes to store the result
1974 * @outlen: the length of @out
1975 * @in: a pointer to an array of UTF-8 chars
1976 * @inlen: the length of @in
1977 *
1978 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1979 * plus HTML entities block of chars out.
1980 *
1981 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1982 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001983 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001984 * The value of @outlen after return is the number of octets consumed.
1985 */
1986int
1987UTF8ToHtml(unsigned char* out, int *outlen,
1988 const unsigned char* in, int *inlen) {
1989 const unsigned char* processed = in;
1990 const unsigned char* outend;
1991 const unsigned char* outstart = out;
1992 const unsigned char* instart = in;
1993 const unsigned char* inend;
1994 unsigned int c, d;
1995 int trailing;
1996
Daniel Veillardce682bc2004-11-05 17:22:25 +00001997 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00001998 if (in == NULL) {
1999 /*
2000 * initialization nothing to do
2001 */
2002 *outlen = 0;
2003 *inlen = 0;
2004 return(0);
2005 }
2006 inend = in + (*inlen);
2007 outend = out + (*outlen);
2008 while (in < inend) {
2009 d = *in++;
2010 if (d < 0x80) { c= d; trailing= 0; }
2011 else if (d < 0xC0) {
2012 /* trailing byte in leading position */
2013 *outlen = out - outstart;
2014 *inlen = processed - instart;
2015 return(-2);
2016 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2017 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2018 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2019 else {
2020 /* no chance for this in Ascii */
2021 *outlen = out - outstart;
2022 *inlen = processed - instart;
2023 return(-2);
2024 }
2025
2026 if (inend - in < trailing) {
2027 break;
Daniel Veillarde77db162009-08-22 11:32:38 +02002028 }
Owen Taylor3473f882001-02-23 17:55:21 +00002029
2030 for ( ; trailing; trailing--) {
2031 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2032 break;
2033 c <<= 6;
2034 c |= d & 0x3F;
2035 }
2036
2037 /* assertion: c is a single UTF-4 value */
2038 if (c < 0x80) {
2039 if (out + 1 >= outend)
2040 break;
2041 *out++ = c;
2042 } else {
2043 int len;
Daniel Veillardbb371292001-08-16 23:26:59 +00002044 const htmlEntityDesc * ent;
Daniel Veillard1032ac42006-11-23 16:18:30 +00002045 const char *cp;
2046 char nbuf[16];
Owen Taylor3473f882001-02-23 17:55:21 +00002047
2048 /*
2049 * Try to lookup a predefined HTML entity for it
2050 */
2051
2052 ent = htmlEntityValueLookup(c);
2053 if (ent == NULL) {
Daniel Veillard1032ac42006-11-23 16:18:30 +00002054 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2055 cp = nbuf;
Owen Taylor3473f882001-02-23 17:55:21 +00002056 }
Daniel Veillard1032ac42006-11-23 16:18:30 +00002057 else
2058 cp = ent->name;
2059 len = strlen(cp);
Owen Taylor3473f882001-02-23 17:55:21 +00002060 if (out + 2 + len >= outend)
2061 break;
2062 *out++ = '&';
Daniel Veillard1032ac42006-11-23 16:18:30 +00002063 memcpy(out, cp, len);
Owen Taylor3473f882001-02-23 17:55:21 +00002064 out += len;
2065 *out++ = ';';
2066 }
2067 processed = in;
2068 }
2069 *outlen = out - outstart;
2070 *inlen = processed - instart;
2071 return(0);
2072}
2073
2074/**
2075 * htmlEncodeEntities:
2076 * @out: a pointer to an array of bytes to store the result
2077 * @outlen: the length of @out
2078 * @in: a pointer to an array of UTF-8 chars
2079 * @inlen: the length of @in
2080 * @quoteChar: the quote character to escape (' or ") or zero.
2081 *
2082 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2083 * plus HTML entities block of chars out.
2084 *
2085 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2086 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002087 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00002088 * The value of @outlen after return is the number of octets consumed.
2089 */
2090int
2091htmlEncodeEntities(unsigned char* out, int *outlen,
2092 const unsigned char* in, int *inlen, int quoteChar) {
2093 const unsigned char* processed = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002094 const unsigned char* outend;
Owen Taylor3473f882001-02-23 17:55:21 +00002095 const unsigned char* outstart = out;
2096 const unsigned char* instart = in;
Daniel Veillardce682bc2004-11-05 17:22:25 +00002097 const unsigned char* inend;
Owen Taylor3473f882001-02-23 17:55:21 +00002098 unsigned int c, d;
2099 int trailing;
2100
Daniel Veillardce682bc2004-11-05 17:22:25 +00002101 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2102 return(-1);
2103 outend = out + (*outlen);
2104 inend = in + (*inlen);
Owen Taylor3473f882001-02-23 17:55:21 +00002105 while (in < inend) {
2106 d = *in++;
2107 if (d < 0x80) { c= d; trailing= 0; }
2108 else if (d < 0xC0) {
2109 /* trailing byte in leading position */
2110 *outlen = out - outstart;
2111 *inlen = processed - instart;
2112 return(-2);
2113 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2114 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2115 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2116 else {
2117 /* no chance for this in Ascii */
2118 *outlen = out - outstart;
2119 *inlen = processed - instart;
2120 return(-2);
2121 }
2122
2123 if (inend - in < trailing)
2124 break;
2125
2126 while (trailing--) {
2127 if (((d= *in++) & 0xC0) != 0x80) {
2128 *outlen = out - outstart;
2129 *inlen = processed - instart;
2130 return(-2);
2131 }
2132 c <<= 6;
2133 c |= d & 0x3F;
2134 }
2135
2136 /* assertion: c is a single UTF-4 value */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002137 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2138 (c != '&') && (c != '<') && (c != '>')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002139 if (out >= outend)
2140 break;
2141 *out++ = c;
2142 } else {
Daniel Veillardbb371292001-08-16 23:26:59 +00002143 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002144 const char *cp;
2145 char nbuf[16];
2146 int len;
2147
2148 /*
2149 * Try to lookup a predefined HTML entity for it
2150 */
2151 ent = htmlEntityValueLookup(c);
2152 if (ent == NULL) {
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002153 snprintf(nbuf, sizeof(nbuf), "#%u", c);
Owen Taylor3473f882001-02-23 17:55:21 +00002154 cp = nbuf;
2155 }
2156 else
2157 cp = ent->name;
2158 len = strlen(cp);
2159 if (out + 2 + len > outend)
2160 break;
2161 *out++ = '&';
2162 memcpy(out, cp, len);
2163 out += len;
2164 *out++ = ';';
2165 }
2166 processed = in;
2167 }
2168 *outlen = out - outstart;
2169 *inlen = processed - instart;
2170 return(0);
2171}
2172
Owen Taylor3473f882001-02-23 17:55:21 +00002173/************************************************************************
2174 * *
2175 * Commodity functions to handle streams *
2176 * *
2177 ************************************************************************/
2178
2179/**
Owen Taylor3473f882001-02-23 17:55:21 +00002180 * htmlNewInputStream:
2181 * @ctxt: an HTML parser context
2182 *
2183 * Create a new input stream structure
2184 * Returns the new input stream or NULL
2185 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002186static htmlParserInputPtr
Owen Taylor3473f882001-02-23 17:55:21 +00002187htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2188 htmlParserInputPtr input;
2189
2190 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2191 if (input == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002192 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002193 return(NULL);
2194 }
2195 memset(input, 0, sizeof(htmlParserInput));
2196 input->filename = NULL;
2197 input->directory = NULL;
2198 input->base = NULL;
2199 input->cur = NULL;
2200 input->buf = NULL;
2201 input->line = 1;
2202 input->col = 1;
2203 input->buf = NULL;
2204 input->free = NULL;
2205 input->version = NULL;
2206 input->consumed = 0;
2207 input->length = 0;
2208 return(input);
2209}
2210
2211
2212/************************************************************************
2213 * *
2214 * Commodity functions, cleanup needed ? *
2215 * *
2216 ************************************************************************/
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002217/*
Daniel Veillarde77db162009-08-22 11:32:38 +02002218 * all tags allowing pc data from the html 4.01 loose dtd
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002219 * NOTE: it might be more apropriate to integrate this information
2220 * into the html40ElementTable array but I don't want to risk any
2221 * binary incomptibility
2222 */
2223static const char *allowPCData[] = {
2224 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2225 "blockquote", "body", "button", "caption", "center", "cite", "code",
2226 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2227 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2228 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2229 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2230};
Owen Taylor3473f882001-02-23 17:55:21 +00002231
2232/**
2233 * areBlanks:
2234 * @ctxt: an HTML parser context
2235 * @str: a xmlChar *
2236 * @len: the size of @str
2237 *
2238 * Is this a sequence of blank chars that one can ignore ?
2239 *
2240 * Returns 1 if ignorable 0 otherwise.
2241 */
2242
2243static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002244 unsigned int i;
2245 int j;
Owen Taylor3473f882001-02-23 17:55:21 +00002246 xmlNodePtr lastChild;
Daniel Veillard36d73402005-09-01 09:52:30 +00002247 xmlDtdPtr dtd;
Owen Taylor3473f882001-02-23 17:55:21 +00002248
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002249 for (j = 0;j < len;j++)
William M. Brack76e95df2003-10-18 16:20:14 +00002250 if (!(IS_BLANK_CH(str[j]))) return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00002251
2252 if (CUR == 0) return(1);
2253 if (CUR != '<') return(0);
2254 if (ctxt->name == NULL)
2255 return(1);
2256 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2257 return(1);
2258 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2259 return(1);
Daniel Veillard36d73402005-09-01 09:52:30 +00002260
2261 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2262 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2263 dtd = xmlGetIntSubset(ctxt->myDoc);
2264 if (dtd != NULL && dtd->ExternalID != NULL) {
2265 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2266 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2267 return(1);
2268 }
2269 }
2270
Owen Taylor3473f882001-02-23 17:55:21 +00002271 if (ctxt->node == NULL) return(0);
2272 lastChild = xmlGetLastChild(ctxt->node);
Daniel Veillard18a65092004-05-11 15:57:42 +00002273 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2274 lastChild = lastChild->prev;
Owen Taylor3473f882001-02-23 17:55:21 +00002275 if (lastChild == NULL) {
Daniel Veillard7db37732001-07-12 01:20:08 +00002276 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2277 (ctxt->node->content != NULL)) return(0);
Daniel Veillarde77db162009-08-22 11:32:38 +02002278 /* keep ws in constructs like ...<b> </b>...
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002279 for all tags "b" allowing PCDATA */
2280 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2281 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2282 return(0);
2283 }
2284 }
Owen Taylor3473f882001-02-23 17:55:21 +00002285 } else if (xmlNodeIsText(lastChild)) {
2286 return(0);
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002287 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02002288 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
Daniel Veillard8c9872c2002-07-05 18:17:10 +00002289 for all tags "p" allowing PCDATA */
2290 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2291 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2292 return(0);
2293 }
2294 }
Owen Taylor3473f882001-02-23 17:55:21 +00002295 }
2296 return(1);
2297}
2298
2299/**
Owen Taylor3473f882001-02-23 17:55:21 +00002300 * htmlNewDocNoDtD:
2301 * @URI: URI for the dtd, or NULL
2302 * @ExternalID: the external ID of the DTD, or NULL
2303 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002304 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2305 * are NULL
2306 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002307 * Returns a new document, do not initialize the DTD if not provided
Owen Taylor3473f882001-02-23 17:55:21 +00002308 */
2309htmlDocPtr
2310htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2311 xmlDocPtr cur;
2312
2313 /*
2314 * Allocate a new document and fill the fields.
2315 */
2316 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2317 if (cur == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002318 htmlErrMemory(NULL, "HTML document creation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002319 return(NULL);
2320 }
2321 memset(cur, 0, sizeof(xmlDoc));
2322
2323 cur->type = XML_HTML_DOCUMENT_NODE;
2324 cur->version = NULL;
2325 cur->intSubset = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002326 cur->doc = cur;
2327 cur->name = NULL;
Daniel Veillarde77db162009-08-22 11:32:38 +02002328 cur->children = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002329 cur->extSubset = NULL;
2330 cur->oldNs = NULL;
2331 cur->encoding = NULL;
2332 cur->standalone = 1;
2333 cur->compression = 0;
2334 cur->ids = NULL;
2335 cur->refs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00002336 cur->_private = NULL;
Daniel Veillard7cc23572004-07-29 11:20:30 +00002337 cur->charset = XML_CHAR_ENCODING_UTF8;
Daniel Veillardae0765b2008-07-31 19:54:59 +00002338 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
Daniel Veillardb6b0fd82001-10-22 12:31:11 +00002339 if ((ExternalID != NULL) ||
2340 (URI != NULL))
Daniel Veillard40412cd2003-09-03 13:28:32 +00002341 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
Owen Taylor3473f882001-02-23 17:55:21 +00002342 return(cur);
2343}
2344
2345/**
2346 * htmlNewDoc:
2347 * @URI: URI for the dtd, or NULL
2348 * @ExternalID: the external ID of the DTD, or NULL
2349 *
Daniel Veillard5e2dace2001-07-18 19:30:27 +00002350 * Creates a new HTML document
2351 *
Owen Taylor3473f882001-02-23 17:55:21 +00002352 * Returns a new document
2353 */
2354htmlDocPtr
2355htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2356 if ((URI == NULL) && (ExternalID == NULL))
2357 return(htmlNewDocNoDtD(
Daniel Veillard64269352001-05-04 17:52:34 +00002358 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2359 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
Owen Taylor3473f882001-02-23 17:55:21 +00002360
2361 return(htmlNewDocNoDtD(URI, ExternalID));
2362}
2363
2364
2365/************************************************************************
2366 * *
2367 * The parser itself *
2368 * Relates to http://www.w3.org/TR/html40 *
2369 * *
2370 ************************************************************************/
2371
2372/************************************************************************
2373 * *
2374 * The parser itself *
2375 * *
2376 ************************************************************************/
2377
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002378static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002379
Owen Taylor3473f882001-02-23 17:55:21 +00002380/**
2381 * htmlParseHTMLName:
2382 * @ctxt: an HTML parser context
2383 *
2384 * parse an HTML tag or attribute name, note that we convert it to lowercase
2385 * since HTML names are not case-sensitive.
2386 *
2387 * Returns the Tag Name parsed or NULL
2388 */
2389
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002390static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002391htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
Owen Taylor3473f882001-02-23 17:55:21 +00002392 int i = 0;
2393 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2394
William M. Brackd1757ab2004-10-02 22:07:48 +00002395 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
Daniel Veillard7459c592009-08-13 10:10:29 +02002396 (CUR != ':') && (CUR != '.')) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002397
2398 while ((i < HTML_PARSER_BUFFER_SIZE) &&
William M. Brackd1757ab2004-10-02 22:07:48 +00002399 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
Daniel Veillard7459c592009-08-13 10:10:29 +02002400 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2401 (CUR == '.'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002402 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2403 else loc[i] = CUR;
2404 i++;
Daniel Veillarde77db162009-08-22 11:32:38 +02002405
Owen Taylor3473f882001-02-23 17:55:21 +00002406 NEXT;
2407 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002408
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002409 return(xmlDictLookup(ctxt->dict, loc, i));
Owen Taylor3473f882001-02-23 17:55:21 +00002410}
2411
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002412
2413/**
2414 * htmlParseHTMLName_nonInvasive:
2415 * @ctxt: an HTML parser context
2416 *
2417 * parse an HTML tag or attribute name, note that we convert it to lowercase
2418 * since HTML names are not case-sensitive, this doesn't consume the data
2419 * from the stream, it's a look-ahead
2420 *
2421 * Returns the Tag Name parsed or NULL
2422 */
2423
2424static const xmlChar *
2425htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2426 int i = 0;
2427 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2428
2429 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2430 (NXT(1) != ':')) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02002431
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002432 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2433 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2434 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2435 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2436 else loc[i] = NXT(1+i);
2437 i++;
2438 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002439
Daniel Veillard890fd9f2006-10-27 12:53:28 +00002440 return(xmlDictLookup(ctxt->dict, loc, i));
2441}
2442
2443
Owen Taylor3473f882001-02-23 17:55:21 +00002444/**
2445 * htmlParseName:
2446 * @ctxt: an HTML parser context
2447 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002448 * parse an HTML name, this routine is case sensitive.
Owen Taylor3473f882001-02-23 17:55:21 +00002449 *
2450 * Returns the Name parsed or NULL
2451 */
2452
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002453static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002454htmlParseName(htmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002455 const xmlChar *in;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002456 const xmlChar *ret;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002457 int count = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002458
2459 GROW;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002460
2461 /*
2462 * Accelerator for simple ASCII names
2463 */
2464 in = ctxt->input->cur;
2465 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2466 ((*in >= 0x41) && (*in <= 0x5A)) ||
2467 (*in == '_') || (*in == ':')) {
2468 in++;
2469 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2470 ((*in >= 0x41) && (*in <= 0x5A)) ||
2471 ((*in >= 0x30) && (*in <= 0x39)) ||
2472 (*in == '_') || (*in == '-') ||
2473 (*in == ':') || (*in == '.'))
2474 in++;
Xin Li28c53d32017-03-07 00:33:02 +00002475
2476 if (in == ctxt->input->end)
2477 return(NULL);
2478
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002479 if ((*in > 0) && (*in < 0x80)) {
2480 count = in - ctxt->input->cur;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002481 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002482 ctxt->input->cur = in;
Daniel Veillard77a90a72003-03-22 00:04:05 +00002483 ctxt->nbChars += count;
2484 ctxt->input->col += count;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002485 return(ret);
2486 }
2487 }
2488 return(htmlParseNameComplex(ctxt));
2489}
2490
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002491static const xmlChar *
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002492htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002493 int len = 0, l;
2494 int c;
2495 int count = 0;
Xin Li28c53d32017-03-07 00:33:02 +00002496 const xmlChar *base = ctxt->input->base;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002497
2498 /*
2499 * Handler for more complex cases
2500 */
2501 GROW;
2502 c = CUR_CHAR(l);
2503 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2504 (!IS_LETTER(c) && (c != '_') &&
2505 (c != ':'))) {
Owen Taylor3473f882001-02-23 17:55:21 +00002506 return(NULL);
2507 }
2508
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002509 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2510 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2511 (c == '.') || (c == '-') ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002512 (c == '_') || (c == ':') ||
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002513 (IS_COMBINING(c)) ||
2514 (IS_EXTENDER(c)))) {
2515 if (count++ > 100) {
2516 count = 0;
2517 GROW;
2518 }
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002519 len += l;
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002520 NEXTL(l);
2521 c = CUR_CHAR(l);
Xin Li28c53d32017-03-07 00:33:02 +00002522 if (ctxt->input->base != base) {
2523 /*
2524 * We changed encoding from an unknown encoding
2525 * Input buffer changed location, so we better start again
2526 */
2527 return(htmlParseNameComplex(ctxt));
2528 }
Owen Taylor3473f882001-02-23 17:55:21 +00002529 }
Xin Li28c53d32017-03-07 00:33:02 +00002530
2531 if (ctxt->input->base > ctxt->input->cur - len)
2532 return(NULL);
2533
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002534 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
Owen Taylor3473f882001-02-23 17:55:21 +00002535}
2536
Daniel Veillarde55e8e42003-01-10 12:50:02 +00002537
Owen Taylor3473f882001-02-23 17:55:21 +00002538/**
2539 * htmlParseHTMLAttribute:
2540 * @ctxt: an HTML parser context
2541 * @stop: a char stop value
Daniel Veillarde77db162009-08-22 11:32:38 +02002542 *
Owen Taylor3473f882001-02-23 17:55:21 +00002543 * parse an HTML attribute value till the stop (quote), if
2544 * stop is 0 then it stops at the first space
2545 *
2546 * Returns the attribute parsed or NULL
2547 */
2548
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002549static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002550htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2551 xmlChar *buffer = NULL;
2552 int buffer_size = 0;
2553 xmlChar *out = NULL;
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002554 const xmlChar *name = NULL;
2555 const xmlChar *cur = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00002556 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00002557
2558 /*
2559 * allocate a translation buffer.
2560 */
2561 buffer_size = HTML_PARSER_BUFFER_SIZE;
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002562 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00002563 if (buffer == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002564 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002565 return(NULL);
2566 }
2567 out = buffer;
2568
2569 /*
2570 * Ok loop until we reach one of the ending chars
2571 */
Daniel Veillard957fdcf2001-11-06 22:50:19 +00002572 while ((CUR != 0) && (CUR != stop)) {
2573 if ((stop == 0) && (CUR == '>')) break;
William M. Brack76e95df2003-10-18 16:20:14 +00002574 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
Owen Taylor3473f882001-02-23 17:55:21 +00002575 if (CUR == '&') {
2576 if (NXT(1) == '#') {
2577 unsigned int c;
2578 int bits;
2579
2580 c = htmlParseCharRef(ctxt);
2581 if (c < 0x80)
2582 { *out++ = c; bits= -6; }
2583 else if (c < 0x800)
2584 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2585 else if (c < 0x10000)
2586 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002587 else
Owen Taylor3473f882001-02-23 17:55:21 +00002588 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002589
Owen Taylor3473f882001-02-23 17:55:21 +00002590 for ( ; bits >= 0; bits-= 6) {
2591 *out++ = ((c >> bits) & 0x3F) | 0x80;
2592 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002593
Daniel Veillardce02dbc2002-10-22 19:14:58 +00002594 if (out - buffer > buffer_size - 100) {
2595 int indx = out - buffer;
2596
2597 growBuffer(buffer);
2598 out = &buffer[indx];
2599 }
Owen Taylor3473f882001-02-23 17:55:21 +00002600 } else {
2601 ent = htmlParseEntityRef(ctxt, &name);
2602 if (name == NULL) {
2603 *out++ = '&';
2604 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002605 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002606
2607 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002608 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002609 }
2610 } else if (ent == NULL) {
2611 *out++ = '&';
2612 cur = name;
2613 while (*cur != 0) {
2614 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002615 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002616
2617 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002618 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002619 }
2620 *out++ = *cur++;
2621 }
Owen Taylor3473f882001-02-23 17:55:21 +00002622 } else {
2623 unsigned int c;
2624 int bits;
2625
2626 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002627 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002628
2629 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002630 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002631 }
Daniel Veillard48519092006-10-17 15:56:35 +00002632 c = ent->value;
Owen Taylor3473f882001-02-23 17:55:21 +00002633 if (c < 0x80)
2634 { *out++ = c; bits= -6; }
2635 else if (c < 0x800)
2636 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2637 else if (c < 0x10000)
2638 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002639 else
Owen Taylor3473f882001-02-23 17:55:21 +00002640 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002641
Owen Taylor3473f882001-02-23 17:55:21 +00002642 for ( ; bits >= 0; bits-= 6) {
2643 *out++ = ((c >> bits) & 0x3F) | 0x80;
2644 }
Owen Taylor3473f882001-02-23 17:55:21 +00002645 }
2646 }
2647 } else {
2648 unsigned int c;
2649 int bits, l;
2650
2651 if (out - buffer > buffer_size - 100) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002652 int indx = out - buffer;
Owen Taylor3473f882001-02-23 17:55:21 +00002653
2654 growBuffer(buffer);
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002655 out = &buffer[indx];
Owen Taylor3473f882001-02-23 17:55:21 +00002656 }
2657 c = CUR_CHAR(l);
2658 if (c < 0x80)
2659 { *out++ = c; bits= -6; }
2660 else if (c < 0x800)
2661 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2662 else if (c < 0x10000)
2663 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002664 else
Owen Taylor3473f882001-02-23 17:55:21 +00002665 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02002666
Owen Taylor3473f882001-02-23 17:55:21 +00002667 for ( ; bits >= 0; bits-= 6) {
2668 *out++ = ((c >> bits) & 0x3F) | 0x80;
2669 }
2670 NEXT;
2671 }
2672 }
Daniel Veillard13cee4e2009-09-05 14:52:55 +02002673 *out = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002674 return(buffer);
2675}
2676
2677/**
Owen Taylor3473f882001-02-23 17:55:21 +00002678 * htmlParseEntityRef:
2679 * @ctxt: an HTML parser context
2680 * @str: location to store the entity name
2681 *
2682 * parse an HTML ENTITY references
2683 *
2684 * [68] EntityRef ::= '&' Name ';'
2685 *
2686 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2687 * if non-NULL *str will have to be freed by the caller.
2688 */
Daniel Veillardbb371292001-08-16 23:26:59 +00002689const htmlEntityDesc *
Daniel Veillard2fdbd322003-08-18 12:15:38 +00002690htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2691 const xmlChar *name;
Daniel Veillardbb371292001-08-16 23:26:59 +00002692 const htmlEntityDesc * ent = NULL;
Daniel Veillard42595322004-11-08 10:52:06 +00002693
2694 if (str != NULL) *str = NULL;
2695 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002696
2697 if (CUR == '&') {
2698 NEXT;
2699 name = htmlParseName(ctxt);
2700 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002701 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2702 "htmlParseEntityRef: no name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002703 } else {
2704 GROW;
2705 if (CUR == ';') {
Daniel Veillard42595322004-11-08 10:52:06 +00002706 if (str != NULL)
2707 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002708
2709 /*
2710 * Lookup the entity in the table.
2711 */
2712 ent = htmlEntityLookup(name);
2713 if (ent != NULL) /* OK that's ugly !!! */
2714 NEXT;
2715 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002716 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2717 "htmlParseEntityRef: expecting ';'\n",
2718 NULL, NULL);
Daniel Veillard42595322004-11-08 10:52:06 +00002719 if (str != NULL)
2720 *str = name;
Owen Taylor3473f882001-02-23 17:55:21 +00002721 }
2722 }
2723 }
2724 return(ent);
2725}
2726
2727/**
2728 * htmlParseAttValue:
2729 * @ctxt: an HTML parser context
2730 *
2731 * parse a value for an attribute
2732 * Note: the parser won't do substitution of entities here, this
2733 * will be handled later in xmlStringGetNodeList, unless it was
Daniel Veillarde77db162009-08-22 11:32:38 +02002734 * asked for ctxt->replaceEntities != 0
Owen Taylor3473f882001-02-23 17:55:21 +00002735 *
2736 * Returns the AttValue parsed or NULL.
2737 */
2738
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002739static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002740htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2741 xmlChar *ret = NULL;
2742
2743 if (CUR == '"') {
2744 NEXT;
2745 ret = htmlParseHTMLAttribute(ctxt, '"');
2746 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002747 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2748 "AttValue: \" expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002749 } else
2750 NEXT;
2751 } else if (CUR == '\'') {
2752 NEXT;
2753 ret = htmlParseHTMLAttribute(ctxt, '\'');
2754 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002755 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2756 "AttValue: ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002757 } else
2758 NEXT;
2759 } else {
2760 /*
2761 * That's an HTMLism, the attribute value may not be quoted
2762 */
2763 ret = htmlParseHTMLAttribute(ctxt, 0);
2764 if (ret == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002765 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2766 "AttValue: no value found\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002767 }
2768 }
2769 return(ret);
2770}
2771
2772/**
2773 * htmlParseSystemLiteral:
2774 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02002775 *
Owen Taylor3473f882001-02-23 17:55:21 +00002776 * parse an HTML Literal
2777 *
2778 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2779 *
2780 * Returns the SystemLiteral parsed or NULL
2781 */
2782
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002783static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002784htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
Xin Li28c53d32017-03-07 00:33:02 +00002785 size_t len = 0, startPosition = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002786 xmlChar *ret = NULL;
2787
2788 if (CUR == '"') {
2789 NEXT;
Xin Li28c53d32017-03-07 00:33:02 +00002790
2791 if (CUR_PTR < BASE_PTR)
2792 return(ret);
2793 startPosition = CUR_PTR - BASE_PTR;
2794
2795 while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002796 NEXT;
Xin Li28c53d32017-03-07 00:33:02 +00002797 len++;
2798 }
William M. Brack76e95df2003-10-18 16:20:14 +00002799 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002800 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2801 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002802 } else {
Xin Li28c53d32017-03-07 00:33:02 +00002803 ret = xmlStrndup((BASE_PTR+startPosition), len);
Owen Taylor3473f882001-02-23 17:55:21 +00002804 NEXT;
2805 }
2806 } else if (CUR == '\'') {
2807 NEXT;
Xin Li28c53d32017-03-07 00:33:02 +00002808
2809 if (CUR_PTR < BASE_PTR)
2810 return(ret);
2811 startPosition = CUR_PTR - BASE_PTR;
2812
2813 while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
Owen Taylor3473f882001-02-23 17:55:21 +00002814 NEXT;
Xin Li28c53d32017-03-07 00:33:02 +00002815 len++;
2816 }
William M. Brack76e95df2003-10-18 16:20:14 +00002817 if (!IS_CHAR_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00002818 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2819 "Unfinished SystemLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002820 } else {
Xin Li28c53d32017-03-07 00:33:02 +00002821 ret = xmlStrndup((BASE_PTR+startPosition), len);
Owen Taylor3473f882001-02-23 17:55:21 +00002822 NEXT;
2823 }
2824 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002825 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2826 " or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002827 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002828
Owen Taylor3473f882001-02-23 17:55:21 +00002829 return(ret);
2830}
2831
2832/**
2833 * htmlParsePubidLiteral:
2834 * @ctxt: an HTML parser context
2835 *
2836 * parse an HTML public literal
2837 *
2838 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2839 *
2840 * Returns the PubidLiteral parsed or NULL.
2841 */
2842
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002843static xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00002844htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
Xin Li28c53d32017-03-07 00:33:02 +00002845 size_t len = 0, startPosition = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002846 xmlChar *ret = NULL;
2847 /*
2848 * Name ::= (Letter | '_') (NameChar)*
2849 */
2850 if (CUR == '"') {
2851 NEXT;
Xin Li28c53d32017-03-07 00:33:02 +00002852
2853 if (CUR_PTR < BASE_PTR)
2854 return(ret);
2855 startPosition = CUR_PTR - BASE_PTR;
2856
2857 while (IS_PUBIDCHAR_CH(CUR)) {
2858 len++;
2859 NEXT;
2860 }
2861
Owen Taylor3473f882001-02-23 17:55:21 +00002862 if (CUR != '"') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002863 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2864 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002865 } else {
Xin Li28c53d32017-03-07 00:33:02 +00002866 ret = xmlStrndup((BASE_PTR + startPosition), len);
Owen Taylor3473f882001-02-23 17:55:21 +00002867 NEXT;
2868 }
2869 } else if (CUR == '\'') {
2870 NEXT;
Xin Li28c53d32017-03-07 00:33:02 +00002871
2872 if (CUR_PTR < BASE_PTR)
2873 return(ret);
2874 startPosition = CUR_PTR - BASE_PTR;
2875
2876 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
2877 len++;
2878 NEXT;
2879 }
2880
Daniel Veillard6560a422003-03-27 21:25:38 +00002881 if (CUR != '\'') {
Daniel Veillardf403d292003-10-05 13:51:35 +00002882 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2883 "Unfinished PubidLiteral\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002884 } else {
Xin Li28c53d32017-03-07 00:33:02 +00002885 ret = xmlStrndup((BASE_PTR + startPosition), len);
Owen Taylor3473f882001-02-23 17:55:21 +00002886 NEXT;
2887 }
2888 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00002889 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2890 "PubidLiteral \" or ' expected\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00002891 }
Daniel Veillarde77db162009-08-22 11:32:38 +02002892
Owen Taylor3473f882001-02-23 17:55:21 +00002893 return(ret);
2894}
2895
2896/**
2897 * htmlParseScript:
2898 * @ctxt: an HTML parser context
2899 *
2900 * parse the content of an HTML SCRIPT or STYLE element
2901 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2902 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2903 * http://www.w3.org/TR/html4/types.html#type-script
2904 * http://www.w3.org/TR/html4/types.html#h-6.15
2905 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2906 *
2907 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2908 * element and the value of intrinsic event attributes. User agents must
2909 * not evaluate script data as HTML markup but instead must pass it on as
2910 * data to a script engine.
2911 * NOTES:
2912 * - The content is passed like CDATA
2913 * - the attributes for style and scripting "onXXX" are also described
2914 * as CDATA but SGML allows entities references in attributes so their
2915 * processing is identical as other attributes
2916 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00002917static void
Owen Taylor3473f882001-02-23 17:55:21 +00002918htmlParseScript(htmlParserCtxtPtr ctxt) {
Daniel Veillard7d2b3232005-07-14 08:57:39 +00002919 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
Owen Taylor3473f882001-02-23 17:55:21 +00002920 int nbchar = 0;
Daniel Veillard358fef42005-07-13 16:37:38 +00002921 int cur,l;
Owen Taylor3473f882001-02-23 17:55:21 +00002922
2923 SHRINK;
Daniel Veillard358fef42005-07-13 16:37:38 +00002924 cur = CUR_CHAR(l);
William M. Brack76e95df2003-10-18 16:20:14 +00002925 while (IS_CHAR_CH(cur)) {
Daniel Veillard42720242007-04-16 07:02:31 +00002926 if ((cur == '<') && (NXT(1) == '/')) {
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002927 /*
2928 * One should break here, the specification is clear:
2929 * Authors should therefore escape "</" within the content.
2930 * Escape mechanisms are specific to each scripting or
2931 * style sheet language.
2932 *
2933 * In recovery mode, only break if end tag match the
2934 * current tag, effectively ignoring all tags inside the
2935 * script/style block and treating the entire block as
2936 * CDATA.
2937 */
2938 if (ctxt->recovery) {
Daniel Veillarde77db162009-08-22 11:32:38 +02002939 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2940 xmlStrlen(ctxt->name)) == 0)
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002941 {
2942 break; /* while */
2943 } else {
2944 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
Daniel Veillard2cf36a12005-10-25 12:21:29 +00002945 "Element %s embeds close tag\n",
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002946 ctxt->name, NULL);
2947 }
2948 } else {
2949 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
Daniel Veillarde77db162009-08-22 11:32:38 +02002950 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002951 {
2952 break; /* while */
2953 }
2954 }
Owen Taylor3473f882001-02-23 17:55:21 +00002955 }
Daniel Veillard358fef42005-07-13 16:37:38 +00002956 COPY_BUF(l,buf,nbchar,cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002957 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2958 if (ctxt->sax->cdataBlock!= NULL) {
2959 /*
2960 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2961 */
2962 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002963 } else if (ctxt->sax->characters != NULL) {
2964 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002965 }
2966 nbchar = 0;
2967 }
Daniel Veillardb9900082005-10-25 12:36:29 +00002968 GROW;
Daniel Veillard358fef42005-07-13 16:37:38 +00002969 NEXTL(l);
2970 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00002971 }
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00002972
Daniel Veillard68716a72006-10-16 09:32:17 +00002973 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
Pierre Belziled4b54472010-11-04 10:18:17 +01002974 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2975 "Invalid char in CDATA 0x%X\n", cur);
2976 if (ctxt->input->cur < ctxt->input->end) {
2977 NEXT;
2978 }
Owen Taylor3473f882001-02-23 17:55:21 +00002979 }
2980
2981 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2982 if (ctxt->sax->cdataBlock!= NULL) {
2983 /*
2984 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2985 */
2986 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
Daniel Veillardd9d32ae2003-07-05 20:32:43 +00002987 } else if (ctxt->sax->characters != NULL) {
2988 ctxt->sax->characters(ctxt->userData, buf, nbchar);
Owen Taylor3473f882001-02-23 17:55:21 +00002989 }
2990 }
2991}
2992
2993
2994/**
Xin Li28c53d32017-03-07 00:33:02 +00002995 * htmlParseCharDataInternal:
Owen Taylor3473f882001-02-23 17:55:21 +00002996 * @ctxt: an HTML parser context
Xin Li28c53d32017-03-07 00:33:02 +00002997 * @readahead: optional read ahead character in ascii range
Owen Taylor3473f882001-02-23 17:55:21 +00002998 *
2999 * parse a CharData section.
3000 * if we are within a CDATA section ']]>' marks an end of section.
3001 *
3002 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3003 */
3004
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003005static void
Xin Li28c53d32017-03-07 00:33:02 +00003006htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3007 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
Owen Taylor3473f882001-02-23 17:55:21 +00003008 int nbchar = 0;
3009 int cur, l;
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00003010 int chunk = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003011
Xin Li28c53d32017-03-07 00:33:02 +00003012 if (readahead)
3013 buf[nbchar++] = readahead;
3014
Owen Taylor3473f882001-02-23 17:55:21 +00003015 SHRINK;
3016 cur = CUR_CHAR(l);
3017 while (((cur != '<') || (ctxt->token == '<')) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003018 ((cur != '&') || (ctxt->token == '&')) &&
Daniel Veillardc5b43cc2008-01-11 07:41:39 +00003019 (cur != 0)) {
3020 if (!(IS_CHAR(cur))) {
3021 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3022 "Invalid char in CDATA 0x%X\n", cur);
3023 } else {
3024 COPY_BUF(l,buf,nbchar,cur);
3025 }
Owen Taylor3473f882001-02-23 17:55:21 +00003026 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3027 /*
3028 * Ok the segment is to be consumed as chars.
3029 */
3030 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3031 if (areBlanks(ctxt, buf, nbchar)) {
Daniel Veillardf933c892012-09-07 19:32:12 +08003032 if (ctxt->keepBlanks) {
3033 if (ctxt->sax->characters != NULL)
3034 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3035 } else {
3036 if (ctxt->sax->ignorableWhitespace != NULL)
3037 ctxt->sax->ignorableWhitespace(ctxt->userData,
3038 buf, nbchar);
3039 }
Owen Taylor3473f882001-02-23 17:55:21 +00003040 } else {
3041 htmlCheckParagraph(ctxt);
3042 if (ctxt->sax->characters != NULL)
3043 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3044 }
3045 }
3046 nbchar = 0;
3047 }
3048 NEXTL(l);
Daniel Veillarda57ba4c2008-09-25 16:06:18 +00003049 chunk++;
3050 if (chunk > HTML_PARSER_BUFFER_SIZE) {
3051 chunk = 0;
3052 SHRINK;
3053 GROW;
3054 }
Owen Taylor3473f882001-02-23 17:55:21 +00003055 cur = CUR_CHAR(l);
Daniel Veillard358a9892003-02-04 15:22:32 +00003056 if (cur == 0) {
3057 SHRINK;
3058 GROW;
3059 cur = CUR_CHAR(l);
3060 }
Owen Taylor3473f882001-02-23 17:55:21 +00003061 }
3062 if (nbchar != 0) {
Daniel Veillardd2755a82005-08-07 23:42:39 +00003063 buf[nbchar] = 0;
3064
Owen Taylor3473f882001-02-23 17:55:21 +00003065 /*
3066 * Ok the segment is to be consumed as chars.
3067 */
3068 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3069 if (areBlanks(ctxt, buf, nbchar)) {
Daniel Veillardf933c892012-09-07 19:32:12 +08003070 if (ctxt->keepBlanks) {
3071 if (ctxt->sax->characters != NULL)
3072 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3073 } else {
3074 if (ctxt->sax->ignorableWhitespace != NULL)
3075 ctxt->sax->ignorableWhitespace(ctxt->userData,
3076 buf, nbchar);
3077 }
Owen Taylor3473f882001-02-23 17:55:21 +00003078 } else {
3079 htmlCheckParagraph(ctxt);
3080 if (ctxt->sax->characters != NULL)
3081 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3082 }
3083 }
Daniel Veillard7cc95c02001-10-17 15:45:12 +00003084 } else {
3085 /*
3086 * Loop detection
3087 */
3088 if (cur == 0)
3089 ctxt->instate = XML_PARSER_EOF;
Owen Taylor3473f882001-02-23 17:55:21 +00003090 }
3091}
3092
3093/**
Xin Li28c53d32017-03-07 00:33:02 +00003094 * htmlParseCharData:
3095 * @ctxt: an HTML parser context
3096 *
3097 * parse a CharData section.
3098 * if we are within a CDATA section ']]>' marks an end of section.
3099 *
3100 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3101 */
3102
3103static void
3104htmlParseCharData(htmlParserCtxtPtr ctxt) {
3105 htmlParseCharDataInternal(ctxt, 0);
3106}
3107
3108/**
Owen Taylor3473f882001-02-23 17:55:21 +00003109 * htmlParseExternalID:
3110 * @ctxt: an HTML parser context
3111 * @publicID: a xmlChar** receiving PubidLiteral
Owen Taylor3473f882001-02-23 17:55:21 +00003112 *
3113 * Parse an External ID or a Public ID
3114 *
Owen Taylor3473f882001-02-23 17:55:21 +00003115 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3116 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3117 *
3118 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3119 *
3120 * Returns the function returns SystemLiteral and in the second
3121 * case publicID receives PubidLiteral, is strict is off
3122 * it is possible to return NULL and have publicID set.
3123 */
3124
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003125static xmlChar *
3126htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
Owen Taylor3473f882001-02-23 17:55:21 +00003127 xmlChar *URI = NULL;
3128
3129 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3130 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3131 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3132 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00003133 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003134 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3135 "Space required after 'SYSTEM'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003136 }
3137 SKIP_BLANKS;
3138 URI = htmlParseSystemLiteral(ctxt);
3139 if (URI == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003140 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3141 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003142 }
3143 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3144 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3145 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3146 SKIP(6);
William M. Brack76e95df2003-10-18 16:20:14 +00003147 if (!IS_BLANK_CH(CUR)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003148 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3149 "Space required after 'PUBLIC'\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003150 }
3151 SKIP_BLANKS;
3152 *publicID = htmlParsePubidLiteral(ctxt);
3153 if (*publicID == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003154 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3155 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3156 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003157 }
3158 SKIP_BLANKS;
3159 if ((CUR == '"') || (CUR == '\'')) {
3160 URI = htmlParseSystemLiteral(ctxt);
3161 }
3162 }
3163 return(URI);
3164}
3165
3166/**
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003167 * xmlParsePI:
3168 * @ctxt: an XML parser context
3169 *
3170 * parse an XML Processing Instruction.
3171 *
3172 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3173 */
3174static void
3175htmlParsePI(htmlParserCtxtPtr ctxt) {
3176 xmlChar *buf = NULL;
3177 int len = 0;
3178 int size = HTML_PARSER_BUFFER_SIZE;
3179 int cur, l;
3180 const xmlChar *target;
3181 xmlParserInputState state;
3182 int count = 0;
3183
3184 if ((RAW == '<') && (NXT(1) == '?')) {
3185 state = ctxt->instate;
3186 ctxt->instate = XML_PARSER_PI;
3187 /*
3188 * this is a Processing Instruction.
3189 */
3190 SKIP(2);
3191 SHRINK;
3192
3193 /*
3194 * Parse the target name and check for special support like
3195 * namespace.
3196 */
3197 target = htmlParseName(ctxt);
3198 if (target != NULL) {
3199 if (RAW == '>') {
3200 SKIP(1);
3201
3202 /*
3203 * SAX: PI detected.
3204 */
3205 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3206 (ctxt->sax->processingInstruction != NULL))
3207 ctxt->sax->processingInstruction(ctxt->userData,
3208 target, NULL);
3209 ctxt->instate = state;
3210 return;
3211 }
3212 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3213 if (buf == NULL) {
3214 htmlErrMemory(ctxt, NULL);
3215 ctxt->instate = state;
3216 return;
3217 }
3218 cur = CUR;
3219 if (!IS_BLANK(cur)) {
3220 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3221 "ParsePI: PI %s space expected\n", target, NULL);
3222 }
3223 SKIP_BLANKS;
3224 cur = CUR_CHAR(l);
3225 while (IS_CHAR(cur) && (cur != '>')) {
3226 if (len + 5 >= size) {
3227 xmlChar *tmp;
3228
3229 size *= 2;
3230 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3231 if (tmp == NULL) {
3232 htmlErrMemory(ctxt, NULL);
3233 xmlFree(buf);
3234 ctxt->instate = state;
3235 return;
3236 }
3237 buf = tmp;
3238 }
3239 count++;
3240 if (count > 50) {
3241 GROW;
3242 count = 0;
3243 }
3244 COPY_BUF(l,buf,len,cur);
3245 NEXTL(l);
3246 cur = CUR_CHAR(l);
3247 if (cur == 0) {
3248 SHRINK;
3249 GROW;
3250 cur = CUR_CHAR(l);
3251 }
3252 }
3253 buf[len] = 0;
3254 if (cur != '>') {
3255 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3256 "ParsePI: PI %s never end ...\n", target, NULL);
3257 } else {
3258 SKIP(1);
3259
3260 /*
3261 * SAX: PI detected.
3262 */
3263 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3264 (ctxt->sax->processingInstruction != NULL))
3265 ctxt->sax->processingInstruction(ctxt->userData,
3266 target, buf);
3267 }
3268 xmlFree(buf);
3269 } else {
Daniel Veillarde77db162009-08-22 11:32:38 +02003270 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
Daniel Veillardfc484dd2004-10-22 14:34:23 +00003271 "PI is not started correctly", NULL, NULL);
3272 }
3273 ctxt->instate = state;
3274 }
3275}
3276
3277/**
Owen Taylor3473f882001-02-23 17:55:21 +00003278 * htmlParseComment:
3279 * @ctxt: an HTML parser context
3280 *
3281 * Parse an XML (SGML) comment <!-- .... -->
3282 *
3283 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3284 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003285static void
Owen Taylor3473f882001-02-23 17:55:21 +00003286htmlParseComment(htmlParserCtxtPtr ctxt) {
3287 xmlChar *buf = NULL;
3288 int len;
3289 int size = HTML_PARSER_BUFFER_SIZE;
3290 int q, ql;
3291 int r, rl;
3292 int cur, l;
3293 xmlParserInputState state;
3294
3295 /*
3296 * Check that there is a comment right here.
3297 */
3298 if ((RAW != '<') || (NXT(1) != '!') ||
3299 (NXT(2) != '-') || (NXT(3) != '-')) return;
3300
3301 state = ctxt->instate;
3302 ctxt->instate = XML_PARSER_COMMENT;
3303 SHRINK;
3304 SKIP(4);
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003305 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
Owen Taylor3473f882001-02-23 17:55:21 +00003306 if (buf == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003307 htmlErrMemory(ctxt, "buffer allocation failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003308 ctxt->instate = state;
3309 return;
3310 }
Xin Li28c53d32017-03-07 00:33:02 +00003311 len = 0;
3312 buf[len] = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003313 q = CUR_CHAR(ql);
Xin Li28c53d32017-03-07 00:33:02 +00003314 if (!IS_CHAR(q))
3315 goto unfinished;
Owen Taylor3473f882001-02-23 17:55:21 +00003316 NEXTL(ql);
3317 r = CUR_CHAR(rl);
Xin Li28c53d32017-03-07 00:33:02 +00003318 if (!IS_CHAR(r))
3319 goto unfinished;
Owen Taylor3473f882001-02-23 17:55:21 +00003320 NEXTL(rl);
3321 cur = CUR_CHAR(l);
Owen Taylor3473f882001-02-23 17:55:21 +00003322 while (IS_CHAR(cur) &&
3323 ((cur != '>') ||
3324 (r != '-') || (q != '-'))) {
3325 if (len + 5 >= size) {
Daniel Veillard079f6a72004-09-23 13:15:03 +00003326 xmlChar *tmp;
3327
Owen Taylor3473f882001-02-23 17:55:21 +00003328 size *= 2;
Daniel Veillard079f6a72004-09-23 13:15:03 +00003329 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3330 if (tmp == NULL) {
3331 xmlFree(buf);
Daniel Veillardf403d292003-10-05 13:51:35 +00003332 htmlErrMemory(ctxt, "growing buffer failed\n");
Owen Taylor3473f882001-02-23 17:55:21 +00003333 ctxt->instate = state;
3334 return;
3335 }
Daniel Veillard079f6a72004-09-23 13:15:03 +00003336 buf = tmp;
Owen Taylor3473f882001-02-23 17:55:21 +00003337 }
3338 COPY_BUF(ql,buf,len,q);
3339 q = r;
3340 ql = rl;
3341 r = cur;
3342 rl = l;
3343 NEXTL(l);
3344 cur = CUR_CHAR(l);
3345 if (cur == 0) {
3346 SHRINK;
3347 GROW;
3348 cur = CUR_CHAR(l);
3349 }
3350 }
3351 buf[len] = 0;
Xin Li28c53d32017-03-07 00:33:02 +00003352 if (IS_CHAR(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00003353 NEXT;
3354 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3355 (!ctxt->disableSAX))
3356 ctxt->sax->comment(ctxt->userData, buf);
3357 xmlFree(buf);
Xin Li28c53d32017-03-07 00:33:02 +00003358 ctxt->instate = state;
3359 return;
Owen Taylor3473f882001-02-23 17:55:21 +00003360 }
Xin Li28c53d32017-03-07 00:33:02 +00003361
3362unfinished:
3363 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3364 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3365 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00003366}
3367
3368/**
3369 * htmlParseCharRef:
3370 * @ctxt: an HTML parser context
3371 *
3372 * parse Reference declarations
3373 *
3374 * [66] CharRef ::= '&#' [0-9]+ ';' |
3375 * '&#x' [0-9a-fA-F]+ ';'
3376 *
3377 * Returns the value parsed (as an int)
3378 */
3379int
3380htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3381 int val = 0;
3382
Daniel Veillarda03e3652004-11-02 18:45:30 +00003383 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3384 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3385 "htmlParseCharRef: context error\n",
3386 NULL, NULL);
3387 return(0);
3388 }
Owen Taylor3473f882001-02-23 17:55:21 +00003389 if ((CUR == '&') && (NXT(1) == '#') &&
Daniel Veillardc59d8262003-11-20 21:59:12 +00003390 ((NXT(2) == 'x') || NXT(2) == 'X')) {
Owen Taylor3473f882001-02-23 17:55:21 +00003391 SKIP(3);
3392 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003393 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003394 val = val * 16 + (CUR - '0');
3395 else if ((CUR >= 'a') && (CUR <= 'f'))
3396 val = val * 16 + (CUR - 'a') + 10;
3397 else if ((CUR >= 'A') && (CUR <= 'F'))
3398 val = val * 16 + (CUR - 'A') + 10;
3399 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003400 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
Michael Dayaf58ee12010-08-02 13:43:28 +02003401 "htmlParseCharRef: missing semicolon\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003402 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003403 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003404 }
3405 NEXT;
3406 }
3407 if (CUR == ';')
3408 NEXT;
3409 } else if ((CUR == '&') && (NXT(1) == '#')) {
3410 SKIP(2);
3411 while (CUR != ';') {
Daniel Veillarde77db162009-08-22 11:32:38 +02003412 if ((CUR >= '0') && (CUR <= '9'))
Owen Taylor3473f882001-02-23 17:55:21 +00003413 val = val * 10 + (CUR - '0');
3414 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003415 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
Michael Dayaf58ee12010-08-02 13:43:28 +02003416 "htmlParseCharRef: missing semicolon\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00003417 NULL, NULL);
Daniel Veillard36de63e2008-04-03 09:05:05 +00003418 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003419 }
3420 NEXT;
3421 }
3422 if (CUR == ';')
3423 NEXT;
3424 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003425 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3426 "htmlParseCharRef: invalid value\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003427 }
3428 /*
3429 * Check the value IS_CHAR ...
3430 */
3431 if (IS_CHAR(val)) {
3432 return(val);
3433 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003434 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3435 "htmlParseCharRef: invalid xmlChar value %d\n",
3436 val);
Owen Taylor3473f882001-02-23 17:55:21 +00003437 }
3438 return(0);
3439}
3440
3441
3442/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003443 * htmlParseDocTypeDecl:
Owen Taylor3473f882001-02-23 17:55:21 +00003444 * @ctxt: an HTML parser context
3445 *
3446 * parse a DOCTYPE declaration
3447 *
Daniel Veillarde77db162009-08-22 11:32:38 +02003448 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
Owen Taylor3473f882001-02-23 17:55:21 +00003449 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3450 */
3451
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003452static void
Owen Taylor3473f882001-02-23 17:55:21 +00003453htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003454 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00003455 xmlChar *ExternalID = NULL;
3456 xmlChar *URI = NULL;
3457
3458 /*
3459 * We know that '<!DOCTYPE' has been detected.
3460 */
3461 SKIP(9);
3462
3463 SKIP_BLANKS;
3464
3465 /*
3466 * Parse the DOCTYPE name.
3467 */
3468 name = htmlParseName(ctxt);
3469 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003470 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3471 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3472 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003473 }
3474 /*
3475 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3476 */
3477
3478 SKIP_BLANKS;
3479
3480 /*
3481 * Check for SystemID and ExternalID
3482 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003483 URI = htmlParseExternalID(ctxt, &ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003484 SKIP_BLANKS;
3485
3486 /*
3487 * We should be at the end of the DOCTYPE declaration.
3488 */
3489 if (CUR != '>') {
Daniel Veillardf403d292003-10-05 13:51:35 +00003490 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3491 "DOCTYPE improperly terminated\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003492 /* We shouldn't try to resynchronize ... */
3493 }
3494 NEXT;
3495
3496 /*
3497 * Create or update the document accordingly to the DOCTYPE
3498 */
3499 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3500 (!ctxt->disableSAX))
3501 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3502
3503 /*
3504 * Cleanup, since we don't use all those identifiers
3505 */
3506 if (URI != NULL) xmlFree(URI);
3507 if (ExternalID != NULL) xmlFree(ExternalID);
Owen Taylor3473f882001-02-23 17:55:21 +00003508}
3509
3510/**
3511 * htmlParseAttribute:
3512 * @ctxt: an HTML parser context
3513 * @value: a xmlChar ** used to store the value of the attribute
3514 *
3515 * parse an attribute
3516 *
3517 * [41] Attribute ::= Name Eq AttValue
3518 *
3519 * [25] Eq ::= S? '=' S?
3520 *
3521 * With namespace:
3522 *
3523 * [NS 11] Attribute ::= QName Eq AttValue
3524 *
3525 * Also the case QName == xmlns:??? is handled independently as a namespace
3526 * definition.
3527 *
3528 * Returns the attribute name, and the value in *value.
3529 */
3530
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003531static const xmlChar *
Owen Taylor3473f882001-02-23 17:55:21 +00003532htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003533 const xmlChar *name;
3534 xmlChar *val = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00003535
3536 *value = NULL;
3537 name = htmlParseHTMLName(ctxt);
3538 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003539 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3540 "error parsing attribute name\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003541 return(NULL);
3542 }
3543
3544 /*
3545 * read the value
3546 */
3547 SKIP_BLANKS;
3548 if (CUR == '=') {
3549 NEXT;
3550 SKIP_BLANKS;
3551 val = htmlParseAttValue(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00003552 }
3553
3554 *value = val;
3555 return(name);
3556}
3557
3558/**
Denis Pauk868d92d2012-05-10 15:34:57 +08003559 * htmlCheckEncodingDirect:
Owen Taylor3473f882001-02-23 17:55:21 +00003560 * @ctxt: an HTML parser context
3561 * @attvalue: the attribute value
3562 *
Denis Pauk868d92d2012-05-10 15:34:57 +08003563 * Checks an attribute value to detect
Owen Taylor3473f882001-02-23 17:55:21 +00003564 * the encoding
3565 * If a new encoding is detected the parser is switched to decode
3566 * it and pass UTF8
3567 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003568static void
Denis Pauk868d92d2012-05-10 15:34:57 +08003569htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
Owen Taylor3473f882001-02-23 17:55:21 +00003570
Denis Pauk868d92d2012-05-10 15:34:57 +08003571 if ((ctxt == NULL) || (encoding == NULL) ||
Daniel Veillardc62efc82011-05-16 16:03:50 +08003572 (ctxt->options & HTML_PARSE_IGNORE_ENC))
Owen Taylor3473f882001-02-23 17:55:21 +00003573 return;
3574
Daniel Veillarde77db162009-08-22 11:32:38 +02003575 /* do not change encoding */
Owen Taylor3473f882001-02-23 17:55:21 +00003576 if (ctxt->input->encoding != NULL)
3577 return;
3578
Owen Taylor3473f882001-02-23 17:55:21 +00003579 if (encoding != NULL) {
3580 xmlCharEncoding enc;
3581 xmlCharEncodingHandlerPtr handler;
3582
3583 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3584
3585 if (ctxt->input->encoding != NULL)
3586 xmlFree((xmlChar *) ctxt->input->encoding);
3587 ctxt->input->encoding = xmlStrdup(encoding);
3588
3589 enc = xmlParseCharEncoding((const char *) encoding);
3590 /*
3591 * registered set of known encodings
3592 */
3593 if (enc != XML_CHAR_ENCODING_ERROR) {
Daniel Veillarde77db162009-08-22 11:32:38 +02003594 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
Daniel Veillard7e303562006-10-16 13:14:55 +00003595 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3596 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3597 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3598 (ctxt->input->buf != NULL) &&
3599 (ctxt->input->buf->encoder == NULL)) {
3600 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3601 "htmlCheckEncoding: wrong encoding meta\n",
3602 NULL, NULL);
3603 } else {
3604 xmlSwitchEncoding(ctxt, enc);
3605 }
Owen Taylor3473f882001-02-23 17:55:21 +00003606 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3607 } else {
3608 /*
3609 * fallback for unknown encodings
3610 */
3611 handler = xmlFindCharEncodingHandler((const char *) encoding);
3612 if (handler != NULL) {
3613 xmlSwitchToEncoding(ctxt, handler);
3614 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3615 } else {
Daniel Veillardc62efc82011-05-16 16:03:50 +08003616 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3617 "htmlCheckEncoding: unknown encoding %s\n",
3618 encoding, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003619 }
3620 }
3621
3622 if ((ctxt->input->buf != NULL) &&
3623 (ctxt->input->buf->encoder != NULL) &&
3624 (ctxt->input->buf->raw != NULL) &&
3625 (ctxt->input->buf->buffer != NULL)) {
3626 int nbchars;
3627 int processed;
3628
3629 /*
3630 * convert as much as possible to the parser reading buffer.
3631 */
3632 processed = ctxt->input->cur - ctxt->input->base;
Daniel Veillarda78d8032012-07-16 14:56:50 +08003633 xmlBufShrink(ctxt->input->buf->buffer, processed);
Daniel Veillardbf058dc2013-02-13 18:19:42 +08003634 nbchars = xmlCharEncInput(ctxt->input->buf, 1);
Owen Taylor3473f882001-02-23 17:55:21 +00003635 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003636 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3637 "htmlCheckEncoding: encoder error\n",
3638 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003639 }
Daniel Veillard61551a12012-07-16 16:28:47 +08003640 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
Owen Taylor3473f882001-02-23 17:55:21 +00003641 }
3642 }
3643}
3644
3645/**
Denis Pauk868d92d2012-05-10 15:34:57 +08003646 * htmlCheckEncoding:
3647 * @ctxt: an HTML parser context
3648 * @attvalue: the attribute value
3649 *
3650 * Checks an http-equiv attribute from a Meta tag to detect
3651 * the encoding
3652 * If a new encoding is detected the parser is switched to decode
3653 * it and pass UTF8
3654 */
3655static void
3656htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3657 const xmlChar *encoding;
3658
3659 if (!attvalue)
3660 return;
3661
3662 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3663 if (encoding != NULL) {
3664 encoding += 7;
3665 }
3666 /*
3667 * skip blank
3668 */
3669 if (encoding && IS_BLANK_CH(*encoding))
3670 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3671 if (encoding && *encoding == '=') {
3672 encoding ++;
3673 htmlCheckEncodingDirect(ctxt, encoding);
3674 }
3675}
3676
3677/**
Owen Taylor3473f882001-02-23 17:55:21 +00003678 * htmlCheckMeta:
3679 * @ctxt: an HTML parser context
3680 * @atts: the attributes values
3681 *
3682 * Checks an attributes from a Meta tag
3683 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003684static void
Owen Taylor3473f882001-02-23 17:55:21 +00003685htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3686 int i;
3687 const xmlChar *att, *value;
3688 int http = 0;
3689 const xmlChar *content = NULL;
3690
3691 if ((ctxt == NULL) || (atts == NULL))
3692 return;
3693
3694 i = 0;
3695 att = atts[i++];
3696 while (att != NULL) {
3697 value = atts[i++];
3698 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3699 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3700 http = 1;
Denis Pauk868d92d2012-05-10 15:34:57 +08003701 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3702 htmlCheckEncodingDirect(ctxt, value);
Owen Taylor3473f882001-02-23 17:55:21 +00003703 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3704 content = value;
3705 att = atts[i++];
3706 }
3707 if ((http) && (content != NULL))
3708 htmlCheckEncoding(ctxt, content);
3709
3710}
3711
3712/**
3713 * htmlParseStartTag:
3714 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02003715 *
Owen Taylor3473f882001-02-23 17:55:21 +00003716 * parse a start of tag either for rule element or
3717 * EmptyElement. In both case we don't parse the tag closing chars.
3718 *
3719 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3720 *
3721 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3722 *
3723 * With namespace:
3724 *
3725 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3726 *
3727 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3728 *
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003729 * Returns 0 in case of success, -1 in case of error and 1 if discarded
Owen Taylor3473f882001-02-23 17:55:21 +00003730 */
3731
Daniel Veillard597f1c12005-07-03 23:00:18 +00003732static int
Owen Taylor3473f882001-02-23 17:55:21 +00003733htmlParseStartTag(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003734 const xmlChar *name;
3735 const xmlChar *attname;
Owen Taylor3473f882001-02-23 17:55:21 +00003736 xmlChar *attvalue;
Daniel Veillard30e76072006-03-09 14:13:55 +00003737 const xmlChar **atts;
Owen Taylor3473f882001-02-23 17:55:21 +00003738 int nbatts = 0;
Daniel Veillard30e76072006-03-09 14:13:55 +00003739 int maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003740 int meta = 0;
3741 int i;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003742 int discardtag = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00003743
Daniel Veillarda03e3652004-11-02 18:45:30 +00003744 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3745 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3746 "htmlParseStartTag: context error\n", NULL, NULL);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003747 return -1;
Daniel Veillarda03e3652004-11-02 18:45:30 +00003748 }
Gaurav3e0eec42014-06-13 14:45:20 +08003749 if (ctxt->instate == XML_PARSER_EOF)
3750 return(-1);
Daniel Veillard597f1c12005-07-03 23:00:18 +00003751 if (CUR != '<') return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003752 NEXT;
3753
Daniel Veillard30e76072006-03-09 14:13:55 +00003754 atts = ctxt->atts;
3755 maxatts = ctxt->maxatts;
3756
Owen Taylor3473f882001-02-23 17:55:21 +00003757 GROW;
3758 name = htmlParseHTMLName(ctxt);
3759 if (name == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003760 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3761 "htmlParseStartTag: invalid element name\n",
3762 NULL, NULL);
Xin Li28c53d32017-03-07 00:33:02 +00003763 /* if recover preserve text on classic misconstructs */
3764 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3765 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3766 htmlParseCharDataInternal(ctxt, '<');
3767 return(-1);
3768 }
3769
3770
Owen Taylor3473f882001-02-23 17:55:21 +00003771 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02003772 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3773 (ctxt->instate != XML_PARSER_EOF))
Owen Taylor3473f882001-02-23 17:55:21 +00003774 NEXT;
Daniel Veillard597f1c12005-07-03 23:00:18 +00003775 return -1;
Owen Taylor3473f882001-02-23 17:55:21 +00003776 }
3777 if (xmlStrEqual(name, BAD_CAST"meta"))
3778 meta = 1;
3779
3780 /*
3781 * Check for auto-closure of HTML elements.
3782 */
3783 htmlAutoClose(ctxt, name);
3784
3785 /*
3786 * Check for implied HTML elements.
3787 */
3788 htmlCheckImplied(ctxt, name);
3789
3790 /*
3791 * Avoid html at any level > 0, head at any level != 1
3792 * or any attempt to recurse body
3793 */
3794 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003795 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3796 "htmlParseStartTag: misplaced <html> tag\n",
3797 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003798 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003799 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003800 }
Daniel Veillarde77db162009-08-22 11:32:38 +02003801 if ((ctxt->nameNr != 1) &&
Owen Taylor3473f882001-02-23 17:55:21 +00003802 (xmlStrEqual(name, BAD_CAST"head"))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003803 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3804 "htmlParseStartTag: misplaced <head> tag\n",
3805 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003806 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003807 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003808 }
3809 if (xmlStrEqual(name, BAD_CAST"body")) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00003810 int indx;
3811 for (indx = 0;indx < ctxt->nameNr;indx++) {
3812 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003813 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3814 "htmlParseStartTag: misplaced <body> tag\n",
3815 name, NULL);
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003816 discardtag = 1;
Daniel Veillarded86dc22008-04-24 11:58:41 +00003817 ctxt->depth++;
Owen Taylor3473f882001-02-23 17:55:21 +00003818 }
3819 }
3820 }
3821
3822 /*
3823 * Now parse the attributes, it ends up with the ending
3824 *
3825 * (S Attribute)* S?
3826 */
3827 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003828 while ((IS_CHAR_CH(CUR)) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02003829 (CUR != '>') &&
Owen Taylor3473f882001-02-23 17:55:21 +00003830 ((CUR != '/') || (NXT(1) != '>'))) {
3831 long cons = ctxt->nbChars;
3832
3833 GROW;
3834 attname = htmlParseAttribute(ctxt, &attvalue);
3835 if (attname != NULL) {
3836
3837 /*
3838 * Well formedness requires at most one declaration of an attribute
3839 */
3840 for (i = 0; i < nbatts;i += 2) {
3841 if (xmlStrEqual(atts[i], attname)) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003842 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3843 "Attribute %s redefined\n", attname, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003844 if (attvalue != NULL)
3845 xmlFree(attvalue);
3846 goto failed;
3847 }
3848 }
3849
3850 /*
3851 * Add the pair to atts
3852 */
3853 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003854 maxatts = 22; /* allow for 10 attrs by default */
3855 atts = (const xmlChar **)
3856 xmlMalloc(maxatts * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00003857 if (atts == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003858 htmlErrMemory(ctxt, NULL);
3859 if (attvalue != NULL)
3860 xmlFree(attvalue);
3861 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003862 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003863 ctxt->atts = atts;
3864 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003865 } else if (nbatts + 4 > maxatts) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003866 const xmlChar **n;
3867
Owen Taylor3473f882001-02-23 17:55:21 +00003868 maxatts *= 2;
Daniel Veillardf403d292003-10-05 13:51:35 +00003869 n = (const xmlChar **) xmlRealloc((void *) atts,
3870 maxatts * sizeof(const xmlChar *));
3871 if (n == NULL) {
3872 htmlErrMemory(ctxt, NULL);
3873 if (attvalue != NULL)
3874 xmlFree(attvalue);
3875 goto failed;
Owen Taylor3473f882001-02-23 17:55:21 +00003876 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003877 atts = n;
3878 ctxt->atts = atts;
3879 ctxt->maxatts = maxatts;
Owen Taylor3473f882001-02-23 17:55:21 +00003880 }
3881 atts[nbatts++] = attname;
3882 atts[nbatts++] = attvalue;
3883 atts[nbatts] = NULL;
3884 atts[nbatts + 1] = NULL;
3885 }
3886 else {
Daniel Veillardf403d292003-10-05 13:51:35 +00003887 if (attvalue != NULL)
3888 xmlFree(attvalue);
Owen Taylor3473f882001-02-23 17:55:21 +00003889 /* Dump the bogus attribute string up to the next blank or
3890 * the end of the tag. */
William M. Brack76e95df2003-10-18 16:20:14 +00003891 while ((IS_CHAR_CH(CUR)) &&
3892 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
Daniel Veillard34ba3872003-07-15 13:34:05 +00003893 ((CUR != '/') || (NXT(1) != '>')))
Owen Taylor3473f882001-02-23 17:55:21 +00003894 NEXT;
3895 }
3896
3897failed:
3898 SKIP_BLANKS;
3899 if (cons == ctxt->nbChars) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003900 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3901 "htmlParseStartTag: problem parsing attributes\n",
3902 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00003903 break;
3904 }
3905 }
3906
3907 /*
3908 * Handle specific association to the META tag
3909 */
William M. Bracke978ae22007-03-21 06:16:02 +00003910 if (meta && (nbatts != 0))
Owen Taylor3473f882001-02-23 17:55:21 +00003911 htmlCheckMeta(ctxt, atts);
3912
3913 /*
3914 * SAX: Start of Element !
3915 */
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003916 if (!discardtag) {
3917 htmlnamePush(ctxt, name);
3918 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3919 if (nbatts != 0)
3920 ctxt->sax->startElement(ctxt->userData, name, atts);
3921 else
3922 ctxt->sax->startElement(ctxt->userData, name, NULL);
3923 }
Daniel Veillardf403d292003-10-05 13:51:35 +00003924 }
Owen Taylor3473f882001-02-23 17:55:21 +00003925
3926 if (atts != NULL) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003927 for (i = 1;i < nbatts;i += 2) {
Owen Taylor3473f882001-02-23 17:55:21 +00003928 if (atts[i] != NULL)
3929 xmlFree((xmlChar *) atts[i]);
3930 }
Owen Taylor3473f882001-02-23 17:55:21 +00003931 }
Daniel Veillard597f1c12005-07-03 23:00:18 +00003932
Daniel Veillard35fcbb82008-03-12 21:43:39 +00003933 return(discardtag);
Owen Taylor3473f882001-02-23 17:55:21 +00003934}
3935
3936/**
3937 * htmlParseEndTag:
3938 * @ctxt: an HTML parser context
3939 *
3940 * parse an end of tag
3941 *
3942 * [42] ETag ::= '</' Name S? '>'
3943 *
3944 * With namespace
3945 *
3946 * [NS 9] ETag ::= '</' QName S? '>'
Daniel Veillardf420ac52001-07-04 16:04:09 +00003947 *
3948 * Returns 1 if the current level should be closed.
Owen Taylor3473f882001-02-23 17:55:21 +00003949 */
3950
Daniel Veillardf420ac52001-07-04 16:04:09 +00003951static int
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003952htmlParseEndTag(htmlParserCtxtPtr ctxt)
3953{
3954 const xmlChar *name;
3955 const xmlChar *oldname;
Daniel Veillardf420ac52001-07-04 16:04:09 +00003956 int i, ret;
Owen Taylor3473f882001-02-23 17:55:21 +00003957
3958 if ((CUR != '<') || (NXT(1) != '/')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003959 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3960 "htmlParseEndTag: '</' not found\n", NULL, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003961 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003962 }
3963 SKIP(2);
3964
3965 name = htmlParseHTMLName(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003966 if (name == NULL)
3967 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00003968 /*
3969 * We should definitely be at the ending "S? '>'" part
3970 */
3971 SKIP_BLANKS;
William M. Brack76e95df2003-10-18 16:20:14 +00003972 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00003973 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3974 "End tag : expected '>'\n", NULL, NULL);
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00003975 if (ctxt->recovery) {
3976 /*
3977 * We're not at the ending > !!
3978 * Error, unless in recover mode where we search forwards
3979 * until we find a >
3980 */
3981 while (CUR != '\0' && CUR != '>') NEXT;
3982 NEXT;
3983 }
Owen Taylor3473f882001-02-23 17:55:21 +00003984 } else
Daniel Veillard2fdbd322003-08-18 12:15:38 +00003985 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00003986
3987 /*
Daniel Veillarded86dc22008-04-24 11:58:41 +00003988 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3989 * out now.
3990 */
3991 if ((ctxt->depth > 0) &&
3992 (xmlStrEqual(name, BAD_CAST "html") ||
3993 xmlStrEqual(name, BAD_CAST "body") ||
3994 xmlStrEqual(name, BAD_CAST "head"))) {
3995 ctxt->depth--;
3996 return (0);
3997 }
3998
3999 /*
Owen Taylor3473f882001-02-23 17:55:21 +00004000 * If the name read is not one of the element in the parsing stack
4001 * then return, it's just an error.
4002 */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004003 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4004 if (xmlStrEqual(name, ctxt->nameTab[i]))
4005 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004006 }
4007 if (i < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004008 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4009 "Unexpected end tag : %s\n", name, NULL);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004010 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00004011 }
4012
4013
4014 /*
4015 * Check for auto-closure of HTML elements.
4016 */
4017
4018 htmlAutoCloseOnClose(ctxt, name);
4019
4020 /*
4021 * Well formedness constraints, opening and closing must match.
4022 * With the exception that the autoclose may have popped stuff out
4023 * of the stack.
4024 */
4025 if (!xmlStrEqual(name, ctxt->name)) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004026 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004027 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4028 "Opening and ending tag mismatch: %s and %s\n",
4029 name, ctxt->name);
Owen Taylor3473f882001-02-23 17:55:21 +00004030 }
4031 }
4032
4033 /*
4034 * SAX: End of Tag
4035 */
4036 oldname = ctxt->name;
4037 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004038 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4039 ctxt->sax->endElement(ctxt->userData, name);
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08004040 htmlNodeInfoPop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004041 htmlnamePop(ctxt);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004042 ret = 1;
Daniel Veillardf420ac52001-07-04 16:04:09 +00004043 } else {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004044 ret = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004045 }
4046
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004047 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00004048}
4049
4050
4051/**
4052 * htmlParseReference:
4053 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004054 *
Owen Taylor3473f882001-02-23 17:55:21 +00004055 * parse and handle entity references in content,
4056 * this will end-up in a call to character() since this is either a
4057 * CharRef, or a predefined entity.
4058 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004059static void
Owen Taylor3473f882001-02-23 17:55:21 +00004060htmlParseReference(htmlParserCtxtPtr ctxt) {
Daniel Veillardbb371292001-08-16 23:26:59 +00004061 const htmlEntityDesc * ent;
Owen Taylor3473f882001-02-23 17:55:21 +00004062 xmlChar out[6];
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004063 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004064 if (CUR != '&') return;
4065
4066 if (NXT(1) == '#') {
4067 unsigned int c;
4068 int bits, i = 0;
4069
4070 c = htmlParseCharRef(ctxt);
4071 if (c == 0)
4072 return;
4073
4074 if (c < 0x80) { out[i++]= c; bits= -6; }
4075 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4076 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4077 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02004078
Owen Taylor3473f882001-02-23 17:55:21 +00004079 for ( ; bits >= 0; bits-= 6) {
4080 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4081 }
4082 out[i] = 0;
4083
4084 htmlCheckParagraph(ctxt);
4085 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4086 ctxt->sax->characters(ctxt->userData, out, i);
4087 } else {
4088 ent = htmlParseEntityRef(ctxt, &name);
4089 if (name == NULL) {
4090 htmlCheckParagraph(ctxt);
4091 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4092 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4093 return;
4094 }
Daniel Veillarde645e8c2002-10-22 17:35:37 +00004095 if ((ent == NULL) || !(ent->value > 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004096 htmlCheckParagraph(ctxt);
4097 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4098 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4099 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4100 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4101 }
4102 } else {
4103 unsigned int c;
4104 int bits, i = 0;
4105
4106 c = ent->value;
4107 if (c < 0x80)
4108 { out[i++]= c; bits= -6; }
4109 else if (c < 0x800)
4110 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4111 else if (c < 0x10000)
4112 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
Daniel Veillarde77db162009-08-22 11:32:38 +02004113 else
Owen Taylor3473f882001-02-23 17:55:21 +00004114 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillarde77db162009-08-22 11:32:38 +02004115
Owen Taylor3473f882001-02-23 17:55:21 +00004116 for ( ; bits >= 0; bits-= 6) {
4117 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4118 }
4119 out[i] = 0;
4120
4121 htmlCheckParagraph(ctxt);
4122 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4123 ctxt->sax->characters(ctxt->userData, out, i);
4124 }
Owen Taylor3473f882001-02-23 17:55:21 +00004125 }
4126}
4127
4128/**
4129 * htmlParseContent:
4130 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00004131 *
4132 * Parse a content: comment, sub-element, reference or text.
Eugene Pimenov615904f2010-03-15 15:16:02 +01004133 * Kept for compatibility with old code
Owen Taylor3473f882001-02-23 17:55:21 +00004134 */
4135
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004136static void
Owen Taylor3473f882001-02-23 17:55:21 +00004137htmlParseContent(htmlParserCtxtPtr ctxt) {
4138 xmlChar *currentNode;
4139 int depth;
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004140 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004141
4142 currentNode = xmlStrdup(ctxt->name);
4143 depth = ctxt->nameNr;
4144 while (1) {
4145 long cons = ctxt->nbChars;
4146
4147 GROW;
Daniel Veillarde77db162009-08-22 11:32:38 +02004148
4149 if (ctxt->instate == XML_PARSER_EOF)
4150 break;
4151
Owen Taylor3473f882001-02-23 17:55:21 +00004152 /*
4153 * Our tag or one of it's parent or children is ending.
4154 */
4155 if ((CUR == '<') && (NXT(1) == '/')) {
Daniel Veillardf420ac52001-07-04 16:04:09 +00004156 if (htmlParseEndTag(ctxt) &&
4157 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4158 if (currentNode != NULL)
4159 xmlFree(currentNode);
4160 return;
4161 }
4162 continue; /* while */
Owen Taylor3473f882001-02-23 17:55:21 +00004163 }
4164
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004165 else if ((CUR == '<') &&
4166 ((IS_ASCII_LETTER(NXT(1))) ||
4167 (NXT(1) == '_') || (NXT(1) == ':'))) {
4168 name = htmlParseHTMLName_nonInvasive(ctxt);
4169 if (name == NULL) {
4170 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4171 "htmlParseStartTag: invalid element name\n",
4172 NULL, NULL);
4173 /* Dump the bogus tag like browsers do */
Daniel Veillarde77db162009-08-22 11:32:38 +02004174 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004175 NEXT;
4176
4177 if (currentNode != NULL)
4178 xmlFree(currentNode);
4179 return;
4180 }
4181
4182 if (ctxt->name != NULL) {
4183 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4184 htmlAutoClose(ctxt, name);
4185 continue;
4186 }
Daniel Veillarde77db162009-08-22 11:32:38 +02004187 }
Daniel Veillard890fd9f2006-10-27 12:53:28 +00004188 }
4189
Owen Taylor3473f882001-02-23 17:55:21 +00004190 /*
4191 * Has this node been popped out during parsing of
4192 * the next element
4193 */
Daniel Veillardf420ac52001-07-04 16:04:09 +00004194 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4195 (!xmlStrEqual(currentNode, ctxt->name)))
4196 {
Owen Taylor3473f882001-02-23 17:55:21 +00004197 if (currentNode != NULL) xmlFree(currentNode);
4198 return;
4199 }
4200
Daniel Veillardf9533d12001-03-03 10:04:57 +00004201 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4202 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00004203 /*
4204 * Handle SCRIPT/STYLE separately
4205 */
4206 htmlParseScript(ctxt);
4207 } else {
4208 /*
4209 * Sometimes DOCTYPE arrives in the middle of the document
4210 */
4211 if ((CUR == '<') && (NXT(1) == '!') &&
4212 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4213 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4214 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4215 (UPP(8) == 'E')) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004216 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4217 "Misplaced DOCTYPE declaration\n",
4218 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004219 htmlParseDocTypeDecl(ctxt);
4220 }
4221
4222 /*
4223 * First case : a comment
4224 */
4225 if ((CUR == '<') && (NXT(1) == '!') &&
4226 (NXT(2) == '-') && (NXT(3) == '-')) {
4227 htmlParseComment(ctxt);
4228 }
4229
4230 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004231 * Second case : a Processing Instruction.
4232 */
4233 else if ((CUR == '<') && (NXT(1) == '?')) {
4234 htmlParsePI(ctxt);
4235 }
4236
4237 /*
4238 * Third case : a sub-element.
Owen Taylor3473f882001-02-23 17:55:21 +00004239 */
4240 else if (CUR == '<') {
4241 htmlParseElement(ctxt);
4242 }
4243
4244 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004245 * Fourth case : a reference. If if has not been resolved,
Daniel Veillarde77db162009-08-22 11:32:38 +02004246 * parsing returns it's Name, create the node
Owen Taylor3473f882001-02-23 17:55:21 +00004247 */
4248 else if (CUR == '&') {
4249 htmlParseReference(ctxt);
4250 }
4251
4252 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004253 * Fifth case : end of the resource
Owen Taylor3473f882001-02-23 17:55:21 +00004254 */
4255 else if (CUR == 0) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004256 htmlAutoCloseOnEnd(ctxt);
4257 break;
Owen Taylor3473f882001-02-23 17:55:21 +00004258 }
4259
4260 /*
4261 * Last case, text. Note that References are handled directly.
4262 */
4263 else {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00004264 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004265 }
4266
4267 if (cons == ctxt->nbChars) {
4268 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004269 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4270 "detected an error in element content\n",
4271 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004272 }
4273 break;
4274 }
4275 }
4276 GROW;
4277 }
4278 if (currentNode != NULL) xmlFree(currentNode);
4279}
4280
4281/**
4282 * htmlParseElement:
4283 * @ctxt: an HTML parser context
4284 *
4285 * parse an HTML element, this is highly recursive
Eugene Pimenov615904f2010-03-15 15:16:02 +01004286 * this is kept for compatibility with previous code versions
Owen Taylor3473f882001-02-23 17:55:21 +00004287 *
4288 * [39] element ::= EmptyElemTag | STag content ETag
4289 *
4290 * [41] Attribute ::= Name Eq AttValue
4291 */
4292
4293void
4294htmlParseElement(htmlParserCtxtPtr ctxt) {
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004295 const xmlChar *name;
Owen Taylor3473f882001-02-23 17:55:21 +00004296 xmlChar *currentNode = NULL;
Daniel Veillardbb371292001-08-16 23:26:59 +00004297 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00004298 htmlParserNodeInfo node_info;
Daniel Veillard597f1c12005-07-03 23:00:18 +00004299 int failed;
Daniel Veillarda03e3652004-11-02 18:45:30 +00004300 int depth;
Daniel Veillard3fbe8e32001-10-06 13:30:33 +00004301 const xmlChar *oldptr;
Owen Taylor3473f882001-02-23 17:55:21 +00004302
Daniel Veillarda03e3652004-11-02 18:45:30 +00004303 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4304 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
Daniel Veillard597f1c12005-07-03 23:00:18 +00004305 "htmlParseElement: context error\n", NULL, NULL);
Daniel Veillarda03e3652004-11-02 18:45:30 +00004306 return;
4307 }
Daniel Veillarddb4ac222009-08-22 17:58:31 +02004308
4309 if (ctxt->instate == XML_PARSER_EOF)
4310 return;
4311
Owen Taylor3473f882001-02-23 17:55:21 +00004312 /* Capture start position */
4313 if (ctxt->record_info) {
4314 node_info.begin_pos = ctxt->input->consumed +
4315 (CUR_PTR - ctxt->input->base);
4316 node_info.begin_line = ctxt->input->line;
4317 }
4318
Daniel Veillard597f1c12005-07-03 23:00:18 +00004319 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004320 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00004321 if ((failed == -1) || (name == NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004322 if (CUR == '>')
4323 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00004324 return;
4325 }
Owen Taylor3473f882001-02-23 17:55:21 +00004326
4327 /*
4328 * Lookup the info for that element.
4329 */
4330 info = htmlTagLookup(name);
4331 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004332 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4333 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004334 }
4335
4336 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00004337 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00004338 */
4339 if ((CUR == '/') && (NXT(1) == '>')) {
4340 SKIP(2);
4341 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4342 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004343 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004344 return;
4345 }
4346
4347 if (CUR == '>') {
4348 NEXT;
4349 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00004350 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4351 "Couldn't find end of Start Tag %s\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004352
4353 /*
4354 * end of parsing of this node.
4355 */
Daniel Veillarde77db162009-08-22 11:32:38 +02004356 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004357 nodePop(ctxt);
Daniel Veillardf403d292003-10-05 13:51:35 +00004358 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02004359 }
Owen Taylor3473f882001-02-23 17:55:21 +00004360
4361 /*
4362 * Capture end position and add node
4363 */
Daniel Veillard30e76072006-03-09 14:13:55 +00004364 if (ctxt->record_info) {
Owen Taylor3473f882001-02-23 17:55:21 +00004365 node_info.end_pos = ctxt->input->consumed +
4366 (CUR_PTR - ctxt->input->base);
4367 node_info.end_line = ctxt->input->line;
4368 node_info.node = ctxt->node;
4369 xmlParserAddNodeInfo(ctxt, &node_info);
4370 }
4371 return;
4372 }
4373
4374 /*
4375 * Check for an Empty Element from DTD definition
4376 */
4377 if ((info != NULL) && (info->empty)) {
4378 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4379 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillardf403d292003-10-05 13:51:35 +00004380 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004381 return;
4382 }
4383
4384 /*
4385 * Parse the content of the element:
4386 */
4387 currentNode = xmlStrdup(ctxt->name);
4388 depth = ctxt->nameNr;
William M. Brack76e95df2003-10-18 16:20:14 +00004389 while (IS_CHAR_CH(CUR)) {
William M. Brackd28e48a2001-09-23 01:55:08 +00004390 oldptr = ctxt->input->cur;
Owen Taylor3473f882001-02-23 17:55:21 +00004391 htmlParseContent(ctxt);
William M. Brackd28e48a2001-09-23 01:55:08 +00004392 if (oldptr==ctxt->input->cur) break;
Daniel Veillarde77db162009-08-22 11:32:38 +02004393 if (ctxt->nameNr < depth) break;
4394 }
Owen Taylor3473f882001-02-23 17:55:21 +00004395
Owen Taylor3473f882001-02-23 17:55:21 +00004396 /*
4397 * Capture end position and add node
4398 */
4399 if ( currentNode != NULL && ctxt->record_info ) {
4400 node_info.end_pos = ctxt->input->consumed +
4401 (CUR_PTR - ctxt->input->base);
4402 node_info.end_line = ctxt->input->line;
4403 node_info.node = ctxt->node;
4404 xmlParserAddNodeInfo(ctxt, &node_info);
4405 }
William M. Brack76e95df2003-10-18 16:20:14 +00004406 if (!IS_CHAR_CH(CUR)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004407 htmlAutoCloseOnEnd(ctxt);
4408 }
4409
Owen Taylor3473f882001-02-23 17:55:21 +00004410 if (currentNode != NULL)
4411 xmlFree(currentNode);
4412}
4413
Eugene Pimenov615904f2010-03-15 15:16:02 +01004414static void
4415htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4416 /*
4417 * Capture end position and add node
4418 */
4419 if ( ctxt->node != NULL && ctxt->record_info ) {
4420 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4421 (CUR_PTR - ctxt->input->base);
4422 ctxt->nodeInfo->end_line = ctxt->input->line;
4423 ctxt->nodeInfo->node = ctxt->node;
4424 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4425 htmlNodeInfoPop(ctxt);
4426 }
4427 if (!IS_CHAR_CH(CUR)) {
4428 htmlAutoCloseOnEnd(ctxt);
4429 }
4430}
4431
4432/**
4433 * htmlParseElementInternal:
4434 * @ctxt: an HTML parser context
4435 *
4436 * parse an HTML element, new version, non recursive
4437 *
4438 * [39] element ::= EmptyElemTag | STag content ETag
4439 *
4440 * [41] Attribute ::= Name Eq AttValue
4441 */
4442
4443static void
4444htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4445 const xmlChar *name;
4446 const htmlElemDesc * info;
Philip Withnall579ebbc2014-06-20 21:03:42 +01004447 htmlParserNodeInfo node_info = { 0, };
Eugene Pimenov615904f2010-03-15 15:16:02 +01004448 int failed;
Eugene Pimenov615904f2010-03-15 15:16:02 +01004449
4450 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4451 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4452 "htmlParseElementInternal: context error\n", NULL, NULL);
4453 return;
4454 }
4455
4456 if (ctxt->instate == XML_PARSER_EOF)
4457 return;
4458
4459 /* Capture start position */
4460 if (ctxt->record_info) {
4461 node_info.begin_pos = ctxt->input->consumed +
4462 (CUR_PTR - ctxt->input->base);
4463 node_info.begin_line = ctxt->input->line;
4464 }
4465
4466 failed = htmlParseStartTag(ctxt);
4467 name = ctxt->name;
4468 if ((failed == -1) || (name == NULL)) {
4469 if (CUR == '>')
4470 NEXT;
4471 return;
4472 }
4473
4474 /*
4475 * Lookup the info for that element.
4476 */
4477 info = htmlTagLookup(name);
4478 if (info == NULL) {
4479 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4480 "Tag %s invalid\n", name, NULL);
4481 }
4482
4483 /*
4484 * Check for an Empty Element labeled the XML/SGML way
4485 */
4486 if ((CUR == '/') && (NXT(1) == '>')) {
4487 SKIP(2);
4488 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4489 ctxt->sax->endElement(ctxt->userData, name);
4490 htmlnamePop(ctxt);
4491 return;
4492 }
4493
4494 if (CUR == '>') {
4495 NEXT;
4496 } else {
4497 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4498 "Couldn't find end of Start Tag %s\n", name, NULL);
4499
4500 /*
4501 * end of parsing of this node.
4502 */
4503 if (xmlStrEqual(name, ctxt->name)) {
4504 nodePop(ctxt);
4505 htmlnamePop(ctxt);
4506 }
4507
4508 if (ctxt->record_info)
4509 htmlNodeInfoPush(ctxt, &node_info);
4510 htmlParserFinishElementParsing(ctxt);
4511 return;
4512 }
4513
4514 /*
4515 * Check for an Empty Element from DTD definition
4516 */
4517 if ((info != NULL) && (info->empty)) {
4518 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4519 ctxt->sax->endElement(ctxt->userData, name);
4520 htmlnamePop(ctxt);
4521 return;
4522 }
4523
4524 if (ctxt->record_info)
4525 htmlNodeInfoPush(ctxt, &node_info);
4526}
4527
4528/**
4529 * htmlParseContentInternal:
4530 * @ctxt: an HTML parser context
4531 *
4532 * Parse a content: comment, sub-element, reference or text.
4533 * New version for non recursive htmlParseElementInternal
4534 */
4535
4536static void
4537htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4538 xmlChar *currentNode;
4539 int depth;
4540 const xmlChar *name;
4541
4542 currentNode = xmlStrdup(ctxt->name);
4543 depth = ctxt->nameNr;
4544 while (1) {
4545 long cons = ctxt->nbChars;
4546
4547 GROW;
4548
4549 if (ctxt->instate == XML_PARSER_EOF)
4550 break;
4551
4552 /*
4553 * Our tag or one of it's parent or children is ending.
4554 */
4555 if ((CUR == '<') && (NXT(1) == '/')) {
4556 if (htmlParseEndTag(ctxt) &&
4557 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4558 if (currentNode != NULL)
4559 xmlFree(currentNode);
4560
4561 currentNode = xmlStrdup(ctxt->name);
4562 depth = ctxt->nameNr;
4563 }
4564 continue; /* while */
4565 }
4566
4567 else if ((CUR == '<') &&
4568 ((IS_ASCII_LETTER(NXT(1))) ||
4569 (NXT(1) == '_') || (NXT(1) == ':'))) {
4570 name = htmlParseHTMLName_nonInvasive(ctxt);
4571 if (name == NULL) {
4572 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4573 "htmlParseStartTag: invalid element name\n",
4574 NULL, NULL);
4575 /* Dump the bogus tag like browsers do */
4576 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4577 NEXT;
4578
4579 htmlParserFinishElementParsing(ctxt);
4580 if (currentNode != NULL)
4581 xmlFree(currentNode);
4582
4583 currentNode = xmlStrdup(ctxt->name);
4584 depth = ctxt->nameNr;
4585 continue;
4586 }
4587
4588 if (ctxt->name != NULL) {
4589 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4590 htmlAutoClose(ctxt, name);
4591 continue;
4592 }
4593 }
4594 }
4595
4596 /*
4597 * Has this node been popped out during parsing of
4598 * the next element
4599 */
4600 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4601 (!xmlStrEqual(currentNode, ctxt->name)))
4602 {
4603 htmlParserFinishElementParsing(ctxt);
4604 if (currentNode != NULL) xmlFree(currentNode);
4605
4606 currentNode = xmlStrdup(ctxt->name);
4607 depth = ctxt->nameNr;
4608 continue;
4609 }
4610
4611 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4612 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4613 /*
4614 * Handle SCRIPT/STYLE separately
4615 */
4616 htmlParseScript(ctxt);
4617 } else {
4618 /*
4619 * Sometimes DOCTYPE arrives in the middle of the document
4620 */
4621 if ((CUR == '<') && (NXT(1) == '!') &&
4622 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4623 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4624 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4625 (UPP(8) == 'E')) {
4626 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4627 "Misplaced DOCTYPE declaration\n",
4628 BAD_CAST "DOCTYPE" , NULL);
4629 htmlParseDocTypeDecl(ctxt);
4630 }
4631
4632 /*
4633 * First case : a comment
4634 */
4635 if ((CUR == '<') && (NXT(1) == '!') &&
4636 (NXT(2) == '-') && (NXT(3) == '-')) {
4637 htmlParseComment(ctxt);
4638 }
4639
4640 /*
4641 * Second case : a Processing Instruction.
4642 */
4643 else if ((CUR == '<') && (NXT(1) == '?')) {
4644 htmlParsePI(ctxt);
4645 }
4646
4647 /*
4648 * Third case : a sub-element.
4649 */
4650 else if (CUR == '<') {
4651 htmlParseElementInternal(ctxt);
4652 if (currentNode != NULL) xmlFree(currentNode);
4653
4654 currentNode = xmlStrdup(ctxt->name);
4655 depth = ctxt->nameNr;
4656 }
4657
4658 /*
4659 * Fourth case : a reference. If if has not been resolved,
4660 * parsing returns it's Name, create the node
4661 */
4662 else if (CUR == '&') {
4663 htmlParseReference(ctxt);
4664 }
4665
4666 /*
4667 * Fifth case : end of the resource
4668 */
4669 else if (CUR == 0) {
4670 htmlAutoCloseOnEnd(ctxt);
4671 break;
4672 }
4673
4674 /*
4675 * Last case, text. Note that References are handled directly.
4676 */
4677 else {
4678 htmlParseCharData(ctxt);
4679 }
4680
4681 if (cons == ctxt->nbChars) {
4682 if (ctxt->node != NULL) {
4683 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4684 "detected an error in element content\n",
4685 NULL, NULL);
4686 }
4687 break;
4688 }
4689 }
4690 GROW;
4691 }
4692 if (currentNode != NULL) xmlFree(currentNode);
4693}
4694
4695/**
4696 * htmlParseContent:
4697 * @ctxt: an HTML parser context
4698 *
4699 * Parse a content: comment, sub-element, reference or text.
4700 * This is the entry point when called from parser.c
4701 */
4702
4703void
4704__htmlParseContent(void *ctxt) {
4705 if (ctxt != NULL)
4706 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4707}
4708
Owen Taylor3473f882001-02-23 17:55:21 +00004709/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00004710 * htmlParseDocument:
Owen Taylor3473f882001-02-23 17:55:21 +00004711 * @ctxt: an HTML parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02004712 *
Owen Taylor3473f882001-02-23 17:55:21 +00004713 * parse an HTML document (and build a tree if using the standard SAX
4714 * interface).
4715 *
4716 * Returns 0, -1 in case of error. the parser context is augmented
4717 * as a result of the parsing.
4718 */
4719
Daniel Veillard1b31e4a2002-05-27 14:44:50 +00004720int
Owen Taylor3473f882001-02-23 17:55:21 +00004721htmlParseDocument(htmlParserCtxtPtr ctxt) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004722 xmlChar start[4];
4723 xmlCharEncoding enc;
Owen Taylor3473f882001-02-23 17:55:21 +00004724 xmlDtdPtr dtd;
4725
Daniel Veillardd0463562001-10-13 09:15:48 +00004726 xmlInitParser();
4727
Owen Taylor3473f882001-02-23 17:55:21 +00004728 htmlDefaultSAXHandlerInit();
Owen Taylor3473f882001-02-23 17:55:21 +00004729
Daniel Veillarda03e3652004-11-02 18:45:30 +00004730 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4731 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4732 "htmlParseDocument: context error\n", NULL, NULL);
4733 return(XML_ERR_INTERNAL_ERROR);
4734 }
4735 ctxt->html = 1;
Daniel Veillard4d3e2da2009-05-15 17:55:45 +02004736 ctxt->linenumbers = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00004737 GROW;
4738 /*
4739 * SAX: beginning of the document processing.
4740 */
4741 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4742 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4743
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004744 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4745 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4746 /*
4747 * Get the 4 first bytes and decode the charset
4748 * if enc != XML_CHAR_ENCODING_NONE
4749 * plug some encoding conversion routines.
4750 */
4751 start[0] = RAW;
4752 start[1] = NXT(1);
4753 start[2] = NXT(2);
4754 start[3] = NXT(3);
4755 enc = xmlDetectCharEncoding(&start[0], 4);
4756 if (enc != XML_CHAR_ENCODING_NONE) {
4757 xmlSwitchEncoding(ctxt, enc);
4758 }
4759 }
4760
Owen Taylor3473f882001-02-23 17:55:21 +00004761 /*
4762 * Wipe out everything which is before the first '<'
4763 */
4764 SKIP_BLANKS;
4765 if (CUR == 0) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004766 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
Daniel Veillardf403d292003-10-05 13:51:35 +00004767 "Document is empty\n", NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00004768 }
4769
4770 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4771 ctxt->sax->startDocument(ctxt->userData);
4772
4773
4774 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004775 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004776 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004777 while (((CUR == '<') && (NXT(1) == '!') &&
4778 (NXT(2) == '-') && (NXT(3) == '-')) ||
4779 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004780 htmlParseComment(ctxt);
4781 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004782 SKIP_BLANKS;
Daniel Veillard7f4547c2008-10-03 07:58:23 +00004783 }
Owen Taylor3473f882001-02-23 17:55:21 +00004784
4785
4786 /*
4787 * Then possibly doc type declaration(s) and more Misc
4788 * (doctypedecl Misc*)?
4789 */
4790 if ((CUR == '<') && (NXT(1) == '!') &&
4791 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4792 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4793 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4794 (UPP(8) == 'E')) {
4795 htmlParseDocTypeDecl(ctxt);
4796 }
4797 SKIP_BLANKS;
4798
4799 /*
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004800 * Parse possible comments and PIs before any content
Owen Taylor3473f882001-02-23 17:55:21 +00004801 */
Daniel Veillardfc484dd2004-10-22 14:34:23 +00004802 while (((CUR == '<') && (NXT(1) == '!') &&
4803 (NXT(2) == '-') && (NXT(3) == '-')) ||
4804 ((CUR == '<') && (NXT(1) == '?'))) {
Daniel Veillarde77db162009-08-22 11:32:38 +02004805 htmlParseComment(ctxt);
4806 htmlParsePI(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004807 SKIP_BLANKS;
Daniel Veillarde77db162009-08-22 11:32:38 +02004808 }
Owen Taylor3473f882001-02-23 17:55:21 +00004809
4810 /*
4811 * Time to start parsing the tree itself
4812 */
Eugene Pimenov615904f2010-03-15 15:16:02 +01004813 htmlParseContentInternal(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004814
4815 /*
4816 * autoclose
4817 */
4818 if (CUR == 0)
Daniel Veillarda3bfca52001-04-12 15:42:58 +00004819 htmlAutoCloseOnEnd(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00004820
4821
4822 /*
4823 * SAX: end of the document processing.
4824 */
4825 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4826 ctxt->sax->endDocument(ctxt->userData);
4827
Daniel Veillardf1121c42010-07-26 14:02:42 +02004828 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
Owen Taylor3473f882001-02-23 17:55:21 +00004829 dtd = xmlGetIntSubset(ctxt->myDoc);
4830 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02004831 ctxt->myDoc->intSubset =
4832 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00004833 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4834 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4835 }
4836 if (! ctxt->wellFormed) return(-1);
4837 return(0);
4838}
4839
4840
4841/************************************************************************
4842 * *
4843 * Parser contexts handling *
4844 * *
4845 ************************************************************************/
4846
4847/**
William M. Brackedb65a72004-02-06 07:36:04 +00004848 * htmlInitParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00004849 * @ctxt: an HTML parser context
4850 *
4851 * Initialize a parser context
Daniel Veillardf403d292003-10-05 13:51:35 +00004852 *
4853 * Returns 0 in case of success and -1 in case of error
Owen Taylor3473f882001-02-23 17:55:21 +00004854 */
4855
Daniel Veillardf403d292003-10-05 13:51:35 +00004856static int
Owen Taylor3473f882001-02-23 17:55:21 +00004857htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4858{
4859 htmlSAXHandler *sax;
4860
Daniel Veillardf403d292003-10-05 13:51:35 +00004861 if (ctxt == NULL) return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004862 memset(ctxt, 0, sizeof(htmlParserCtxt));
4863
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004864 ctxt->dict = xmlDictCreate();
4865 if (ctxt->dict == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004866 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4867 return(-1);
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004868 }
Owen Taylor3473f882001-02-23 17:55:21 +00004869 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4870 if (sax == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004871 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4872 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004873 }
4874 else
4875 memset(sax, 0, sizeof(htmlSAXHandler));
4876
4877 /* Allocate the Input stack */
Daniel Veillarde77db162009-08-22 11:32:38 +02004878 ctxt->inputTab = (htmlParserInputPtr *)
Owen Taylor3473f882001-02-23 17:55:21 +00004879 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4880 if (ctxt->inputTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004881 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004882 ctxt->inputNr = 0;
4883 ctxt->inputMax = 0;
4884 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004885 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004886 }
4887 ctxt->inputNr = 0;
4888 ctxt->inputMax = 5;
4889 ctxt->input = NULL;
4890 ctxt->version = NULL;
4891 ctxt->encoding = NULL;
4892 ctxt->standalone = -1;
4893 ctxt->instate = XML_PARSER_START;
4894
4895 /* Allocate the Node stack */
4896 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4897 if (ctxt->nodeTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004898 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004899 ctxt->nodeNr = 0;
4900 ctxt->nodeMax = 0;
4901 ctxt->node = NULL;
4902 ctxt->inputNr = 0;
4903 ctxt->inputMax = 0;
4904 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004905 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004906 }
4907 ctxt->nodeNr = 0;
4908 ctxt->nodeMax = 10;
4909 ctxt->node = NULL;
4910
4911 /* Allocate the Name stack */
Daniel Veillard2fdbd322003-08-18 12:15:38 +00004912 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
Owen Taylor3473f882001-02-23 17:55:21 +00004913 if (ctxt->nameTab == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004914 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
Owen Taylor3473f882001-02-23 17:55:21 +00004915 ctxt->nameNr = 0;
Eugene Pimenovef9c6362010-03-15 11:37:48 +01004916 ctxt->nameMax = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00004917 ctxt->name = NULL;
4918 ctxt->nodeNr = 0;
4919 ctxt->nodeMax = 0;
4920 ctxt->node = NULL;
4921 ctxt->inputNr = 0;
4922 ctxt->inputMax = 0;
4923 ctxt->input = NULL;
Daniel Veillardf403d292003-10-05 13:51:35 +00004924 return(-1);
Owen Taylor3473f882001-02-23 17:55:21 +00004925 }
4926 ctxt->nameNr = 0;
4927 ctxt->nameMax = 10;
4928 ctxt->name = NULL;
4929
Eugene Pimenov615904f2010-03-15 15:16:02 +01004930 ctxt->nodeInfoTab = NULL;
4931 ctxt->nodeInfoNr = 0;
4932 ctxt->nodeInfoMax = 0;
4933
Daniel Veillard092643b2003-09-25 14:29:29 +00004934 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
Owen Taylor3473f882001-02-23 17:55:21 +00004935 else {
4936 ctxt->sax = sax;
Daniel Veillard092643b2003-09-25 14:29:29 +00004937 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Owen Taylor3473f882001-02-23 17:55:21 +00004938 }
4939 ctxt->userData = ctxt;
4940 ctxt->myDoc = NULL;
4941 ctxt->wellFormed = 1;
4942 ctxt->replaceEntities = 0;
Daniel Veillard635ef722001-10-29 11:48:19 +00004943 ctxt->linenumbers = xmlLineNumbersDefaultValue;
Owen Taylor3473f882001-02-23 17:55:21 +00004944 ctxt->html = 1;
Daniel Veillardeff45a92004-10-29 12:10:55 +00004945 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
William M. Brackedb65a72004-02-06 07:36:04 +00004946 ctxt->vctxt.userData = ctxt;
4947 ctxt->vctxt.error = xmlParserValidityError;
4948 ctxt->vctxt.warning = xmlParserValidityWarning;
Owen Taylor3473f882001-02-23 17:55:21 +00004949 ctxt->record_info = 0;
4950 ctxt->validate = 0;
4951 ctxt->nbChars = 0;
4952 ctxt->checkIndex = 0;
Daniel Veillarddc2cee22001-08-22 16:30:37 +00004953 ctxt->catalogs = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00004954 xmlInitNodeInfoSeq(&ctxt->node_seq);
Daniel Veillardf403d292003-10-05 13:51:35 +00004955 return(0);
Owen Taylor3473f882001-02-23 17:55:21 +00004956}
4957
4958/**
4959 * htmlFreeParserCtxt:
4960 * @ctxt: an HTML parser context
4961 *
4962 * Free all the memory used by a parser context. However the parsed
4963 * document in ctxt->myDoc is not freed.
4964 */
4965
4966void
4967htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4968{
4969 xmlFreeParserCtxt(ctxt);
4970}
4971
4972/**
Daniel Veillard1d995272002-07-22 16:43:32 +00004973 * htmlNewParserCtxt:
4974 *
4975 * Allocate and initialize a new parser context.
4976 *
Daniel Veillard34c647c2006-09-21 06:53:59 +00004977 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
Daniel Veillard1d995272002-07-22 16:43:32 +00004978 */
4979
Daniel Veillard34c647c2006-09-21 06:53:59 +00004980htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00004981htmlNewParserCtxt(void)
4982{
4983 xmlParserCtxtPtr ctxt;
4984
4985 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4986 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00004987 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
Daniel Veillard1d995272002-07-22 16:43:32 +00004988 return(NULL);
4989 }
4990 memset(ctxt, 0, sizeof(xmlParserCtxt));
Daniel Veillardf403d292003-10-05 13:51:35 +00004991 if (htmlInitParserCtxt(ctxt) < 0) {
4992 htmlFreeParserCtxt(ctxt);
4993 return(NULL);
4994 }
Daniel Veillard1d995272002-07-22 16:43:32 +00004995 return(ctxt);
4996}
4997
4998/**
4999 * htmlCreateMemoryParserCtxt:
5000 * @buffer: a pointer to a char array
5001 * @size: the size of the array
5002 *
5003 * Create a parser context for an HTML in-memory document.
5004 *
5005 * Returns the new parser context or NULL
5006 */
Daniel Veillard02ea1412003-04-09 12:08:47 +00005007htmlParserCtxtPtr
Daniel Veillard1d995272002-07-22 16:43:32 +00005008htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5009 xmlParserCtxtPtr ctxt;
5010 xmlParserInputPtr input;
5011 xmlParserInputBufferPtr buf;
5012
5013 if (buffer == NULL)
5014 return(NULL);
5015 if (size <= 0)
5016 return(NULL);
5017
5018 ctxt = htmlNewParserCtxt();
5019 if (ctxt == NULL)
5020 return(NULL);
5021
5022 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5023 if (buf == NULL) return(NULL);
5024
5025 input = xmlNewInputStream(ctxt);
5026 if (input == NULL) {
5027 xmlFreeParserCtxt(ctxt);
5028 return(NULL);
5029 }
5030
5031 input->filename = NULL;
5032 input->buf = buf;
Daniel Veillard61551a12012-07-16 16:28:47 +08005033 xmlBufResetInput(buf->buffer, input);
Daniel Veillard1d995272002-07-22 16:43:32 +00005034
5035 inputPush(ctxt, input);
5036 return(ctxt);
5037}
5038
5039/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00005040 * htmlCreateDocParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00005041 * @cur: a pointer to an array of xmlChar
5042 * @encoding: a free form C string describing the HTML document encoding, or NULL
5043 *
5044 * Create a parser context for an HTML document.
5045 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005046 * TODO: check the need to add encoding handling there
5047 *
Owen Taylor3473f882001-02-23 17:55:21 +00005048 * Returns the new parser context or NULL
5049 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005050static htmlParserCtxtPtr
Daniel Veillard4d1320f2007-04-26 08:55:33 +00005051htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
Daniel Veillard1d995272002-07-22 16:43:32 +00005052 int len;
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005053 htmlParserCtxtPtr ctxt;
Owen Taylor3473f882001-02-23 17:55:21 +00005054
Daniel Veillard1d995272002-07-22 16:43:32 +00005055 if (cur == NULL)
Owen Taylor3473f882001-02-23 17:55:21 +00005056 return(NULL);
Daniel Veillard1d995272002-07-22 16:43:32 +00005057 len = xmlStrlen(cur);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005058 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
Daniel Veillard739e9d02007-04-27 09:33:58 +00005059 if (ctxt == NULL)
5060 return(NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005061
5062 if (encoding != NULL) {
5063 xmlCharEncoding enc;
5064 xmlCharEncodingHandlerPtr handler;
5065
5066 if (ctxt->input->encoding != NULL)
5067 xmlFree((xmlChar *) ctxt->input->encoding);
Daniel Veillarde8ed6202003-08-14 23:39:01 +00005068 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005069
5070 enc = xmlParseCharEncoding(encoding);
5071 /*
5072 * registered set of known encodings
5073 */
5074 if (enc != XML_CHAR_ENCODING_ERROR) {
5075 xmlSwitchEncoding(ctxt, enc);
5076 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005077 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
Daniel Veillarde77db162009-08-22 11:32:38 +02005078 "Unsupported encoding %s\n",
Daniel Veillardf403d292003-10-05 13:51:35 +00005079 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005080 }
5081 } else {
5082 /*
5083 * fallback for unknown encodings
5084 */
5085 handler = xmlFindCharEncodingHandler((const char *) encoding);
5086 if (handler != NULL) {
5087 xmlSwitchToEncoding(ctxt, handler);
5088 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005089 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5090 "Unsupported encoding %s\n",
5091 (const xmlChar *) encoding, NULL);
Daniel Veillarde5b110b2003-02-04 14:43:39 +00005092 }
5093 }
5094 }
5095 return(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005096}
5097
Daniel Veillard73b013f2003-09-30 12:36:01 +00005098#ifdef LIBXML_PUSH_ENABLED
Owen Taylor3473f882001-02-23 17:55:21 +00005099/************************************************************************
5100 * *
Daniel Veillarde77db162009-08-22 11:32:38 +02005101 * Progressive parsing interfaces *
Owen Taylor3473f882001-02-23 17:55:21 +00005102 * *
5103 ************************************************************************/
5104
5105/**
5106 * htmlParseLookupSequence:
5107 * @ctxt: an HTML parser context
5108 * @first: the first char to lookup
5109 * @next: the next char to lookup or zero
5110 * @third: the next char to lookup or zero
William M. Brack78637da2003-07-31 14:47:38 +00005111 * @comment: flag to force checking inside comments
Owen Taylor3473f882001-02-23 17:55:21 +00005112 *
5113 * Try to find if a sequence (first, next, third) or just (first next) or
5114 * (first) is available in the input stream.
5115 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5116 * to avoid rescanning sequences of bytes, it DOES change the state of the
5117 * parser, do not use liberally.
5118 * This is basically similar to xmlParseLookupSequence()
5119 *
5120 * Returns the index to the current parsing point if the full sequence
5121 * is available, -1 otherwise.
5122 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005123static int
Owen Taylor3473f882001-02-23 17:55:21 +00005124htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
Jiri Netolicky446e1262009-08-07 17:05:36 +02005125 xmlChar next, xmlChar third, int iscomment,
Daniel Veillardeeb99322009-08-25 14:42:16 +02005126 int ignoreattrval)
5127{
Owen Taylor3473f882001-02-23 17:55:21 +00005128 int base, len;
5129 htmlParserInputPtr in;
5130 const xmlChar *buf;
Daniel Veillardc1f78342001-11-10 11:43:05 +00005131 int incomment = 0;
Jiri Netolicky446e1262009-08-07 17:05:36 +02005132 int invalue = 0;
5133 char valdellim = 0x0;
Owen Taylor3473f882001-02-23 17:55:21 +00005134
5135 in = ctxt->input;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005136 if (in == NULL)
5137 return (-1);
5138
Owen Taylor3473f882001-02-23 17:55:21 +00005139 base = in->cur - in->base;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005140 if (base < 0)
5141 return (-1);
5142
Owen Taylor3473f882001-02-23 17:55:21 +00005143 if (ctxt->checkIndex > base)
5144 base = ctxt->checkIndex;
Daniel Veillardeeb99322009-08-25 14:42:16 +02005145
Owen Taylor3473f882001-02-23 17:55:21 +00005146 if (in->buf == NULL) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02005147 buf = in->base;
5148 len = in->length;
Owen Taylor3473f882001-02-23 17:55:21 +00005149 } else {
Daniel Veillarda78d8032012-07-16 14:56:50 +08005150 buf = xmlBufContent(in->buf->buffer);
5151 len = xmlBufUse(in->buf->buffer);
Owen Taylor3473f882001-02-23 17:55:21 +00005152 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02005153
Owen Taylor3473f882001-02-23 17:55:21 +00005154 /* take into account the sequence length */
Daniel Veillardeeb99322009-08-25 14:42:16 +02005155 if (third)
5156 len -= 2;
5157 else if (next)
5158 len--;
5159 for (; base < len; base++) {
5160 if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5161 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5162 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5163 incomment = 1;
5164 /* do not increment past <! - some people use <!--> */
5165 base += 2;
5166 }
5167 }
5168 if (ignoreattrval) {
5169 if (buf[base] == '"' || buf[base] == '\'') {
5170 if (invalue) {
5171 if (buf[base] == valdellim) {
5172 invalue = 0;
5173 continue;
5174 }
5175 } else {
5176 valdellim = buf[base];
5177 invalue = 1;
5178 continue;
5179 }
5180 } else if (invalue) {
5181 continue;
5182 }
5183 }
5184 if (incomment) {
5185 if (base + 3 > len)
5186 return (-1);
5187 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5188 (buf[base + 2] == '>')) {
5189 incomment = 0;
5190 base += 2;
5191 }
5192 continue;
5193 }
Owen Taylor3473f882001-02-23 17:55:21 +00005194 if (buf[base] == first) {
Daniel Veillardeeb99322009-08-25 14:42:16 +02005195 if (third != 0) {
5196 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5197 continue;
5198 } else if (next != 0) {
5199 if (buf[base + 1] != next)
5200 continue;
5201 }
5202 ctxt->checkIndex = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00005203#ifdef DEBUG_PUSH
Daniel Veillardeeb99322009-08-25 14:42:16 +02005204 if (next == 0)
5205 xmlGenericError(xmlGenericErrorContext,
5206 "HPP: lookup '%c' found at %d\n",
5207 first, base);
5208 else if (third == 0)
5209 xmlGenericError(xmlGenericErrorContext,
5210 "HPP: lookup '%c%c' found at %d\n",
5211 first, next, base);
5212 else
5213 xmlGenericError(xmlGenericErrorContext,
5214 "HPP: lookup '%c%c%c' found at %d\n",
5215 first, next, third, base);
Owen Taylor3473f882001-02-23 17:55:21 +00005216#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02005217 return (base - (in->cur - in->base));
5218 }
Owen Taylor3473f882001-02-23 17:55:21 +00005219 }
Daniel Veillardeeb99322009-08-25 14:42:16 +02005220 if ((!incomment) && (!invalue))
5221 ctxt->checkIndex = base;
Owen Taylor3473f882001-02-23 17:55:21 +00005222#ifdef DEBUG_PUSH
5223 if (next == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02005224 xmlGenericError(xmlGenericErrorContext,
5225 "HPP: lookup '%c' failed\n", first);
Owen Taylor3473f882001-02-23 17:55:21 +00005226 else if (third == 0)
Daniel Veillardeeb99322009-08-25 14:42:16 +02005227 xmlGenericError(xmlGenericErrorContext,
5228 "HPP: lookup '%c%c' failed\n", first, next);
Daniel Veillarde77db162009-08-22 11:32:38 +02005229 else
Daniel Veillardeeb99322009-08-25 14:42:16 +02005230 xmlGenericError(xmlGenericErrorContext,
5231 "HPP: lookup '%c%c%c' failed\n", first, next,
5232 third);
Owen Taylor3473f882001-02-23 17:55:21 +00005233#endif
Daniel Veillardeeb99322009-08-25 14:42:16 +02005234 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00005235}
5236
5237/**
Markus Kull56a03032009-08-24 19:00:23 +02005238 * htmlParseLookupChars:
5239 * @ctxt: an HTML parser context
5240 * @stop: Array of chars, which stop the lookup.
5241 * @stopLen: Length of stop-Array
5242 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005243 * Try to find if any char of the stop-Array is available in the input
Markus Kull56a03032009-08-24 19:00:23 +02005244 * stream.
5245 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5246 * to avoid rescanning sequences of bytes, it DOES change the state of the
5247 * parser, do not use liberally.
5248 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005249 * Returns the index to the current parsing point if a stopChar
Markus Kull56a03032009-08-24 19:00:23 +02005250 * is available, -1 otherwise.
5251 */
5252static int
5253htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5254 int stopLen)
5255{
5256 int base, len;
5257 htmlParserInputPtr in;
5258 const xmlChar *buf;
5259 int incomment = 0;
5260 int i;
5261
5262 in = ctxt->input;
5263 if (in == NULL)
5264 return (-1);
5265
5266 base = in->cur - in->base;
5267 if (base < 0)
5268 return (-1);
5269
5270 if (ctxt->checkIndex > base)
5271 base = ctxt->checkIndex;
5272
5273 if (in->buf == NULL) {
5274 buf = in->base;
5275 len = in->length;
5276 } else {
Daniel Veillarda78d8032012-07-16 14:56:50 +08005277 buf = xmlBufContent(in->buf->buffer);
5278 len = xmlBufUse(in->buf->buffer);
Markus Kull56a03032009-08-24 19:00:23 +02005279 }
5280
5281 for (; base < len; base++) {
5282 if (!incomment && (base + 4 < len)) {
5283 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5284 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5285 incomment = 1;
5286 /* do not increment past <! - some people use <!--> */
5287 base += 2;
5288 }
5289 }
5290 if (incomment) {
5291 if (base + 3 > len)
5292 return (-1);
5293 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5294 (buf[base + 2] == '>')) {
5295 incomment = 0;
5296 base += 2;
5297 }
5298 continue;
5299 }
5300 for (i = 0; i < stopLen; ++i) {
5301 if (buf[base] == stop[i]) {
5302 ctxt->checkIndex = 0;
5303 return (base - (in->cur - in->base));
5304 }
5305 }
5306 }
5307 ctxt->checkIndex = base;
5308 return (-1);
5309}
5310
5311/**
Owen Taylor3473f882001-02-23 17:55:21 +00005312 * htmlParseTryOrFinish:
5313 * @ctxt: an HTML parser context
5314 * @terminate: last chunk indicator
5315 *
5316 * Try to progress on parsing
5317 *
5318 * Returns zero if no parsing was possible
5319 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005320static int
Owen Taylor3473f882001-02-23 17:55:21 +00005321htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5322 int ret = 0;
5323 htmlParserInputPtr in;
5324 int avail = 0;
5325 xmlChar cur, next;
5326
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005327 htmlParserNodeInfo node_info;
5328
Owen Taylor3473f882001-02-23 17:55:21 +00005329#ifdef DEBUG_PUSH
5330 switch (ctxt->instate) {
5331 case XML_PARSER_EOF:
5332 xmlGenericError(xmlGenericErrorContext,
5333 "HPP: try EOF\n"); break;
5334 case XML_PARSER_START:
5335 xmlGenericError(xmlGenericErrorContext,
5336 "HPP: try START\n"); break;
5337 case XML_PARSER_MISC:
5338 xmlGenericError(xmlGenericErrorContext,
5339 "HPP: try MISC\n");break;
5340 case XML_PARSER_COMMENT:
5341 xmlGenericError(xmlGenericErrorContext,
5342 "HPP: try COMMENT\n");break;
5343 case XML_PARSER_PROLOG:
5344 xmlGenericError(xmlGenericErrorContext,
5345 "HPP: try PROLOG\n");break;
5346 case XML_PARSER_START_TAG:
5347 xmlGenericError(xmlGenericErrorContext,
5348 "HPP: try START_TAG\n");break;
5349 case XML_PARSER_CONTENT:
5350 xmlGenericError(xmlGenericErrorContext,
5351 "HPP: try CONTENT\n");break;
5352 case XML_PARSER_CDATA_SECTION:
5353 xmlGenericError(xmlGenericErrorContext,
5354 "HPP: try CDATA_SECTION\n");break;
5355 case XML_PARSER_END_TAG:
5356 xmlGenericError(xmlGenericErrorContext,
5357 "HPP: try END_TAG\n");break;
5358 case XML_PARSER_ENTITY_DECL:
5359 xmlGenericError(xmlGenericErrorContext,
5360 "HPP: try ENTITY_DECL\n");break;
5361 case XML_PARSER_ENTITY_VALUE:
5362 xmlGenericError(xmlGenericErrorContext,
5363 "HPP: try ENTITY_VALUE\n");break;
5364 case XML_PARSER_ATTRIBUTE_VALUE:
5365 xmlGenericError(xmlGenericErrorContext,
5366 "HPP: try ATTRIBUTE_VALUE\n");break;
5367 case XML_PARSER_DTD:
5368 xmlGenericError(xmlGenericErrorContext,
5369 "HPP: try DTD\n");break;
5370 case XML_PARSER_EPILOG:
5371 xmlGenericError(xmlGenericErrorContext,
5372 "HPP: try EPILOG\n");break;
5373 case XML_PARSER_PI:
5374 xmlGenericError(xmlGenericErrorContext,
5375 "HPP: try PI\n");break;
5376 case XML_PARSER_SYSTEM_LITERAL:
5377 xmlGenericError(xmlGenericErrorContext,
5378 "HPP: try SYSTEM_LITERAL\n");break;
5379 }
5380#endif
5381
5382 while (1) {
5383
5384 in = ctxt->input;
5385 if (in == NULL) break;
5386 if (in->buf == NULL)
5387 avail = in->length - (in->cur - in->base);
5388 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005389 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Owen Taylor3473f882001-02-23 17:55:21 +00005390 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00005391 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005392 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005393 /*
5394 * SAX: end of the document processing.
5395 */
5396 ctxt->instate = XML_PARSER_EOF;
5397 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5398 ctxt->sax->endDocument(ctxt->userData);
5399 }
5400 }
5401 if (avail < 1)
5402 goto done;
Daniel Veillard45269b82003-04-22 13:21:57 +00005403 cur = in->cur[0];
5404 if (cur == 0) {
5405 SKIP(1);
5406 continue;
5407 }
5408
Owen Taylor3473f882001-02-23 17:55:21 +00005409 switch (ctxt->instate) {
5410 case XML_PARSER_EOF:
5411 /*
5412 * Document parsing is done !
5413 */
5414 goto done;
5415 case XML_PARSER_START:
5416 /*
5417 * Very first chars read from the document flow.
5418 */
5419 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005420 if (IS_BLANK_CH(cur)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005421 SKIP_BLANKS;
5422 if (in->buf == NULL)
5423 avail = in->length - (in->cur - in->base);
5424 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005425 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Owen Taylor3473f882001-02-23 17:55:21 +00005426 }
5427 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5428 ctxt->sax->setDocumentLocator(ctxt->userData,
5429 &xmlDefaultSAXLocator);
5430 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5431 (!ctxt->disableSAX))
5432 ctxt->sax->startDocument(ctxt->userData);
5433
5434 cur = in->cur[0];
5435 next = in->cur[1];
5436 if ((cur == '<') && (next == '!') &&
5437 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5438 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5439 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5440 (UPP(8) == 'E')) {
5441 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005442 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005443 goto done;
5444#ifdef DEBUG_PUSH
5445 xmlGenericError(xmlGenericErrorContext,
5446 "HPP: Parsing internal subset\n");
5447#endif
5448 htmlParseDocTypeDecl(ctxt);
5449 ctxt->instate = XML_PARSER_PROLOG;
5450#ifdef DEBUG_PUSH
5451 xmlGenericError(xmlGenericErrorContext,
5452 "HPP: entering PROLOG\n");
5453#endif
5454 } else {
5455 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005456#ifdef DEBUG_PUSH
Daniel Veillard597f1c12005-07-03 23:00:18 +00005457 xmlGenericError(xmlGenericErrorContext,
5458 "HPP: entering MISC\n");
Owen Taylor3473f882001-02-23 17:55:21 +00005459#endif
Daniel Veillard597f1c12005-07-03 23:00:18 +00005460 }
Owen Taylor3473f882001-02-23 17:55:21 +00005461 break;
5462 case XML_PARSER_MISC:
5463 SKIP_BLANKS;
5464 if (in->buf == NULL)
5465 avail = in->length - (in->cur - in->base);
5466 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005467 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Denis Paukfdf990c2012-05-10 20:40:49 +08005468 /*
5469 * no chars in buffer
5470 */
5471 if (avail < 1)
Owen Taylor3473f882001-02-23 17:55:21 +00005472 goto done;
Denis Paukfdf990c2012-05-10 20:40:49 +08005473 /*
5474 * not enouth chars in buffer
5475 */
5476 if (avail < 2) {
5477 if (!terminate)
5478 goto done;
5479 else
5480 next = ' ';
5481 } else {
5482 next = in->cur[1];
5483 }
Owen Taylor3473f882001-02-23 17:55:21 +00005484 cur = in->cur[0];
Owen Taylor3473f882001-02-23 17:55:21 +00005485 if ((cur == '<') && (next == '!') &&
5486 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5487 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005488 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005489 goto done;
5490#ifdef DEBUG_PUSH
5491 xmlGenericError(xmlGenericErrorContext,
5492 "HPP: Parsing Comment\n");
5493#endif
5494 htmlParseComment(ctxt);
5495 ctxt->instate = XML_PARSER_MISC;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005496 } else if ((cur == '<') && (next == '?')) {
5497 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005498 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005499 goto done;
5500#ifdef DEBUG_PUSH
5501 xmlGenericError(xmlGenericErrorContext,
5502 "HPP: Parsing PI\n");
5503#endif
5504 htmlParsePI(ctxt);
5505 ctxt->instate = XML_PARSER_MISC;
Owen Taylor3473f882001-02-23 17:55:21 +00005506 } else if ((cur == '<') && (next == '!') &&
5507 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5508 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5509 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5510 (UPP(8) == 'E')) {
5511 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005512 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005513 goto done;
5514#ifdef DEBUG_PUSH
5515 xmlGenericError(xmlGenericErrorContext,
5516 "HPP: Parsing internal subset\n");
5517#endif
5518 htmlParseDocTypeDecl(ctxt);
5519 ctxt->instate = XML_PARSER_PROLOG;
5520#ifdef DEBUG_PUSH
5521 xmlGenericError(xmlGenericErrorContext,
5522 "HPP: entering PROLOG\n");
5523#endif
5524 } else if ((cur == '<') && (next == '!') &&
5525 (avail < 9)) {
5526 goto done;
5527 } else {
5528 ctxt->instate = XML_PARSER_START_TAG;
5529#ifdef DEBUG_PUSH
5530 xmlGenericError(xmlGenericErrorContext,
5531 "HPP: entering START_TAG\n");
5532#endif
5533 }
5534 break;
5535 case XML_PARSER_PROLOG:
5536 SKIP_BLANKS;
5537 if (in->buf == NULL)
5538 avail = in->length - (in->cur - in->base);
5539 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005540 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Daniel Veillarde77db162009-08-22 11:32:38 +02005541 if (avail < 2)
Owen Taylor3473f882001-02-23 17:55:21 +00005542 goto done;
5543 cur = in->cur[0];
5544 next = in->cur[1];
5545 if ((cur == '<') && (next == '!') &&
5546 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5547 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005548 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005549 goto done;
5550#ifdef DEBUG_PUSH
5551 xmlGenericError(xmlGenericErrorContext,
5552 "HPP: Parsing Comment\n");
5553#endif
5554 htmlParseComment(ctxt);
5555 ctxt->instate = XML_PARSER_PROLOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005556 } else if ((cur == '<') && (next == '?')) {
5557 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005558 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005559 goto done;
5560#ifdef DEBUG_PUSH
5561 xmlGenericError(xmlGenericErrorContext,
5562 "HPP: Parsing PI\n");
5563#endif
5564 htmlParsePI(ctxt);
5565 ctxt->instate = XML_PARSER_PROLOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005566 } else if ((cur == '<') && (next == '!') &&
5567 (avail < 4)) {
5568 goto done;
5569 } else {
5570 ctxt->instate = XML_PARSER_START_TAG;
5571#ifdef DEBUG_PUSH
5572 xmlGenericError(xmlGenericErrorContext,
5573 "HPP: entering START_TAG\n");
5574#endif
5575 }
5576 break;
5577 case XML_PARSER_EPILOG:
5578 if (in->buf == NULL)
5579 avail = in->length - (in->cur - in->base);
5580 else
Daniel Veillarda78d8032012-07-16 14:56:50 +08005581 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
Owen Taylor3473f882001-02-23 17:55:21 +00005582 if (avail < 1)
5583 goto done;
5584 cur = in->cur[0];
William M. Brack76e95df2003-10-18 16:20:14 +00005585 if (IS_BLANK_CH(cur)) {
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005586 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005587 goto done;
5588 }
5589 if (avail < 2)
5590 goto done;
5591 next = in->cur[1];
5592 if ((cur == '<') && (next == '!') &&
5593 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5594 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005595 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005596 goto done;
5597#ifdef DEBUG_PUSH
5598 xmlGenericError(xmlGenericErrorContext,
5599 "HPP: Parsing Comment\n");
5600#endif
5601 htmlParseComment(ctxt);
5602 ctxt->instate = XML_PARSER_EPILOG;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005603 } else if ((cur == '<') && (next == '?')) {
5604 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005605 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005606 goto done;
5607#ifdef DEBUG_PUSH
5608 xmlGenericError(xmlGenericErrorContext,
5609 "HPP: Parsing PI\n");
5610#endif
5611 htmlParsePI(ctxt);
5612 ctxt->instate = XML_PARSER_EPILOG;
Owen Taylor3473f882001-02-23 17:55:21 +00005613 } else if ((cur == '<') && (next == '!') &&
5614 (avail < 4)) {
5615 goto done;
5616 } else {
5617 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00005618 ctxt->wellFormed = 0;
5619 ctxt->instate = XML_PARSER_EOF;
5620#ifdef DEBUG_PUSH
5621 xmlGenericError(xmlGenericErrorContext,
5622 "HPP: entering EOF\n");
5623#endif
5624 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5625 ctxt->sax->endDocument(ctxt->userData);
5626 goto done;
5627 }
5628 break;
5629 case XML_PARSER_START_TAG: {
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005630 const xmlChar *name;
Daniel Veillard597f1c12005-07-03 23:00:18 +00005631 int failed;
Daniel Veillardbb371292001-08-16 23:26:59 +00005632 const htmlElemDesc * info;
Owen Taylor3473f882001-02-23 17:55:21 +00005633
Denis Paukfdf990c2012-05-10 20:40:49 +08005634 /*
5635 * no chars in buffer
5636 */
5637 if (avail < 1)
Owen Taylor3473f882001-02-23 17:55:21 +00005638 goto done;
Denis Paukfdf990c2012-05-10 20:40:49 +08005639 /*
5640 * not enouth chars in buffer
5641 */
5642 if (avail < 2) {
5643 if (!terminate)
5644 goto done;
5645 else
5646 next = ' ';
5647 } else {
5648 next = in->cur[1];
5649 }
Owen Taylor3473f882001-02-23 17:55:21 +00005650 cur = in->cur[0];
5651 if (cur != '<') {
5652 ctxt->instate = XML_PARSER_CONTENT;
5653#ifdef DEBUG_PUSH
5654 xmlGenericError(xmlGenericErrorContext,
5655 "HPP: entering CONTENT\n");
5656#endif
5657 break;
5658 }
Denis Paukfdf990c2012-05-10 20:40:49 +08005659 if (next == '/') {
Daniel Veillardf69bb4b2001-05-19 13:24:56 +00005660 ctxt->instate = XML_PARSER_END_TAG;
5661 ctxt->checkIndex = 0;
5662#ifdef DEBUG_PUSH
5663 xmlGenericError(xmlGenericErrorContext,
5664 "HPP: entering END_TAG\n");
5665#endif
5666 break;
5667 }
Owen Taylor3473f882001-02-23 17:55:21 +00005668 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005669 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005670 goto done;
5671
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005672 /* Capture start position */
5673 if (ctxt->record_info) {
5674 node_info.begin_pos = ctxt->input->consumed +
5675 (CUR_PTR - ctxt->input->base);
5676 node_info.begin_line = ctxt->input->line;
5677 }
5678
5679
Daniel Veillard597f1c12005-07-03 23:00:18 +00005680 failed = htmlParseStartTag(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005681 name = ctxt->name;
Daniel Veillard35fcbb82008-03-12 21:43:39 +00005682 if ((failed == -1) ||
Owen Taylor3473f882001-02-23 17:55:21 +00005683 (name == NULL)) {
5684 if (CUR == '>')
5685 NEXT;
Owen Taylor3473f882001-02-23 17:55:21 +00005686 break;
5687 }
Owen Taylor3473f882001-02-23 17:55:21 +00005688
5689 /*
5690 * Lookup the info for that element.
5691 */
5692 info = htmlTagLookup(name);
5693 if (info == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005694 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5695 "Tag %s invalid\n", name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005696 }
5697
5698 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00005699 * Check for an Empty Element labeled the XML/SGML way
Owen Taylor3473f882001-02-23 17:55:21 +00005700 */
5701 if ((CUR == '/') && (NXT(1) == '>')) {
5702 SKIP(2);
5703 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5704 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005705 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005706 ctxt->instate = XML_PARSER_CONTENT;
5707#ifdef DEBUG_PUSH
5708 xmlGenericError(xmlGenericErrorContext,
5709 "HPP: entering CONTENT\n");
5710#endif
5711 break;
5712 }
5713
5714 if (CUR == '>') {
5715 NEXT;
5716 } else {
Daniel Veillardf403d292003-10-05 13:51:35 +00005717 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5718 "Couldn't find end of Start Tag %s\n",
5719 name, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005720
5721 /*
5722 * end of parsing of this node.
5723 */
Daniel Veillarde77db162009-08-22 11:32:38 +02005724 if (xmlStrEqual(name, ctxt->name)) {
Owen Taylor3473f882001-02-23 17:55:21 +00005725 nodePop(ctxt);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005726 htmlnamePop(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02005727 }
Owen Taylor3473f882001-02-23 17:55:21 +00005728
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005729 if (ctxt->record_info)
5730 htmlNodeInfoPush(ctxt, &node_info);
5731
Owen Taylor3473f882001-02-23 17:55:21 +00005732 ctxt->instate = XML_PARSER_CONTENT;
5733#ifdef DEBUG_PUSH
5734 xmlGenericError(xmlGenericErrorContext,
5735 "HPP: entering CONTENT\n");
5736#endif
5737 break;
5738 }
5739
5740 /*
5741 * Check for an Empty Element from DTD definition
5742 */
5743 if ((info != NULL) && (info->empty)) {
5744 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5745 ctxt->sax->endElement(ctxt->userData, name);
Daniel Veillard6a0baa02005-12-10 11:11:12 +00005746 htmlnamePop(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005747 }
Pavel Andrejs8ad4da52012-05-08 11:01:12 +08005748
5749 if (ctxt->record_info)
5750 htmlNodeInfoPush(ctxt, &node_info);
5751
Owen Taylor3473f882001-02-23 17:55:21 +00005752 ctxt->instate = XML_PARSER_CONTENT;
5753#ifdef DEBUG_PUSH
5754 xmlGenericError(xmlGenericErrorContext,
5755 "HPP: entering CONTENT\n");
5756#endif
5757 break;
5758 }
5759 case XML_PARSER_CONTENT: {
5760 long cons;
5761 /*
5762 * Handle preparsed entities and charRef
5763 */
5764 if (ctxt->token != 0) {
5765 xmlChar chr[2] = { 0 , 0 } ;
5766
5767 chr[0] = (xmlChar) ctxt->token;
5768 htmlCheckParagraph(ctxt);
5769 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5770 ctxt->sax->characters(ctxt->userData, chr, 1);
5771 ctxt->token = 0;
5772 ctxt->checkIndex = 0;
5773 }
5774 if ((avail == 1) && (terminate)) {
5775 cur = in->cur[0];
5776 if ((cur != '<') && (cur != '&')) {
5777 if (ctxt->sax != NULL) {
William M. Brack76e95df2003-10-18 16:20:14 +00005778 if (IS_BLANK_CH(cur)) {
Daniel Veillardf933c892012-09-07 19:32:12 +08005779 if (ctxt->keepBlanks) {
5780 if (ctxt->sax->characters != NULL)
5781 ctxt->sax->characters(
Xin Li28c53d32017-03-07 00:33:02 +00005782 ctxt->userData, &in->cur[0], 1);
Daniel Veillardf933c892012-09-07 19:32:12 +08005783 } else {
5784 if (ctxt->sax->ignorableWhitespace != NULL)
5785 ctxt->sax->ignorableWhitespace(
Xin Li28c53d32017-03-07 00:33:02 +00005786 ctxt->userData, &in->cur[0], 1);
Daniel Veillardf933c892012-09-07 19:32:12 +08005787 }
Owen Taylor3473f882001-02-23 17:55:21 +00005788 } else {
5789 htmlCheckParagraph(ctxt);
5790 if (ctxt->sax->characters != NULL)
5791 ctxt->sax->characters(
Xin Li28c53d32017-03-07 00:33:02 +00005792 ctxt->userData, &in->cur[0], 1);
Owen Taylor3473f882001-02-23 17:55:21 +00005793 }
5794 }
5795 ctxt->token = 0;
5796 ctxt->checkIndex = 0;
Daniel Veillardbc6e1a32002-11-18 15:07:25 +00005797 in->cur++;
William M. Brack1633d182001-10-05 15:41:19 +00005798 break;
Owen Taylor3473f882001-02-23 17:55:21 +00005799 }
Owen Taylor3473f882001-02-23 17:55:21 +00005800 }
5801 if (avail < 2)
5802 goto done;
5803 cur = in->cur[0];
5804 next = in->cur[1];
5805 cons = ctxt->nbChars;
5806 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5807 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5808 /*
5809 * Handle SCRIPT/STYLE separately
5810 */
Daniel Veillard68716a72006-10-16 09:32:17 +00005811 if (!terminate) {
5812 int idx;
5813 xmlChar val;
5814
Denis Pauk91d239c2010-11-04 12:39:18 +01005815 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
Daniel Veillard68716a72006-10-16 09:32:17 +00005816 if (idx < 0)
5817 goto done;
5818 val = in->cur[idx + 2];
5819 if (val == 0) /* bad cut of input */
5820 goto done;
5821 }
Owen Taylor3473f882001-02-23 17:55:21 +00005822 htmlParseScript(ctxt);
5823 if ((cur == '<') && (next == '/')) {
5824 ctxt->instate = XML_PARSER_END_TAG;
5825 ctxt->checkIndex = 0;
5826#ifdef DEBUG_PUSH
5827 xmlGenericError(xmlGenericErrorContext,
5828 "HPP: entering END_TAG\n");
5829#endif
5830 break;
5831 }
5832 } else {
5833 /*
5834 * Sometimes DOCTYPE arrives in the middle of the document
5835 */
5836 if ((cur == '<') && (next == '!') &&
5837 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5838 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5839 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5840 (UPP(8) == 'E')) {
5841 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005842 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005843 goto done;
Daniel Veillardf403d292003-10-05 13:51:35 +00005844 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5845 "Misplaced DOCTYPE declaration\n",
5846 BAD_CAST "DOCTYPE" , NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005847 htmlParseDocTypeDecl(ctxt);
5848 } else if ((cur == '<') && (next == '!') &&
5849 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5850 if ((!terminate) &&
Daniel Veillard97e01882003-07-30 18:59:19 +00005851 (htmlParseLookupSequence(
Daniel Veillarde77db162009-08-22 11:32:38 +02005852 ctxt, '-', '-', '>', 1, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005853 goto done;
5854#ifdef DEBUG_PUSH
5855 xmlGenericError(xmlGenericErrorContext,
5856 "HPP: Parsing Comment\n");
5857#endif
5858 htmlParseComment(ctxt);
5859 ctxt->instate = XML_PARSER_CONTENT;
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005860 } else if ((cur == '<') && (next == '?')) {
5861 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005862 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Daniel Veillardfc484dd2004-10-22 14:34:23 +00005863 goto done;
5864#ifdef DEBUG_PUSH
5865 xmlGenericError(xmlGenericErrorContext,
5866 "HPP: Parsing PI\n");
5867#endif
5868 htmlParsePI(ctxt);
5869 ctxt->instate = XML_PARSER_CONTENT;
Owen Taylor3473f882001-02-23 17:55:21 +00005870 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5871 goto done;
5872 } else if ((cur == '<') && (next == '/')) {
5873 ctxt->instate = XML_PARSER_END_TAG;
5874 ctxt->checkIndex = 0;
5875#ifdef DEBUG_PUSH
5876 xmlGenericError(xmlGenericErrorContext,
5877 "HPP: entering END_TAG\n");
5878#endif
5879 break;
5880 } else if (cur == '<') {
5881 ctxt->instate = XML_PARSER_START_TAG;
5882 ctxt->checkIndex = 0;
5883#ifdef DEBUG_PUSH
5884 xmlGenericError(xmlGenericErrorContext,
5885 "HPP: entering START_TAG\n");
5886#endif
5887 break;
5888 } else if (cur == '&') {
5889 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005890 (htmlParseLookupChars(ctxt,
5891 BAD_CAST "; >/", 4) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005892 goto done;
5893#ifdef DEBUG_PUSH
5894 xmlGenericError(xmlGenericErrorContext,
5895 "HPP: Parsing Reference\n");
5896#endif
5897 /* TODO: check generation of subtrees if noent !!! */
5898 htmlParseReference(ctxt);
5899 } else {
Daniel Veillard14f752c2003-08-09 11:44:50 +00005900 /*
5901 * check that the text sequence is complete
5902 * before handing out the data to the parser
5903 * to avoid problems with erroneous end of
5904 * data detection.
Owen Taylor3473f882001-02-23 17:55:21 +00005905 */
Daniel Veillard14f752c2003-08-09 11:44:50 +00005906 if ((!terminate) &&
Markus Kull56a03032009-08-24 19:00:23 +02005907 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
Daniel Veillard14f752c2003-08-09 11:44:50 +00005908 goto done;
Owen Taylor3473f882001-02-23 17:55:21 +00005909 ctxt->checkIndex = 0;
5910#ifdef DEBUG_PUSH
5911 xmlGenericError(xmlGenericErrorContext,
5912 "HPP: Parsing char data\n");
5913#endif
Daniel Veillard56a4cb82001-03-24 17:00:36 +00005914 htmlParseCharData(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00005915 }
5916 }
5917 if (cons == ctxt->nbChars) {
5918 if (ctxt->node != NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00005919 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5920 "detected an error in element content\n",
5921 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005922 }
5923 NEXT;
5924 break;
5925 }
5926
5927 break;
5928 }
5929 case XML_PARSER_END_TAG:
5930 if (avail < 2)
5931 goto done;
5932 if ((!terminate) &&
Jiri Netolicky446e1262009-08-07 17:05:36 +02005933 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
Owen Taylor3473f882001-02-23 17:55:21 +00005934 goto done;
5935 htmlParseEndTag(ctxt);
5936 if (ctxt->nameNr == 0) {
5937 ctxt->instate = XML_PARSER_EPILOG;
5938 } else {
5939 ctxt->instate = XML_PARSER_CONTENT;
5940 }
5941 ctxt->checkIndex = 0;
5942#ifdef DEBUG_PUSH
5943 xmlGenericError(xmlGenericErrorContext,
5944 "HPP: entering CONTENT\n");
5945#endif
5946 break;
5947 case XML_PARSER_CDATA_SECTION:
Daniel Veillardf403d292003-10-05 13:51:35 +00005948 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5949 "HPP: internal error, state == CDATA\n",
5950 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005951 ctxt->instate = XML_PARSER_CONTENT;
5952 ctxt->checkIndex = 0;
5953#ifdef DEBUG_PUSH
5954 xmlGenericError(xmlGenericErrorContext,
5955 "HPP: entering CONTENT\n");
5956#endif
5957 break;
5958 case XML_PARSER_DTD:
Daniel Veillardf403d292003-10-05 13:51:35 +00005959 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5960 "HPP: internal error, state == DTD\n",
5961 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005962 ctxt->instate = XML_PARSER_CONTENT;
5963 ctxt->checkIndex = 0;
5964#ifdef DEBUG_PUSH
5965 xmlGenericError(xmlGenericErrorContext,
5966 "HPP: entering CONTENT\n");
5967#endif
5968 break;
5969 case XML_PARSER_COMMENT:
Daniel Veillardf403d292003-10-05 13:51:35 +00005970 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5971 "HPP: internal error, state == COMMENT\n",
5972 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005973 ctxt->instate = XML_PARSER_CONTENT;
5974 ctxt->checkIndex = 0;
5975#ifdef DEBUG_PUSH
5976 xmlGenericError(xmlGenericErrorContext,
5977 "HPP: entering CONTENT\n");
5978#endif
5979 break;
5980 case XML_PARSER_PI:
Daniel Veillardf403d292003-10-05 13:51:35 +00005981 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5982 "HPP: internal error, state == PI\n",
5983 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005984 ctxt->instate = XML_PARSER_CONTENT;
5985 ctxt->checkIndex = 0;
5986#ifdef DEBUG_PUSH
5987 xmlGenericError(xmlGenericErrorContext,
5988 "HPP: entering CONTENT\n");
5989#endif
5990 break;
5991 case XML_PARSER_ENTITY_DECL:
Daniel Veillardf403d292003-10-05 13:51:35 +00005992 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5993 "HPP: internal error, state == ENTITY_DECL\n",
5994 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00005995 ctxt->instate = XML_PARSER_CONTENT;
5996 ctxt->checkIndex = 0;
5997#ifdef DEBUG_PUSH
5998 xmlGenericError(xmlGenericErrorContext,
5999 "HPP: entering CONTENT\n");
6000#endif
6001 break;
6002 case XML_PARSER_ENTITY_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00006003 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6004 "HPP: internal error, state == ENTITY_VALUE\n",
6005 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00006006 ctxt->instate = XML_PARSER_CONTENT;
6007 ctxt->checkIndex = 0;
6008#ifdef DEBUG_PUSH
6009 xmlGenericError(xmlGenericErrorContext,
6010 "HPP: entering DTD\n");
6011#endif
6012 break;
6013 case XML_PARSER_ATTRIBUTE_VALUE:
Daniel Veillardf403d292003-10-05 13:51:35 +00006014 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6015 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6016 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00006017 ctxt->instate = XML_PARSER_START_TAG;
6018 ctxt->checkIndex = 0;
6019#ifdef DEBUG_PUSH
6020 xmlGenericError(xmlGenericErrorContext,
6021 "HPP: entering START_TAG\n");
6022#endif
6023 break;
6024 case XML_PARSER_SYSTEM_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00006025 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6026 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6027 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00006028 ctxt->instate = XML_PARSER_CONTENT;
6029 ctxt->checkIndex = 0;
6030#ifdef DEBUG_PUSH
6031 xmlGenericError(xmlGenericErrorContext,
6032 "HPP: entering CONTENT\n");
6033#endif
6034 break;
6035 case XML_PARSER_IGNORE:
Daniel Veillardf403d292003-10-05 13:51:35 +00006036 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6037 "HPP: internal error, state == XML_PARSER_IGNORE\n",
6038 NULL, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00006039 ctxt->instate = XML_PARSER_CONTENT;
6040 ctxt->checkIndex = 0;
6041#ifdef DEBUG_PUSH
6042 xmlGenericError(xmlGenericErrorContext,
6043 "HPP: entering CONTENT\n");
6044#endif
6045 break;
Daniel Veillard044fc6b2002-03-04 17:09:44 +00006046 case XML_PARSER_PUBLIC_LITERAL:
Daniel Veillardf403d292003-10-05 13:51:35 +00006047 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6048 "HPP: internal error, state == XML_PARSER_LITERAL\n",
6049 NULL, NULL);
Daniel Veillard044fc6b2002-03-04 17:09:44 +00006050 ctxt->instate = XML_PARSER_CONTENT;
6051 ctxt->checkIndex = 0;
6052#ifdef DEBUG_PUSH
6053 xmlGenericError(xmlGenericErrorContext,
6054 "HPP: entering CONTENT\n");
6055#endif
6056 break;
6057
Owen Taylor3473f882001-02-23 17:55:21 +00006058 }
6059 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006060done:
Owen Taylor3473f882001-02-23 17:55:21 +00006061 if ((avail == 0) && (terminate)) {
Daniel Veillarda3bfca52001-04-12 15:42:58 +00006062 htmlAutoCloseOnEnd(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006063 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
Owen Taylor3473f882001-02-23 17:55:21 +00006064 /*
6065 * SAX: end of the document processing.
6066 */
6067 ctxt->instate = XML_PARSER_EOF;
6068 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6069 ctxt->sax->endDocument(ctxt->userData);
6070 }
6071 }
Arnold Hendriks826bc322013-11-29 14:12:12 +08006072 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
Owen Taylor3473f882001-02-23 17:55:21 +00006073 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6074 (ctxt->instate == XML_PARSER_EPILOG))) {
6075 xmlDtdPtr dtd;
6076 dtd = xmlGetIntSubset(ctxt->myDoc);
6077 if (dtd == NULL)
Daniel Veillarde77db162009-08-22 11:32:38 +02006078 ctxt->myDoc->intSubset =
6079 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
Owen Taylor3473f882001-02-23 17:55:21 +00006080 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6081 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6082 }
6083#ifdef DEBUG_PUSH
6084 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6085#endif
6086 return(ret);
6087}
6088
6089/**
Owen Taylor3473f882001-02-23 17:55:21 +00006090 * htmlParseChunk:
Daniel Veillardf403d292003-10-05 13:51:35 +00006091 * @ctxt: an HTML parser context
Owen Taylor3473f882001-02-23 17:55:21 +00006092 * @chunk: an char array
6093 * @size: the size in byte of the chunk
6094 * @terminate: last chunk indicator
6095 *
6096 * Parse a Chunk of memory
6097 *
6098 * Returns zero if no error, the xmlParserErrors otherwise.
6099 */
6100int
6101htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6102 int terminate) {
Daniel Veillarda03e3652004-11-02 18:45:30 +00006103 if ((ctxt == NULL) || (ctxt->input == NULL)) {
6104 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6105 "htmlParseChunk: context error\n", NULL, NULL);
6106 return(XML_ERR_INTERNAL_ERROR);
6107 }
Owen Taylor3473f882001-02-23 17:55:21 +00006108 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6109 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
Daniel Veillard00ac0d32012-07-16 18:03:01 +08006110 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6111 size_t cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillardd2755a82005-08-07 23:42:39 +00006112 int res;
Daniel Veillarde77db162009-08-22 11:32:38 +02006113
6114 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillardd2755a82005-08-07 23:42:39 +00006115 if (res < 0) {
6116 ctxt->errNo = XML_PARSER_EOF;
6117 ctxt->disableSAX = 1;
6118 return (XML_PARSER_EOF);
6119 }
Daniel Veillard00ac0d32012-07-16 18:03:01 +08006120 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Owen Taylor3473f882001-02-23 17:55:21 +00006121#ifdef DEBUG_PUSH
6122 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6123#endif
6124
Daniel Veillard14f752c2003-08-09 11:44:50 +00006125#if 0
Owen Taylor3473f882001-02-23 17:55:21 +00006126 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6127 htmlParseTryOrFinish(ctxt, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006128#endif
Owen Taylor3473f882001-02-23 17:55:21 +00006129 } else if (ctxt->instate != XML_PARSER_EOF) {
Daniel Veillard14f752c2003-08-09 11:44:50 +00006130 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6131 xmlParserInputBufferPtr in = ctxt->input->buf;
6132 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6133 (in->raw != NULL)) {
6134 int nbchars;
Daniel Veillardde0cc202013-02-12 16:55:34 +08006135 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6136 size_t current = ctxt->input->cur - ctxt->input->base;
Daniel Veillarde77db162009-08-22 11:32:38 +02006137
Daniel Veillardbf058dc2013-02-13 18:19:42 +08006138 nbchars = xmlCharEncInput(in, terminate);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006139 if (nbchars < 0) {
Daniel Veillardf403d292003-10-05 13:51:35 +00006140 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6141 "encoder error\n", NULL, NULL);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006142 return(XML_ERR_INVALID_ENCODING);
6143 }
Daniel Veillardde0cc202013-02-12 16:55:34 +08006144 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
Daniel Veillard14f752c2003-08-09 11:44:50 +00006145 }
6146 }
Owen Taylor3473f882001-02-23 17:55:21 +00006147 }
Daniel Veillard14f752c2003-08-09 11:44:50 +00006148 htmlParseTryOrFinish(ctxt, terminate);
Owen Taylor3473f882001-02-23 17:55:21 +00006149 if (terminate) {
6150 if ((ctxt->instate != XML_PARSER_EOF) &&
6151 (ctxt->instate != XML_PARSER_EPILOG) &&
6152 (ctxt->instate != XML_PARSER_MISC)) {
6153 ctxt->errNo = XML_ERR_DOCUMENT_END;
Owen Taylor3473f882001-02-23 17:55:21 +00006154 ctxt->wellFormed = 0;
Daniel Veillarde77db162009-08-22 11:32:38 +02006155 }
Owen Taylor3473f882001-02-23 17:55:21 +00006156 if (ctxt->instate != XML_PARSER_EOF) {
6157 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6158 ctxt->sax->endDocument(ctxt->userData);
6159 }
6160 ctxt->instate = XML_PARSER_EOF;
6161 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006162 return((xmlParserErrors) ctxt->errNo);
Owen Taylor3473f882001-02-23 17:55:21 +00006163}
6164
6165/************************************************************************
6166 * *
6167 * User entry points *
6168 * *
6169 ************************************************************************/
6170
6171/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006172 * htmlCreatePushParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00006173 * @sax: a SAX handler
6174 * @user_data: The user data returned on SAX callbacks
6175 * @chunk: a pointer to an array of chars
6176 * @size: number of chars in the array
6177 * @filename: an optional file name or URI
6178 * @enc: an optional encoding
6179 *
6180 * Create a parser context for using the HTML parser in push mode
Owen Taylor3473f882001-02-23 17:55:21 +00006181 * The value of @filename is used for fetching external entities
6182 * and error/warning reports.
6183 *
6184 * Returns the new parser context or NULL
6185 */
6186htmlParserCtxtPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02006187htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
Owen Taylor3473f882001-02-23 17:55:21 +00006188 const char *chunk, int size, const char *filename,
6189 xmlCharEncoding enc) {
6190 htmlParserCtxtPtr ctxt;
6191 htmlParserInputPtr inputStream;
6192 xmlParserInputBufferPtr buf;
6193
Daniel Veillardd0463562001-10-13 09:15:48 +00006194 xmlInitParser();
6195
Owen Taylor3473f882001-02-23 17:55:21 +00006196 buf = xmlAllocParserInputBuffer(enc);
6197 if (buf == NULL) return(NULL);
6198
Daniel Veillardf403d292003-10-05 13:51:35 +00006199 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00006200 if (ctxt == NULL) {
Daniel Veillardf403d292003-10-05 13:51:35 +00006201 xmlFreeParserInputBuffer(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00006202 return(NULL);
6203 }
Daniel Veillard77a90a72003-03-22 00:04:05 +00006204 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6205 ctxt->charset=XML_CHAR_ENCODING_UTF8;
Owen Taylor3473f882001-02-23 17:55:21 +00006206 if (sax != NULL) {
Daniel Veillard092643b2003-09-25 14:29:29 +00006207 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
Owen Taylor3473f882001-02-23 17:55:21 +00006208 xmlFree(ctxt->sax);
6209 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6210 if (ctxt->sax == NULL) {
6211 xmlFree(buf);
6212 xmlFree(ctxt);
6213 return(NULL);
6214 }
6215 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6216 if (user_data != NULL)
6217 ctxt->userData = user_data;
Daniel Veillarde77db162009-08-22 11:32:38 +02006218 }
Owen Taylor3473f882001-02-23 17:55:21 +00006219 if (filename == NULL) {
6220 ctxt->directory = NULL;
6221 } else {
6222 ctxt->directory = xmlParserGetDirectory(filename);
6223 }
6224
6225 inputStream = htmlNewInputStream(ctxt);
6226 if (inputStream == NULL) {
6227 xmlFreeParserCtxt(ctxt);
Daniel Veillard77a90a72003-03-22 00:04:05 +00006228 xmlFree(buf);
Owen Taylor3473f882001-02-23 17:55:21 +00006229 return(NULL);
6230 }
6231
6232 if (filename == NULL)
6233 inputStream->filename = NULL;
6234 else
Daniel Veillard5f704af2003-03-05 10:01:43 +00006235 inputStream->filename = (char *)
6236 xmlCanonicPath((const xmlChar *) filename);
Owen Taylor3473f882001-02-23 17:55:21 +00006237 inputStream->buf = buf;
Daniel Veillard61551a12012-07-16 16:28:47 +08006238 xmlBufResetInput(buf->buffer, inputStream);
Owen Taylor3473f882001-02-23 17:55:21 +00006239
6240 inputPush(ctxt, inputStream);
6241
6242 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
Daniel Veillarde77db162009-08-22 11:32:38 +02006243 (ctxt->input->buf != NULL)) {
Daniel Veillard00ac0d32012-07-16 18:03:01 +08006244 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6245 size_t cur = ctxt->input->cur - ctxt->input->base;
Daniel Veillard5f704af2003-03-05 10:01:43 +00006246
Daniel Veillarde77db162009-08-22 11:32:38 +02006247 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
Daniel Veillard5f704af2003-03-05 10:01:43 +00006248
Daniel Veillard00ac0d32012-07-16 18:03:01 +08006249 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
Owen Taylor3473f882001-02-23 17:55:21 +00006250#ifdef DEBUG_PUSH
6251 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6252#endif
6253 }
Daniel Veillard68716a72006-10-16 09:32:17 +00006254 ctxt->progressive = 1;
Owen Taylor3473f882001-02-23 17:55:21 +00006255
6256 return(ctxt);
6257}
William M. Brack21e4ef22005-01-02 09:53:13 +00006258#endif /* LIBXML_PUSH_ENABLED */
Owen Taylor3473f882001-02-23 17:55:21 +00006259
6260/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006261 * htmlSAXParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00006262 * @cur: a pointer to an array of xmlChar
6263 * @encoding: a free form C string describing the HTML document encoding, or NULL
6264 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02006265 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00006266 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006267 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6268 * to handle parse events. If sax is NULL, fallback to the default DOM
6269 * behavior and return a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006270 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006271 * Returns the resulting document tree unless SAX is NULL or the document is
6272 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00006273 */
6274
6275htmlDocPtr
6276htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
6277 htmlDocPtr ret;
6278 htmlParserCtxtPtr ctxt;
6279
Daniel Veillardd0463562001-10-13 09:15:48 +00006280 xmlInitParser();
6281
Owen Taylor3473f882001-02-23 17:55:21 +00006282 if (cur == NULL) return(NULL);
6283
6284
6285 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6286 if (ctxt == NULL) return(NULL);
Daniel Veillarde77db162009-08-22 11:32:38 +02006287 if (sax != NULL) {
Daniel Veillardb19ba832003-08-14 00:33:46 +00006288 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
Owen Taylor3473f882001-02-23 17:55:21 +00006289 ctxt->sax = sax;
6290 ctxt->userData = userData;
6291 }
6292
6293 htmlParseDocument(ctxt);
6294 ret = ctxt->myDoc;
6295 if (sax != NULL) {
6296 ctxt->sax = NULL;
6297 ctxt->userData = NULL;
6298 }
6299 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006300
Owen Taylor3473f882001-02-23 17:55:21 +00006301 return(ret);
6302}
6303
6304/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006305 * htmlParseDoc:
Owen Taylor3473f882001-02-23 17:55:21 +00006306 * @cur: a pointer to an array of xmlChar
6307 * @encoding: a free form C string describing the HTML document encoding, or NULL
6308 *
6309 * parse an HTML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006310 *
Owen Taylor3473f882001-02-23 17:55:21 +00006311 * Returns the resulting document tree
6312 */
6313
6314htmlDocPtr
6315htmlParseDoc(xmlChar *cur, const char *encoding) {
6316 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6317}
6318
6319
6320/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006321 * htmlCreateFileParserCtxt:
Owen Taylor3473f882001-02-23 17:55:21 +00006322 * @filename: the filename
6323 * @encoding: a free form C string describing the HTML document encoding, or NULL
6324 *
Daniel Veillarde77db162009-08-22 11:32:38 +02006325 * Create a parser context for a file content.
Owen Taylor3473f882001-02-23 17:55:21 +00006326 * Automatic support for ZLIB/Compress compressed document is provided
6327 * by default if found at compile-time.
6328 *
6329 * Returns the new parser context or NULL
6330 */
6331htmlParserCtxtPtr
6332htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6333{
6334 htmlParserCtxtPtr ctxt;
6335 htmlParserInputPtr inputStream;
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006336 char *canonicFilename;
Owen Taylor3473f882001-02-23 17:55:21 +00006337 /* htmlCharEncoding enc; */
6338 xmlChar *content, *content_line = (xmlChar *) "charset=";
6339
Daniel Veillarda03e3652004-11-02 18:45:30 +00006340 if (filename == NULL)
6341 return(NULL);
6342
Daniel Veillardf403d292003-10-05 13:51:35 +00006343 ctxt = htmlNewParserCtxt();
Owen Taylor3473f882001-02-23 17:55:21 +00006344 if (ctxt == NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00006345 return(NULL);
6346 }
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006347 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6348 if (canonicFilename == NULL) {
Daniel Veillard87247e82004-01-13 20:42:02 +00006349#ifdef LIBXML_SAX1_ENABLED
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006350 if (xmlDefaultSAXHandler.error != NULL) {
6351 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6352 }
Daniel Veillard87247e82004-01-13 20:42:02 +00006353#endif
Daniel Veillard104caa32003-05-13 22:54:05 +00006354 xmlFreeParserCtxt(ctxt);
Owen Taylor3473f882001-02-23 17:55:21 +00006355 return(NULL);
6356 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006357
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006358 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6359 xmlFree(canonicFilename);
6360 if (inputStream == NULL) {
6361 xmlFreeParserCtxt(ctxt);
6362 return(NULL);
6363 }
Owen Taylor3473f882001-02-23 17:55:21 +00006364
6365 inputPush(ctxt, inputStream);
Daniel Veillarde8b09e42003-05-13 22:14:13 +00006366
Owen Taylor3473f882001-02-23 17:55:21 +00006367 /* set encoding */
6368 if (encoding) {
Daniel Veillard292a9f22014-10-06 18:51:04 +08006369 size_t l = strlen(encoding);
6370
6371 if (l < 1000) {
6372 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6373 if (content) {
6374 strcpy ((char *)content, (char *)content_line);
6375 strcat ((char *)content, (char *)encoding);
6376 htmlCheckEncoding (ctxt, content);
6377 xmlFree (content);
6378 }
Owen Taylor3473f882001-02-23 17:55:21 +00006379 }
6380 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006381
Owen Taylor3473f882001-02-23 17:55:21 +00006382 return(ctxt);
6383}
6384
6385/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006386 * htmlSAXParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00006387 * @filename: the filename
6388 * @encoding: a free form C string describing the HTML document encoding, or NULL
6389 * @sax: the SAX handler block
Daniel Veillarde77db162009-08-22 11:32:38 +02006390 * @userData: if using SAX, this pointer will be provided on callbacks.
Owen Taylor3473f882001-02-23 17:55:21 +00006391 *
6392 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6393 * compressed document is provided by default if found at compile-time.
6394 * It use the given SAX function block to handle the parsing callback.
6395 * If sax is NULL, fallback to the default DOM tree building routines.
6396 *
Daniel Veillard4d65a1c2001-07-04 22:06:23 +00006397 * Returns the resulting document tree unless SAX is NULL or the document is
6398 * not well formed.
Owen Taylor3473f882001-02-23 17:55:21 +00006399 */
6400
6401htmlDocPtr
Daniel Veillarde77db162009-08-22 11:32:38 +02006402htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
Owen Taylor3473f882001-02-23 17:55:21 +00006403 void *userData) {
6404 htmlDocPtr ret;
6405 htmlParserCtxtPtr ctxt;
6406 htmlSAXHandlerPtr oldsax = NULL;
6407
Daniel Veillardd0463562001-10-13 09:15:48 +00006408 xmlInitParser();
6409
Owen Taylor3473f882001-02-23 17:55:21 +00006410 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6411 if (ctxt == NULL) return(NULL);
6412 if (sax != NULL) {
6413 oldsax = ctxt->sax;
6414 ctxt->sax = sax;
6415 ctxt->userData = userData;
6416 }
6417
6418 htmlParseDocument(ctxt);
6419
6420 ret = ctxt->myDoc;
6421 if (sax != NULL) {
6422 ctxt->sax = oldsax;
6423 ctxt->userData = NULL;
6424 }
6425 htmlFreeParserCtxt(ctxt);
Daniel Veillarde77db162009-08-22 11:32:38 +02006426
Owen Taylor3473f882001-02-23 17:55:21 +00006427 return(ret);
6428}
6429
6430/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00006431 * htmlParseFile:
Owen Taylor3473f882001-02-23 17:55:21 +00006432 * @filename: the filename
6433 * @encoding: a free form C string describing the HTML document encoding, or NULL
6434 *
6435 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6436 * compressed document is provided by default if found at compile-time.
6437 *
6438 * Returns the resulting document tree
6439 */
6440
6441htmlDocPtr
6442htmlParseFile(const char *filename, const char *encoding) {
6443 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6444}
6445
6446/**
6447 * htmlHandleOmittedElem:
Daniel Veillarde77db162009-08-22 11:32:38 +02006448 * @val: int 0 or 1
Owen Taylor3473f882001-02-23 17:55:21 +00006449 *
6450 * Set and return the previous value for handling HTML omitted tags.
6451 *
6452 * Returns the last value for 0 for no handling, 1 for auto insertion.
6453 */
6454
6455int
6456htmlHandleOmittedElem(int val) {
6457 int old = htmlOmittedDefaultValue;
6458
6459 htmlOmittedDefaultValue = val;
6460 return(old);
6461}
6462
Daniel Veillard930dfb62003-02-05 10:17:38 +00006463/**
6464 * htmlElementAllowedHere:
6465 * @parent: HTML parent element
6466 * @elt: HTML element
6467 *
6468 * Checks whether an HTML element may be a direct child of a parent element.
6469 * Note - doesn't check for deprecated elements
6470 *
6471 * Returns 1 if allowed; 0 otherwise.
6472 */
6473int
6474htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6475 const char** p ;
6476
6477 if ( ! elt || ! parent || ! parent->subelts )
6478 return 0 ;
6479
6480 for ( p = parent->subelts; *p; ++p )
6481 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6482 return 1 ;
6483
6484 return 0 ;
6485}
6486/**
6487 * htmlElementStatusHere:
6488 * @parent: HTML parent element
6489 * @elt: HTML element
6490 *
6491 * Checks whether an HTML element may be a direct child of a parent element.
6492 * and if so whether it is valid or deprecated.
6493 *
6494 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6495 */
6496htmlStatus
6497htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6498 if ( ! parent || ! elt )
6499 return HTML_INVALID ;
6500 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6501 return HTML_INVALID ;
6502
6503 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6504}
6505/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006506 * htmlAttrAllowed:
Daniel Veillard930dfb62003-02-05 10:17:38 +00006507 * @elt: HTML element
6508 * @attr: HTML attribute
6509 * @legacy: whether to allow deprecated attributes
6510 *
6511 * Checks whether an attribute is valid for an element
6512 * Has full knowledge of Required and Deprecated attributes
6513 *
6514 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6515 */
6516htmlStatus
6517htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6518 const char** p ;
6519
6520 if ( !elt || ! attr )
6521 return HTML_INVALID ;
6522
6523 if ( elt->attrs_req )
6524 for ( p = elt->attrs_req; *p; ++p)
6525 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6526 return HTML_REQUIRED ;
6527
6528 if ( elt->attrs_opt )
6529 for ( p = elt->attrs_opt; *p; ++p)
6530 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6531 return HTML_VALID ;
6532
6533 if ( legacy && elt->attrs_depr )
6534 for ( p = elt->attrs_depr; *p; ++p)
6535 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6536 return HTML_DEPRECATED ;
6537
6538 return HTML_INVALID ;
6539}
6540/**
Daniel Veillard71531f32003-02-05 13:19:53 +00006541 * htmlNodeStatus:
Daniel Veillard1703c5f2003-02-10 14:28:44 +00006542 * @node: an htmlNodePtr in a tree
6543 * @legacy: whether to allow deprecated elements (YES is faster here
Daniel Veillard930dfb62003-02-05 10:17:38 +00006544 * for Element nodes)
6545 *
6546 * Checks whether the tree node is valid. Experimental (the author
6547 * only uses the HTML enhancements in a SAX parser)
6548 *
6549 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6550 * legacy allowed) or htmlElementStatusHere (otherwise).
6551 * for Attribute nodes, a return from htmlAttrAllowed
6552 * for other nodes, HTML_NA (no checks performed)
6553 */
6554htmlStatus
6555htmlNodeStatus(const htmlNodePtr node, int legacy) {
6556 if ( ! node )
6557 return HTML_INVALID ;
6558
6559 switch ( node->type ) {
6560 case XML_ELEMENT_NODE:
6561 return legacy
6562 ? ( htmlElementAllowedHere (
6563 htmlTagLookup(node->parent->name) , node->name
6564 ) ? HTML_VALID : HTML_INVALID )
6565 : htmlElementStatusHere(
6566 htmlTagLookup(node->parent->name) ,
6567 htmlTagLookup(node->name) )
6568 ;
6569 case XML_ATTRIBUTE_NODE:
6570 return htmlAttrAllowed(
6571 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6572 default: return HTML_NA ;
6573 }
6574}
Daniel Veillard9475a352003-09-26 12:47:50 +00006575/************************************************************************
6576 * *
6577 * New set (2.6.0) of simpler and more flexible APIs *
6578 * *
6579 ************************************************************************/
6580/**
6581 * DICT_FREE:
6582 * @str: a string
6583 *
Xin Li28c53d32017-03-07 00:33:02 +00006584 * Free a string if it is not owned by the "dict" dictionary in the
Daniel Veillard9475a352003-09-26 12:47:50 +00006585 * current scope
6586 */
6587#define DICT_FREE(str) \
Daniel Veillarde77db162009-08-22 11:32:38 +02006588 if ((str) && ((!dict) || \
Daniel Veillard9475a352003-09-26 12:47:50 +00006589 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6590 xmlFree((char *)(str));
6591
6592/**
6593 * htmlCtxtReset:
Daniel Veillardf403d292003-10-05 13:51:35 +00006594 * @ctxt: an HTML parser context
Daniel Veillard9475a352003-09-26 12:47:50 +00006595 *
6596 * Reset a parser context
6597 */
6598void
6599htmlCtxtReset(htmlParserCtxtPtr ctxt)
6600{
6601 xmlParserInputPtr input;
Daniel Veillarda03e3652004-11-02 18:45:30 +00006602 xmlDictPtr dict;
Daniel Veillarde77db162009-08-22 11:32:38 +02006603
Daniel Veillarda03e3652004-11-02 18:45:30 +00006604 if (ctxt == NULL)
6605 return;
6606
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006607 xmlInitParser();
Daniel Veillarda03e3652004-11-02 18:45:30 +00006608 dict = ctxt->dict;
Daniel Veillard9475a352003-09-26 12:47:50 +00006609
6610 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6611 xmlFreeInputStream(input);
6612 }
6613 ctxt->inputNr = 0;
6614 ctxt->input = NULL;
6615
6616 ctxt->spaceNr = 0;
Daniel Veillarda521d282004-11-09 14:59:59 +00006617 if (ctxt->spaceTab != NULL) {
6618 ctxt->spaceTab[0] = -1;
6619 ctxt->space = &ctxt->spaceTab[0];
6620 } else {
6621 ctxt->space = NULL;
6622 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006623
6624
6625 ctxt->nodeNr = 0;
6626 ctxt->node = NULL;
6627
6628 ctxt->nameNr = 0;
6629 ctxt->name = NULL;
6630
6631 DICT_FREE(ctxt->version);
6632 ctxt->version = NULL;
6633 DICT_FREE(ctxt->encoding);
6634 ctxt->encoding = NULL;
6635 DICT_FREE(ctxt->directory);
6636 ctxt->directory = NULL;
6637 DICT_FREE(ctxt->extSubURI);
6638 ctxt->extSubURI = NULL;
6639 DICT_FREE(ctxt->extSubSystem);
6640 ctxt->extSubSystem = NULL;
6641 if (ctxt->myDoc != NULL)
6642 xmlFreeDoc(ctxt->myDoc);
6643 ctxt->myDoc = NULL;
6644
6645 ctxt->standalone = -1;
6646 ctxt->hasExternalSubset = 0;
6647 ctxt->hasPErefs = 0;
6648 ctxt->html = 1;
6649 ctxt->external = 0;
6650 ctxt->instate = XML_PARSER_START;
6651 ctxt->token = 0;
6652
6653 ctxt->wellFormed = 1;
6654 ctxt->nsWellFormed = 1;
Daniel Veillard8ad29302010-10-28 11:51:22 +02006655 ctxt->disableSAX = 0;
Daniel Veillard9475a352003-09-26 12:47:50 +00006656 ctxt->valid = 1;
6657 ctxt->vctxt.userData = ctxt;
6658 ctxt->vctxt.error = xmlParserValidityError;
6659 ctxt->vctxt.warning = xmlParserValidityWarning;
6660 ctxt->record_info = 0;
6661 ctxt->nbChars = 0;
6662 ctxt->checkIndex = 0;
6663 ctxt->inSubset = 0;
6664 ctxt->errNo = XML_ERR_OK;
6665 ctxt->depth = 0;
Daniel Veillard772869f2006-11-08 09:16:56 +00006666 ctxt->charset = XML_CHAR_ENCODING_NONE;
Daniel Veillard9475a352003-09-26 12:47:50 +00006667 ctxt->catalogs = NULL;
6668 xmlInitNodeInfoSeq(&ctxt->node_seq);
6669
6670 if (ctxt->attsDefault != NULL) {
6671 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6672 ctxt->attsDefault = NULL;
6673 }
6674 if (ctxt->attsSpecial != NULL) {
6675 xmlHashFree(ctxt->attsSpecial, NULL);
6676 ctxt->attsSpecial = NULL;
6677 }
6678}
6679
6680/**
6681 * htmlCtxtUseOptions:
6682 * @ctxt: an HTML parser context
6683 * @options: a combination of htmlParserOption(s)
6684 *
6685 * Applies the options to the parser context
6686 *
6687 * Returns 0 in case of success, the set of unknown or unimplemented options
6688 * in case of error.
6689 */
6690int
6691htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6692{
Daniel Veillarda03e3652004-11-02 18:45:30 +00006693 if (ctxt == NULL)
6694 return(-1);
6695
Daniel Veillard9475a352003-09-26 12:47:50 +00006696 if (options & HTML_PARSE_NOWARNING) {
6697 ctxt->sax->warning = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006698 ctxt->vctxt.warning = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006699 options -= XML_PARSE_NOWARNING;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006700 ctxt->options |= XML_PARSE_NOWARNING;
Daniel Veillard9475a352003-09-26 12:47:50 +00006701 }
6702 if (options & HTML_PARSE_NOERROR) {
6703 ctxt->sax->error = NULL;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006704 ctxt->vctxt.error = NULL;
Daniel Veillard9475a352003-09-26 12:47:50 +00006705 ctxt->sax->fatalError = NULL;
6706 options -= XML_PARSE_NOERROR;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006707 ctxt->options |= XML_PARSE_NOERROR;
Daniel Veillard9475a352003-09-26 12:47:50 +00006708 }
6709 if (options & HTML_PARSE_PEDANTIC) {
6710 ctxt->pedantic = 1;
6711 options -= XML_PARSE_PEDANTIC;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006712 ctxt->options |= XML_PARSE_PEDANTIC;
Daniel Veillard9475a352003-09-26 12:47:50 +00006713 } else
6714 ctxt->pedantic = 0;
6715 if (options & XML_PARSE_NOBLANKS) {
6716 ctxt->keepBlanks = 0;
6717 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6718 options -= XML_PARSE_NOBLANKS;
Daniel Veillardd3669b22004-02-25 12:34:55 +00006719 ctxt->options |= XML_PARSE_NOBLANKS;
Daniel Veillard9475a352003-09-26 12:47:50 +00006720 } else
6721 ctxt->keepBlanks = 1;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006722 if (options & HTML_PARSE_RECOVER) {
6723 ctxt->recovery = 1;
Daniel Veillardaf616a72006-10-17 20:18:39 +00006724 options -= HTML_PARSE_RECOVER;
Daniel Veillardea4b0ba2005-08-23 16:06:08 +00006725 } else
6726 ctxt->recovery = 0;
Daniel Veillard8874b942005-08-25 13:19:21 +00006727 if (options & HTML_PARSE_COMPACT) {
6728 ctxt->options |= HTML_PARSE_COMPACT;
6729 options -= HTML_PARSE_COMPACT;
6730 }
Daniel Veillarde77db162009-08-22 11:32:38 +02006731 if (options & XML_PARSE_HUGE) {
6732 ctxt->options |= XML_PARSE_HUGE;
6733 options -= XML_PARSE_HUGE;
6734 }
Daniel Veillardf1121c42010-07-26 14:02:42 +02006735 if (options & HTML_PARSE_NODEFDTD) {
6736 ctxt->options |= HTML_PARSE_NODEFDTD;
6737 options -= HTML_PARSE_NODEFDTD;
6738 }
Daniel Veillardc62efc82011-05-16 16:03:50 +08006739 if (options & HTML_PARSE_IGNORE_ENC) {
6740 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6741 options -= HTML_PARSE_IGNORE_ENC;
6742 }
Martin Schröderb91111b2012-05-10 18:52:37 +08006743 if (options & HTML_PARSE_NOIMPLIED) {
6744 ctxt->options |= HTML_PARSE_NOIMPLIED;
6745 options -= HTML_PARSE_NOIMPLIED;
6746 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006747 ctxt->dictNames = 0;
6748 return (options);
6749}
6750
6751/**
6752 * htmlDoRead:
6753 * @ctxt: an HTML parser context
6754 * @URL: the base URL to use for the document
6755 * @encoding: the document encoding, or NULL
6756 * @options: a combination of htmlParserOption(s)
6757 * @reuse: keep the context for reuse
6758 *
6759 * Common front-end for the htmlRead functions
Daniel Veillarde77db162009-08-22 11:32:38 +02006760 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006761 * Returns the resulting document tree or NULL
6762 */
6763static htmlDocPtr
6764htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6765 int options, int reuse)
6766{
6767 htmlDocPtr ret;
Daniel Veillarde77db162009-08-22 11:32:38 +02006768
Daniel Veillard9475a352003-09-26 12:47:50 +00006769 htmlCtxtUseOptions(ctxt, options);
6770 ctxt->html = 1;
6771 if (encoding != NULL) {
6772 xmlCharEncodingHandlerPtr hdlr;
6773
6774 hdlr = xmlFindCharEncodingHandler(encoding);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006775 if (hdlr != NULL) {
Daniel Veillard9475a352003-09-26 12:47:50 +00006776 xmlSwitchToEncoding(ctxt, hdlr);
Daniel Veillard4cc67bb2008-08-29 19:58:23 +00006777 if (ctxt->input->encoding != NULL)
6778 xmlFree((xmlChar *) ctxt->input->encoding);
6779 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6780 }
Daniel Veillard9475a352003-09-26 12:47:50 +00006781 }
6782 if ((URL != NULL) && (ctxt->input != NULL) &&
6783 (ctxt->input->filename == NULL))
6784 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6785 htmlParseDocument(ctxt);
6786 ret = ctxt->myDoc;
6787 ctxt->myDoc = NULL;
6788 if (!reuse) {
6789 if ((ctxt->dictNames) &&
6790 (ret != NULL) &&
6791 (ret->dict == ctxt->dict))
6792 ctxt->dict = NULL;
6793 xmlFreeParserCtxt(ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00006794 }
6795 return (ret);
6796}
6797
6798/**
6799 * htmlReadDoc:
6800 * @cur: a pointer to a zero terminated string
6801 * @URL: the base URL to use for the document
6802 * @encoding: the document encoding, or NULL
6803 * @options: a combination of htmlParserOption(s)
6804 *
6805 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006806 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006807 * Returns the resulting document tree
6808 */
6809htmlDocPtr
6810htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6811{
6812 htmlParserCtxtPtr ctxt;
6813
6814 if (cur == NULL)
6815 return (NULL);
6816
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006817 xmlInitParser();
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006818 ctxt = htmlCreateDocParserCtxt(cur, NULL);
Daniel Veillard9475a352003-09-26 12:47:50 +00006819 if (ctxt == NULL)
6820 return (NULL);
6821 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6822}
6823
6824/**
6825 * htmlReadFile:
6826 * @filename: a file or URL
6827 * @encoding: the document encoding, or NULL
6828 * @options: a combination of htmlParserOption(s)
6829 *
6830 * parse an XML file from the filesystem or the network.
Daniel Veillarde77db162009-08-22 11:32:38 +02006831 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006832 * Returns the resulting document tree
6833 */
6834htmlDocPtr
6835htmlReadFile(const char *filename, const char *encoding, int options)
6836{
6837 htmlParserCtxtPtr ctxt;
6838
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006839 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006840 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6841 if (ctxt == NULL)
6842 return (NULL);
6843 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6844}
6845
6846/**
6847 * htmlReadMemory:
6848 * @buffer: a pointer to a char array
6849 * @size: the size of the array
6850 * @URL: the base URL to use for the document
6851 * @encoding: the document encoding, or NULL
6852 * @options: a combination of htmlParserOption(s)
6853 *
6854 * parse an XML in-memory document and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006855 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006856 * Returns the resulting document tree
6857 */
6858htmlDocPtr
6859htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6860{
6861 htmlParserCtxtPtr ctxt;
6862
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006863 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006864 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6865 if (ctxt == NULL)
6866 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006867 htmlDefaultSAXHandlerInit();
William M. Brackd43cdcd2004-08-03 15:13:29 +00006868 if (ctxt->sax != NULL)
6869 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
Daniel Veillard9475a352003-09-26 12:47:50 +00006870 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6871}
6872
6873/**
6874 * htmlReadFd:
6875 * @fd: an open file descriptor
6876 * @URL: the base URL to use for the document
6877 * @encoding: the document encoding, or NULL
6878 * @options: a combination of htmlParserOption(s)
6879 *
6880 * parse an XML from a file descriptor and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006881 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006882 * Returns the resulting document tree
6883 */
6884htmlDocPtr
6885htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6886{
6887 htmlParserCtxtPtr ctxt;
6888 xmlParserInputBufferPtr input;
6889 xmlParserInputPtr stream;
6890
6891 if (fd < 0)
6892 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08006893 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006894
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006895 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006896 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6897 if (input == NULL)
6898 return (NULL);
6899 ctxt = xmlNewParserCtxt();
6900 if (ctxt == NULL) {
6901 xmlFreeParserInputBuffer(input);
6902 return (NULL);
6903 }
6904 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6905 if (stream == NULL) {
6906 xmlFreeParserInputBuffer(input);
6907 xmlFreeParserCtxt(ctxt);
6908 return (NULL);
6909 }
6910 inputPush(ctxt, stream);
6911 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6912}
6913
6914/**
6915 * htmlReadIO:
6916 * @ioread: an I/O read function
6917 * @ioclose: an I/O close function
6918 * @ioctx: an I/O handler
6919 * @URL: the base URL to use for the document
6920 * @encoding: the document encoding, or NULL
6921 * @options: a combination of htmlParserOption(s)
6922 *
6923 * parse an HTML document from I/O functions and source and build a tree.
Daniel Veillarde77db162009-08-22 11:32:38 +02006924 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006925 * Returns the resulting document tree
6926 */
6927htmlDocPtr
6928htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6929 void *ioctx, const char *URL, const char *encoding, int options)
6930{
6931 htmlParserCtxtPtr ctxt;
6932 xmlParserInputBufferPtr input;
6933 xmlParserInputPtr stream;
6934
6935 if (ioread == NULL)
6936 return (NULL);
Daniel Veillardf1a27c62006-10-13 22:33:03 +00006937 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006938
6939 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6940 XML_CHAR_ENCODING_NONE);
Lin Yi-Li24464be2012-05-10 16:14:55 +08006941 if (input == NULL) {
6942 if (ioclose != NULL)
6943 ioclose(ioctx);
Daniel Veillard9475a352003-09-26 12:47:50 +00006944 return (NULL);
Lin Yi-Li24464be2012-05-10 16:14:55 +08006945 }
Daniel Veillard8a82ae12006-10-17 20:04:10 +00006946 ctxt = htmlNewParserCtxt();
Daniel Veillard9475a352003-09-26 12:47:50 +00006947 if (ctxt == NULL) {
6948 xmlFreeParserInputBuffer(input);
6949 return (NULL);
6950 }
6951 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6952 if (stream == NULL) {
6953 xmlFreeParserInputBuffer(input);
6954 xmlFreeParserCtxt(ctxt);
6955 return (NULL);
6956 }
6957 inputPush(ctxt, stream);
6958 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6959}
6960
6961/**
6962 * htmlCtxtReadDoc:
6963 * @ctxt: an HTML parser context
6964 * @cur: a pointer to a zero terminated string
6965 * @URL: the base URL to use for the document
6966 * @encoding: the document encoding, or NULL
6967 * @options: a combination of htmlParserOption(s)
6968 *
6969 * parse an XML in-memory document and build a tree.
6970 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02006971 *
Daniel Veillard9475a352003-09-26 12:47:50 +00006972 * Returns the resulting document tree
6973 */
6974htmlDocPtr
6975htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6976 const char *URL, const char *encoding, int options)
6977{
6978 xmlParserInputPtr stream;
6979
6980 if (cur == NULL)
6981 return (NULL);
6982 if (ctxt == NULL)
6983 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08006984 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00006985
6986 htmlCtxtReset(ctxt);
6987
6988 stream = xmlNewStringInputStream(ctxt, cur);
6989 if (stream == NULL) {
6990 return (NULL);
6991 }
6992 inputPush(ctxt, stream);
6993 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6994}
6995
6996/**
6997 * htmlCtxtReadFile:
6998 * @ctxt: an HTML parser context
6999 * @filename: a file or URL
7000 * @encoding: the document encoding, or NULL
7001 * @options: a combination of htmlParserOption(s)
7002 *
7003 * parse an XML file from the filesystem or the network.
7004 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02007005 *
Daniel Veillard9475a352003-09-26 12:47:50 +00007006 * Returns the resulting document tree
7007 */
7008htmlDocPtr
7009htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7010 const char *encoding, int options)
7011{
7012 xmlParserInputPtr stream;
7013
7014 if (filename == NULL)
7015 return (NULL);
7016 if (ctxt == NULL)
7017 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08007018 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00007019
7020 htmlCtxtReset(ctxt);
7021
Daniel Veillard29614c72004-11-26 10:47:26 +00007022 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
Daniel Veillard9475a352003-09-26 12:47:50 +00007023 if (stream == NULL) {
7024 return (NULL);
7025 }
7026 inputPush(ctxt, stream);
7027 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7028}
7029
7030/**
7031 * htmlCtxtReadMemory:
7032 * @ctxt: an HTML parser context
7033 * @buffer: a pointer to a char array
7034 * @size: the size of the array
7035 * @URL: the base URL to use for the document
7036 * @encoding: the document encoding, or NULL
7037 * @options: a combination of htmlParserOption(s)
7038 *
7039 * parse an XML in-memory document and build a tree.
7040 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02007041 *
Daniel Veillard9475a352003-09-26 12:47:50 +00007042 * Returns the resulting document tree
7043 */
7044htmlDocPtr
7045htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7046 const char *URL, const char *encoding, int options)
7047{
7048 xmlParserInputBufferPtr input;
7049 xmlParserInputPtr stream;
7050
7051 if (ctxt == NULL)
7052 return (NULL);
7053 if (buffer == NULL)
7054 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08007055 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00007056
7057 htmlCtxtReset(ctxt);
7058
7059 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7060 if (input == NULL) {
7061 return(NULL);
7062 }
7063
7064 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7065 if (stream == NULL) {
7066 xmlFreeParserInputBuffer(input);
7067 return(NULL);
7068 }
7069
7070 inputPush(ctxt, stream);
7071 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7072}
7073
7074/**
7075 * htmlCtxtReadFd:
7076 * @ctxt: an HTML parser context
7077 * @fd: an open file descriptor
7078 * @URL: the base URL to use for the document
7079 * @encoding: the document encoding, or NULL
7080 * @options: a combination of htmlParserOption(s)
7081 *
7082 * parse an XML from a file descriptor and build a tree.
7083 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02007084 *
Daniel Veillard9475a352003-09-26 12:47:50 +00007085 * Returns the resulting document tree
7086 */
7087htmlDocPtr
7088htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7089 const char *URL, const char *encoding, int options)
7090{
7091 xmlParserInputBufferPtr input;
7092 xmlParserInputPtr stream;
7093
7094 if (fd < 0)
7095 return (NULL);
7096 if (ctxt == NULL)
7097 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08007098 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00007099
7100 htmlCtxtReset(ctxt);
7101
7102
7103 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7104 if (input == NULL)
7105 return (NULL);
7106 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7107 if (stream == NULL) {
7108 xmlFreeParserInputBuffer(input);
7109 return (NULL);
7110 }
7111 inputPush(ctxt, stream);
7112 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7113}
7114
7115/**
7116 * htmlCtxtReadIO:
7117 * @ctxt: an HTML parser context
7118 * @ioread: an I/O read function
7119 * @ioclose: an I/O close function
7120 * @ioctx: an I/O handler
7121 * @URL: the base URL to use for the document
7122 * @encoding: the document encoding, or NULL
7123 * @options: a combination of htmlParserOption(s)
7124 *
7125 * parse an HTML document from I/O functions and source and build a tree.
7126 * This reuses the existing @ctxt parser context
Daniel Veillarde77db162009-08-22 11:32:38 +02007127 *
Daniel Veillard9475a352003-09-26 12:47:50 +00007128 * Returns the resulting document tree
7129 */
7130htmlDocPtr
7131htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7132 xmlInputCloseCallback ioclose, void *ioctx,
7133 const char *URL,
7134 const char *encoding, int options)
7135{
7136 xmlParserInputBufferPtr input;
7137 xmlParserInputPtr stream;
7138
7139 if (ioread == NULL)
7140 return (NULL);
7141 if (ctxt == NULL)
7142 return (NULL);
Daniel Veillard4e1476c2013-12-09 15:23:40 +08007143 xmlInitParser();
Daniel Veillard9475a352003-09-26 12:47:50 +00007144
7145 htmlCtxtReset(ctxt);
7146
7147 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7148 XML_CHAR_ENCODING_NONE);
Lin Yi-Li24464be2012-05-10 16:14:55 +08007149 if (input == NULL) {
7150 if (ioclose != NULL)
7151 ioclose(ioctx);
Daniel Veillard9475a352003-09-26 12:47:50 +00007152 return (NULL);
Lin Yi-Li24464be2012-05-10 16:14:55 +08007153 }
Daniel Veillard9475a352003-09-26 12:47:50 +00007154 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7155 if (stream == NULL) {
7156 xmlFreeParserInputBuffer(input);
7157 return (NULL);
7158 }
7159 inputPush(ctxt, stream);
7160 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7161}
7162
Daniel Veillard5d4644e2005-04-01 13:11:58 +00007163#define bottom_HTMLparser
7164#include "elfgcchack.h"
Owen Taylor3473f882001-02-23 17:55:21 +00007165#endif /* LIBXML_HTML_ENABLED */