blob: 9dffd58c118b11183b9d8c71d7e763406bcc7ca0 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
10#include "win32config.h"
11#else
12#include "config.h"
13#endif
14
15#include <libxml/xmlversion.h>
16#ifdef LIBXML_HTML_ENABLED
17#include <stdio.h>
18#include <string.h>
19#ifdef HAVE_CTYPE_H
20#include <ctype.h>
21#endif
22#ifdef HAVE_STDLIB_H
23#include <stdlib.h>
24#endif
25#ifdef HAVE_SYS_STAT_H
26#include <sys/stat.h>
27#endif
28#ifdef HAVE_FCNTL_H
29#include <fcntl.h>
30#endif
31#ifdef HAVE_UNISTD_H
32#include <unistd.h>
33#endif
34#ifdef HAVE_ZLIB_H
35#include <zlib.h>
36#endif
37
38#include <libxml/xmlmemory.h>
39#include <libxml/tree.h>
40#include <libxml/parser.h>
41#include <libxml/parserInternals.h>
42#include <libxml/xmlerror.h>
43#include <libxml/HTMLparser.h>
44#include <libxml/entities.h>
45#include <libxml/encoding.h>
46#include <libxml/valid.h>
47#include <libxml/xmlIO.h>
48
49#define HTML_MAX_NAMELEN 1000
50#define HTML_PARSER_BIG_BUFFER_SIZE 1000
51#define HTML_PARSER_BUFFER_SIZE 100
52
53/* #define DEBUG */
54/* #define DEBUG_PUSH */
55
56int htmlOmittedDefaultValue = 1;
57
58/************************************************************************
59 * *
60 * Parser stacks related functions and macros *
61 * *
62 ************************************************************************/
63
64/*
65 * Generic function for accessing stacks in the Parser Context
66 */
67
68#define PUSH_AND_POP(scope, type, name) \
69scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
70 if (ctxt->name##Nr >= ctxt->name##Max) { \
71 ctxt->name##Max *= 2; \
72 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
73 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
74 if (ctxt->name##Tab == NULL) { \
75 xmlGenericError(xmlGenericErrorContext, \
76 "realloc failed !\n"); \
77 return(0); \
78 } \
79 } \
80 ctxt->name##Tab[ctxt->name##Nr] = value; \
81 ctxt->name = value; \
82 return(ctxt->name##Nr++); \
83} \
84scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
85 type ret; \
86 if (ctxt->name##Nr < 0) return(0); \
87 ctxt->name##Nr--; \
88 if (ctxt->name##Nr < 0) return(0); \
89 if (ctxt->name##Nr > 0) \
90 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
91 else \
92 ctxt->name = NULL; \
93 ret = ctxt->name##Tab[ctxt->name##Nr]; \
94 ctxt->name##Tab[ctxt->name##Nr] = 0; \
95 return(ret); \
96} \
97
98PUSH_AND_POP(extern, xmlNodePtr, node)
99PUSH_AND_POP(extern, xmlChar*, name)
100
101/*
102 * Macros for accessing the content. Those should be used only by the parser,
103 * and not exported.
104 *
105 * Dirty macros, i.e. one need to make assumption on the context to use them
106 *
107 * CUR_PTR return the current pointer to the xmlChar to be parsed.
108 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
109 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
110 * in UNICODE mode. This should be used internally by the parser
111 * only to compare to ASCII values otherwise it would break when
112 * running with UTF-8 encoding.
113 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
114 * to compare on ASCII based substring.
115 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
116 * it should be used only to compare on ASCII based substring.
117 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
118 * strings within the parser.
119 *
120 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
121 *
122 * CURRENT Returns the current char value, with the full decoding of
123 * UTF-8 if we are using this mode. It returns an int.
124 * NEXT Skip to the next character, this does the proper decoding
125 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
126 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
127 */
128
129#define UPPER (toupper(*ctxt->input->cur))
130
131#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
132
133#define NXT(val) ctxt->input->cur[(val)]
134
135#define UPP(val) (toupper(ctxt->input->cur[(val)]))
136
137#define CUR_PTR ctxt->input->cur
138
139#define SHRINK xmlParserInputShrink(ctxt->input)
140
141#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
142
143#define CURRENT ((int) (*ctxt->input->cur))
144
145#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
146
147/* Inported from XML */
148
149/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
150#define CUR ((int) (*ctxt->input->cur))
151#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
152
153#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
154#define NXT(val) ctxt->input->cur[(val)]
155#define CUR_PTR ctxt->input->cur
156
157
158#define NEXTL(l) do { \
159 if (*(ctxt->input->cur) == '\n') { \
160 ctxt->input->line++; ctxt->input->col = 1; \
161 } else ctxt->input->col++; \
162 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
163 } while (0)
164
165/************
166 \
167 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
168 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
169 ************/
170
171#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
172#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
173
174#define COPY_BUF(l,b,i,v) \
175 if (l == 1) b[i++] = (xmlChar) v; \
176 else i += xmlCopyChar(l,&b[i],v)
177
178/**
179 * htmlCurrentChar:
180 * @ctxt: the HTML parser context
181 * @len: pointer to the length of the char read
182 *
183 * The current char value, if using UTF-8 this may actaully span multiple
184 * bytes in the input buffer. Implement the end of line normalization:
185 * 2.11 End-of-Line Handling
186 * If the encoding is unspecified, in the case we find an ISO-Latin-1
187 * char, then the encoding converter is plugged in automatically.
188 *
189 * Returns the current char value and its lenght
190 */
191
192int
193htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
194 if (ctxt->instate == XML_PARSER_EOF)
195 return(0);
196
197 if (ctxt->token != 0) {
198 *len = 0;
199 return(ctxt->token);
200 }
201 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
202 /*
203 * We are supposed to handle UTF8, check it's valid
204 * From rfc2044: encoding of the Unicode values on UTF-8:
205 *
206 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
207 * 0000 0000-0000 007F 0xxxxxxx
208 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
209 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
210 *
211 * Check for the 0x110000 limit too
212 */
213 const unsigned char *cur = ctxt->input->cur;
214 unsigned char c;
215 unsigned int val;
216
217 c = *cur;
218 if (c & 0x80) {
219 if (cur[1] == 0)
220 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
221 if ((cur[1] & 0xc0) != 0x80)
222 goto encoding_error;
223 if ((c & 0xe0) == 0xe0) {
224
225 if (cur[2] == 0)
226 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
227 if ((cur[2] & 0xc0) != 0x80)
228 goto encoding_error;
229 if ((c & 0xf0) == 0xf0) {
230 if (cur[3] == 0)
231 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
232 if (((c & 0xf8) != 0xf0) ||
233 ((cur[3] & 0xc0) != 0x80))
234 goto encoding_error;
235 /* 4-byte code */
236 *len = 4;
237 val = (cur[0] & 0x7) << 18;
238 val |= (cur[1] & 0x3f) << 12;
239 val |= (cur[2] & 0x3f) << 6;
240 val |= cur[3] & 0x3f;
241 } else {
242 /* 3-byte code */
243 *len = 3;
244 val = (cur[0] & 0xf) << 12;
245 val |= (cur[1] & 0x3f) << 6;
246 val |= cur[2] & 0x3f;
247 }
248 } else {
249 /* 2-byte code */
250 *len = 2;
251 val = (cur[0] & 0x1f) << 6;
252 val |= cur[1] & 0x3f;
253 }
254 if (!IS_CHAR(val)) {
255 ctxt->errNo = XML_ERR_INVALID_ENCODING;
256 if ((ctxt->sax != NULL) &&
257 (ctxt->sax->error != NULL))
258 ctxt->sax->error(ctxt->userData,
259 "Char 0x%X out of allowed range\n", val);
260 ctxt->wellFormed = 0;
261 ctxt->disableSAX = 1;
262 }
263 return(val);
264 } else {
265 /* 1-byte code */
266 *len = 1;
267 return((int) *ctxt->input->cur);
268 }
269 }
270 /*
271 * Assume it's a fixed lenght encoding (1) with
272 * a compatibke encoding for the ASCII set, since
273 * XML constructs only use < 128 chars
274 */
275 *len = 1;
276 if ((int) *ctxt->input->cur < 0x80)
277 return((int) *ctxt->input->cur);
278
279 /*
280 * Humm this is bad, do an automatic flow conversion
281 */
282 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
283 ctxt->charset = XML_CHAR_ENCODING_UTF8;
284 return(xmlCurrentChar(ctxt, len));
285
286encoding_error:
287 /*
288 * If we detect an UTF8 error that probably mean that the
289 * input encoding didn't get properly advertized in the
290 * declaration header. Report the error and switch the encoding
291 * to ISO-Latin-1 (if you don't like this policy, just declare the
292 * encoding !)
293 */
294 ctxt->errNo = XML_ERR_INVALID_ENCODING;
295 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
296 ctxt->sax->error(ctxt->userData,
297 "Input is not proper UTF-8, indicate encoding !\n");
298 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
299 ctxt->input->cur[0], ctxt->input->cur[1],
300 ctxt->input->cur[2], ctxt->input->cur[3]);
301 }
302
303 ctxt->charset = XML_CHAR_ENCODING_8859_1;
304 *len = 1;
305 return((int) *ctxt->input->cur);
306}
307
308/**
309 * htmlNextChar:
310 * @ctxt: the HTML parser context
311 *
312 * Skip to the next char input char.
313 */
314
315void
316htmlNextChar(htmlParserCtxtPtr ctxt) {
317 if (ctxt->instate == XML_PARSER_EOF)
318 return;
319 if ((*ctxt->input->cur == 0) &&
320 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
321 xmlPopInput(ctxt);
322 } else {
323 if (*(ctxt->input->cur) == '\n') {
324 ctxt->input->line++; ctxt->input->col = 1;
325 } else ctxt->input->col++;
326 ctxt->input->cur++;
327 ctxt->nbChars++;
328 if (*ctxt->input->cur == 0)
329 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
330 }
331}
332
333/**
334 * htmlSkipBlankChars:
335 * @ctxt: the HTML parser context
336 *
337 * skip all blanks character found at that point in the input streams.
338 *
339 * Returns the number of space chars skipped
340 */
341
342int
343htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
344 int res = 0;
345
346 while (IS_BLANK(*(ctxt->input->cur))) {
347 if ((*ctxt->input->cur == 0) &&
348 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
349 xmlPopInput(ctxt);
350 } else {
351 if (*(ctxt->input->cur) == '\n') {
352 ctxt->input->line++; ctxt->input->col = 1;
353 } else ctxt->input->col++;
354 ctxt->input->cur++;
355 ctxt->nbChars++;
356 if (*ctxt->input->cur == 0)
357 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
358 }
359 res++;
360 }
361 return(res);
362}
363
364
365
366/************************************************************************
367 * *
368 * The list of HTML elements and their properties *
369 * *
370 ************************************************************************/
371
372/*
373 * Start Tag: 1 means the start tag can be ommited
374 * End Tag: 1 means the end tag can be ommited
375 * 2 means it's forbidden (empty elements)
376 * Depr: this element is deprecated
377 * DTD: 1 means that this element is valid only in the Loose DTD
378 * 2 means that this element is valid only in the Frameset DTD
379 *
380 * Name,Start Tag,End Tag,Save End, Empty, Depr., DTD, Description
381 */
382htmlElemDesc html40ElementTable[] = {
383{ "a", 0, 0, 0, 0, 0, 0, "anchor " },
384{ "abbr", 0, 0, 0, 0, 0, 0, "abbreviated form" },
385{ "acronym", 0, 0, 0, 0, 0, 0, "" },
386{ "address", 0, 0, 0, 0, 0, 0, "information on author " },
387{ "applet", 0, 0, 0, 0, 1, 1, "java applet " },
388{ "area", 0, 2, 2, 1, 0, 0, "client-side image map area " },
389{ "b", 0, 0, 0, 0, 0, 0, "bold text style" },
390{ "base", 0, 2, 2, 1, 0, 0, "document base uri " },
391{ "basefont", 0, 2, 2, 1, 1, 1, "base font size " },
392{ "bdo", 0, 0, 0, 0, 0, 0, "i18n bidi over-ride " },
393{ "big", 0, 0, 0, 0, 0, 0, "large text style" },
394{ "blockquote", 0, 0, 0, 0, 0, 0, "long quotation " },
395{ "body", 1, 1, 0, 0, 0, 0, "document body " },
396{ "br", 0, 2, 2, 1, 0, 0, "forced line break " },
397{ "button", 0, 0, 0, 0, 0, 0, "push button " },
398{ "caption", 0, 0, 0, 0, 0, 0, "table caption " },
399{ "center", 0, 0, 0, 0, 1, 1, "shorthand for div align=center " },
400{ "cite", 0, 0, 0, 0, 0, 0, "citation" },
401{ "code", 0, 0, 0, 0, 0, 0, "computer code fragment" },
402{ "col", 0, 2, 2, 1, 0, 0, "table column " },
403{ "colgroup", 0, 1, 0, 0, 0, 0, "table column group " },
404{ "dd", 0, 1, 0, 0, 0, 0, "definition description " },
405{ "del", 0, 0, 0, 0, 0, 0, "deleted text " },
406{ "dfn", 0, 0, 0, 0, 0, 0, "instance definition" },
407{ "dir", 0, 0, 0, 0, 1, 1, "directory list" },
408{ "div", 0, 0, 0, 0, 0, 0, "generic language/style container"},
409{ "dl", 0, 0, 0, 0, 0, 0, "definition list " },
410{ "dt", 0, 1, 0, 0, 0, 0, "definition term " },
411{ "em", 0, 0, 0, 0, 0, 0, "emphasis" },
412{ "fieldset", 0, 0, 0, 0, 0, 0, "form control group " },
413{ "font", 0, 0, 0, 0, 1, 1, "local change to font " },
414{ "form", 0, 0, 0, 0, 0, 0, "interactive form " },
415{ "frame", 0, 2, 2, 1, 0, 2, "subwindow " },
416{ "frameset", 0, 0, 0, 0, 0, 2, "window subdivision" },
417{ "h1", 0, 0, 0, 0, 0, 0, "heading " },
418{ "h2", 0, 0, 0, 0, 0, 0, "heading " },
419{ "h3", 0, 0, 0, 0, 0, 0, "heading " },
420{ "h4", 0, 0, 0, 0, 0, 0, "heading " },
421{ "h5", 0, 0, 0, 0, 0, 0, "heading " },
422{ "h6", 0, 0, 0, 0, 0, 0, "heading " },
423{ "head", 1, 1, 0, 0, 0, 0, "document head " },
424{ "hr", 0, 2, 2, 1, 0, 0, "horizontal rule " },
425{ "html", 1, 1, 0, 0, 0, 0, "document root element " },
426{ "i", 0, 0, 0, 0, 0, 0, "italic text style" },
427{ "iframe", 0, 0, 0, 0, 0, 1, "inline subwindow " },
428{ "img", 0, 2, 2, 1, 0, 0, "embedded image " },
429{ "input", 0, 2, 2, 1, 0, 0, "form control " },
430{ "ins", 0, 0, 0, 0, 0, 0, "inserted text" },
431{ "isindex", 0, 2, 2, 1, 1, 1, "single line prompt " },
432{ "kbd", 0, 0, 0, 0, 0, 0, "text to be entered by the user" },
433{ "label", 0, 0, 0, 0, 0, 0, "form field label text " },
434{ "legend", 0, 0, 0, 0, 0, 0, "fieldset legend " },
435{ "li", 0, 1, 1, 0, 0, 0, "list item " },
436{ "link", 0, 2, 2, 1, 0, 0, "a media-independent link " },
437{ "map", 0, 0, 0, 0, 0, 0, "client-side image map " },
438{ "menu", 0, 0, 0, 0, 1, 1, "menu list " },
439{ "meta", 0, 2, 2, 1, 0, 0, "generic metainformation " },
440{ "noframes", 0, 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
441{ "noscript", 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
442{ "object", 0, 0, 0, 0, 0, 0, "generic embedded object " },
443{ "ol", 0, 0, 0, 0, 0, 0, "ordered list " },
444{ "optgroup", 0, 0, 0, 0, 0, 0, "option group " },
445{ "option", 0, 1, 0, 0, 0, 0, "selectable choice " },
446{ "p", 0, 1, 1, 0, 0, 0, "paragraph " },
447{ "param", 0, 2, 2, 1, 0, 0, "named property value " },
448{ "pre", 0, 0, 0, 0, 0, 0, "preformatted text " },
449{ "q", 0, 0, 0, 0, 0, 0, "short inline quotation " },
450{ "s", 0, 0, 0, 0, 1, 1, "strike-through text style" },
451{ "samp", 0, 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
452{ "script", 0, 0, 0, 0, 0, 0, "script statements " },
453{ "select", 0, 0, 0, 0, 0, 0, "option selector " },
454{ "small", 0, 0, 0, 0, 0, 0, "small text style" },
455{ "span", 0, 0, 0, 0, 0, 0, "generic language/style container " },
456{ "strike", 0, 0, 0, 0, 1, 1, "strike-through text" },
457{ "strong", 0, 0, 0, 0, 0, 0, "strong emphasis" },
458{ "style", 0, 0, 0, 0, 0, 0, "style info " },
459{ "sub", 0, 0, 0, 0, 0, 0, "subscript" },
460{ "sup", 0, 0, 0, 0, 0, 0, "superscript " },
461{ "table", 0, 0, 0, 0, 0, 0, "&#160;" },
462{ "tbody", 1, 0, 0, 0, 0, 0, "table body " },
463{ "td", 0, 0, 0, 0, 0, 0, "table data cell" },
464{ "textarea", 0, 0, 0, 0, 0, 0, "multi-line text field " },
465{ "tfoot", 0, 1, 0, 0, 0, 0, "table footer " },
466{ "th", 0, 1, 0, 0, 0, 0, "table header cell" },
467{ "thead", 0, 1, 0, 0, 0, 0, "table header " },
468{ "title", 0, 0, 0, 0, 0, 0, "document title " },
469{ "tr", 0, 1, 0, 0, 0, 0, "table row " },
470{ "tt", 0, 0, 0, 0, 0, 0, "teletype or monospaced text style" },
471{ "u", 0, 0, 0, 0, 1, 1, "underlined text style" },
472{ "ul", 0, 0, 0, 0, 0, 0, "unordered list " },
473{ "var", 0, 0, 0, 0, 0, 0, "instance of a variable or program argument" },
474};
475
476/*
477 * start tags that imply the end of a current element
478 * any tag of each line implies the end of the current element if the type of
479 * that element is in the same line
480 */
481char *htmlEquEnd[] = {
482"dt", "dd", "li", "option", NULL,
483"h1", "h2", "h3", "h4", "h5", "h6", NULL,
484"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
485NULL
486};
487/*
488 * acording the HTML DTD, HR should be added to the 2nd line above, as it
489 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
490 * because many documents contain rules in headings...
491 */
492
493/*
494 * start tags that imply the end of current element
495 */
496char *htmlStartClose[] = {
497"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
498 "dl", "ul", "ol", "menu", "dir", "address", "pre",
499 "listing", "xmp", "head", NULL,
500"head", "p", NULL,
501"title", "p", NULL,
502"body", "head", "style", "link", "title", "p", NULL,
503"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
504 "pre", "listing", "xmp", "head", "li", NULL,
505"hr", "p", "head", NULL,
506"h1", "p", "head", NULL,
507"h2", "p", "head", NULL,
508"h3", "p", "head", NULL,
509"h4", "p", "head", NULL,
510"h5", "p", "head", NULL,
511"h6", "p", "head", NULL,
512"dir", "p", "head", NULL,
513"address", "p", "head", "ul", NULL,
514"pre", "p", "head", "ul", NULL,
515"listing", "p", "head", NULL,
516"xmp", "p", "head", NULL,
517"blockquote", "p", "head", NULL,
518"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
519 "xmp", "head", NULL,
520"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
521 "head", "dd", NULL,
522"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
523 "head", "dt", NULL,
524"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
525 "listing", "xmp", NULL,
526"ol", "p", "head", "ul", NULL,
527"menu", "p", "head", "ul", NULL,
528"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
529"div", "p", "head", NULL,
530"noscript", "p", "head", NULL,
531"center", "font", "b", "i", "p", "head", NULL,
532"a", "a", NULL,
533"caption", "p", NULL,
534"colgroup", "caption", "colgroup", "col", "p", NULL,
535"col", "caption", "col", "p", NULL,
536"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
537 "listing", "xmp", "a", NULL,
538"th", "th", "td", NULL,
539"td", "th", "td", "p", NULL,
540"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
541"thead", "caption", "col", "colgroup", NULL,
542"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
543 "tbody", "p", NULL,
544"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
545 "tfoot", "tbody", "p", NULL,
546"optgroup", "option", NULL,
547"option", "option", NULL,
548"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
549 "pre", "listing", "xmp", "a", NULL,
550NULL
551};
552
553/*
554 * The list of HTML elements which are supposed not to have
555 * CDATA content and where a p element will be implied
556 *
557 * TODO: extend that list by reading the HTML SGML DtD on
558 * implied paragraph
559 */
560static char *htmlNoContentElements[] = {
561 "html",
562 "head",
563 "body",
564 NULL
565};
566
567/*
568 * The list of HTML attributes which are of content %Script;
569 * NOTE: when adding ones, check htmlIsScriptAttribute() since
570 * it assumes the name starts with 'on'
571 */
572static char *htmlScriptAttributes[] = {
573 "onclick",
574 "ondblclick",
575 "onmousedown",
576 "onmouseup",
577 "onmouseover",
578 "onmousemove",
579 "onmouseout",
580 "onkeypress",
581 "onkeydown",
582 "onkeyup",
583 "onload",
584 "onunload",
585 "onfocus",
586 "onblur",
587 "onsubmit",
588 "onrest",
589 "onchange",
590 "onselect"
591};
592
593
594static char** htmlStartCloseIndex[100];
595static int htmlStartCloseIndexinitialized = 0;
596
597/************************************************************************
598 * *
599 * functions to handle HTML specific data *
600 * *
601 ************************************************************************/
602
603/**
604 * htmlInitAutoClose:
605 *
606 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
607 * This is not reentrant. Call xmlInitParser() once before processing in
608 * case of use in multithreaded programs.
609 */
610void
611htmlInitAutoClose(void) {
612 int index, i = 0;
613
614 if (htmlStartCloseIndexinitialized) return;
615
616 for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
617 index = 0;
618 while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
619 htmlStartCloseIndex[index++] = &htmlStartClose[i];
620 while (htmlStartClose[i] != NULL) i++;
621 i++;
622 }
623 htmlStartCloseIndexinitialized = 1;
624}
625
626/**
627 * htmlTagLookup:
628 * @tag: The tag name in lowercase
629 *
630 * Lookup the HTML tag in the ElementTable
631 *
632 * Returns the related htmlElemDescPtr or NULL if not found.
633 */
634htmlElemDescPtr
635htmlTagLookup(const xmlChar *tag) {
636 int i;
637
638 for (i = 0; i < (sizeof(html40ElementTable) /
639 sizeof(html40ElementTable[0]));i++) {
640 if (xmlStrEqual(tag, BAD_CAST html40ElementTable[i].name))
641 return(&html40ElementTable[i]);
642 }
643 return(NULL);
644}
645
646/**
647 * htmlCheckAutoClose:
648 * @newtag: The new tag name
649 * @oldtag: The old tag name
650 *
651 * Checks wether the new tag is one of the registered valid tags for closing old.
652 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
653 *
654 * Returns 0 if no, 1 if yes.
655 */
656int
657htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
658 int i, index;
659 char **close = NULL;
660
661 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
662
663 /* inefficient, but not a big deal */
664 for (index = 0; index < 100;index++) {
665 close = htmlStartCloseIndex[index];
666 if (close == NULL) return(0);
667 if (xmlStrEqual(BAD_CAST *close, newtag)) break;
668 }
669
670 i = close - htmlStartClose;
671 i++;
672 while (htmlStartClose[i] != NULL) {
673 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
674 return(1);
675 }
676 i++;
677 }
678 return(0);
679}
680
681/**
682 * htmlAutoCloseOnClose:
683 * @ctxt: an HTML parser context
684 * @newtag: The new tag name
685 *
686 * The HTmL DtD allows an ending tag to implicitely close other tags.
687 */
688void
689htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
690 htmlElemDescPtr info;
691 xmlChar *oldname;
692 int i;
693
694#ifdef DEBUG
695 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
696 for (i = 0;i < ctxt->nameNr;i++)
697 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
698#endif
699
700 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
701 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
702 }
703 if (i < 0) return;
704
705 while (!xmlStrEqual(newtag, ctxt->name)) {
706 info = htmlTagLookup(ctxt->name);
707 if ((info == NULL) || (info->endTag == 1)) {
708#ifdef DEBUG
709 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
710#endif
711 } else {
712 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
713 ctxt->sax->error(ctxt->userData,
714 "Opening and ending tag mismatch: %s and %s\n",
715 newtag, ctxt->name);
716 ctxt->wellFormed = 0;
717 }
718 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
719 ctxt->sax->endElement(ctxt->userData, ctxt->name);
720 oldname = htmlnamePop(ctxt);
721 if (oldname != NULL) {
722#ifdef DEBUG
723 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
724#endif
725 xmlFree(oldname);
726 }
727 }
728}
729
730/**
731 * htmlAutoClose:
732 * @ctxt: an HTML parser context
733 * @newtag: The new tag name or NULL
734 *
735 * The HTmL DtD allows a tag to implicitely close other tags.
736 * The list is kept in htmlStartClose array. This function is
737 * called when a new tag has been detected and generates the
738 * appropriates closes if possible/needed.
739 * If newtag is NULL this mean we are at the end of the resource
740 * and we should check
741 */
742void
743htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
744 xmlChar *oldname;
745 while ((newtag != NULL) && (ctxt->name != NULL) &&
746 (htmlCheckAutoClose(newtag, ctxt->name))) {
747#ifdef DEBUG
748 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
749#endif
750 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
751 ctxt->sax->endElement(ctxt->userData, ctxt->name);
752 oldname = htmlnamePop(ctxt);
753 if (oldname != NULL) {
754#ifdef DEBUG
755 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
756#endif
757 xmlFree(oldname);
758 }
759 }
760 if (newtag == NULL) {
761 htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
762 htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
763 htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
764 }
765 while ((newtag == NULL) && (ctxt->name != NULL) &&
766 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
767 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
768 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
769#ifdef DEBUG
770 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
771#endif
772 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
773 ctxt->sax->endElement(ctxt->userData, ctxt->name);
774 oldname = htmlnamePop(ctxt);
775 if (oldname != NULL) {
776#ifdef DEBUG
777 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
778#endif
779 xmlFree(oldname);
780 }
781 }
782
783}
784
785/**
786 * htmlAutoCloseTag:
787 * @doc: the HTML document
788 * @name: The tag name
789 * @elem: the HTML element
790 *
791 * The HTmL DtD allows a tag to implicitely close other tags.
792 * The list is kept in htmlStartClose array. This function checks
793 * if the element or one of it's children would autoclose the
794 * given tag.
795 *
796 * Returns 1 if autoclose, 0 otherwise
797 */
798int
799htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
800 htmlNodePtr child;
801
802 if (elem == NULL) return(1);
803 if (xmlStrEqual(name, elem->name)) return(0);
804 if (htmlCheckAutoClose(elem->name, name)) return(1);
805 child = elem->children;
806 while (child != NULL) {
807 if (htmlAutoCloseTag(doc, name, child)) return(1);
808 child = child->next;
809 }
810 return(0);
811}
812
813/**
814 * htmlIsAutoClosed:
815 * @doc: the HTML document
816 * @elem: the HTML element
817 *
818 * The HTmL DtD allows a tag to implicitely close other tags.
819 * The list is kept in htmlStartClose array. This function checks
820 * if a tag is autoclosed by one of it's child
821 *
822 * Returns 1 if autoclosed, 0 otherwise
823 */
824int
825htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
826 htmlNodePtr child;
827
828 if (elem == NULL) return(1);
829 child = elem->children;
830 while (child != NULL) {
831 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
832 child = child->next;
833 }
834 return(0);
835}
836
837/**
838 * htmlCheckImplied:
839 * @ctxt: an HTML parser context
840 * @newtag: The new tag name
841 *
842 * The HTML DtD allows a tag to exists only implicitely
843 * called when a new tag has been detected and generates the
844 * appropriates implicit tags if missing
845 */
846void
847htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
848 if (!htmlOmittedDefaultValue)
849 return;
850 if (xmlStrEqual(newtag, BAD_CAST"html"))
851 return;
852 if (ctxt->nameNr <= 0) {
853#ifdef DEBUG
854 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
855#endif
856 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
857 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
858 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
859 }
860 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
861 return;
862 if ((ctxt->nameNr <= 1) &&
863 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
864 (xmlStrEqual(newtag, BAD_CAST"style")) ||
865 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
866 (xmlStrEqual(newtag, BAD_CAST"link")) ||
867 (xmlStrEqual(newtag, BAD_CAST"title")) ||
868 (xmlStrEqual(newtag, BAD_CAST"base")))) {
869 /*
870 * dropped OBJECT ... i you put it first BODY will be
871 * assumed !
872 */
873#ifdef DEBUG
874 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
875#endif
876 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
877 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
878 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
879 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
880 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
881 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
882 int i;
883 for (i = 0;i < ctxt->nameNr;i++) {
884 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
885 return;
886 }
887 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
888 return;
889 }
890 }
891
892#ifdef DEBUG
893 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
894#endif
895 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
896 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
897 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
898 }
899}
900
901/**
902 * htmlCheckParagraph
903 * @ctxt: an HTML parser context
904 *
905 * Check whether a p element need to be implied before inserting
906 * characters in the current element.
907 *
908 * Returns 1 if a paragraph has been inserted, 0 if not and -1
909 * in case of error.
910 */
911
912int
913htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
914 const xmlChar *tag;
915 int i;
916
917 if (ctxt == NULL)
918 return(-1);
919 tag = ctxt->name;
920 if (tag == NULL) {
921 htmlAutoClose(ctxt, BAD_CAST"p");
922 htmlCheckImplied(ctxt, BAD_CAST"p");
923 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
924 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
925 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
926 return(1);
927 }
928 if (!htmlOmittedDefaultValue)
929 return(0);
930 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
931 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
932#ifdef DEBUG
933 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
934#endif
935 htmlAutoClose(ctxt, BAD_CAST"p");
936 htmlCheckImplied(ctxt, BAD_CAST"p");
937 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
938 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
939 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
940 return(1);
941 }
942 }
943 return(0);
944}
945
946/**
947 * htmlIsScriptAttribute:
948 * @name: an attribute name
949 *
950 * Check if an attribute is of content type Script
951 *
952 * Returns 1 is the attribute is a script 0 otherwise
953 */
954int
955htmlIsScriptAttribute(const xmlChar *name) {
956 int i;
957
958 if (name == NULL)
959 return(0);
960 /*
961 * all script attributes start with 'on'
962 */
963 if ((name[0] != 'o') || (name[1] != 'n'))
964 return(0);
965 for (i = 0;
966 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
967 i++) {
968 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
969 return(1);
970 }
971 return(0);
972}
973
974/************************************************************************
975 * *
976 * The list of HTML predefined entities *
977 * *
978 ************************************************************************/
979
980
981htmlEntityDesc html40EntitiesTable[] = {
982/*
983 * the 4 absolute ones, plus apostrophe.
984 */
985{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
986{ 38, "amp", "ampersand, U+0026 ISOnum" },
987{ 39, "apos", "single quote" },
988{ 60, "lt", "less-than sign, U+003C ISOnum" },
989{ 62, "gt", "greater-than sign, U+003E ISOnum" },
990
991/*
992 * A bunch still in the 128-255 range
993 * Replacing them depend really on the charset used.
994 */
995{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
996{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
997{ 162, "cent", "cent sign, U+00A2 ISOnum" },
998{ 163, "pound","pound sign, U+00A3 ISOnum" },
999{ 164, "curren","currency sign, U+00A4 ISOnum" },
1000{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1001{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1002{ 167, "sect", "section sign, U+00A7 ISOnum" },
1003{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1004{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1005{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1006{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1007{ 172, "not", "not sign, U+00AC ISOnum" },
1008{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1009{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1010{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1011{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1012{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1013{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1014{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1015{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1016{ 181, "micro","micro sign, U+00B5 ISOnum" },
1017{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1018{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1019{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1020{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1021{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1022{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1023{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1024{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1025{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1026{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1027{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1028{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1029{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1030{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1031{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1032{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1033{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1034{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1035{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1036{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1037{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1038{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1039{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1040{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1041{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1042{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1043{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1044{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1045{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1046{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1047{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1048{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1049{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1050{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1051{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1052{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1053{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1054{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1055{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1056{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1057{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1058{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1059{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1060{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1061{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1062{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1063{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1064{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1065{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1066{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1067{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1068{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1069{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1070{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1071{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1072{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1073{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1074{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1075{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1076{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1077{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1078{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1079{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1080{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1081{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1082{ 247, "divide","division sign, U+00F7 ISOnum" },
1083{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1084{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1085{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1086{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1087{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1088{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1089{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1090{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1091
1092{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1093{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1094{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1095{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1096{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1097
1098/*
1099 * Anything below should really be kept as entities references
1100 */
1101{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1102
1103{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1104{ 732, "tilde","small tilde, U+02DC ISOdia" },
1105
1106{ 913, "Alpha","greek capital letter alpha, U+0391" },
1107{ 914, "Beta", "greek capital letter beta, U+0392" },
1108{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1109{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1110{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1111{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1112{ 919, "Eta", "greek capital letter eta, U+0397" },
1113{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1114{ 921, "Iota", "greek capital letter iota, U+0399" },
1115{ 922, "Kappa","greek capital letter kappa, U+039A" },
1116{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
1117{ 924, "Mu", "greek capital letter mu, U+039C" },
1118{ 925, "Nu", "greek capital letter nu, U+039D" },
1119{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1120{ 927, "Omicron","greek capital letter omicron, U+039F" },
1121{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1122{ 929, "Rho", "greek capital letter rho, U+03A1" },
1123{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1124{ 932, "Tau", "greek capital letter tau, U+03A4" },
1125{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1126{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1127{ 935, "Chi", "greek capital letter chi, U+03A7" },
1128{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1129{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1130
1131{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1132{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1133{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1134{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1135{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1136{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1137{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1138{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1139{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1140{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1141{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1142{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1143{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1144{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1145{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1146{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1147{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1148{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1149{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1150{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1151{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1152{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1153{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1154{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1155{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1156{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1157{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1158{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1159
1160{ 8194, "ensp", "en space, U+2002 ISOpub" },
1161{ 8195, "emsp", "em space, U+2003 ISOpub" },
1162{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1163{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1164{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1165{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1166{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1167{ 8211, "ndash","en dash, U+2013 ISOpub" },
1168{ 8212, "mdash","em dash, U+2014 ISOpub" },
1169{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1170{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1171{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1172{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1173{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1174{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1175{ 8224, "dagger","dagger, U+2020 ISOpub" },
1176{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1177
1178{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1179{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1180
1181{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1182
1183{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1184{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1185
1186{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1187{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1188
1189{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1190{ 8260, "frasl","fraction slash, U+2044 NEW" },
1191
1192{ 8364, "euro", "euro sign, U+20AC NEW" },
1193
1194{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1195{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1196{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1197{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1198{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1199{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1200{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1201{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1202{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1203{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1204{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1205{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1206{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1207{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1208{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1209{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1210
1211{ 8704, "forall","for all, U+2200 ISOtech" },
1212{ 8706, "part", "partial differential, U+2202 ISOtech" },
1213{ 8707, "exist","there exists, U+2203 ISOtech" },
1214{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1215{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1216{ 8712, "isin", "element of, U+2208 ISOtech" },
1217{ 8713, "notin","not an element of, U+2209 ISOtech" },
1218{ 8715, "ni", "contains as member, U+220B ISOtech" },
1219{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1220{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1221{ 8722, "minus","minus sign, U+2212 ISOtech" },
1222{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1223{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1224{ 8733, "prop", "proportional to, U+221D ISOtech" },
1225{ 8734, "infin","infinity, U+221E ISOtech" },
1226{ 8736, "ang", "angle, U+2220 ISOamso" },
1227{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1228{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1229{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1230{ 8746, "cup", "union = cup, U+222A ISOtech" },
1231{ 8747, "int", "integral, U+222B ISOtech" },
1232{ 8756, "there4","therefore, U+2234 ISOtech" },
1233{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1234{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1235{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1236{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1237{ 8801, "equiv","identical to, U+2261 ISOtech" },
1238{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1239{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1240{ 8834, "sub", "subset of, U+2282 ISOtech" },
1241{ 8835, "sup", "superset of, U+2283 ISOtech" },
1242{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1243{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1244{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1245{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1246{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1247{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1248{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1249{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1250{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1251{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1252{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1253{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1254{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1255{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1256
1257{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1258{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1259{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1260{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1261
1262};
1263
1264/************************************************************************
1265 * *
1266 * Commodity functions to handle entities *
1267 * *
1268 ************************************************************************/
1269
1270/*
1271 * Macro used to grow the current buffer.
1272 */
1273#define growBuffer(buffer) { \
1274 buffer##_size *= 2; \
1275 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1276 if (buffer == NULL) { \
1277 perror("realloc failed"); \
1278 return(NULL); \
1279 } \
1280}
1281
1282/**
1283 * htmlEntityLookup:
1284 * @name: the entity name
1285 *
1286 * Lookup the given entity in EntitiesTable
1287 *
1288 * TODO: the linear scan is really ugly, an hash table is really needed.
1289 *
1290 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1291 */
1292htmlEntityDescPtr
1293htmlEntityLookup(const xmlChar *name) {
1294 int i;
1295
1296 for (i = 0;i < (sizeof(html40EntitiesTable)/
1297 sizeof(html40EntitiesTable[0]));i++) {
1298 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1299#ifdef DEBUG
1300 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1301#endif
1302 return(&html40EntitiesTable[i]);
1303 }
1304 }
1305 return(NULL);
1306}
1307
1308/**
1309 * htmlEntityValueLookup:
1310 * @value: the entity's unicode value
1311 *
1312 * Lookup the given entity in EntitiesTable
1313 *
1314 * TODO: the linear scan is really ugly, an hash table is really needed.
1315 *
1316 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1317 */
1318htmlEntityDescPtr
1319htmlEntityValueLookup(int value) {
1320 int i;
1321#ifdef DEBUG
1322 int lv = 0;
1323#endif
1324
1325 for (i = 0;i < (sizeof(html40EntitiesTable)/
1326 sizeof(html40EntitiesTable[0]));i++) {
1327 if ((unsigned int) html40EntitiesTable[i].value >= value) {
1328 if ((unsigned int) html40EntitiesTable[i].value > value)
1329 break;
1330#ifdef DEBUG
1331 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1332#endif
1333 return(&html40EntitiesTable[i]);
1334 }
1335#ifdef DEBUG
1336 if (lv > html40EntitiesTable[i].value) {
1337 xmlGenericError(xmlGenericErrorContext,
1338 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1339 lv, html40EntitiesTable[i].value);
1340 }
1341 lv = html40EntitiesTable[i].value;
1342#endif
1343 }
1344 return(NULL);
1345}
1346
1347/**
1348 * UTF8ToHtml:
1349 * @out: a pointer to an array of bytes to store the result
1350 * @outlen: the length of @out
1351 * @in: a pointer to an array of UTF-8 chars
1352 * @inlen: the length of @in
1353 *
1354 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1355 * plus HTML entities block of chars out.
1356 *
1357 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1358 * The value of @inlen after return is the number of octets consumed
1359 * as the return value is positive, else unpredictiable.
1360 * The value of @outlen after return is the number of octets consumed.
1361 */
1362int
1363UTF8ToHtml(unsigned char* out, int *outlen,
1364 const unsigned char* in, int *inlen) {
1365 const unsigned char* processed = in;
1366 const unsigned char* outend;
1367 const unsigned char* outstart = out;
1368 const unsigned char* instart = in;
1369 const unsigned char* inend;
1370 unsigned int c, d;
1371 int trailing;
1372
1373 if (in == NULL) {
1374 /*
1375 * initialization nothing to do
1376 */
1377 *outlen = 0;
1378 *inlen = 0;
1379 return(0);
1380 }
1381 inend = in + (*inlen);
1382 outend = out + (*outlen);
1383 while (in < inend) {
1384 d = *in++;
1385 if (d < 0x80) { c= d; trailing= 0; }
1386 else if (d < 0xC0) {
1387 /* trailing byte in leading position */
1388 *outlen = out - outstart;
1389 *inlen = processed - instart;
1390 return(-2);
1391 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1392 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1393 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1394 else {
1395 /* no chance for this in Ascii */
1396 *outlen = out - outstart;
1397 *inlen = processed - instart;
1398 return(-2);
1399 }
1400
1401 if (inend - in < trailing) {
1402 break;
1403 }
1404
1405 for ( ; trailing; trailing--) {
1406 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1407 break;
1408 c <<= 6;
1409 c |= d & 0x3F;
1410 }
1411
1412 /* assertion: c is a single UTF-4 value */
1413 if (c < 0x80) {
1414 if (out + 1 >= outend)
1415 break;
1416 *out++ = c;
1417 } else {
1418 int len;
1419 htmlEntityDescPtr ent;
1420
1421 /*
1422 * Try to lookup a predefined HTML entity for it
1423 */
1424
1425 ent = htmlEntityValueLookup(c);
1426 if (ent == NULL) {
1427 /* no chance for this in Ascii */
1428 *outlen = out - outstart;
1429 *inlen = processed - instart;
1430 return(-2);
1431 }
1432 len = strlen(ent->name);
1433 if (out + 2 + len >= outend)
1434 break;
1435 *out++ = '&';
1436 memcpy(out, ent->name, len);
1437 out += len;
1438 *out++ = ';';
1439 }
1440 processed = in;
1441 }
1442 *outlen = out - outstart;
1443 *inlen = processed - instart;
1444 return(0);
1445}
1446
1447/**
1448 * htmlEncodeEntities:
1449 * @out: a pointer to an array of bytes to store the result
1450 * @outlen: the length of @out
1451 * @in: a pointer to an array of UTF-8 chars
1452 * @inlen: the length of @in
1453 * @quoteChar: the quote character to escape (' or ") or zero.
1454 *
1455 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1456 * plus HTML entities block of chars out.
1457 *
1458 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1459 * The value of @inlen after return is the number of octets consumed
1460 * as the return value is positive, else unpredictiable.
1461 * The value of @outlen after return is the number of octets consumed.
1462 */
1463int
1464htmlEncodeEntities(unsigned char* out, int *outlen,
1465 const unsigned char* in, int *inlen, int quoteChar) {
1466 const unsigned char* processed = in;
1467 const unsigned char* outend = out + (*outlen);
1468 const unsigned char* outstart = out;
1469 const unsigned char* instart = in;
1470 const unsigned char* inend = in + (*inlen);
1471 unsigned int c, d;
1472 int trailing;
1473
1474 while (in < inend) {
1475 d = *in++;
1476 if (d < 0x80) { c= d; trailing= 0; }
1477 else if (d < 0xC0) {
1478 /* trailing byte in leading position */
1479 *outlen = out - outstart;
1480 *inlen = processed - instart;
1481 return(-2);
1482 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1483 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1484 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1485 else {
1486 /* no chance for this in Ascii */
1487 *outlen = out - outstart;
1488 *inlen = processed - instart;
1489 return(-2);
1490 }
1491
1492 if (inend - in < trailing)
1493 break;
1494
1495 while (trailing--) {
1496 if (((d= *in++) & 0xC0) != 0x80) {
1497 *outlen = out - outstart;
1498 *inlen = processed - instart;
1499 return(-2);
1500 }
1501 c <<= 6;
1502 c |= d & 0x3F;
1503 }
1504
1505 /* assertion: c is a single UTF-4 value */
1506 if (c < 0x80 && c != quoteChar && c != '&' && c != '<' && c != '>') {
1507 if (out >= outend)
1508 break;
1509 *out++ = c;
1510 } else {
1511 htmlEntityDescPtr ent;
1512 const char *cp;
1513 char nbuf[16];
1514 int len;
1515
1516 /*
1517 * Try to lookup a predefined HTML entity for it
1518 */
1519 ent = htmlEntityValueLookup(c);
1520 if (ent == NULL) {
1521 sprintf(nbuf, "#%u", c);
1522 cp = nbuf;
1523 }
1524 else
1525 cp = ent->name;
1526 len = strlen(cp);
1527 if (out + 2 + len > outend)
1528 break;
1529 *out++ = '&';
1530 memcpy(out, cp, len);
1531 out += len;
1532 *out++ = ';';
1533 }
1534 processed = in;
1535 }
1536 *outlen = out - outstart;
1537 *inlen = processed - instart;
1538 return(0);
1539}
1540
1541/**
1542 * htmlDecodeEntities:
1543 * @ctxt: the parser context
1544 * @len: the len to decode (in bytes !), -1 for no size limit
1545 * @end: an end marker xmlChar, 0 if none
1546 * @end2: an end marker xmlChar, 0 if none
1547 * @end3: an end marker xmlChar, 0 if none
1548 *
1549 * Subtitute the HTML entities by their value
1550 *
1551 * DEPRECATED !!!!
1552 *
1553 * Returns A newly allocated string with the substitution done. The caller
1554 * must deallocate it !
1555 */
1556xmlChar *
1557htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
1558 xmlChar end, xmlChar end2, xmlChar end3) {
1559 xmlChar *name = NULL;
1560 xmlChar *buffer = NULL;
1561 unsigned int buffer_size = 0;
1562 unsigned int nbchars = 0;
1563 htmlEntityDescPtr ent;
1564 unsigned int max = (unsigned int) len;
1565 int c,l;
1566
1567 if (ctxt->depth > 40) {
1568 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1569 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1570 ctxt->sax->error(ctxt->userData,
1571 "Detected entity reference loop\n");
1572 ctxt->wellFormed = 0;
1573 ctxt->disableSAX = 1;
1574 return(NULL);
1575 }
1576
1577 /*
1578 * allocate a translation buffer.
1579 */
1580 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1581 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1582 if (buffer == NULL) {
1583 perror("xmlDecodeEntities: malloc failed");
1584 return(NULL);
1585 }
1586
1587 /*
1588 * Ok loop until we reach one of the ending char or a size limit.
1589 */
1590 c = CUR_CHAR(l);
1591 while ((nbchars < max) && (c != end) &&
1592 (c != end2) && (c != end3)) {
1593
1594 if (c == 0) break;
1595 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1596 int val = htmlParseCharRef(ctxt);
1597 COPY_BUF(0,buffer,nbchars,val);
1598 NEXTL(l);
1599 } else if ((c == '&') && (ctxt->token != '&')) {
1600 ent = htmlParseEntityRef(ctxt, &name);
1601 if (name != NULL) {
1602 if (ent != NULL) {
1603 int val = ent->value;
1604 COPY_BUF(0,buffer,nbchars,val);
1605 NEXTL(l);
1606 } else {
1607 const xmlChar *cur = name;
1608
1609 buffer[nbchars++] = '&';
1610 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1611 growBuffer(buffer);
1612 }
1613 while (*cur != 0) {
1614 buffer[nbchars++] = *cur++;
1615 }
1616 buffer[nbchars++] = ';';
1617 }
1618 }
1619 } else {
1620 COPY_BUF(l,buffer,nbchars,c);
1621 NEXTL(l);
1622 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1623 growBuffer(buffer);
1624 }
1625 }
1626 c = CUR_CHAR(l);
1627 }
1628 buffer[nbchars++] = 0;
1629 return(buffer);
1630}
1631
1632/************************************************************************
1633 * *
1634 * Commodity functions to handle streams *
1635 * *
1636 ************************************************************************/
1637
1638/**
1639 * htmlFreeInputStream:
1640 * @input: an htmlParserInputPtr
1641 *
1642 * Free up an input stream.
1643 */
1644void
1645htmlFreeInputStream(htmlParserInputPtr input) {
1646 if (input == NULL) return;
1647
1648 if (input->filename != NULL) xmlFree((char *) input->filename);
1649 if (input->directory != NULL) xmlFree((char *) input->directory);
1650 if ((input->free != NULL) && (input->base != NULL))
1651 input->free((xmlChar *) input->base);
1652 if (input->buf != NULL)
1653 xmlFreeParserInputBuffer(input->buf);
Daniel Veillard48b2f892001-02-25 16:11:03 +00001654 MEM_CLEANUP(input, sizeof(htmlParserInput));
Owen Taylor3473f882001-02-23 17:55:21 +00001655 xmlFree(input);
1656}
1657
1658/**
1659 * htmlNewInputStream:
1660 * @ctxt: an HTML parser context
1661 *
1662 * Create a new input stream structure
1663 * Returns the new input stream or NULL
1664 */
1665htmlParserInputPtr
1666htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1667 htmlParserInputPtr input;
1668
1669 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1670 if (input == NULL) {
1671 ctxt->errNo = XML_ERR_NO_MEMORY;
1672 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1673 ctxt->sax->error(ctxt->userData,
1674 "malloc: couldn't allocate a new input stream\n");
1675 return(NULL);
1676 }
1677 memset(input, 0, sizeof(htmlParserInput));
1678 input->filename = NULL;
1679 input->directory = NULL;
1680 input->base = NULL;
1681 input->cur = NULL;
1682 input->buf = NULL;
1683 input->line = 1;
1684 input->col = 1;
1685 input->buf = NULL;
1686 input->free = NULL;
1687 input->version = NULL;
1688 input->consumed = 0;
1689 input->length = 0;
1690 return(input);
1691}
1692
1693
1694/************************************************************************
1695 * *
1696 * Commodity functions, cleanup needed ? *
1697 * *
1698 ************************************************************************/
1699
1700/**
1701 * areBlanks:
1702 * @ctxt: an HTML parser context
1703 * @str: a xmlChar *
1704 * @len: the size of @str
1705 *
1706 * Is this a sequence of blank chars that one can ignore ?
1707 *
1708 * Returns 1 if ignorable 0 otherwise.
1709 */
1710
1711static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1712 int i;
1713 xmlNodePtr lastChild;
1714
1715 for (i = 0;i < len;i++)
1716 if (!(IS_BLANK(str[i]))) return(0);
1717
1718 if (CUR == 0) return(1);
1719 if (CUR != '<') return(0);
1720 if (ctxt->name == NULL)
1721 return(1);
1722 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1723 return(1);
1724 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1725 return(1);
1726 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1727 return(1);
1728 if (ctxt->node == NULL) return(0);
1729 lastChild = xmlGetLastChild(ctxt->node);
1730 if (lastChild == NULL) {
1731 if (ctxt->node->content != NULL) return(0);
1732 } else if (xmlNodeIsText(lastChild)) {
1733 return(0);
1734 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1735 return(0);
1736 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1737 return(0);
1738 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1739 return(0);
1740 }
1741 return(1);
1742}
1743
1744/**
1745 * htmlHandleEntity:
1746 * @ctxt: an HTML parser context
1747 * @entity: an XML entity pointer.
1748 *
1749 * Default handling of an HTML entity, call the parser with the
1750 * substitution string
1751 */
1752
1753void
1754htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1755 int len;
1756
1757 if (entity->content == NULL) {
1758 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1759 ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1760 entity->name);
1761 ctxt->wellFormed = 0;
1762 return;
1763 }
1764 len = xmlStrlen(entity->content);
1765
1766 /*
1767 * Just handle the content as a set of chars.
1768 */
1769 htmlCheckParagraph(ctxt);
1770 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1771 ctxt->sax->characters(ctxt->userData, entity->content, len);
1772
1773}
1774
1775/**
1776 * htmlNewDocNoDtD:
1777 * @URI: URI for the dtd, or NULL
1778 * @ExternalID: the external ID of the DTD, or NULL
1779 *
1780 * Returns a new document, do not intialize the DTD if not provided
1781 */
1782htmlDocPtr
1783htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1784 xmlDocPtr cur;
1785
1786 /*
1787 * Allocate a new document and fill the fields.
1788 */
1789 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1790 if (cur == NULL) {
1791 xmlGenericError(xmlGenericErrorContext,
1792 "xmlNewDoc : malloc failed\n");
1793 return(NULL);
1794 }
1795 memset(cur, 0, sizeof(xmlDoc));
1796
1797 cur->type = XML_HTML_DOCUMENT_NODE;
1798 cur->version = NULL;
1799 cur->intSubset = NULL;
1800 if ((ExternalID != NULL) ||
1801 (URI != NULL))
1802 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1803 cur->doc = cur;
1804 cur->name = NULL;
1805 cur->children = NULL;
1806 cur->extSubset = NULL;
1807 cur->oldNs = NULL;
1808 cur->encoding = NULL;
1809 cur->standalone = 1;
1810 cur->compression = 0;
1811 cur->ids = NULL;
1812 cur->refs = NULL;
1813#ifndef XML_WITHOUT_CORBA
1814 cur->_private = NULL;
1815#endif
1816 return(cur);
1817}
1818
1819/**
1820 * htmlNewDoc:
1821 * @URI: URI for the dtd, or NULL
1822 * @ExternalID: the external ID of the DTD, or NULL
1823 *
1824 * Returns a new document
1825 */
1826htmlDocPtr
1827htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1828 if ((URI == NULL) && (ExternalID == NULL))
1829 return(htmlNewDocNoDtD(
1830 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1831 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
1832
1833 return(htmlNewDocNoDtD(URI, ExternalID));
1834}
1835
1836
1837/************************************************************************
1838 * *
1839 * The parser itself *
1840 * Relates to http://www.w3.org/TR/html40 *
1841 * *
1842 ************************************************************************/
1843
1844/************************************************************************
1845 * *
1846 * The parser itself *
1847 * *
1848 ************************************************************************/
1849
1850/**
1851 * htmlParseHTMLName:
1852 * @ctxt: an HTML parser context
1853 *
1854 * parse an HTML tag or attribute name, note that we convert it to lowercase
1855 * since HTML names are not case-sensitive.
1856 *
1857 * Returns the Tag Name parsed or NULL
1858 */
1859
1860xmlChar *
1861htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1862 xmlChar *ret = NULL;
1863 int i = 0;
1864 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1865
1866 if (!IS_LETTER(CUR) && (CUR != '_') &&
1867 (CUR != ':')) return(NULL);
1868
1869 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1870 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1871 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1872 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1873 else loc[i] = CUR;
1874 i++;
1875
1876 NEXT;
1877 }
1878
1879 ret = xmlStrndup(loc, i);
1880
1881 return(ret);
1882}
1883
1884/**
1885 * htmlParseName:
1886 * @ctxt: an HTML parser context
1887 *
1888 * parse an HTML name, this routine is case sensistive.
1889 *
1890 * Returns the Name parsed or NULL
1891 */
1892
1893xmlChar *
1894htmlParseName(htmlParserCtxtPtr ctxt) {
1895 xmlChar buf[HTML_MAX_NAMELEN];
1896 int len = 0;
1897
1898 GROW;
1899 if (!IS_LETTER(CUR) && (CUR != '_')) {
1900 return(NULL);
1901 }
1902
1903 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1904 (CUR == '.') || (CUR == '-') ||
1905 (CUR == '_') || (CUR == ':') ||
1906 (IS_COMBINING(CUR)) ||
1907 (IS_EXTENDER(CUR))) {
1908 buf[len++] = CUR;
1909 NEXT;
1910 if (len >= HTML_MAX_NAMELEN) {
1911 xmlGenericError(xmlGenericErrorContext,
1912 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1913 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1914 (CUR == '.') || (CUR == '-') ||
1915 (CUR == '_') || (CUR == ':') ||
1916 (IS_COMBINING(CUR)) ||
1917 (IS_EXTENDER(CUR)))
1918 NEXT;
1919 break;
1920 }
1921 }
1922 return(xmlStrndup(buf, len));
1923}
1924
1925/**
1926 * htmlParseHTMLAttribute:
1927 * @ctxt: an HTML parser context
1928 * @stop: a char stop value
1929 *
1930 * parse an HTML attribute value till the stop (quote), if
1931 * stop is 0 then it stops at the first space
1932 *
1933 * Returns the attribute parsed or NULL
1934 */
1935
1936xmlChar *
1937htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1938 xmlChar *buffer = NULL;
1939 int buffer_size = 0;
1940 xmlChar *out = NULL;
1941 xmlChar *name = NULL;
1942
1943 xmlChar *cur = NULL;
1944 htmlEntityDescPtr ent;
1945
1946 /*
1947 * allocate a translation buffer.
1948 */
1949 buffer_size = HTML_PARSER_BUFFER_SIZE;
1950 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1951 if (buffer == NULL) {
1952 perror("htmlParseHTMLAttribute: malloc failed");
1953 return(NULL);
1954 }
1955 out = buffer;
1956
1957 /*
1958 * Ok loop until we reach one of the ending chars
1959 */
1960 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1961 if ((stop == 0) && (IS_BLANK(CUR))) break;
1962 if (CUR == '&') {
1963 if (NXT(1) == '#') {
1964 unsigned int c;
1965 int bits;
1966
1967 c = htmlParseCharRef(ctxt);
1968 if (c < 0x80)
1969 { *out++ = c; bits= -6; }
1970 else if (c < 0x800)
1971 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1972 else if (c < 0x10000)
1973 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1974 else
1975 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1976
1977 for ( ; bits >= 0; bits-= 6) {
1978 *out++ = ((c >> bits) & 0x3F) | 0x80;
1979 }
1980 } else {
1981 ent = htmlParseEntityRef(ctxt, &name);
1982 if (name == NULL) {
1983 *out++ = '&';
1984 if (out - buffer > buffer_size - 100) {
1985 int index = out - buffer;
1986
1987 growBuffer(buffer);
1988 out = &buffer[index];
1989 }
1990 } else if (ent == NULL) {
1991 *out++ = '&';
1992 cur = name;
1993 while (*cur != 0) {
1994 if (out - buffer > buffer_size - 100) {
1995 int index = out - buffer;
1996
1997 growBuffer(buffer);
1998 out = &buffer[index];
1999 }
2000 *out++ = *cur++;
2001 }
2002 xmlFree(name);
2003 } else {
2004 unsigned int c;
2005 int bits;
2006
2007 if (out - buffer > buffer_size - 100) {
2008 int index = out - buffer;
2009
2010 growBuffer(buffer);
2011 out = &buffer[index];
2012 }
2013 c = (xmlChar)ent->value;
2014 if (c < 0x80)
2015 { *out++ = c; bits= -6; }
2016 else if (c < 0x800)
2017 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2018 else if (c < 0x10000)
2019 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2020 else
2021 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2022
2023 for ( ; bits >= 0; bits-= 6) {
2024 *out++ = ((c >> bits) & 0x3F) | 0x80;
2025 }
2026 xmlFree(name);
2027 }
2028 }
2029 } else {
2030 unsigned int c;
2031 int bits, l;
2032
2033 if (out - buffer > buffer_size - 100) {
2034 int index = out - buffer;
2035
2036 growBuffer(buffer);
2037 out = &buffer[index];
2038 }
2039 c = CUR_CHAR(l);
2040 if (c < 0x80)
2041 { *out++ = c; bits= -6; }
2042 else if (c < 0x800)
2043 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2044 else if (c < 0x10000)
2045 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2046 else
2047 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2048
2049 for ( ; bits >= 0; bits-= 6) {
2050 *out++ = ((c >> bits) & 0x3F) | 0x80;
2051 }
2052 NEXT;
2053 }
2054 }
2055 *out++ = 0;
2056 return(buffer);
2057}
2058
2059/**
2060 * htmlParseNmtoken:
2061 * @ctxt: an HTML parser context
2062 *
2063 * parse an HTML Nmtoken.
2064 *
2065 * Returns the Nmtoken parsed or NULL
2066 */
2067
2068xmlChar *
2069htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
2070 xmlChar buf[HTML_MAX_NAMELEN];
2071 int len = 0;
2072
2073 GROW;
2074 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2075 (CUR == '.') || (CUR == '-') ||
2076 (CUR == '_') || (CUR == ':') ||
2077 (IS_COMBINING(CUR)) ||
2078 (IS_EXTENDER(CUR))) {
2079 buf[len++] = CUR;
2080 NEXT;
2081 if (len >= HTML_MAX_NAMELEN) {
2082 xmlGenericError(xmlGenericErrorContext,
2083 "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
2084 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2085 (CUR == '.') || (CUR == '-') ||
2086 (CUR == '_') || (CUR == ':') ||
2087 (IS_COMBINING(CUR)) ||
2088 (IS_EXTENDER(CUR)))
2089 NEXT;
2090 break;
2091 }
2092 }
2093 return(xmlStrndup(buf, len));
2094}
2095
2096/**
2097 * htmlParseEntityRef:
2098 * @ctxt: an HTML parser context
2099 * @str: location to store the entity name
2100 *
2101 * parse an HTML ENTITY references
2102 *
2103 * [68] EntityRef ::= '&' Name ';'
2104 *
2105 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2106 * if non-NULL *str will have to be freed by the caller.
2107 */
2108htmlEntityDescPtr
2109htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2110 xmlChar *name;
2111 htmlEntityDescPtr ent = NULL;
2112 *str = NULL;
2113
2114 if (CUR == '&') {
2115 NEXT;
2116 name = htmlParseName(ctxt);
2117 if (name == NULL) {
2118 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2119 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2120 ctxt->wellFormed = 0;
2121 } else {
2122 GROW;
2123 if (CUR == ';') {
2124 *str = name;
2125
2126 /*
2127 * Lookup the entity in the table.
2128 */
2129 ent = htmlEntityLookup(name);
2130 if (ent != NULL) /* OK that's ugly !!! */
2131 NEXT;
2132 } else {
2133 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2134 ctxt->sax->error(ctxt->userData,
2135 "htmlParseEntityRef: expecting ';'\n");
2136 *str = name;
2137 }
2138 }
2139 }
2140 return(ent);
2141}
2142
2143/**
2144 * htmlParseAttValue:
2145 * @ctxt: an HTML parser context
2146 *
2147 * parse a value for an attribute
2148 * Note: the parser won't do substitution of entities here, this
2149 * will be handled later in xmlStringGetNodeList, unless it was
2150 * asked for ctxt->replaceEntities != 0
2151 *
2152 * Returns the AttValue parsed or NULL.
2153 */
2154
2155xmlChar *
2156htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2157 xmlChar *ret = NULL;
2158
2159 if (CUR == '"') {
2160 NEXT;
2161 ret = htmlParseHTMLAttribute(ctxt, '"');
2162 if (CUR != '"') {
2163 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2164 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2165 ctxt->wellFormed = 0;
2166 } else
2167 NEXT;
2168 } else if (CUR == '\'') {
2169 NEXT;
2170 ret = htmlParseHTMLAttribute(ctxt, '\'');
2171 if (CUR != '\'') {
2172 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2173 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2174 ctxt->wellFormed = 0;
2175 } else
2176 NEXT;
2177 } else {
2178 /*
2179 * That's an HTMLism, the attribute value may not be quoted
2180 */
2181 ret = htmlParseHTMLAttribute(ctxt, 0);
2182 if (ret == NULL) {
2183 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2184 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2185 ctxt->wellFormed = 0;
2186 }
2187 }
2188 return(ret);
2189}
2190
2191/**
2192 * htmlParseSystemLiteral:
2193 * @ctxt: an HTML parser context
2194 *
2195 * parse an HTML Literal
2196 *
2197 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2198 *
2199 * Returns the SystemLiteral parsed or NULL
2200 */
2201
2202xmlChar *
2203htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2204 const xmlChar *q;
2205 xmlChar *ret = NULL;
2206
2207 if (CUR == '"') {
2208 NEXT;
2209 q = CUR_PTR;
2210 while ((IS_CHAR(CUR)) && (CUR != '"'))
2211 NEXT;
2212 if (!IS_CHAR(CUR)) {
2213 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2214 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2215 ctxt->wellFormed = 0;
2216 } else {
2217 ret = xmlStrndup(q, CUR_PTR - q);
2218 NEXT;
2219 }
2220 } else if (CUR == '\'') {
2221 NEXT;
2222 q = CUR_PTR;
2223 while ((IS_CHAR(CUR)) && (CUR != '\''))
2224 NEXT;
2225 if (!IS_CHAR(CUR)) {
2226 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2227 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2228 ctxt->wellFormed = 0;
2229 } else {
2230 ret = xmlStrndup(q, CUR_PTR - q);
2231 NEXT;
2232 }
2233 } else {
2234 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2235 ctxt->sax->error(ctxt->userData,
2236 "SystemLiteral \" or ' expected\n");
2237 ctxt->wellFormed = 0;
2238 }
2239
2240 return(ret);
2241}
2242
2243/**
2244 * htmlParsePubidLiteral:
2245 * @ctxt: an HTML parser context
2246 *
2247 * parse an HTML public literal
2248 *
2249 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2250 *
2251 * Returns the PubidLiteral parsed or NULL.
2252 */
2253
2254xmlChar *
2255htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2256 const xmlChar *q;
2257 xmlChar *ret = NULL;
2258 /*
2259 * Name ::= (Letter | '_') (NameChar)*
2260 */
2261 if (CUR == '"') {
2262 NEXT;
2263 q = CUR_PTR;
2264 while (IS_PUBIDCHAR(CUR)) NEXT;
2265 if (CUR != '"') {
2266 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2267 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2268 ctxt->wellFormed = 0;
2269 } else {
2270 ret = xmlStrndup(q, CUR_PTR - q);
2271 NEXT;
2272 }
2273 } else if (CUR == '\'') {
2274 NEXT;
2275 q = CUR_PTR;
2276 while ((IS_LETTER(CUR)) && (CUR != '\''))
2277 NEXT;
2278 if (!IS_LETTER(CUR)) {
2279 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2280 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2281 ctxt->wellFormed = 0;
2282 } else {
2283 ret = xmlStrndup(q, CUR_PTR - q);
2284 NEXT;
2285 }
2286 } else {
2287 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2288 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2289 ctxt->wellFormed = 0;
2290 }
2291
2292 return(ret);
2293}
2294
2295/**
2296 * htmlParseScript:
2297 * @ctxt: an HTML parser context
2298 *
2299 * parse the content of an HTML SCRIPT or STYLE element
2300 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2301 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2302 * http://www.w3.org/TR/html4/types.html#type-script
2303 * http://www.w3.org/TR/html4/types.html#h-6.15
2304 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2305 *
2306 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2307 * element and the value of intrinsic event attributes. User agents must
2308 * not evaluate script data as HTML markup but instead must pass it on as
2309 * data to a script engine.
2310 * NOTES:
2311 * - The content is passed like CDATA
2312 * - the attributes for style and scripting "onXXX" are also described
2313 * as CDATA but SGML allows entities references in attributes so their
2314 * processing is identical as other attributes
2315 */
2316void
2317htmlParseScript(htmlParserCtxtPtr ctxt) {
2318 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2319 int nbchar = 0;
2320 xmlChar cur;
2321
2322 SHRINK;
2323 cur = CUR;
2324 while (IS_CHAR(cur)) {
2325 if ((cur == '<') && (NXT(1) == '/')) {
2326 /*
2327 * One should break here, the specification is clear:
2328 * Authors should therefore escape "</" within the content.
2329 * Escape mechanisms are specific to each scripting or
2330 * style sheet language.
2331 */
2332 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2333 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2334 break; /* while */
2335 }
2336 buf[nbchar++] = cur;
2337 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2338 if (ctxt->sax->cdataBlock!= NULL) {
2339 /*
2340 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2341 */
2342 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2343 }
2344 nbchar = 0;
2345 }
2346 NEXT;
2347 cur = CUR;
2348 }
2349 if (!(IS_CHAR(cur))) {
2350 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2351 ctxt->sax->error(ctxt->userData,
2352 "Invalid char in CDATA 0x%X\n", cur);
2353 ctxt->wellFormed = 0;
2354 NEXT;
2355 }
2356
2357 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2358 if (ctxt->sax->cdataBlock!= NULL) {
2359 /*
2360 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2361 */
2362 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2363 }
2364 }
2365}
2366
2367
2368/**
2369 * htmlParseCharData:
2370 * @ctxt: an HTML parser context
2371 * @cdata: int indicating whether we are within a CDATA section
2372 *
2373 * parse a CharData section.
2374 * if we are within a CDATA section ']]>' marks an end of section.
2375 *
2376 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2377 */
2378
2379void
2380htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
2381 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2382 int nbchar = 0;
2383 int cur, l;
2384
2385 SHRINK;
2386 cur = CUR_CHAR(l);
2387 while (((cur != '<') || (ctxt->token == '<')) &&
2388 ((cur != '&') || (ctxt->token == '&')) &&
2389 (IS_CHAR(cur))) {
2390 COPY_BUF(l,buf,nbchar,cur);
2391 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2392 /*
2393 * Ok the segment is to be consumed as chars.
2394 */
2395 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2396 if (areBlanks(ctxt, buf, nbchar)) {
2397 if (ctxt->sax->ignorableWhitespace != NULL)
2398 ctxt->sax->ignorableWhitespace(ctxt->userData,
2399 buf, nbchar);
2400 } else {
2401 htmlCheckParagraph(ctxt);
2402 if (ctxt->sax->characters != NULL)
2403 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2404 }
2405 }
2406 nbchar = 0;
2407 }
2408 NEXTL(l);
2409 cur = CUR_CHAR(l);
2410 }
2411 if (nbchar != 0) {
2412 /*
2413 * Ok the segment is to be consumed as chars.
2414 */
2415 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2416 if (areBlanks(ctxt, buf, nbchar)) {
2417 if (ctxt->sax->ignorableWhitespace != NULL)
2418 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2419 } else {
2420 htmlCheckParagraph(ctxt);
2421 if (ctxt->sax->characters != NULL)
2422 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2423 }
2424 }
2425 }
2426}
2427
2428/**
2429 * htmlParseExternalID:
2430 * @ctxt: an HTML parser context
2431 * @publicID: a xmlChar** receiving PubidLiteral
2432 * @strict: indicate whether we should restrict parsing to only
2433 * production [75], see NOTE below
2434 *
2435 * Parse an External ID or a Public ID
2436 *
2437 * NOTE: Productions [75] and [83] interract badly since [75] can generate
2438 * 'PUBLIC' S PubidLiteral S SystemLiteral
2439 *
2440 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2441 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2442 *
2443 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2444 *
2445 * Returns the function returns SystemLiteral and in the second
2446 * case publicID receives PubidLiteral, is strict is off
2447 * it is possible to return NULL and have publicID set.
2448 */
2449
2450xmlChar *
2451htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
2452 xmlChar *URI = NULL;
2453
2454 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2455 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2456 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2457 SKIP(6);
2458 if (!IS_BLANK(CUR)) {
2459 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2460 ctxt->sax->error(ctxt->userData,
2461 "Space required after 'SYSTEM'\n");
2462 ctxt->wellFormed = 0;
2463 }
2464 SKIP_BLANKS;
2465 URI = htmlParseSystemLiteral(ctxt);
2466 if (URI == NULL) {
2467 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2468 ctxt->sax->error(ctxt->userData,
2469 "htmlParseExternalID: SYSTEM, no URI\n");
2470 ctxt->wellFormed = 0;
2471 }
2472 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2473 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2474 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2475 SKIP(6);
2476 if (!IS_BLANK(CUR)) {
2477 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2478 ctxt->sax->error(ctxt->userData,
2479 "Space required after 'PUBLIC'\n");
2480 ctxt->wellFormed = 0;
2481 }
2482 SKIP_BLANKS;
2483 *publicID = htmlParsePubidLiteral(ctxt);
2484 if (*publicID == NULL) {
2485 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2486 ctxt->sax->error(ctxt->userData,
2487 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2488 ctxt->wellFormed = 0;
2489 }
2490 SKIP_BLANKS;
2491 if ((CUR == '"') || (CUR == '\'')) {
2492 URI = htmlParseSystemLiteral(ctxt);
2493 }
2494 }
2495 return(URI);
2496}
2497
2498/**
2499 * htmlParseComment:
2500 * @ctxt: an HTML parser context
2501 *
2502 * Parse an XML (SGML) comment <!-- .... -->
2503 *
2504 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2505 */
2506void
2507htmlParseComment(htmlParserCtxtPtr ctxt) {
2508 xmlChar *buf = NULL;
2509 int len;
2510 int size = HTML_PARSER_BUFFER_SIZE;
2511 int q, ql;
2512 int r, rl;
2513 int cur, l;
2514 xmlParserInputState state;
2515
2516 /*
2517 * Check that there is a comment right here.
2518 */
2519 if ((RAW != '<') || (NXT(1) != '!') ||
2520 (NXT(2) != '-') || (NXT(3) != '-')) return;
2521
2522 state = ctxt->instate;
2523 ctxt->instate = XML_PARSER_COMMENT;
2524 SHRINK;
2525 SKIP(4);
2526 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2527 if (buf == NULL) {
2528 xmlGenericError(xmlGenericErrorContext,
2529 "malloc of %d byte failed\n", size);
2530 ctxt->instate = state;
2531 return;
2532 }
2533 q = CUR_CHAR(ql);
2534 NEXTL(ql);
2535 r = CUR_CHAR(rl);
2536 NEXTL(rl);
2537 cur = CUR_CHAR(l);
2538 len = 0;
2539 while (IS_CHAR(cur) &&
2540 ((cur != '>') ||
2541 (r != '-') || (q != '-'))) {
2542 if (len + 5 >= size) {
2543 size *= 2;
2544 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2545 if (buf == NULL) {
2546 xmlGenericError(xmlGenericErrorContext,
2547 "realloc of %d byte failed\n", size);
2548 ctxt->instate = state;
2549 return;
2550 }
2551 }
2552 COPY_BUF(ql,buf,len,q);
2553 q = r;
2554 ql = rl;
2555 r = cur;
2556 rl = l;
2557 NEXTL(l);
2558 cur = CUR_CHAR(l);
2559 if (cur == 0) {
2560 SHRINK;
2561 GROW;
2562 cur = CUR_CHAR(l);
2563 }
2564 }
2565 buf[len] = 0;
2566 if (!IS_CHAR(cur)) {
2567 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2568 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2569 ctxt->sax->error(ctxt->userData,
2570 "Comment not terminated \n<!--%.50s\n", buf);
2571 ctxt->wellFormed = 0;
2572 xmlFree(buf);
2573 } else {
2574 NEXT;
2575 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2576 (!ctxt->disableSAX))
2577 ctxt->sax->comment(ctxt->userData, buf);
2578 xmlFree(buf);
2579 }
2580 ctxt->instate = state;
2581}
2582
2583/**
2584 * htmlParseCharRef:
2585 * @ctxt: an HTML parser context
2586 *
2587 * parse Reference declarations
2588 *
2589 * [66] CharRef ::= '&#' [0-9]+ ';' |
2590 * '&#x' [0-9a-fA-F]+ ';'
2591 *
2592 * Returns the value parsed (as an int)
2593 */
2594int
2595htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2596 int val = 0;
2597
2598 if ((CUR == '&') && (NXT(1) == '#') &&
2599 (NXT(2) == 'x')) {
2600 SKIP(3);
2601 while (CUR != ';') {
2602 if ((CUR >= '0') && (CUR <= '9'))
2603 val = val * 16 + (CUR - '0');
2604 else if ((CUR >= 'a') && (CUR <= 'f'))
2605 val = val * 16 + (CUR - 'a') + 10;
2606 else if ((CUR >= 'A') && (CUR <= 'F'))
2607 val = val * 16 + (CUR - 'A') + 10;
2608 else {
2609 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2610 ctxt->sax->error(ctxt->userData,
2611 "htmlParseCharRef: invalid hexadecimal value\n");
2612 ctxt->wellFormed = 0;
2613 return(0);
2614 }
2615 NEXT;
2616 }
2617 if (CUR == ';')
2618 NEXT;
2619 } else if ((CUR == '&') && (NXT(1) == '#')) {
2620 SKIP(2);
2621 while (CUR != ';') {
2622 if ((CUR >= '0') && (CUR <= '9'))
2623 val = val * 10 + (CUR - '0');
2624 else {
2625 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2626 ctxt->sax->error(ctxt->userData,
2627 "htmlParseCharRef: invalid decimal value\n");
2628 ctxt->wellFormed = 0;
2629 return(0);
2630 }
2631 NEXT;
2632 }
2633 if (CUR == ';')
2634 NEXT;
2635 } else {
2636 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2637 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2638 ctxt->wellFormed = 0;
2639 }
2640 /*
2641 * Check the value IS_CHAR ...
2642 */
2643 if (IS_CHAR(val)) {
2644 return(val);
2645 } else {
2646 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2647 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2648 val);
2649 ctxt->wellFormed = 0;
2650 }
2651 return(0);
2652}
2653
2654
2655/**
2656 * htmlParseDocTypeDecl :
2657 * @ctxt: an HTML parser context
2658 *
2659 * parse a DOCTYPE declaration
2660 *
2661 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2662 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2663 */
2664
2665void
2666htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2667 xmlChar *name;
2668 xmlChar *ExternalID = NULL;
2669 xmlChar *URI = NULL;
2670
2671 /*
2672 * We know that '<!DOCTYPE' has been detected.
2673 */
2674 SKIP(9);
2675
2676 SKIP_BLANKS;
2677
2678 /*
2679 * Parse the DOCTYPE name.
2680 */
2681 name = htmlParseName(ctxt);
2682 if (name == NULL) {
2683 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2684 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2685 ctxt->wellFormed = 0;
2686 }
2687 /*
2688 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2689 */
2690
2691 SKIP_BLANKS;
2692
2693 /*
2694 * Check for SystemID and ExternalID
2695 */
2696 URI = htmlParseExternalID(ctxt, &ExternalID, 0);
2697 SKIP_BLANKS;
2698
2699 /*
2700 * We should be at the end of the DOCTYPE declaration.
2701 */
2702 if (CUR != '>') {
2703 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2704 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2705 ctxt->wellFormed = 0;
2706 /* We shouldn't try to resynchronize ... */
2707 }
2708 NEXT;
2709
2710 /*
2711 * Create or update the document accordingly to the DOCTYPE
2712 */
2713 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2714 (!ctxt->disableSAX))
2715 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2716
2717 /*
2718 * Cleanup, since we don't use all those identifiers
2719 */
2720 if (URI != NULL) xmlFree(URI);
2721 if (ExternalID != NULL) xmlFree(ExternalID);
2722 if (name != NULL) xmlFree(name);
2723}
2724
2725/**
2726 * htmlParseAttribute:
2727 * @ctxt: an HTML parser context
2728 * @value: a xmlChar ** used to store the value of the attribute
2729 *
2730 * parse an attribute
2731 *
2732 * [41] Attribute ::= Name Eq AttValue
2733 *
2734 * [25] Eq ::= S? '=' S?
2735 *
2736 * With namespace:
2737 *
2738 * [NS 11] Attribute ::= QName Eq AttValue
2739 *
2740 * Also the case QName == xmlns:??? is handled independently as a namespace
2741 * definition.
2742 *
2743 * Returns the attribute name, and the value in *value.
2744 */
2745
2746xmlChar *
2747htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2748 xmlChar *name, *val = NULL;
2749
2750 *value = NULL;
2751 name = htmlParseHTMLName(ctxt);
2752 if (name == NULL) {
2753 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2754 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2755 ctxt->wellFormed = 0;
2756 return(NULL);
2757 }
2758
2759 /*
2760 * read the value
2761 */
2762 SKIP_BLANKS;
2763 if (CUR == '=') {
2764 NEXT;
2765 SKIP_BLANKS;
2766 val = htmlParseAttValue(ctxt);
2767 /******
2768 } else {
2769 * TODO : some attribute must have values, some may not
2770 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2771 ctxt->sax->warning(ctxt->userData,
2772 "No value for attribute %s\n", name); */
2773 }
2774
2775 *value = val;
2776 return(name);
2777}
2778
2779/**
2780 * htmlCheckEncoding:
2781 * @ctxt: an HTML parser context
2782 * @attvalue: the attribute value
2783 *
2784 * Checks an http-equiv attribute from a Meta tag to detect
2785 * the encoding
2786 * If a new encoding is detected the parser is switched to decode
2787 * it and pass UTF8
2788 */
2789void
2790htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2791 const xmlChar *encoding;
2792
2793 if ((ctxt == NULL) || (attvalue == NULL))
2794 return;
2795
2796 /* do not change encoding */
2797 if (ctxt->input->encoding != NULL)
2798 return;
2799
2800 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2801 if (encoding != NULL) {
2802 encoding += 8;
2803 } else {
2804 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2805 if (encoding != NULL)
2806 encoding += 9;
2807 }
2808 if (encoding != NULL) {
2809 xmlCharEncoding enc;
2810 xmlCharEncodingHandlerPtr handler;
2811
2812 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2813
2814 if (ctxt->input->encoding != NULL)
2815 xmlFree((xmlChar *) ctxt->input->encoding);
2816 ctxt->input->encoding = xmlStrdup(encoding);
2817
2818 enc = xmlParseCharEncoding((const char *) encoding);
2819 /*
2820 * registered set of known encodings
2821 */
2822 if (enc != XML_CHAR_ENCODING_ERROR) {
2823 xmlSwitchEncoding(ctxt, enc);
2824 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2825 } else {
2826 /*
2827 * fallback for unknown encodings
2828 */
2829 handler = xmlFindCharEncodingHandler((const char *) encoding);
2830 if (handler != NULL) {
2831 xmlSwitchToEncoding(ctxt, handler);
2832 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2833 } else {
2834 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2835 }
2836 }
2837
2838 if ((ctxt->input->buf != NULL) &&
2839 (ctxt->input->buf->encoder != NULL) &&
2840 (ctxt->input->buf->raw != NULL) &&
2841 (ctxt->input->buf->buffer != NULL)) {
2842 int nbchars;
2843 int processed;
2844
2845 /*
2846 * convert as much as possible to the parser reading buffer.
2847 */
2848 processed = ctxt->input->cur - ctxt->input->base;
2849 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2850 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2851 ctxt->input->buf->buffer,
2852 ctxt->input->buf->raw);
2853 if (nbchars < 0) {
2854 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2855 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2856 ctxt->sax->error(ctxt->userData,
2857 "htmlCheckEncoding: encoder error\n");
2858 }
2859 ctxt->input->base =
2860 ctxt->input->cur = ctxt->input->buf->buffer->content;
2861 }
2862 }
2863}
2864
2865/**
2866 * htmlCheckMeta:
2867 * @ctxt: an HTML parser context
2868 * @atts: the attributes values
2869 *
2870 * Checks an attributes from a Meta tag
2871 */
2872void
2873htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2874 int i;
2875 const xmlChar *att, *value;
2876 int http = 0;
2877 const xmlChar *content = NULL;
2878
2879 if ((ctxt == NULL) || (atts == NULL))
2880 return;
2881
2882 i = 0;
2883 att = atts[i++];
2884 while (att != NULL) {
2885 value = atts[i++];
2886 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2887 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2888 http = 1;
2889 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2890 content = value;
2891 att = atts[i++];
2892 }
2893 if ((http) && (content != NULL))
2894 htmlCheckEncoding(ctxt, content);
2895
2896}
2897
2898/**
2899 * htmlParseStartTag:
2900 * @ctxt: an HTML parser context
2901 *
2902 * parse a start of tag either for rule element or
2903 * EmptyElement. In both case we don't parse the tag closing chars.
2904 *
2905 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2906 *
2907 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2908 *
2909 * With namespace:
2910 *
2911 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2912 *
2913 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2914 *
2915 */
2916
2917void
2918htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2919 xmlChar *name;
2920 xmlChar *attname;
2921 xmlChar *attvalue;
2922 const xmlChar **atts = NULL;
2923 int nbatts = 0;
2924 int maxatts = 0;
2925 int meta = 0;
2926 int i;
2927
2928 if (CUR != '<') return;
2929 NEXT;
2930
2931 GROW;
2932 name = htmlParseHTMLName(ctxt);
2933 if (name == NULL) {
2934 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2935 ctxt->sax->error(ctxt->userData,
2936 "htmlParseStartTag: invalid element name\n");
2937 ctxt->wellFormed = 0;
2938 /* Dump the bogus tag like browsers do */
2939 while ((IS_CHAR(CUR)) && (CUR != '>'))
2940 NEXT;
2941 return;
2942 }
2943 if (xmlStrEqual(name, BAD_CAST"meta"))
2944 meta = 1;
2945
2946 /*
2947 * Check for auto-closure of HTML elements.
2948 */
2949 htmlAutoClose(ctxt, name);
2950
2951 /*
2952 * Check for implied HTML elements.
2953 */
2954 htmlCheckImplied(ctxt, name);
2955
2956 /*
2957 * Avoid html at any level > 0, head at any level != 1
2958 * or any attempt to recurse body
2959 */
2960 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2961 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2962 ctxt->sax->error(ctxt->userData,
2963 "htmlParseStartTag: misplaced <html> tag\n");
2964 ctxt->wellFormed = 0;
2965 xmlFree(name);
2966 return;
2967 }
2968 if ((ctxt->nameNr != 1) &&
2969 (xmlStrEqual(name, BAD_CAST"head"))) {
2970 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2971 ctxt->sax->error(ctxt->userData,
2972 "htmlParseStartTag: misplaced <head> tag\n");
2973 ctxt->wellFormed = 0;
2974 xmlFree(name);
2975 return;
2976 }
2977 if (xmlStrEqual(name, BAD_CAST"body")) {
2978 int i;
2979 for (i = 0;i < ctxt->nameNr;i++) {
2980 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
2981 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2982 ctxt->sax->error(ctxt->userData,
2983 "htmlParseStartTag: misplaced <body> tag\n");
2984 ctxt->wellFormed = 0;
2985 xmlFree(name);
2986 return;
2987 }
2988 }
2989 }
2990
2991 /*
2992 * Now parse the attributes, it ends up with the ending
2993 *
2994 * (S Attribute)* S?
2995 */
2996 SKIP_BLANKS;
2997 while ((IS_CHAR(CUR)) &&
2998 (CUR != '>') &&
2999 ((CUR != '/') || (NXT(1) != '>'))) {
3000 long cons = ctxt->nbChars;
3001
3002 GROW;
3003 attname = htmlParseAttribute(ctxt, &attvalue);
3004 if (attname != NULL) {
3005
3006 /*
3007 * Well formedness requires at most one declaration of an attribute
3008 */
3009 for (i = 0; i < nbatts;i += 2) {
3010 if (xmlStrEqual(atts[i], attname)) {
3011 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3012 ctxt->sax->error(ctxt->userData,
3013 "Attribute %s redefined\n",
3014 attname);
3015 ctxt->wellFormed = 0;
3016 xmlFree(attname);
3017 if (attvalue != NULL)
3018 xmlFree(attvalue);
3019 goto failed;
3020 }
3021 }
3022
3023 /*
3024 * Add the pair to atts
3025 */
3026 if (atts == NULL) {
3027 maxatts = 10;
3028 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3029 if (atts == NULL) {
3030 xmlGenericError(xmlGenericErrorContext,
3031 "malloc of %ld byte failed\n",
3032 maxatts * (long)sizeof(xmlChar *));
3033 if (name != NULL) xmlFree(name);
3034 return;
3035 }
3036 } else if (nbatts + 4 > maxatts) {
3037 maxatts *= 2;
3038 atts = (const xmlChar **) xmlRealloc((void *) atts,
3039 maxatts * sizeof(xmlChar *));
3040 if (atts == NULL) {
3041 xmlGenericError(xmlGenericErrorContext,
3042 "realloc of %ld byte failed\n",
3043 maxatts * (long)sizeof(xmlChar *));
3044 if (name != NULL) xmlFree(name);
3045 return;
3046 }
3047 }
3048 atts[nbatts++] = attname;
3049 atts[nbatts++] = attvalue;
3050 atts[nbatts] = NULL;
3051 atts[nbatts + 1] = NULL;
3052 }
3053 else {
3054 /* Dump the bogus attribute string up to the next blank or
3055 * the end of the tag. */
3056 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3057 && ((CUR != '/') || (NXT(1) != '>')))
3058 NEXT;
3059 }
3060
3061failed:
3062 SKIP_BLANKS;
3063 if (cons == ctxt->nbChars) {
3064 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3065 ctxt->sax->error(ctxt->userData,
3066 "htmlParseStartTag: problem parsing attributes\n");
3067 ctxt->wellFormed = 0;
3068 break;
3069 }
3070 }
3071
3072 /*
3073 * Handle specific association to the META tag
3074 */
3075 if (meta)
3076 htmlCheckMeta(ctxt, atts);
3077
3078 /*
3079 * SAX: Start of Element !
3080 */
3081 htmlnamePush(ctxt, xmlStrdup(name));
3082#ifdef DEBUG
3083 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3084#endif
3085 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3086 ctxt->sax->startElement(ctxt->userData, name, atts);
3087
3088 if (atts != NULL) {
3089 for (i = 0;i < nbatts;i++) {
3090 if (atts[i] != NULL)
3091 xmlFree((xmlChar *) atts[i]);
3092 }
3093 xmlFree((void *) atts);
3094 }
3095 if (name != NULL) xmlFree(name);
3096}
3097
3098/**
3099 * htmlParseEndTag:
3100 * @ctxt: an HTML parser context
3101 *
3102 * parse an end of tag
3103 *
3104 * [42] ETag ::= '</' Name S? '>'
3105 *
3106 * With namespace
3107 *
3108 * [NS 9] ETag ::= '</' QName S? '>'
3109 */
3110
3111void
3112htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3113 xmlChar *name;
3114 xmlChar *oldname;
3115 int i;
3116
3117 if ((CUR != '<') || (NXT(1) != '/')) {
3118 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3119 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3120 ctxt->wellFormed = 0;
3121 return;
3122 }
3123 SKIP(2);
3124
3125 name = htmlParseHTMLName(ctxt);
3126 if (name == NULL) return;
3127
3128 /*
3129 * We should definitely be at the ending "S? '>'" part
3130 */
3131 SKIP_BLANKS;
3132 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3133 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3134 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3135 ctxt->wellFormed = 0;
3136 } else
3137 NEXT;
3138
3139 /*
3140 * If the name read is not one of the element in the parsing stack
3141 * then return, it's just an error.
3142 */
3143 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3144 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3145 }
3146 if (i < 0) {
3147 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3148 ctxt->sax->error(ctxt->userData,
3149 "Unexpected end tag : %s\n", name);
3150 xmlFree(name);
3151 ctxt->wellFormed = 0;
3152 return;
3153 }
3154
3155
3156 /*
3157 * Check for auto-closure of HTML elements.
3158 */
3159
3160 htmlAutoCloseOnClose(ctxt, name);
3161
3162 /*
3163 * Well formedness constraints, opening and closing must match.
3164 * With the exception that the autoclose may have popped stuff out
3165 * of the stack.
3166 */
3167 if (!xmlStrEqual(name, ctxt->name)) {
3168#ifdef DEBUG
3169 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3170#endif
3171 if ((ctxt->name != NULL) &&
3172 (!xmlStrEqual(ctxt->name, name))) {
3173 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3174 ctxt->sax->error(ctxt->userData,
3175 "Opening and ending tag mismatch: %s and %s\n",
3176 name, ctxt->name);
3177 ctxt->wellFormed = 0;
3178 }
3179 }
3180
3181 /*
3182 * SAX: End of Tag
3183 */
3184 oldname = ctxt->name;
3185 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3186 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3187 ctxt->sax->endElement(ctxt->userData, name);
3188 oldname = htmlnamePop(ctxt);
3189 if (oldname != NULL) {
3190#ifdef DEBUG
3191 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3192#endif
3193 xmlFree(oldname);
3194#ifdef DEBUG
3195 } else {
3196 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3197#endif
3198 }
3199 }
3200
3201 if (name != NULL)
3202 xmlFree(name);
3203
3204 return;
3205}
3206
3207
3208/**
3209 * htmlParseReference:
3210 * @ctxt: an HTML parser context
3211 *
3212 * parse and handle entity references in content,
3213 * this will end-up in a call to character() since this is either a
3214 * CharRef, or a predefined entity.
3215 */
3216void
3217htmlParseReference(htmlParserCtxtPtr ctxt) {
3218 htmlEntityDescPtr ent;
3219 xmlChar out[6];
3220 xmlChar *name;
3221 if (CUR != '&') return;
3222
3223 if (NXT(1) == '#') {
3224 unsigned int c;
3225 int bits, i = 0;
3226
3227 c = htmlParseCharRef(ctxt);
3228 if (c == 0)
3229 return;
3230
3231 if (c < 0x80) { out[i++]= c; bits= -6; }
3232 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3233 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3234 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3235
3236 for ( ; bits >= 0; bits-= 6) {
3237 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3238 }
3239 out[i] = 0;
3240
3241 htmlCheckParagraph(ctxt);
3242 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3243 ctxt->sax->characters(ctxt->userData, out, i);
3244 } else {
3245 ent = htmlParseEntityRef(ctxt, &name);
3246 if (name == NULL) {
3247 htmlCheckParagraph(ctxt);
3248 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3249 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3250 return;
3251 }
3252 if ((ent == NULL) || (ent->value <= 0)) {
3253 htmlCheckParagraph(ctxt);
3254 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3255 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3256 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3257 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3258 }
3259 } else {
3260 unsigned int c;
3261 int bits, i = 0;
3262
3263 c = ent->value;
3264 if (c < 0x80)
3265 { out[i++]= c; bits= -6; }
3266 else if (c < 0x800)
3267 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3268 else if (c < 0x10000)
3269 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3270 else
3271 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3272
3273 for ( ; bits >= 0; bits-= 6) {
3274 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3275 }
3276 out[i] = 0;
3277
3278 htmlCheckParagraph(ctxt);
3279 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3280 ctxt->sax->characters(ctxt->userData, out, i);
3281 }
3282 xmlFree(name);
3283 }
3284}
3285
3286/**
3287 * htmlParseContent:
3288 * @ctxt: an HTML parser context
3289 * @name: the node name
3290 *
3291 * Parse a content: comment, sub-element, reference or text.
3292 *
3293 */
3294
3295void
3296htmlParseContent(htmlParserCtxtPtr ctxt) {
3297 xmlChar *currentNode;
3298 int depth;
3299
3300 currentNode = xmlStrdup(ctxt->name);
3301 depth = ctxt->nameNr;
3302 while (1) {
3303 long cons = ctxt->nbChars;
3304
3305 GROW;
3306 /*
3307 * Our tag or one of it's parent or children is ending.
3308 */
3309 if ((CUR == '<') && (NXT(1) == '/')) {
3310 htmlParseEndTag(ctxt);
3311 if (currentNode != NULL) xmlFree(currentNode);
3312 return;
3313 }
3314
3315 /*
3316 * Has this node been popped out during parsing of
3317 * the next element
3318 */
3319 if ((!xmlStrEqual(currentNode, ctxt->name)) &&
3320 (depth >= ctxt->nameNr)) {
3321 if (currentNode != NULL) xmlFree(currentNode);
3322 return;
3323 }
3324
3325 if ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3326 (xmlStrEqual(currentNode, BAD_CAST"style"))) {
3327 /*
3328 * Handle SCRIPT/STYLE separately
3329 */
3330 htmlParseScript(ctxt);
3331 } else {
3332 /*
3333 * Sometimes DOCTYPE arrives in the middle of the document
3334 */
3335 if ((CUR == '<') && (NXT(1) == '!') &&
3336 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3337 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3338 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3339 (UPP(8) == 'E')) {
3340 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3341 ctxt->sax->error(ctxt->userData,
3342 "Misplaced DOCTYPE declaration\n");
3343 ctxt->wellFormed = 0;
3344 htmlParseDocTypeDecl(ctxt);
3345 }
3346
3347 /*
3348 * First case : a comment
3349 */
3350 if ((CUR == '<') && (NXT(1) == '!') &&
3351 (NXT(2) == '-') && (NXT(3) == '-')) {
3352 htmlParseComment(ctxt);
3353 }
3354
3355 /*
3356 * Second case : a sub-element.
3357 */
3358 else if (CUR == '<') {
3359 htmlParseElement(ctxt);
3360 }
3361
3362 /*
3363 * Third case : a reference. If if has not been resolved,
3364 * parsing returns it's Name, create the node
3365 */
3366 else if (CUR == '&') {
3367 htmlParseReference(ctxt);
3368 }
3369
3370 /*
3371 * Fourth : end of the resource
3372 */
3373 else if (CUR == 0) {
3374 htmlAutoClose(ctxt, NULL);
3375 }
3376
3377 /*
3378 * Last case, text. Note that References are handled directly.
3379 */
3380 else {
3381 htmlParseCharData(ctxt, 0);
3382 }
3383
3384 if (cons == ctxt->nbChars) {
3385 if (ctxt->node != NULL) {
3386 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3387 ctxt->sax->error(ctxt->userData,
3388 "detected an error in element content\n");
3389 ctxt->wellFormed = 0;
3390 }
3391 break;
3392 }
3393 }
3394 GROW;
3395 }
3396 if (currentNode != NULL) xmlFree(currentNode);
3397}
3398
3399/**
3400 * htmlParseElement:
3401 * @ctxt: an HTML parser context
3402 *
3403 * parse an HTML element, this is highly recursive
3404 *
3405 * [39] element ::= EmptyElemTag | STag content ETag
3406 *
3407 * [41] Attribute ::= Name Eq AttValue
3408 */
3409
3410void
3411htmlParseElement(htmlParserCtxtPtr ctxt) {
3412 xmlChar *name;
3413 xmlChar *currentNode = NULL;
3414 htmlElemDescPtr info;
3415 htmlParserNodeInfo node_info;
3416 xmlChar *oldname;
3417 int depth = ctxt->nameNr;
3418
3419 /* Capture start position */
3420 if (ctxt->record_info) {
3421 node_info.begin_pos = ctxt->input->consumed +
3422 (CUR_PTR - ctxt->input->base);
3423 node_info.begin_line = ctxt->input->line;
3424 }
3425
3426 oldname = xmlStrdup(ctxt->name);
3427 htmlParseStartTag(ctxt);
3428 name = ctxt->name;
3429#ifdef DEBUG
3430 if (oldname == NULL)
3431 xmlGenericError(xmlGenericErrorContext,
3432 "Start of element %s\n", name);
3433 else if (name == NULL)
3434 xmlGenericError(xmlGenericErrorContext,
3435 "Start of element failed, was %s\n", oldname);
3436 else
3437 xmlGenericError(xmlGenericErrorContext,
3438 "Start of element %s, was %s\n", name, oldname);
3439#endif
3440 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3441 (name == NULL)) {
3442 if (CUR == '>')
3443 NEXT;
3444 if (oldname != NULL)
3445 xmlFree(oldname);
3446 return;
3447 }
3448 if (oldname != NULL)
3449 xmlFree(oldname);
3450
3451 /*
3452 * Lookup the info for that element.
3453 */
3454 info = htmlTagLookup(name);
3455 if (info == NULL) {
3456 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3457 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3458 name);
3459 ctxt->wellFormed = 0;
3460 } else if (info->depr) {
3461/***************************
3462 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3463 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3464 name);
3465 ***************************/
3466 }
3467
3468 /*
3469 * Check for an Empty Element labelled the XML/SGML way
3470 */
3471 if ((CUR == '/') && (NXT(1) == '>')) {
3472 SKIP(2);
3473 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3474 ctxt->sax->endElement(ctxt->userData, name);
3475 oldname = htmlnamePop(ctxt);
3476#ifdef DEBUG
3477 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3478#endif
3479 if (oldname != NULL)
3480 xmlFree(oldname);
3481 return;
3482 }
3483
3484 if (CUR == '>') {
3485 NEXT;
3486 } else {
3487 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3488 ctxt->sax->error(ctxt->userData,
3489 "Couldn't find end of Start Tag %s\n",
3490 name);
3491 ctxt->wellFormed = 0;
3492
3493 /*
3494 * end of parsing of this node.
3495 */
3496 if (xmlStrEqual(name, ctxt->name)) {
3497 nodePop(ctxt);
3498 oldname = htmlnamePop(ctxt);
3499#ifdef DEBUG
3500 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3501#endif
3502 if (oldname != NULL)
3503 xmlFree(oldname);
3504 }
3505
3506 /*
3507 * Capture end position and add node
3508 */
3509 if ( currentNode != NULL && ctxt->record_info ) {
3510 node_info.end_pos = ctxt->input->consumed +
3511 (CUR_PTR - ctxt->input->base);
3512 node_info.end_line = ctxt->input->line;
3513 node_info.node = ctxt->node;
3514 xmlParserAddNodeInfo(ctxt, &node_info);
3515 }
3516 return;
3517 }
3518
3519 /*
3520 * Check for an Empty Element from DTD definition
3521 */
3522 if ((info != NULL) && (info->empty)) {
3523 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3524 ctxt->sax->endElement(ctxt->userData, name);
3525 oldname = htmlnamePop(ctxt);
3526#ifdef DEBUG
3527 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3528#endif
3529 if (oldname != NULL)
3530 xmlFree(oldname);
3531 return;
3532 }
3533
3534 /*
3535 * Parse the content of the element:
3536 */
3537 currentNode = xmlStrdup(ctxt->name);
3538 depth = ctxt->nameNr;
3539 while (IS_CHAR(CUR)) {
3540 htmlParseContent(ctxt);
3541 if (ctxt->nameNr < depth) break;
3542 }
3543
3544 if (!IS_CHAR(CUR)) {
3545 /************
3546 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3547 ctxt->sax->error(ctxt->userData,
3548 "Premature end of data in tag %s\n", currentNode);
3549 ctxt->wellFormed = 0;
3550 *************/
3551
3552 /*
3553 * end of parsing of this node.
3554 */
3555 nodePop(ctxt);
3556 oldname = htmlnamePop(ctxt);
3557#ifdef DEBUG
3558 xmlGenericError(xmlGenericErrorContext,"Premature end of tag %s : popping out %s\n", name, oldname);
3559#endif
3560 if (oldname != NULL)
3561 xmlFree(oldname);
3562 if (currentNode != NULL)
3563 xmlFree(currentNode);
3564 return;
3565 }
3566
3567 /*
3568 * Capture end position and add node
3569 */
3570 if ( currentNode != NULL && ctxt->record_info ) {
3571 node_info.end_pos = ctxt->input->consumed +
3572 (CUR_PTR - ctxt->input->base);
3573 node_info.end_line = ctxt->input->line;
3574 node_info.node = ctxt->node;
3575 xmlParserAddNodeInfo(ctxt, &node_info);
3576 }
3577 if (currentNode != NULL)
3578 xmlFree(currentNode);
3579}
3580
3581/**
3582 * htmlParseDocument :
3583 * @ctxt: an HTML parser context
3584 *
3585 * parse an HTML document (and build a tree if using the standard SAX
3586 * interface).
3587 *
3588 * Returns 0, -1 in case of error. the parser context is augmented
3589 * as a result of the parsing.
3590 */
3591
3592int
3593htmlParseDocument(htmlParserCtxtPtr ctxt) {
3594 xmlDtdPtr dtd;
3595
3596 htmlDefaultSAXHandlerInit();
3597 ctxt->html = 1;
3598
3599 GROW;
3600 /*
3601 * SAX: beginning of the document processing.
3602 */
3603 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3604 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3605
3606 /*
3607 * Wipe out everything which is before the first '<'
3608 */
3609 SKIP_BLANKS;
3610 if (CUR == 0) {
3611 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3612 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3613 ctxt->wellFormed = 0;
3614 }
3615
3616 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3617 ctxt->sax->startDocument(ctxt->userData);
3618
3619
3620 /*
3621 * Parse possible comments before any content
3622 */
3623 while ((CUR == '<') && (NXT(1) == '!') &&
3624 (NXT(2) == '-') && (NXT(3) == '-')) {
3625 htmlParseComment(ctxt);
3626 SKIP_BLANKS;
3627 }
3628
3629
3630 /*
3631 * Then possibly doc type declaration(s) and more Misc
3632 * (doctypedecl Misc*)?
3633 */
3634 if ((CUR == '<') && (NXT(1) == '!') &&
3635 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3636 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3637 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3638 (UPP(8) == 'E')) {
3639 htmlParseDocTypeDecl(ctxt);
3640 }
3641 SKIP_BLANKS;
3642
3643 /*
3644 * Parse possible comments before any content
3645 */
3646 while ((CUR == '<') && (NXT(1) == '!') &&
3647 (NXT(2) == '-') && (NXT(3) == '-')) {
3648 htmlParseComment(ctxt);
3649 SKIP_BLANKS;
3650 }
3651
3652 /*
3653 * Time to start parsing the tree itself
3654 */
3655 htmlParseContent(ctxt);
3656
3657 /*
3658 * autoclose
3659 */
3660 if (CUR == 0)
3661 htmlAutoClose(ctxt, NULL);
3662
3663
3664 /*
3665 * SAX: end of the document processing.
3666 */
3667 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3668 ctxt->sax->endDocument(ctxt->userData);
3669
3670 if (ctxt->myDoc != NULL) {
3671 dtd = xmlGetIntSubset(ctxt->myDoc);
3672 if (dtd == NULL)
3673 ctxt->myDoc->intSubset =
3674 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3675 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3676 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3677 }
3678 if (! ctxt->wellFormed) return(-1);
3679 return(0);
3680}
3681
3682
3683/************************************************************************
3684 * *
3685 * Parser contexts handling *
3686 * *
3687 ************************************************************************/
3688
3689/**
3690 * xmlInitParserCtxt:
3691 * @ctxt: an HTML parser context
3692 *
3693 * Initialize a parser context
3694 */
3695
3696void
3697htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3698{
3699 htmlSAXHandler *sax;
3700
3701 if (ctxt == NULL) return;
3702 memset(ctxt, 0, sizeof(htmlParserCtxt));
3703
3704 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3705 if (sax == NULL) {
3706 xmlGenericError(xmlGenericErrorContext,
3707 "htmlInitParserCtxt: out of memory\n");
3708 }
3709 else
3710 memset(sax, 0, sizeof(htmlSAXHandler));
3711
3712 /* Allocate the Input stack */
3713 ctxt->inputTab = (htmlParserInputPtr *)
3714 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3715 if (ctxt->inputTab == NULL) {
3716 xmlGenericError(xmlGenericErrorContext,
3717 "htmlInitParserCtxt: out of memory\n");
3718 ctxt->inputNr = 0;
3719 ctxt->inputMax = 0;
3720 ctxt->input = NULL;
3721 return;
3722 }
3723 ctxt->inputNr = 0;
3724 ctxt->inputMax = 5;
3725 ctxt->input = NULL;
3726 ctxt->version = NULL;
3727 ctxt->encoding = NULL;
3728 ctxt->standalone = -1;
3729 ctxt->instate = XML_PARSER_START;
3730
3731 /* Allocate the Node stack */
3732 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3733 if (ctxt->nodeTab == NULL) {
3734 xmlGenericError(xmlGenericErrorContext,
3735 "htmlInitParserCtxt: out of memory\n");
3736 ctxt->nodeNr = 0;
3737 ctxt->nodeMax = 0;
3738 ctxt->node = NULL;
3739 ctxt->inputNr = 0;
3740 ctxt->inputMax = 0;
3741 ctxt->input = NULL;
3742 return;
3743 }
3744 ctxt->nodeNr = 0;
3745 ctxt->nodeMax = 10;
3746 ctxt->node = NULL;
3747
3748 /* Allocate the Name stack */
3749 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3750 if (ctxt->nameTab == NULL) {
3751 xmlGenericError(xmlGenericErrorContext,
3752 "htmlInitParserCtxt: out of memory\n");
3753 ctxt->nameNr = 0;
3754 ctxt->nameMax = 10;
3755 ctxt->name = NULL;
3756 ctxt->nodeNr = 0;
3757 ctxt->nodeMax = 0;
3758 ctxt->node = NULL;
3759 ctxt->inputNr = 0;
3760 ctxt->inputMax = 0;
3761 ctxt->input = NULL;
3762 return;
3763 }
3764 ctxt->nameNr = 0;
3765 ctxt->nameMax = 10;
3766 ctxt->name = NULL;
3767
3768 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3769 else {
3770 ctxt->sax = sax;
3771 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3772 }
3773 ctxt->userData = ctxt;
3774 ctxt->myDoc = NULL;
3775 ctxt->wellFormed = 1;
3776 ctxt->replaceEntities = 0;
3777 ctxt->html = 1;
3778 ctxt->record_info = 0;
3779 ctxt->validate = 0;
3780 ctxt->nbChars = 0;
3781 ctxt->checkIndex = 0;
3782 xmlInitNodeInfoSeq(&ctxt->node_seq);
3783}
3784
3785/**
3786 * htmlFreeParserCtxt:
3787 * @ctxt: an HTML parser context
3788 *
3789 * Free all the memory used by a parser context. However the parsed
3790 * document in ctxt->myDoc is not freed.
3791 */
3792
3793void
3794htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3795{
3796 xmlFreeParserCtxt(ctxt);
3797}
3798
3799/**
3800 * htmlCreateDocParserCtxt :
3801 * @cur: a pointer to an array of xmlChar
3802 * @encoding: a free form C string describing the HTML document encoding, or NULL
3803 *
3804 * Create a parser context for an HTML document.
3805 *
3806 * Returns the new parser context or NULL
3807 */
3808htmlParserCtxtPtr
3809htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
3810 htmlParserCtxtPtr ctxt;
3811 htmlParserInputPtr input;
3812 /* htmlCharEncoding enc; */
3813
3814 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3815 if (ctxt == NULL) {
3816 perror("malloc");
3817 return(NULL);
3818 }
3819 htmlInitParserCtxt(ctxt);
3820 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3821 if (input == NULL) {
3822 perror("malloc");
3823 xmlFree(ctxt);
3824 return(NULL);
3825 }
3826 memset(input, 0, sizeof(htmlParserInput));
3827
3828 input->line = 1;
3829 input->col = 1;
3830 input->base = cur;
3831 input->cur = cur;
3832
3833 inputPush(ctxt, input);
3834 return(ctxt);
3835}
3836
3837/************************************************************************
3838 * *
3839 * Progressive parsing interfaces *
3840 * *
3841 ************************************************************************/
3842
3843/**
3844 * htmlParseLookupSequence:
3845 * @ctxt: an HTML parser context
3846 * @first: the first char to lookup
3847 * @next: the next char to lookup or zero
3848 * @third: the next char to lookup or zero
3849 *
3850 * Try to find if a sequence (first, next, third) or just (first next) or
3851 * (first) is available in the input stream.
3852 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3853 * to avoid rescanning sequences of bytes, it DOES change the state of the
3854 * parser, do not use liberally.
3855 * This is basically similar to xmlParseLookupSequence()
3856 *
3857 * Returns the index to the current parsing point if the full sequence
3858 * is available, -1 otherwise.
3859 */
3860int
3861htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3862 xmlChar next, xmlChar third) {
3863 int base, len;
3864 htmlParserInputPtr in;
3865 const xmlChar *buf;
3866
3867 in = ctxt->input;
3868 if (in == NULL) return(-1);
3869 base = in->cur - in->base;
3870 if (base < 0) return(-1);
3871 if (ctxt->checkIndex > base)
3872 base = ctxt->checkIndex;
3873 if (in->buf == NULL) {
3874 buf = in->base;
3875 len = in->length;
3876 } else {
3877 buf = in->buf->buffer->content;
3878 len = in->buf->buffer->use;
3879 }
3880 /* take into account the sequence length */
3881 if (third) len -= 2;
3882 else if (next) len --;
3883 for (;base < len;base++) {
3884 if (buf[base] == first) {
3885 if (third != 0) {
3886 if ((buf[base + 1] != next) ||
3887 (buf[base + 2] != third)) continue;
3888 } else if (next != 0) {
3889 if (buf[base + 1] != next) continue;
3890 }
3891 ctxt->checkIndex = 0;
3892#ifdef DEBUG_PUSH
3893 if (next == 0)
3894 xmlGenericError(xmlGenericErrorContext,
3895 "HPP: lookup '%c' found at %d\n",
3896 first, base);
3897 else if (third == 0)
3898 xmlGenericError(xmlGenericErrorContext,
3899 "HPP: lookup '%c%c' found at %d\n",
3900 first, next, base);
3901 else
3902 xmlGenericError(xmlGenericErrorContext,
3903 "HPP: lookup '%c%c%c' found at %d\n",
3904 first, next, third, base);
3905#endif
3906 return(base - (in->cur - in->base));
3907 }
3908 }
3909 ctxt->checkIndex = base;
3910#ifdef DEBUG_PUSH
3911 if (next == 0)
3912 xmlGenericError(xmlGenericErrorContext,
3913 "HPP: lookup '%c' failed\n", first);
3914 else if (third == 0)
3915 xmlGenericError(xmlGenericErrorContext,
3916 "HPP: lookup '%c%c' failed\n", first, next);
3917 else
3918 xmlGenericError(xmlGenericErrorContext,
3919 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3920#endif
3921 return(-1);
3922}
3923
3924/**
3925 * htmlParseTryOrFinish:
3926 * @ctxt: an HTML parser context
3927 * @terminate: last chunk indicator
3928 *
3929 * Try to progress on parsing
3930 *
3931 * Returns zero if no parsing was possible
3932 */
3933int
3934htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3935 int ret = 0;
3936 htmlParserInputPtr in;
3937 int avail = 0;
3938 xmlChar cur, next;
3939
3940#ifdef DEBUG_PUSH
3941 switch (ctxt->instate) {
3942 case XML_PARSER_EOF:
3943 xmlGenericError(xmlGenericErrorContext,
3944 "HPP: try EOF\n"); break;
3945 case XML_PARSER_START:
3946 xmlGenericError(xmlGenericErrorContext,
3947 "HPP: try START\n"); break;
3948 case XML_PARSER_MISC:
3949 xmlGenericError(xmlGenericErrorContext,
3950 "HPP: try MISC\n");break;
3951 case XML_PARSER_COMMENT:
3952 xmlGenericError(xmlGenericErrorContext,
3953 "HPP: try COMMENT\n");break;
3954 case XML_PARSER_PROLOG:
3955 xmlGenericError(xmlGenericErrorContext,
3956 "HPP: try PROLOG\n");break;
3957 case XML_PARSER_START_TAG:
3958 xmlGenericError(xmlGenericErrorContext,
3959 "HPP: try START_TAG\n");break;
3960 case XML_PARSER_CONTENT:
3961 xmlGenericError(xmlGenericErrorContext,
3962 "HPP: try CONTENT\n");break;
3963 case XML_PARSER_CDATA_SECTION:
3964 xmlGenericError(xmlGenericErrorContext,
3965 "HPP: try CDATA_SECTION\n");break;
3966 case XML_PARSER_END_TAG:
3967 xmlGenericError(xmlGenericErrorContext,
3968 "HPP: try END_TAG\n");break;
3969 case XML_PARSER_ENTITY_DECL:
3970 xmlGenericError(xmlGenericErrorContext,
3971 "HPP: try ENTITY_DECL\n");break;
3972 case XML_PARSER_ENTITY_VALUE:
3973 xmlGenericError(xmlGenericErrorContext,
3974 "HPP: try ENTITY_VALUE\n");break;
3975 case XML_PARSER_ATTRIBUTE_VALUE:
3976 xmlGenericError(xmlGenericErrorContext,
3977 "HPP: try ATTRIBUTE_VALUE\n");break;
3978 case XML_PARSER_DTD:
3979 xmlGenericError(xmlGenericErrorContext,
3980 "HPP: try DTD\n");break;
3981 case XML_PARSER_EPILOG:
3982 xmlGenericError(xmlGenericErrorContext,
3983 "HPP: try EPILOG\n");break;
3984 case XML_PARSER_PI:
3985 xmlGenericError(xmlGenericErrorContext,
3986 "HPP: try PI\n");break;
3987 case XML_PARSER_SYSTEM_LITERAL:
3988 xmlGenericError(xmlGenericErrorContext,
3989 "HPP: try SYSTEM_LITERAL\n");break;
3990 }
3991#endif
3992
3993 while (1) {
3994
3995 in = ctxt->input;
3996 if (in == NULL) break;
3997 if (in->buf == NULL)
3998 avail = in->length - (in->cur - in->base);
3999 else
4000 avail = in->buf->buffer->use - (in->cur - in->base);
4001 if ((avail == 0) && (terminate)) {
4002 htmlAutoClose(ctxt, NULL);
4003 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4004 /*
4005 * SAX: end of the document processing.
4006 */
4007 ctxt->instate = XML_PARSER_EOF;
4008 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4009 ctxt->sax->endDocument(ctxt->userData);
4010 }
4011 }
4012 if (avail < 1)
4013 goto done;
4014 switch (ctxt->instate) {
4015 case XML_PARSER_EOF:
4016 /*
4017 * Document parsing is done !
4018 */
4019 goto done;
4020 case XML_PARSER_START:
4021 /*
4022 * Very first chars read from the document flow.
4023 */
4024 cur = in->cur[0];
4025 if (IS_BLANK(cur)) {
4026 SKIP_BLANKS;
4027 if (in->buf == NULL)
4028 avail = in->length - (in->cur - in->base);
4029 else
4030 avail = in->buf->buffer->use - (in->cur - in->base);
4031 }
4032 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4033 ctxt->sax->setDocumentLocator(ctxt->userData,
4034 &xmlDefaultSAXLocator);
4035 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4036 (!ctxt->disableSAX))
4037 ctxt->sax->startDocument(ctxt->userData);
4038
4039 cur = in->cur[0];
4040 next = in->cur[1];
4041 if ((cur == '<') && (next == '!') &&
4042 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4043 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4044 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4045 (UPP(8) == 'E')) {
4046 if ((!terminate) &&
4047 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4048 goto done;
4049#ifdef DEBUG_PUSH
4050 xmlGenericError(xmlGenericErrorContext,
4051 "HPP: Parsing internal subset\n");
4052#endif
4053 htmlParseDocTypeDecl(ctxt);
4054 ctxt->instate = XML_PARSER_PROLOG;
4055#ifdef DEBUG_PUSH
4056 xmlGenericError(xmlGenericErrorContext,
4057 "HPP: entering PROLOG\n");
4058#endif
4059 } else {
4060 ctxt->instate = XML_PARSER_MISC;
4061 }
4062#ifdef DEBUG_PUSH
4063 xmlGenericError(xmlGenericErrorContext,
4064 "HPP: entering MISC\n");
4065#endif
4066 break;
4067 case XML_PARSER_MISC:
4068 SKIP_BLANKS;
4069 if (in->buf == NULL)
4070 avail = in->length - (in->cur - in->base);
4071 else
4072 avail = in->buf->buffer->use - (in->cur - in->base);
4073 if (avail < 2)
4074 goto done;
4075 cur = in->cur[0];
4076 next = in->cur[1];
4077 if ((cur == '<') && (next == '!') &&
4078 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4079 if ((!terminate) &&
4080 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4081 goto done;
4082#ifdef DEBUG_PUSH
4083 xmlGenericError(xmlGenericErrorContext,
4084 "HPP: Parsing Comment\n");
4085#endif
4086 htmlParseComment(ctxt);
4087 ctxt->instate = XML_PARSER_MISC;
4088 } else if ((cur == '<') && (next == '!') &&
4089 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4090 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4091 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4092 (UPP(8) == 'E')) {
4093 if ((!terminate) &&
4094 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4095 goto done;
4096#ifdef DEBUG_PUSH
4097 xmlGenericError(xmlGenericErrorContext,
4098 "HPP: Parsing internal subset\n");
4099#endif
4100 htmlParseDocTypeDecl(ctxt);
4101 ctxt->instate = XML_PARSER_PROLOG;
4102#ifdef DEBUG_PUSH
4103 xmlGenericError(xmlGenericErrorContext,
4104 "HPP: entering PROLOG\n");
4105#endif
4106 } else if ((cur == '<') && (next == '!') &&
4107 (avail < 9)) {
4108 goto done;
4109 } else {
4110 ctxt->instate = XML_PARSER_START_TAG;
4111#ifdef DEBUG_PUSH
4112 xmlGenericError(xmlGenericErrorContext,
4113 "HPP: entering START_TAG\n");
4114#endif
4115 }
4116 break;
4117 case XML_PARSER_PROLOG:
4118 SKIP_BLANKS;
4119 if (in->buf == NULL)
4120 avail = in->length - (in->cur - in->base);
4121 else
4122 avail = in->buf->buffer->use - (in->cur - in->base);
4123 if (avail < 2)
4124 goto done;
4125 cur = in->cur[0];
4126 next = in->cur[1];
4127 if ((cur == '<') && (next == '!') &&
4128 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4129 if ((!terminate) &&
4130 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4131 goto done;
4132#ifdef DEBUG_PUSH
4133 xmlGenericError(xmlGenericErrorContext,
4134 "HPP: Parsing Comment\n");
4135#endif
4136 htmlParseComment(ctxt);
4137 ctxt->instate = XML_PARSER_PROLOG;
4138 } else if ((cur == '<') && (next == '!') &&
4139 (avail < 4)) {
4140 goto done;
4141 } else {
4142 ctxt->instate = XML_PARSER_START_TAG;
4143#ifdef DEBUG_PUSH
4144 xmlGenericError(xmlGenericErrorContext,
4145 "HPP: entering START_TAG\n");
4146#endif
4147 }
4148 break;
4149 case XML_PARSER_EPILOG:
4150 if (in->buf == NULL)
4151 avail = in->length - (in->cur - in->base);
4152 else
4153 avail = in->buf->buffer->use - (in->cur - in->base);
4154 if (avail < 1)
4155 goto done;
4156 cur = in->cur[0];
4157 if (IS_BLANK(cur)) {
4158 htmlParseCharData(ctxt, 0);
4159 goto done;
4160 }
4161 if (avail < 2)
4162 goto done;
4163 next = in->cur[1];
4164 if ((cur == '<') && (next == '!') &&
4165 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4166 if ((!terminate) &&
4167 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4168 goto done;
4169#ifdef DEBUG_PUSH
4170 xmlGenericError(xmlGenericErrorContext,
4171 "HPP: Parsing Comment\n");
4172#endif
4173 htmlParseComment(ctxt);
4174 ctxt->instate = XML_PARSER_EPILOG;
4175 } else if ((cur == '<') && (next == '!') &&
4176 (avail < 4)) {
4177 goto done;
4178 } else {
4179 ctxt->errNo = XML_ERR_DOCUMENT_END;
4180 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4181 ctxt->sax->error(ctxt->userData,
4182 "Extra content at the end of the document\n");
4183 ctxt->wellFormed = 0;
4184 ctxt->instate = XML_PARSER_EOF;
4185#ifdef DEBUG_PUSH
4186 xmlGenericError(xmlGenericErrorContext,
4187 "HPP: entering EOF\n");
4188#endif
4189 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4190 ctxt->sax->endDocument(ctxt->userData);
4191 goto done;
4192 }
4193 break;
4194 case XML_PARSER_START_TAG: {
4195 xmlChar *name, *oldname;
4196 int depth = ctxt->nameNr;
4197 htmlElemDescPtr info;
4198
4199 if (avail < 2)
4200 goto done;
4201 cur = in->cur[0];
4202 if (cur != '<') {
4203 ctxt->instate = XML_PARSER_CONTENT;
4204#ifdef DEBUG_PUSH
4205 xmlGenericError(xmlGenericErrorContext,
4206 "HPP: entering CONTENT\n");
4207#endif
4208 break;
4209 }
4210 if ((!terminate) &&
4211 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4212 goto done;
4213
4214 oldname = xmlStrdup(ctxt->name);
4215 htmlParseStartTag(ctxt);
4216 name = ctxt->name;
4217#ifdef DEBUG
4218 if (oldname == NULL)
4219 xmlGenericError(xmlGenericErrorContext,
4220 "Start of element %s\n", name);
4221 else if (name == NULL)
4222 xmlGenericError(xmlGenericErrorContext,
4223 "Start of element failed, was %s\n",
4224 oldname);
4225 else
4226 xmlGenericError(xmlGenericErrorContext,
4227 "Start of element %s, was %s\n",
4228 name, oldname);
4229#endif
4230 if (((depth == ctxt->nameNr) &&
4231 (xmlStrEqual(oldname, ctxt->name))) ||
4232 (name == NULL)) {
4233 if (CUR == '>')
4234 NEXT;
4235 if (oldname != NULL)
4236 xmlFree(oldname);
4237 break;
4238 }
4239 if (oldname != NULL)
4240 xmlFree(oldname);
4241
4242 /*
4243 * Lookup the info for that element.
4244 */
4245 info = htmlTagLookup(name);
4246 if (info == NULL) {
4247 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4248 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4249 name);
4250 ctxt->wellFormed = 0;
4251 } else if (info->depr) {
4252 /***************************
4253 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4254 ctxt->sax->warning(ctxt->userData,
4255 "Tag %s is deprecated\n",
4256 name);
4257 ***************************/
4258 }
4259
4260 /*
4261 * Check for an Empty Element labelled the XML/SGML way
4262 */
4263 if ((CUR == '/') && (NXT(1) == '>')) {
4264 SKIP(2);
4265 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4266 ctxt->sax->endElement(ctxt->userData, name);
4267 oldname = htmlnamePop(ctxt);
4268#ifdef DEBUG
4269 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4270 oldname);
4271#endif
4272 if (oldname != NULL)
4273 xmlFree(oldname);
4274 ctxt->instate = XML_PARSER_CONTENT;
4275#ifdef DEBUG_PUSH
4276 xmlGenericError(xmlGenericErrorContext,
4277 "HPP: entering CONTENT\n");
4278#endif
4279 break;
4280 }
4281
4282 if (CUR == '>') {
4283 NEXT;
4284 } else {
4285 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4286 ctxt->sax->error(ctxt->userData,
4287 "Couldn't find end of Start Tag %s\n",
4288 name);
4289 ctxt->wellFormed = 0;
4290
4291 /*
4292 * end of parsing of this node.
4293 */
4294 if (xmlStrEqual(name, ctxt->name)) {
4295 nodePop(ctxt);
4296 oldname = htmlnamePop(ctxt);
4297#ifdef DEBUG
4298 xmlGenericError(xmlGenericErrorContext,
4299 "End of start tag problem: popping out %s\n", oldname);
4300#endif
4301 if (oldname != NULL)
4302 xmlFree(oldname);
4303 }
4304
4305 ctxt->instate = XML_PARSER_CONTENT;
4306#ifdef DEBUG_PUSH
4307 xmlGenericError(xmlGenericErrorContext,
4308 "HPP: entering CONTENT\n");
4309#endif
4310 break;
4311 }
4312
4313 /*
4314 * Check for an Empty Element from DTD definition
4315 */
4316 if ((info != NULL) && (info->empty)) {
4317 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4318 ctxt->sax->endElement(ctxt->userData, name);
4319 oldname = htmlnamePop(ctxt);
4320#ifdef DEBUG
4321 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4322#endif
4323 if (oldname != NULL)
4324 xmlFree(oldname);
4325 }
4326 ctxt->instate = XML_PARSER_CONTENT;
4327#ifdef DEBUG_PUSH
4328 xmlGenericError(xmlGenericErrorContext,
4329 "HPP: entering CONTENT\n");
4330#endif
4331 break;
4332 }
4333 case XML_PARSER_CONTENT: {
4334 long cons;
4335 /*
4336 * Handle preparsed entities and charRef
4337 */
4338 if (ctxt->token != 0) {
4339 xmlChar chr[2] = { 0 , 0 } ;
4340
4341 chr[0] = (xmlChar) ctxt->token;
4342 htmlCheckParagraph(ctxt);
4343 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4344 ctxt->sax->characters(ctxt->userData, chr, 1);
4345 ctxt->token = 0;
4346 ctxt->checkIndex = 0;
4347 }
4348 if ((avail == 1) && (terminate)) {
4349 cur = in->cur[0];
4350 if ((cur != '<') && (cur != '&')) {
4351 if (ctxt->sax != NULL) {
4352 if (IS_BLANK(cur)) {
4353 if (ctxt->sax->ignorableWhitespace != NULL)
4354 ctxt->sax->ignorableWhitespace(
4355 ctxt->userData, &cur, 1);
4356 } else {
4357 htmlCheckParagraph(ctxt);
4358 if (ctxt->sax->characters != NULL)
4359 ctxt->sax->characters(
4360 ctxt->userData, &cur, 1);
4361 }
4362 }
4363 ctxt->token = 0;
4364 ctxt->checkIndex = 0;
4365 NEXT;
4366 }
4367 break;
4368 }
4369 if (avail < 2)
4370 goto done;
4371 cur = in->cur[0];
4372 next = in->cur[1];
4373 cons = ctxt->nbChars;
4374 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4375 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4376 /*
4377 * Handle SCRIPT/STYLE separately
4378 */
4379 if ((!terminate) &&
4380 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4381 goto done;
4382 htmlParseScript(ctxt);
4383 if ((cur == '<') && (next == '/')) {
4384 ctxt->instate = XML_PARSER_END_TAG;
4385 ctxt->checkIndex = 0;
4386#ifdef DEBUG_PUSH
4387 xmlGenericError(xmlGenericErrorContext,
4388 "HPP: entering END_TAG\n");
4389#endif
4390 break;
4391 }
4392 } else {
4393 /*
4394 * Sometimes DOCTYPE arrives in the middle of the document
4395 */
4396 if ((cur == '<') && (next == '!') &&
4397 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4398 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4399 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4400 (UPP(8) == 'E')) {
4401 if ((!terminate) &&
4402 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4403 goto done;
4404 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4405 ctxt->sax->error(ctxt->userData,
4406 "Misplaced DOCTYPE declaration\n");
4407 ctxt->wellFormed = 0;
4408 htmlParseDocTypeDecl(ctxt);
4409 } else if ((cur == '<') && (next == '!') &&
4410 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4411 if ((!terminate) &&
4412 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4413 goto done;
4414#ifdef DEBUG_PUSH
4415 xmlGenericError(xmlGenericErrorContext,
4416 "HPP: Parsing Comment\n");
4417#endif
4418 htmlParseComment(ctxt);
4419 ctxt->instate = XML_PARSER_CONTENT;
4420 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4421 goto done;
4422 } else if ((cur == '<') && (next == '/')) {
4423 ctxt->instate = XML_PARSER_END_TAG;
4424 ctxt->checkIndex = 0;
4425#ifdef DEBUG_PUSH
4426 xmlGenericError(xmlGenericErrorContext,
4427 "HPP: entering END_TAG\n");
4428#endif
4429 break;
4430 } else if (cur == '<') {
4431 ctxt->instate = XML_PARSER_START_TAG;
4432 ctxt->checkIndex = 0;
4433#ifdef DEBUG_PUSH
4434 xmlGenericError(xmlGenericErrorContext,
4435 "HPP: entering START_TAG\n");
4436#endif
4437 break;
4438 } else if (cur == '&') {
4439 if ((!terminate) &&
4440 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4441 goto done;
4442#ifdef DEBUG_PUSH
4443 xmlGenericError(xmlGenericErrorContext,
4444 "HPP: Parsing Reference\n");
4445#endif
4446 /* TODO: check generation of subtrees if noent !!! */
4447 htmlParseReference(ctxt);
4448 } else {
4449 /* TODO Avoid the extra copy, handle directly !!!!!! */
4450 /*
4451 * Goal of the following test is :
4452 * - minimize calls to the SAX 'character' callback
4453 * when they are mergeable
4454 */
4455 if ((ctxt->inputNr == 1) &&
4456 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4457 if ((!terminate) &&
4458 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4459 goto done;
4460 }
4461 ctxt->checkIndex = 0;
4462#ifdef DEBUG_PUSH
4463 xmlGenericError(xmlGenericErrorContext,
4464 "HPP: Parsing char data\n");
4465#endif
4466 htmlParseCharData(ctxt, 0);
4467 }
4468 }
4469 if (cons == ctxt->nbChars) {
4470 if (ctxt->node != NULL) {
4471 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4472 ctxt->sax->error(ctxt->userData,
4473 "detected an error in element content\n");
4474 ctxt->wellFormed = 0;
4475 }
4476 NEXT;
4477 break;
4478 }
4479
4480 break;
4481 }
4482 case XML_PARSER_END_TAG:
4483 if (avail < 2)
4484 goto done;
4485 if ((!terminate) &&
4486 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4487 goto done;
4488 htmlParseEndTag(ctxt);
4489 if (ctxt->nameNr == 0) {
4490 ctxt->instate = XML_PARSER_EPILOG;
4491 } else {
4492 ctxt->instate = XML_PARSER_CONTENT;
4493 }
4494 ctxt->checkIndex = 0;
4495#ifdef DEBUG_PUSH
4496 xmlGenericError(xmlGenericErrorContext,
4497 "HPP: entering CONTENT\n");
4498#endif
4499 break;
4500 case XML_PARSER_CDATA_SECTION:
4501 xmlGenericError(xmlGenericErrorContext,
4502 "HPP: internal error, state == CDATA\n");
4503 ctxt->instate = XML_PARSER_CONTENT;
4504 ctxt->checkIndex = 0;
4505#ifdef DEBUG_PUSH
4506 xmlGenericError(xmlGenericErrorContext,
4507 "HPP: entering CONTENT\n");
4508#endif
4509 break;
4510 case XML_PARSER_DTD:
4511 xmlGenericError(xmlGenericErrorContext,
4512 "HPP: internal error, state == DTD\n");
4513 ctxt->instate = XML_PARSER_CONTENT;
4514 ctxt->checkIndex = 0;
4515#ifdef DEBUG_PUSH
4516 xmlGenericError(xmlGenericErrorContext,
4517 "HPP: entering CONTENT\n");
4518#endif
4519 break;
4520 case XML_PARSER_COMMENT:
4521 xmlGenericError(xmlGenericErrorContext,
4522 "HPP: internal error, state == COMMENT\n");
4523 ctxt->instate = XML_PARSER_CONTENT;
4524 ctxt->checkIndex = 0;
4525#ifdef DEBUG_PUSH
4526 xmlGenericError(xmlGenericErrorContext,
4527 "HPP: entering CONTENT\n");
4528#endif
4529 break;
4530 case XML_PARSER_PI:
4531 xmlGenericError(xmlGenericErrorContext,
4532 "HPP: internal error, state == PI\n");
4533 ctxt->instate = XML_PARSER_CONTENT;
4534 ctxt->checkIndex = 0;
4535#ifdef DEBUG_PUSH
4536 xmlGenericError(xmlGenericErrorContext,
4537 "HPP: entering CONTENT\n");
4538#endif
4539 break;
4540 case XML_PARSER_ENTITY_DECL:
4541 xmlGenericError(xmlGenericErrorContext,
4542 "HPP: internal error, state == ENTITY_DECL\n");
4543 ctxt->instate = XML_PARSER_CONTENT;
4544 ctxt->checkIndex = 0;
4545#ifdef DEBUG_PUSH
4546 xmlGenericError(xmlGenericErrorContext,
4547 "HPP: entering CONTENT\n");
4548#endif
4549 break;
4550 case XML_PARSER_ENTITY_VALUE:
4551 xmlGenericError(xmlGenericErrorContext,
4552 "HPP: internal error, state == ENTITY_VALUE\n");
4553 ctxt->instate = XML_PARSER_CONTENT;
4554 ctxt->checkIndex = 0;
4555#ifdef DEBUG_PUSH
4556 xmlGenericError(xmlGenericErrorContext,
4557 "HPP: entering DTD\n");
4558#endif
4559 break;
4560 case XML_PARSER_ATTRIBUTE_VALUE:
4561 xmlGenericError(xmlGenericErrorContext,
4562 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4563 ctxt->instate = XML_PARSER_START_TAG;
4564 ctxt->checkIndex = 0;
4565#ifdef DEBUG_PUSH
4566 xmlGenericError(xmlGenericErrorContext,
4567 "HPP: entering START_TAG\n");
4568#endif
4569 break;
4570 case XML_PARSER_SYSTEM_LITERAL:
4571 xmlGenericError(xmlGenericErrorContext,
4572 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4573 ctxt->instate = XML_PARSER_CONTENT;
4574 ctxt->checkIndex = 0;
4575#ifdef DEBUG_PUSH
4576 xmlGenericError(xmlGenericErrorContext,
4577 "HPP: entering CONTENT\n");
4578#endif
4579 break;
4580 case XML_PARSER_IGNORE:
4581 xmlGenericError(xmlGenericErrorContext,
4582 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4583 ctxt->instate = XML_PARSER_CONTENT;
4584 ctxt->checkIndex = 0;
4585#ifdef DEBUG_PUSH
4586 xmlGenericError(xmlGenericErrorContext,
4587 "HPP: entering CONTENT\n");
4588#endif
4589 break;
4590 }
4591 }
4592done:
4593 if ((avail == 0) && (terminate)) {
4594 htmlAutoClose(ctxt, NULL);
4595 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4596 /*
4597 * SAX: end of the document processing.
4598 */
4599 ctxt->instate = XML_PARSER_EOF;
4600 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4601 ctxt->sax->endDocument(ctxt->userData);
4602 }
4603 }
4604 if ((ctxt->myDoc != NULL) &&
4605 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4606 (ctxt->instate == XML_PARSER_EPILOG))) {
4607 xmlDtdPtr dtd;
4608 dtd = xmlGetIntSubset(ctxt->myDoc);
4609 if (dtd == NULL)
4610 ctxt->myDoc->intSubset =
4611 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4612 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4613 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4614 }
4615#ifdef DEBUG_PUSH
4616 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4617#endif
4618 return(ret);
4619}
4620
4621/**
4622 * htmlParseTry:
4623 * @ctxt: an HTML parser context
4624 *
4625 * Try to progress on parsing
4626 *
4627 * Returns zero if no parsing was possible
4628 */
4629int
4630htmlParseTry(htmlParserCtxtPtr ctxt) {
4631 return(htmlParseTryOrFinish(ctxt, 0));
4632}
4633
4634/**
4635 * htmlParseChunk:
4636 * @ctxt: an XML parser context
4637 * @chunk: an char array
4638 * @size: the size in byte of the chunk
4639 * @terminate: last chunk indicator
4640 *
4641 * Parse a Chunk of memory
4642 *
4643 * Returns zero if no error, the xmlParserErrors otherwise.
4644 */
4645int
4646htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4647 int terminate) {
4648 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4649 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4650 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4651 int cur = ctxt->input->cur - ctxt->input->base;
4652
4653 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4654 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4655 ctxt->input->cur = ctxt->input->base + cur;
4656#ifdef DEBUG_PUSH
4657 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4658#endif
4659
4660 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4661 htmlParseTryOrFinish(ctxt, terminate);
4662 } else if (ctxt->instate != XML_PARSER_EOF) {
4663 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4664 htmlParseTryOrFinish(ctxt, terminate);
4665 }
4666 if (terminate) {
4667 if ((ctxt->instate != XML_PARSER_EOF) &&
4668 (ctxt->instate != XML_PARSER_EPILOG) &&
4669 (ctxt->instate != XML_PARSER_MISC)) {
4670 ctxt->errNo = XML_ERR_DOCUMENT_END;
4671 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4672 ctxt->sax->error(ctxt->userData,
4673 "Extra content at the end of the document\n");
4674 ctxt->wellFormed = 0;
4675 }
4676 if (ctxt->instate != XML_PARSER_EOF) {
4677 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4678 ctxt->sax->endDocument(ctxt->userData);
4679 }
4680 ctxt->instate = XML_PARSER_EOF;
4681 }
4682 return((xmlParserErrors) ctxt->errNo);
4683}
4684
4685/************************************************************************
4686 * *
4687 * User entry points *
4688 * *
4689 ************************************************************************/
4690
4691/**
4692 * htmlCreatePushParserCtxt :
4693 * @sax: a SAX handler
4694 * @user_data: The user data returned on SAX callbacks
4695 * @chunk: a pointer to an array of chars
4696 * @size: number of chars in the array
4697 * @filename: an optional file name or URI
4698 * @enc: an optional encoding
4699 *
4700 * Create a parser context for using the HTML parser in push mode
4701 * To allow content encoding detection, @size should be >= 4
4702 * The value of @filename is used for fetching external entities
4703 * and error/warning reports.
4704 *
4705 * Returns the new parser context or NULL
4706 */
4707htmlParserCtxtPtr
4708htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4709 const char *chunk, int size, const char *filename,
4710 xmlCharEncoding enc) {
4711 htmlParserCtxtPtr ctxt;
4712 htmlParserInputPtr inputStream;
4713 xmlParserInputBufferPtr buf;
4714
4715 buf = xmlAllocParserInputBuffer(enc);
4716 if (buf == NULL) return(NULL);
4717
4718 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4719 if (ctxt == NULL) {
4720 xmlFree(buf);
4721 return(NULL);
4722 }
4723 memset(ctxt, 0, sizeof(htmlParserCtxt));
4724 htmlInitParserCtxt(ctxt);
4725 if (sax != NULL) {
4726 if (ctxt->sax != &htmlDefaultSAXHandler)
4727 xmlFree(ctxt->sax);
4728 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4729 if (ctxt->sax == NULL) {
4730 xmlFree(buf);
4731 xmlFree(ctxt);
4732 return(NULL);
4733 }
4734 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4735 if (user_data != NULL)
4736 ctxt->userData = user_data;
4737 }
4738 if (filename == NULL) {
4739 ctxt->directory = NULL;
4740 } else {
4741 ctxt->directory = xmlParserGetDirectory(filename);
4742 }
4743
4744 inputStream = htmlNewInputStream(ctxt);
4745 if (inputStream == NULL) {
4746 xmlFreeParserCtxt(ctxt);
4747 return(NULL);
4748 }
4749
4750 if (filename == NULL)
4751 inputStream->filename = NULL;
4752 else
4753 inputStream->filename = xmlMemStrdup(filename);
4754 inputStream->buf = buf;
4755 inputStream->base = inputStream->buf->buffer->content;
4756 inputStream->cur = inputStream->buf->buffer->content;
4757
4758 inputPush(ctxt, inputStream);
4759
4760 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4761 (ctxt->input->buf != NULL)) {
4762 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4763#ifdef DEBUG_PUSH
4764 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4765#endif
4766 }
4767
4768 return(ctxt);
4769}
4770
4771/**
4772 * htmlSAXParseDoc :
4773 * @cur: a pointer to an array of xmlChar
4774 * @encoding: a free form C string describing the HTML document encoding, or NULL
4775 * @sax: the SAX handler block
4776 * @userData: if using SAX, this pointer will be provided on callbacks.
4777 *
4778 * parse an HTML in-memory document and build a tree.
4779 * It use the given SAX function block to handle the parsing callback.
4780 * If sax is NULL, fallback to the default DOM tree building routines.
4781 *
4782 * Returns the resulting document tree
4783 */
4784
4785htmlDocPtr
4786htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4787 htmlDocPtr ret;
4788 htmlParserCtxtPtr ctxt;
4789
4790 if (cur == NULL) return(NULL);
4791
4792
4793 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4794 if (ctxt == NULL) return(NULL);
4795 if (sax != NULL) {
4796 ctxt->sax = sax;
4797 ctxt->userData = userData;
4798 }
4799
4800 htmlParseDocument(ctxt);
4801 ret = ctxt->myDoc;
4802 if (sax != NULL) {
4803 ctxt->sax = NULL;
4804 ctxt->userData = NULL;
4805 }
4806 htmlFreeParserCtxt(ctxt);
4807
4808 return(ret);
4809}
4810
4811/**
4812 * htmlParseDoc :
4813 * @cur: a pointer to an array of xmlChar
4814 * @encoding: a free form C string describing the HTML document encoding, or NULL
4815 *
4816 * parse an HTML in-memory document and build a tree.
4817 *
4818 * Returns the resulting document tree
4819 */
4820
4821htmlDocPtr
4822htmlParseDoc(xmlChar *cur, const char *encoding) {
4823 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4824}
4825
4826
4827/**
4828 * htmlCreateFileParserCtxt :
4829 * @filename: the filename
4830 * @encoding: a free form C string describing the HTML document encoding, or NULL
4831 *
4832 * Create a parser context for a file content.
4833 * Automatic support for ZLIB/Compress compressed document is provided
4834 * by default if found at compile-time.
4835 *
4836 * Returns the new parser context or NULL
4837 */
4838htmlParserCtxtPtr
4839htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4840{
4841 htmlParserCtxtPtr ctxt;
4842 htmlParserInputPtr inputStream;
4843 xmlParserInputBufferPtr buf;
4844 /* htmlCharEncoding enc; */
4845 xmlChar *content, *content_line = (xmlChar *) "charset=";
4846
4847 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4848 if (buf == NULL) return(NULL);
4849
4850 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4851 if (ctxt == NULL) {
4852 perror("malloc");
4853 return(NULL);
4854 }
4855 memset(ctxt, 0, sizeof(htmlParserCtxt));
4856 htmlInitParserCtxt(ctxt);
4857 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4858 if (inputStream == NULL) {
4859 perror("malloc");
4860 xmlFree(ctxt);
4861 return(NULL);
4862 }
4863 memset(inputStream, 0, sizeof(htmlParserInput));
4864
4865 inputStream->filename = xmlMemStrdup(filename);
4866 inputStream->line = 1;
4867 inputStream->col = 1;
4868 inputStream->buf = buf;
4869 inputStream->directory = NULL;
4870
4871 inputStream->base = inputStream->buf->buffer->content;
4872 inputStream->cur = inputStream->buf->buffer->content;
4873 inputStream->free = NULL;
4874
4875 inputPush(ctxt, inputStream);
4876
4877 /* set encoding */
4878 if (encoding) {
4879 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4880 if (content) {
4881 strcpy ((char *)content, (char *)content_line);
4882 strcat ((char *)content, (char *)encoding);
4883 htmlCheckEncoding (ctxt, content);
4884 xmlFree (content);
4885 }
4886 }
4887
4888 return(ctxt);
4889}
4890
4891/**
4892 * htmlSAXParseFile :
4893 * @filename: the filename
4894 * @encoding: a free form C string describing the HTML document encoding, or NULL
4895 * @sax: the SAX handler block
4896 * @userData: if using SAX, this pointer will be provided on callbacks.
4897 *
4898 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4899 * compressed document is provided by default if found at compile-time.
4900 * It use the given SAX function block to handle the parsing callback.
4901 * If sax is NULL, fallback to the default DOM tree building routines.
4902 *
4903 * Returns the resulting document tree
4904 */
4905
4906htmlDocPtr
4907htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4908 void *userData) {
4909 htmlDocPtr ret;
4910 htmlParserCtxtPtr ctxt;
4911 htmlSAXHandlerPtr oldsax = NULL;
4912
4913 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4914 if (ctxt == NULL) return(NULL);
4915 if (sax != NULL) {
4916 oldsax = ctxt->sax;
4917 ctxt->sax = sax;
4918 ctxt->userData = userData;
4919 }
4920
4921 htmlParseDocument(ctxt);
4922
4923 ret = ctxt->myDoc;
4924 if (sax != NULL) {
4925 ctxt->sax = oldsax;
4926 ctxt->userData = NULL;
4927 }
4928 htmlFreeParserCtxt(ctxt);
4929
4930 return(ret);
4931}
4932
4933/**
4934 * htmlParseFile :
4935 * @filename: the filename
4936 * @encoding: a free form C string describing the HTML document encoding, or NULL
4937 *
4938 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4939 * compressed document is provided by default if found at compile-time.
4940 *
4941 * Returns the resulting document tree
4942 */
4943
4944htmlDocPtr
4945htmlParseFile(const char *filename, const char *encoding) {
4946 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4947}
4948
4949/**
4950 * htmlHandleOmittedElem:
4951 * @val: int 0 or 1
4952 *
4953 * Set and return the previous value for handling HTML omitted tags.
4954 *
4955 * Returns the last value for 0 for no handling, 1 for auto insertion.
4956 */
4957
4958int
4959htmlHandleOmittedElem(int val) {
4960 int old = htmlOmittedDefaultValue;
4961
4962 htmlOmittedDefaultValue = val;
4963 return(old);
4964}
4965
4966#endif /* LIBXML_HTML_ENABLED */