blob: a83b669bb5b338fc6e2656cc04d1df58fa7224a6 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifdef WIN32
10#include "win32config.h"
11#else
12#include "config.h"
13#endif
14
15#include <libxml/xmlversion.h>
16#ifdef LIBXML_HTML_ENABLED
17#include <stdio.h>
18#include <string.h>
19#ifdef HAVE_CTYPE_H
20#include <ctype.h>
21#endif
22#ifdef HAVE_STDLIB_H
23#include <stdlib.h>
24#endif
25#ifdef HAVE_SYS_STAT_H
26#include <sys/stat.h>
27#endif
28#ifdef HAVE_FCNTL_H
29#include <fcntl.h>
30#endif
31#ifdef HAVE_UNISTD_H
32#include <unistd.h>
33#endif
34#ifdef HAVE_ZLIB_H
35#include <zlib.h>
36#endif
37
38#include <libxml/xmlmemory.h>
39#include <libxml/tree.h>
40#include <libxml/parser.h>
41#include <libxml/parserInternals.h>
42#include <libxml/xmlerror.h>
43#include <libxml/HTMLparser.h>
44#include <libxml/entities.h>
45#include <libxml/encoding.h>
46#include <libxml/valid.h>
47#include <libxml/xmlIO.h>
48
49#define HTML_MAX_NAMELEN 1000
50#define HTML_PARSER_BIG_BUFFER_SIZE 1000
51#define HTML_PARSER_BUFFER_SIZE 100
52
53/* #define DEBUG */
54/* #define DEBUG_PUSH */
55
56int htmlOmittedDefaultValue = 1;
57
58/************************************************************************
59 * *
60 * Parser stacks related functions and macros *
61 * *
62 ************************************************************************/
63
64/*
65 * Generic function for accessing stacks in the Parser Context
66 */
67
68#define PUSH_AND_POP(scope, type, name) \
69scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
70 if (ctxt->name##Nr >= ctxt->name##Max) { \
71 ctxt->name##Max *= 2; \
72 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
73 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
74 if (ctxt->name##Tab == NULL) { \
75 xmlGenericError(xmlGenericErrorContext, \
76 "realloc failed !\n"); \
77 return(0); \
78 } \
79 } \
80 ctxt->name##Tab[ctxt->name##Nr] = value; \
81 ctxt->name = value; \
82 return(ctxt->name##Nr++); \
83} \
84scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
85 type ret; \
86 if (ctxt->name##Nr < 0) return(0); \
87 ctxt->name##Nr--; \
88 if (ctxt->name##Nr < 0) return(0); \
89 if (ctxt->name##Nr > 0) \
90 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
91 else \
92 ctxt->name = NULL; \
93 ret = ctxt->name##Tab[ctxt->name##Nr]; \
94 ctxt->name##Tab[ctxt->name##Nr] = 0; \
95 return(ret); \
96} \
97
98PUSH_AND_POP(extern, xmlNodePtr, node)
99PUSH_AND_POP(extern, xmlChar*, name)
100
101/*
102 * Macros for accessing the content. Those should be used only by the parser,
103 * and not exported.
104 *
105 * Dirty macros, i.e. one need to make assumption on the context to use them
106 *
107 * CUR_PTR return the current pointer to the xmlChar to be parsed.
108 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
109 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
110 * in UNICODE mode. This should be used internally by the parser
111 * only to compare to ASCII values otherwise it would break when
112 * running with UTF-8 encoding.
113 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
114 * to compare on ASCII based substring.
115 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
116 * it should be used only to compare on ASCII based substring.
117 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
118 * strings within the parser.
119 *
120 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
121 *
122 * CURRENT Returns the current char value, with the full decoding of
123 * UTF-8 if we are using this mode. It returns an int.
124 * NEXT Skip to the next character, this does the proper decoding
125 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
126 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
127 */
128
129#define UPPER (toupper(*ctxt->input->cur))
130
131#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
132
133#define NXT(val) ctxt->input->cur[(val)]
134
135#define UPP(val) (toupper(ctxt->input->cur[(val)]))
136
137#define CUR_PTR ctxt->input->cur
138
139#define SHRINK xmlParserInputShrink(ctxt->input)
140
141#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
142
143#define CURRENT ((int) (*ctxt->input->cur))
144
145#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
146
147/* Inported from XML */
148
149/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
150#define CUR ((int) (*ctxt->input->cur))
151#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
152
153#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
154#define NXT(val) ctxt->input->cur[(val)]
155#define CUR_PTR ctxt->input->cur
156
157
158#define NEXTL(l) do { \
159 if (*(ctxt->input->cur) == '\n') { \
160 ctxt->input->line++; ctxt->input->col = 1; \
161 } else ctxt->input->col++; \
162 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
163 } while (0)
164
165/************
166 \
167 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
168 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
169 ************/
170
171#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
172#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
173
174#define COPY_BUF(l,b,i,v) \
175 if (l == 1) b[i++] = (xmlChar) v; \
176 else i += xmlCopyChar(l,&b[i],v)
177
178/**
179 * htmlCurrentChar:
180 * @ctxt: the HTML parser context
181 * @len: pointer to the length of the char read
182 *
183 * The current char value, if using UTF-8 this may actaully span multiple
184 * bytes in the input buffer. Implement the end of line normalization:
185 * 2.11 End-of-Line Handling
186 * If the encoding is unspecified, in the case we find an ISO-Latin-1
187 * char, then the encoding converter is plugged in automatically.
188 *
189 * Returns the current char value and its lenght
190 */
191
192int
193htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
194 if (ctxt->instate == XML_PARSER_EOF)
195 return(0);
196
197 if (ctxt->token != 0) {
198 *len = 0;
199 return(ctxt->token);
200 }
201 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
202 /*
203 * We are supposed to handle UTF8, check it's valid
204 * From rfc2044: encoding of the Unicode values on UTF-8:
205 *
206 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
207 * 0000 0000-0000 007F 0xxxxxxx
208 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
209 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
210 *
211 * Check for the 0x110000 limit too
212 */
213 const unsigned char *cur = ctxt->input->cur;
214 unsigned char c;
215 unsigned int val;
216
217 c = *cur;
218 if (c & 0x80) {
219 if (cur[1] == 0)
220 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
221 if ((cur[1] & 0xc0) != 0x80)
222 goto encoding_error;
223 if ((c & 0xe0) == 0xe0) {
224
225 if (cur[2] == 0)
226 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
227 if ((cur[2] & 0xc0) != 0x80)
228 goto encoding_error;
229 if ((c & 0xf0) == 0xf0) {
230 if (cur[3] == 0)
231 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
232 if (((c & 0xf8) != 0xf0) ||
233 ((cur[3] & 0xc0) != 0x80))
234 goto encoding_error;
235 /* 4-byte code */
236 *len = 4;
237 val = (cur[0] & 0x7) << 18;
238 val |= (cur[1] & 0x3f) << 12;
239 val |= (cur[2] & 0x3f) << 6;
240 val |= cur[3] & 0x3f;
241 } else {
242 /* 3-byte code */
243 *len = 3;
244 val = (cur[0] & 0xf) << 12;
245 val |= (cur[1] & 0x3f) << 6;
246 val |= cur[2] & 0x3f;
247 }
248 } else {
249 /* 2-byte code */
250 *len = 2;
251 val = (cur[0] & 0x1f) << 6;
252 val |= cur[1] & 0x3f;
253 }
254 if (!IS_CHAR(val)) {
255 ctxt->errNo = XML_ERR_INVALID_ENCODING;
256 if ((ctxt->sax != NULL) &&
257 (ctxt->sax->error != NULL))
258 ctxt->sax->error(ctxt->userData,
259 "Char 0x%X out of allowed range\n", val);
260 ctxt->wellFormed = 0;
261 ctxt->disableSAX = 1;
262 }
263 return(val);
264 } else {
265 /* 1-byte code */
266 *len = 1;
267 return((int) *ctxt->input->cur);
268 }
269 }
270 /*
271 * Assume it's a fixed lenght encoding (1) with
272 * a compatibke encoding for the ASCII set, since
273 * XML constructs only use < 128 chars
274 */
275 *len = 1;
276 if ((int) *ctxt->input->cur < 0x80)
277 return((int) *ctxt->input->cur);
278
279 /*
280 * Humm this is bad, do an automatic flow conversion
281 */
282 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
283 ctxt->charset = XML_CHAR_ENCODING_UTF8;
284 return(xmlCurrentChar(ctxt, len));
285
286encoding_error:
287 /*
288 * If we detect an UTF8 error that probably mean that the
289 * input encoding didn't get properly advertized in the
290 * declaration header. Report the error and switch the encoding
291 * to ISO-Latin-1 (if you don't like this policy, just declare the
292 * encoding !)
293 */
294 ctxt->errNo = XML_ERR_INVALID_ENCODING;
295 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
296 ctxt->sax->error(ctxt->userData,
297 "Input is not proper UTF-8, indicate encoding !\n");
298 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
299 ctxt->input->cur[0], ctxt->input->cur[1],
300 ctxt->input->cur[2], ctxt->input->cur[3]);
301 }
302
303 ctxt->charset = XML_CHAR_ENCODING_8859_1;
304 *len = 1;
305 return((int) *ctxt->input->cur);
306}
307
308/**
309 * htmlNextChar:
310 * @ctxt: the HTML parser context
311 *
312 * Skip to the next char input char.
313 */
314
315void
316htmlNextChar(htmlParserCtxtPtr ctxt) {
317 if (ctxt->instate == XML_PARSER_EOF)
318 return;
319 if ((*ctxt->input->cur == 0) &&
320 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
321 xmlPopInput(ctxt);
322 } else {
323 if (*(ctxt->input->cur) == '\n') {
324 ctxt->input->line++; ctxt->input->col = 1;
325 } else ctxt->input->col++;
326 ctxt->input->cur++;
327 ctxt->nbChars++;
328 if (*ctxt->input->cur == 0)
329 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
330 }
331}
332
333/**
334 * htmlSkipBlankChars:
335 * @ctxt: the HTML parser context
336 *
337 * skip all blanks character found at that point in the input streams.
338 *
339 * Returns the number of space chars skipped
340 */
341
342int
343htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
344 int res = 0;
345
346 while (IS_BLANK(*(ctxt->input->cur))) {
347 if ((*ctxt->input->cur == 0) &&
348 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
349 xmlPopInput(ctxt);
350 } else {
351 if (*(ctxt->input->cur) == '\n') {
352 ctxt->input->line++; ctxt->input->col = 1;
353 } else ctxt->input->col++;
354 ctxt->input->cur++;
355 ctxt->nbChars++;
356 if (*ctxt->input->cur == 0)
357 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
358 }
359 res++;
360 }
361 return(res);
362}
363
364
365
366/************************************************************************
367 * *
368 * The list of HTML elements and their properties *
369 * *
370 ************************************************************************/
371
372/*
373 * Start Tag: 1 means the start tag can be ommited
374 * End Tag: 1 means the end tag can be ommited
375 * 2 means it's forbidden (empty elements)
376 * Depr: this element is deprecated
377 * DTD: 1 means that this element is valid only in the Loose DTD
378 * 2 means that this element is valid only in the Frameset DTD
379 *
380 * Name,Start Tag,End Tag,Save End, Empty, Depr., DTD, Description
381 */
382htmlElemDesc html40ElementTable[] = {
383{ "a", 0, 0, 0, 0, 0, 0, "anchor " },
384{ "abbr", 0, 0, 0, 0, 0, 0, "abbreviated form" },
385{ "acronym", 0, 0, 0, 0, 0, 0, "" },
386{ "address", 0, 0, 0, 0, 0, 0, "information on author " },
387{ "applet", 0, 0, 0, 0, 1, 1, "java applet " },
388{ "area", 0, 2, 2, 1, 0, 0, "client-side image map area " },
389{ "b", 0, 0, 0, 0, 0, 0, "bold text style" },
390{ "base", 0, 2, 2, 1, 0, 0, "document base uri " },
391{ "basefont", 0, 2, 2, 1, 1, 1, "base font size " },
392{ "bdo", 0, 0, 0, 0, 0, 0, "i18n bidi over-ride " },
393{ "big", 0, 0, 0, 0, 0, 0, "large text style" },
394{ "blockquote", 0, 0, 0, 0, 0, 0, "long quotation " },
395{ "body", 1, 1, 0, 0, 0, 0, "document body " },
396{ "br", 0, 2, 2, 1, 0, 0, "forced line break " },
397{ "button", 0, 0, 0, 0, 0, 0, "push button " },
398{ "caption", 0, 0, 0, 0, 0, 0, "table caption " },
399{ "center", 0, 0, 0, 0, 1, 1, "shorthand for div align=center " },
400{ "cite", 0, 0, 0, 0, 0, 0, "citation" },
401{ "code", 0, 0, 0, 0, 0, 0, "computer code fragment" },
402{ "col", 0, 2, 2, 1, 0, 0, "table column " },
403{ "colgroup", 0, 1, 0, 0, 0, 0, "table column group " },
404{ "dd", 0, 1, 0, 0, 0, 0, "definition description " },
405{ "del", 0, 0, 0, 0, 0, 0, "deleted text " },
406{ "dfn", 0, 0, 0, 0, 0, 0, "instance definition" },
407{ "dir", 0, 0, 0, 0, 1, 1, "directory list" },
408{ "div", 0, 0, 0, 0, 0, 0, "generic language/style container"},
409{ "dl", 0, 0, 0, 0, 0, 0, "definition list " },
410{ "dt", 0, 1, 0, 0, 0, 0, "definition term " },
411{ "em", 0, 0, 0, 0, 0, 0, "emphasis" },
412{ "fieldset", 0, 0, 0, 0, 0, 0, "form control group " },
413{ "font", 0, 0, 0, 0, 1, 1, "local change to font " },
414{ "form", 0, 0, 0, 0, 0, 0, "interactive form " },
415{ "frame", 0, 2, 2, 1, 0, 2, "subwindow " },
416{ "frameset", 0, 0, 0, 0, 0, 2, "window subdivision" },
417{ "h1", 0, 0, 0, 0, 0, 0, "heading " },
418{ "h2", 0, 0, 0, 0, 0, 0, "heading " },
419{ "h3", 0, 0, 0, 0, 0, 0, "heading " },
420{ "h4", 0, 0, 0, 0, 0, 0, "heading " },
421{ "h5", 0, 0, 0, 0, 0, 0, "heading " },
422{ "h6", 0, 0, 0, 0, 0, 0, "heading " },
423{ "head", 1, 1, 0, 0, 0, 0, "document head " },
424{ "hr", 0, 2, 2, 1, 0, 0, "horizontal rule " },
425{ "html", 1, 1, 0, 0, 0, 0, "document root element " },
426{ "i", 0, 0, 0, 0, 0, 0, "italic text style" },
427{ "iframe", 0, 0, 0, 0, 0, 1, "inline subwindow " },
428{ "img", 0, 2, 2, 1, 0, 0, "embedded image " },
429{ "input", 0, 2, 2, 1, 0, 0, "form control " },
430{ "ins", 0, 0, 0, 0, 0, 0, "inserted text" },
431{ "isindex", 0, 2, 2, 1, 1, 1, "single line prompt " },
432{ "kbd", 0, 0, 0, 0, 0, 0, "text to be entered by the user" },
433{ "label", 0, 0, 0, 0, 0, 0, "form field label text " },
434{ "legend", 0, 0, 0, 0, 0, 0, "fieldset legend " },
435{ "li", 0, 1, 1, 0, 0, 0, "list item " },
436{ "link", 0, 2, 2, 1, 0, 0, "a media-independent link " },
437{ "map", 0, 0, 0, 0, 0, 0, "client-side image map " },
438{ "menu", 0, 0, 0, 0, 1, 1, "menu list " },
439{ "meta", 0, 2, 2, 1, 0, 0, "generic metainformation " },
440{ "noframes", 0, 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
441{ "noscript", 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
442{ "object", 0, 0, 0, 0, 0, 0, "generic embedded object " },
443{ "ol", 0, 0, 0, 0, 0, 0, "ordered list " },
444{ "optgroup", 0, 0, 0, 0, 0, 0, "option group " },
445{ "option", 0, 1, 0, 0, 0, 0, "selectable choice " },
446{ "p", 0, 1, 1, 0, 0, 0, "paragraph " },
447{ "param", 0, 2, 2, 1, 0, 0, "named property value " },
448{ "pre", 0, 0, 0, 0, 0, 0, "preformatted text " },
449{ "q", 0, 0, 0, 0, 0, 0, "short inline quotation " },
450{ "s", 0, 0, 0, 0, 1, 1, "strike-through text style" },
451{ "samp", 0, 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
452{ "script", 0, 0, 0, 0, 0, 0, "script statements " },
453{ "select", 0, 0, 0, 0, 0, 0, "option selector " },
454{ "small", 0, 0, 0, 0, 0, 0, "small text style" },
455{ "span", 0, 0, 0, 0, 0, 0, "generic language/style container " },
456{ "strike", 0, 0, 0, 0, 1, 1, "strike-through text" },
457{ "strong", 0, 0, 0, 0, 0, 0, "strong emphasis" },
458{ "style", 0, 0, 0, 0, 0, 0, "style info " },
459{ "sub", 0, 0, 0, 0, 0, 0, "subscript" },
460{ "sup", 0, 0, 0, 0, 0, 0, "superscript " },
461{ "table", 0, 0, 0, 0, 0, 0, "&#160;" },
462{ "tbody", 1, 0, 0, 0, 0, 0, "table body " },
463{ "td", 0, 0, 0, 0, 0, 0, "table data cell" },
464{ "textarea", 0, 0, 0, 0, 0, 0, "multi-line text field " },
465{ "tfoot", 0, 1, 0, 0, 0, 0, "table footer " },
466{ "th", 0, 1, 0, 0, 0, 0, "table header cell" },
467{ "thead", 0, 1, 0, 0, 0, 0, "table header " },
468{ "title", 0, 0, 0, 0, 0, 0, "document title " },
469{ "tr", 0, 1, 0, 0, 0, 0, "table row " },
470{ "tt", 0, 0, 0, 0, 0, 0, "teletype or monospaced text style" },
471{ "u", 0, 0, 0, 0, 1, 1, "underlined text style" },
472{ "ul", 0, 0, 0, 0, 0, 0, "unordered list " },
473{ "var", 0, 0, 0, 0, 0, 0, "instance of a variable or program argument" },
474};
475
476/*
477 * start tags that imply the end of a current element
478 * any tag of each line implies the end of the current element if the type of
479 * that element is in the same line
480 */
481char *htmlEquEnd[] = {
482"dt", "dd", "li", "option", NULL,
483"h1", "h2", "h3", "h4", "h5", "h6", NULL,
484"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
485NULL
486};
487/*
488 * acording the HTML DTD, HR should be added to the 2nd line above, as it
489 * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
490 * because many documents contain rules in headings...
491 */
492
493/*
494 * start tags that imply the end of current element
495 */
496char *htmlStartClose[] = {
497"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
498 "dl", "ul", "ol", "menu", "dir", "address", "pre",
499 "listing", "xmp", "head", NULL,
500"head", "p", NULL,
501"title", "p", NULL,
502"body", "head", "style", "link", "title", "p", NULL,
503"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
504 "pre", "listing", "xmp", "head", "li", NULL,
505"hr", "p", "head", NULL,
506"h1", "p", "head", NULL,
507"h2", "p", "head", NULL,
508"h3", "p", "head", NULL,
509"h4", "p", "head", NULL,
510"h5", "p", "head", NULL,
511"h6", "p", "head", NULL,
512"dir", "p", "head", NULL,
513"address", "p", "head", "ul", NULL,
514"pre", "p", "head", "ul", NULL,
515"listing", "p", "head", NULL,
516"xmp", "p", "head", NULL,
517"blockquote", "p", "head", NULL,
518"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
519 "xmp", "head", NULL,
520"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
521 "head", "dd", NULL,
522"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
523 "head", "dt", NULL,
524"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
525 "listing", "xmp", NULL,
526"ol", "p", "head", "ul", NULL,
527"menu", "p", "head", "ul", NULL,
528"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
529"div", "p", "head", NULL,
530"noscript", "p", "head", NULL,
531"center", "font", "b", "i", "p", "head", NULL,
532"a", "a", NULL,
533"caption", "p", NULL,
534"colgroup", "caption", "colgroup", "col", "p", NULL,
535"col", "caption", "col", "p", NULL,
536"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
537 "listing", "xmp", "a", NULL,
538"th", "th", "td", NULL,
539"td", "th", "td", "p", NULL,
540"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
541"thead", "caption", "col", "colgroup", NULL,
542"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
543 "tbody", "p", NULL,
544"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
545 "tfoot", "tbody", "p", NULL,
546"optgroup", "option", NULL,
547"option", "option", NULL,
548"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
549 "pre", "listing", "xmp", "a", NULL,
550NULL
551};
552
553/*
554 * The list of HTML elements which are supposed not to have
555 * CDATA content and where a p element will be implied
556 *
557 * TODO: extend that list by reading the HTML SGML DtD on
558 * implied paragraph
559 */
560static char *htmlNoContentElements[] = {
561 "html",
562 "head",
563 "body",
564 NULL
565};
566
567/*
568 * The list of HTML attributes which are of content %Script;
569 * NOTE: when adding ones, check htmlIsScriptAttribute() since
570 * it assumes the name starts with 'on'
571 */
572static char *htmlScriptAttributes[] = {
573 "onclick",
574 "ondblclick",
575 "onmousedown",
576 "onmouseup",
577 "onmouseover",
578 "onmousemove",
579 "onmouseout",
580 "onkeypress",
581 "onkeydown",
582 "onkeyup",
583 "onload",
584 "onunload",
585 "onfocus",
586 "onblur",
587 "onsubmit",
588 "onrest",
589 "onchange",
590 "onselect"
591};
592
593
594static char** htmlStartCloseIndex[100];
595static int htmlStartCloseIndexinitialized = 0;
596
597/************************************************************************
598 * *
599 * functions to handle HTML specific data *
600 * *
601 ************************************************************************/
602
603/**
604 * htmlInitAutoClose:
605 *
606 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
607 * This is not reentrant. Call xmlInitParser() once before processing in
608 * case of use in multithreaded programs.
609 */
610void
611htmlInitAutoClose(void) {
612 int index, i = 0;
613
614 if (htmlStartCloseIndexinitialized) return;
615
616 for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
617 index = 0;
618 while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
619 htmlStartCloseIndex[index++] = &htmlStartClose[i];
620 while (htmlStartClose[i] != NULL) i++;
621 i++;
622 }
623 htmlStartCloseIndexinitialized = 1;
624}
625
626/**
627 * htmlTagLookup:
628 * @tag: The tag name in lowercase
629 *
630 * Lookup the HTML tag in the ElementTable
631 *
632 * Returns the related htmlElemDescPtr or NULL if not found.
633 */
634htmlElemDescPtr
635htmlTagLookup(const xmlChar *tag) {
636 int i;
637
638 for (i = 0; i < (sizeof(html40ElementTable) /
639 sizeof(html40ElementTable[0]));i++) {
640 if (xmlStrEqual(tag, BAD_CAST html40ElementTable[i].name))
641 return(&html40ElementTable[i]);
642 }
643 return(NULL);
644}
645
646/**
647 * htmlCheckAutoClose:
648 * @newtag: The new tag name
649 * @oldtag: The old tag name
650 *
651 * Checks wether the new tag is one of the registered valid tags for closing old.
652 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
653 *
654 * Returns 0 if no, 1 if yes.
655 */
656int
657htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
658 int i, index;
659 char **close = NULL;
660
661 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
662
663 /* inefficient, but not a big deal */
664 for (index = 0; index < 100;index++) {
665 close = htmlStartCloseIndex[index];
666 if (close == NULL) return(0);
667 if (xmlStrEqual(BAD_CAST *close, newtag)) break;
668 }
669
670 i = close - htmlStartClose;
671 i++;
672 while (htmlStartClose[i] != NULL) {
673 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
674 return(1);
675 }
676 i++;
677 }
678 return(0);
679}
680
681/**
682 * htmlAutoCloseOnClose:
683 * @ctxt: an HTML parser context
684 * @newtag: The new tag name
685 *
686 * The HTmL DtD allows an ending tag to implicitely close other tags.
687 */
688void
689htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
690 htmlElemDescPtr info;
691 xmlChar *oldname;
692 int i;
693
694#ifdef DEBUG
695 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
696 for (i = 0;i < ctxt->nameNr;i++)
697 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
698#endif
699
700 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
701 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
702 }
703 if (i < 0) return;
704
705 while (!xmlStrEqual(newtag, ctxt->name)) {
706 info = htmlTagLookup(ctxt->name);
707 if ((info == NULL) || (info->endTag == 1)) {
708#ifdef DEBUG
709 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
710#endif
711 } else {
712 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
713 ctxt->sax->error(ctxt->userData,
714 "Opening and ending tag mismatch: %s and %s\n",
715 newtag, ctxt->name);
716 ctxt->wellFormed = 0;
717 }
718 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
719 ctxt->sax->endElement(ctxt->userData, ctxt->name);
720 oldname = htmlnamePop(ctxt);
721 if (oldname != NULL) {
722#ifdef DEBUG
723 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
724#endif
725 xmlFree(oldname);
726 }
727 }
728}
729
730/**
731 * htmlAutoClose:
732 * @ctxt: an HTML parser context
733 * @newtag: The new tag name or NULL
734 *
735 * The HTmL DtD allows a tag to implicitely close other tags.
736 * The list is kept in htmlStartClose array. This function is
737 * called when a new tag has been detected and generates the
738 * appropriates closes if possible/needed.
739 * If newtag is NULL this mean we are at the end of the resource
740 * and we should check
741 */
742void
743htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
744 xmlChar *oldname;
745 while ((newtag != NULL) && (ctxt->name != NULL) &&
746 (htmlCheckAutoClose(newtag, ctxt->name))) {
747#ifdef DEBUG
748 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
749#endif
750 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
751 ctxt->sax->endElement(ctxt->userData, ctxt->name);
752 oldname = htmlnamePop(ctxt);
753 if (oldname != NULL) {
754#ifdef DEBUG
755 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
756#endif
757 xmlFree(oldname);
758 }
759 }
760 if (newtag == NULL) {
761 htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
762 htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
763 htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
764 }
765 while ((newtag == NULL) && (ctxt->name != NULL) &&
766 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
767 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
768 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
769#ifdef DEBUG
770 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
771#endif
772 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
773 ctxt->sax->endElement(ctxt->userData, ctxt->name);
774 oldname = htmlnamePop(ctxt);
775 if (oldname != NULL) {
776#ifdef DEBUG
777 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
778#endif
779 xmlFree(oldname);
780 }
781 }
782
783}
784
785/**
786 * htmlAutoCloseTag:
787 * @doc: the HTML document
788 * @name: The tag name
789 * @elem: the HTML element
790 *
791 * The HTmL DtD allows a tag to implicitely close other tags.
792 * The list is kept in htmlStartClose array. This function checks
793 * if the element or one of it's children would autoclose the
794 * given tag.
795 *
796 * Returns 1 if autoclose, 0 otherwise
797 */
798int
799htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
800 htmlNodePtr child;
801
802 if (elem == NULL) return(1);
803 if (xmlStrEqual(name, elem->name)) return(0);
804 if (htmlCheckAutoClose(elem->name, name)) return(1);
805 child = elem->children;
806 while (child != NULL) {
807 if (htmlAutoCloseTag(doc, name, child)) return(1);
808 child = child->next;
809 }
810 return(0);
811}
812
813/**
814 * htmlIsAutoClosed:
815 * @doc: the HTML document
816 * @elem: the HTML element
817 *
818 * The HTmL DtD allows a tag to implicitely close other tags.
819 * The list is kept in htmlStartClose array. This function checks
820 * if a tag is autoclosed by one of it's child
821 *
822 * Returns 1 if autoclosed, 0 otherwise
823 */
824int
825htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
826 htmlNodePtr child;
827
828 if (elem == NULL) return(1);
829 child = elem->children;
830 while (child != NULL) {
831 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
832 child = child->next;
833 }
834 return(0);
835}
836
837/**
838 * htmlCheckImplied:
839 * @ctxt: an HTML parser context
840 * @newtag: The new tag name
841 *
842 * The HTML DtD allows a tag to exists only implicitely
843 * called when a new tag has been detected and generates the
844 * appropriates implicit tags if missing
845 */
846void
847htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
848 if (!htmlOmittedDefaultValue)
849 return;
850 if (xmlStrEqual(newtag, BAD_CAST"html"))
851 return;
852 if (ctxt->nameNr <= 0) {
853#ifdef DEBUG
854 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
855#endif
856 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
857 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
858 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
859 }
860 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
861 return;
862 if ((ctxt->nameNr <= 1) &&
863 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
864 (xmlStrEqual(newtag, BAD_CAST"style")) ||
865 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
866 (xmlStrEqual(newtag, BAD_CAST"link")) ||
867 (xmlStrEqual(newtag, BAD_CAST"title")) ||
868 (xmlStrEqual(newtag, BAD_CAST"base")))) {
869 /*
870 * dropped OBJECT ... i you put it first BODY will be
871 * assumed !
872 */
873#ifdef DEBUG
874 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
875#endif
876 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
877 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
878 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
879 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
880 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
881 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
882 int i;
883 for (i = 0;i < ctxt->nameNr;i++) {
884 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
885 return;
886 }
887 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
888 return;
889 }
890 }
891
892#ifdef DEBUG
893 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
894#endif
895 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
896 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
897 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
898 }
899}
900
901/**
902 * htmlCheckParagraph
903 * @ctxt: an HTML parser context
904 *
905 * Check whether a p element need to be implied before inserting
906 * characters in the current element.
907 *
908 * Returns 1 if a paragraph has been inserted, 0 if not and -1
909 * in case of error.
910 */
911
912int
913htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
914 const xmlChar *tag;
915 int i;
916
917 if (ctxt == NULL)
918 return(-1);
919 tag = ctxt->name;
920 if (tag == NULL) {
921 htmlAutoClose(ctxt, BAD_CAST"p");
922 htmlCheckImplied(ctxt, BAD_CAST"p");
923 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
924 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
925 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
926 return(1);
927 }
928 if (!htmlOmittedDefaultValue)
929 return(0);
930 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
931 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
932#ifdef DEBUG
933 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
934#endif
935 htmlAutoClose(ctxt, BAD_CAST"p");
936 htmlCheckImplied(ctxt, BAD_CAST"p");
937 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
938 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
939 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
940 return(1);
941 }
942 }
943 return(0);
944}
945
946/**
947 * htmlIsScriptAttribute:
948 * @name: an attribute name
949 *
950 * Check if an attribute is of content type Script
951 *
952 * Returns 1 is the attribute is a script 0 otherwise
953 */
954int
955htmlIsScriptAttribute(const xmlChar *name) {
956 int i;
957
958 if (name == NULL)
959 return(0);
960 /*
961 * all script attributes start with 'on'
962 */
963 if ((name[0] != 'o') || (name[1] != 'n'))
964 return(0);
965 for (i = 0;
966 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
967 i++) {
968 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
969 return(1);
970 }
971 return(0);
972}
973
974/************************************************************************
975 * *
976 * The list of HTML predefined entities *
977 * *
978 ************************************************************************/
979
980
981htmlEntityDesc html40EntitiesTable[] = {
982/*
983 * the 4 absolute ones, plus apostrophe.
984 */
985{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
986{ 38, "amp", "ampersand, U+0026 ISOnum" },
987{ 39, "apos", "single quote" },
988{ 60, "lt", "less-than sign, U+003C ISOnum" },
989{ 62, "gt", "greater-than sign, U+003E ISOnum" },
990
991/*
992 * A bunch still in the 128-255 range
993 * Replacing them depend really on the charset used.
994 */
995{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
996{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
997{ 162, "cent", "cent sign, U+00A2 ISOnum" },
998{ 163, "pound","pound sign, U+00A3 ISOnum" },
999{ 164, "curren","currency sign, U+00A4 ISOnum" },
1000{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1001{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1002{ 167, "sect", "section sign, U+00A7 ISOnum" },
1003{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1004{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1005{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1006{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1007{ 172, "not", "not sign, U+00AC ISOnum" },
1008{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1009{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1010{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1011{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1012{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1013{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1014{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1015{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1016{ 181, "micro","micro sign, U+00B5 ISOnum" },
1017{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1018{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1019{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1020{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1021{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1022{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1023{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1024{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1025{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1026{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1027{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1028{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1029{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1030{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1031{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1032{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1033{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1034{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1035{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1036{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1037{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1038{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1039{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1040{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1041{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1042{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1043{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1044{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1045{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1046{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1047{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1048{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1049{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1050{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1051{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1052{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1053{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1054{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1055{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1056{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1057{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1058{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1059{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1060{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1061{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1062{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1063{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1064{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1065{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1066{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1067{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1068{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1069{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1070{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1071{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1072{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1073{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1074{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1075{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1076{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1077{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1078{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1079{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1080{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1081{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1082{ 247, "divide","division sign, U+00F7 ISOnum" },
1083{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1084{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1085{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1086{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1087{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1088{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1089{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1090{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1091
1092{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1093{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1094{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1095{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1096{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1097
1098/*
1099 * Anything below should really be kept as entities references
1100 */
1101{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1102
1103{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1104{ 732, "tilde","small tilde, U+02DC ISOdia" },
1105
1106{ 913, "Alpha","greek capital letter alpha, U+0391" },
1107{ 914, "Beta", "greek capital letter beta, U+0392" },
1108{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1109{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1110{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1111{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1112{ 919, "Eta", "greek capital letter eta, U+0397" },
1113{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1114{ 921, "Iota", "greek capital letter iota, U+0399" },
1115{ 922, "Kappa","greek capital letter kappa, U+039A" },
1116{ 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
1117{ 924, "Mu", "greek capital letter mu, U+039C" },
1118{ 925, "Nu", "greek capital letter nu, U+039D" },
1119{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1120{ 927, "Omicron","greek capital letter omicron, U+039F" },
1121{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1122{ 929, "Rho", "greek capital letter rho, U+03A1" },
1123{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1124{ 932, "Tau", "greek capital letter tau, U+03A4" },
1125{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1126{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1127{ 935, "Chi", "greek capital letter chi, U+03A7" },
1128{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1129{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1130
1131{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1132{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1133{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1134{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1135{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1136{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1137{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1138{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1139{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1140{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1141{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1142{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1143{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1144{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1145{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1146{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1147{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1148{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1149{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1150{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1151{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1152{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1153{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1154{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1155{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1156{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1157{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1158{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1159
1160{ 8194, "ensp", "en space, U+2002 ISOpub" },
1161{ 8195, "emsp", "em space, U+2003 ISOpub" },
1162{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1163{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1164{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1165{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1166{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1167{ 8211, "ndash","en dash, U+2013 ISOpub" },
1168{ 8212, "mdash","em dash, U+2014 ISOpub" },
1169{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1170{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1171{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1172{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1173{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1174{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1175{ 8224, "dagger","dagger, U+2020 ISOpub" },
1176{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1177
1178{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1179{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1180
1181{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1182
1183{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1184{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1185
1186{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1187{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1188
1189{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1190{ 8260, "frasl","fraction slash, U+2044 NEW" },
1191
1192{ 8364, "euro", "euro sign, U+20AC NEW" },
1193
1194{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1195{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1196{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1197{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1198{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1199{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1200{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1201{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1202{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1203{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1204{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1205{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1206{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1207{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1208{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1209{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1210
1211{ 8704, "forall","for all, U+2200 ISOtech" },
1212{ 8706, "part", "partial differential, U+2202 ISOtech" },
1213{ 8707, "exist","there exists, U+2203 ISOtech" },
1214{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1215{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1216{ 8712, "isin", "element of, U+2208 ISOtech" },
1217{ 8713, "notin","not an element of, U+2209 ISOtech" },
1218{ 8715, "ni", "contains as member, U+220B ISOtech" },
1219{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1220{ 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1221{ 8722, "minus","minus sign, U+2212 ISOtech" },
1222{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1223{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1224{ 8733, "prop", "proportional to, U+221D ISOtech" },
1225{ 8734, "infin","infinity, U+221E ISOtech" },
1226{ 8736, "ang", "angle, U+2220 ISOamso" },
1227{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1228{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1229{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1230{ 8746, "cup", "union = cup, U+222A ISOtech" },
1231{ 8747, "int", "integral, U+222B ISOtech" },
1232{ 8756, "there4","therefore, U+2234 ISOtech" },
1233{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1234{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1235{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1236{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1237{ 8801, "equiv","identical to, U+2261 ISOtech" },
1238{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1239{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1240{ 8834, "sub", "subset of, U+2282 ISOtech" },
1241{ 8835, "sup", "superset of, U+2283 ISOtech" },
1242{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1243{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1244{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1245{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1246{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1247{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1248{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1249{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1250{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1251{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1252{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1253{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1254{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1255{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1256
1257{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1258{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1259{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1260{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1261
1262};
1263
1264/************************************************************************
1265 * *
1266 * Commodity functions to handle entities *
1267 * *
1268 ************************************************************************/
1269
1270/*
1271 * Macro used to grow the current buffer.
1272 */
1273#define growBuffer(buffer) { \
1274 buffer##_size *= 2; \
1275 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1276 if (buffer == NULL) { \
1277 perror("realloc failed"); \
1278 return(NULL); \
1279 } \
1280}
1281
1282/**
1283 * htmlEntityLookup:
1284 * @name: the entity name
1285 *
1286 * Lookup the given entity in EntitiesTable
1287 *
1288 * TODO: the linear scan is really ugly, an hash table is really needed.
1289 *
1290 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1291 */
1292htmlEntityDescPtr
1293htmlEntityLookup(const xmlChar *name) {
1294 int i;
1295
1296 for (i = 0;i < (sizeof(html40EntitiesTable)/
1297 sizeof(html40EntitiesTable[0]));i++) {
1298 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1299#ifdef DEBUG
1300 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1301#endif
1302 return(&html40EntitiesTable[i]);
1303 }
1304 }
1305 return(NULL);
1306}
1307
1308/**
1309 * htmlEntityValueLookup:
1310 * @value: the entity's unicode value
1311 *
1312 * Lookup the given entity in EntitiesTable
1313 *
1314 * TODO: the linear scan is really ugly, an hash table is really needed.
1315 *
1316 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1317 */
1318htmlEntityDescPtr
1319htmlEntityValueLookup(int value) {
1320 int i;
1321#ifdef DEBUG
1322 int lv = 0;
1323#endif
1324
1325 for (i = 0;i < (sizeof(html40EntitiesTable)/
1326 sizeof(html40EntitiesTable[0]));i++) {
1327 if ((unsigned int) html40EntitiesTable[i].value >= value) {
1328 if ((unsigned int) html40EntitiesTable[i].value > value)
1329 break;
1330#ifdef DEBUG
1331 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1332#endif
1333 return(&html40EntitiesTable[i]);
1334 }
1335#ifdef DEBUG
1336 if (lv > html40EntitiesTable[i].value) {
1337 xmlGenericError(xmlGenericErrorContext,
1338 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1339 lv, html40EntitiesTable[i].value);
1340 }
1341 lv = html40EntitiesTable[i].value;
1342#endif
1343 }
1344 return(NULL);
1345}
1346
1347/**
1348 * UTF8ToHtml:
1349 * @out: a pointer to an array of bytes to store the result
1350 * @outlen: the length of @out
1351 * @in: a pointer to an array of UTF-8 chars
1352 * @inlen: the length of @in
1353 *
1354 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1355 * plus HTML entities block of chars out.
1356 *
1357 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1358 * The value of @inlen after return is the number of octets consumed
1359 * as the return value is positive, else unpredictiable.
1360 * The value of @outlen after return is the number of octets consumed.
1361 */
1362int
1363UTF8ToHtml(unsigned char* out, int *outlen,
1364 const unsigned char* in, int *inlen) {
1365 const unsigned char* processed = in;
1366 const unsigned char* outend;
1367 const unsigned char* outstart = out;
1368 const unsigned char* instart = in;
1369 const unsigned char* inend;
1370 unsigned int c, d;
1371 int trailing;
1372
1373 if (in == NULL) {
1374 /*
1375 * initialization nothing to do
1376 */
1377 *outlen = 0;
1378 *inlen = 0;
1379 return(0);
1380 }
1381 inend = in + (*inlen);
1382 outend = out + (*outlen);
1383 while (in < inend) {
1384 d = *in++;
1385 if (d < 0x80) { c= d; trailing= 0; }
1386 else if (d < 0xC0) {
1387 /* trailing byte in leading position */
1388 *outlen = out - outstart;
1389 *inlen = processed - instart;
1390 return(-2);
1391 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1392 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1393 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1394 else {
1395 /* no chance for this in Ascii */
1396 *outlen = out - outstart;
1397 *inlen = processed - instart;
1398 return(-2);
1399 }
1400
1401 if (inend - in < trailing) {
1402 break;
1403 }
1404
1405 for ( ; trailing; trailing--) {
1406 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1407 break;
1408 c <<= 6;
1409 c |= d & 0x3F;
1410 }
1411
1412 /* assertion: c is a single UTF-4 value */
1413 if (c < 0x80) {
1414 if (out + 1 >= outend)
1415 break;
1416 *out++ = c;
1417 } else {
1418 int len;
1419 htmlEntityDescPtr ent;
1420
1421 /*
1422 * Try to lookup a predefined HTML entity for it
1423 */
1424
1425 ent = htmlEntityValueLookup(c);
1426 if (ent == NULL) {
1427 /* no chance for this in Ascii */
1428 *outlen = out - outstart;
1429 *inlen = processed - instart;
1430 return(-2);
1431 }
1432 len = strlen(ent->name);
1433 if (out + 2 + len >= outend)
1434 break;
1435 *out++ = '&';
1436 memcpy(out, ent->name, len);
1437 out += len;
1438 *out++ = ';';
1439 }
1440 processed = in;
1441 }
1442 *outlen = out - outstart;
1443 *inlen = processed - instart;
1444 return(0);
1445}
1446
1447/**
1448 * htmlEncodeEntities:
1449 * @out: a pointer to an array of bytes to store the result
1450 * @outlen: the length of @out
1451 * @in: a pointer to an array of UTF-8 chars
1452 * @inlen: the length of @in
1453 * @quoteChar: the quote character to escape (' or ") or zero.
1454 *
1455 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1456 * plus HTML entities block of chars out.
1457 *
1458 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1459 * The value of @inlen after return is the number of octets consumed
1460 * as the return value is positive, else unpredictiable.
1461 * The value of @outlen after return is the number of octets consumed.
1462 */
1463int
1464htmlEncodeEntities(unsigned char* out, int *outlen,
1465 const unsigned char* in, int *inlen, int quoteChar) {
1466 const unsigned char* processed = in;
1467 const unsigned char* outend = out + (*outlen);
1468 const unsigned char* outstart = out;
1469 const unsigned char* instart = in;
1470 const unsigned char* inend = in + (*inlen);
1471 unsigned int c, d;
1472 int trailing;
1473
1474 while (in < inend) {
1475 d = *in++;
1476 if (d < 0x80) { c= d; trailing= 0; }
1477 else if (d < 0xC0) {
1478 /* trailing byte in leading position */
1479 *outlen = out - outstart;
1480 *inlen = processed - instart;
1481 return(-2);
1482 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1483 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1484 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1485 else {
1486 /* no chance for this in Ascii */
1487 *outlen = out - outstart;
1488 *inlen = processed - instart;
1489 return(-2);
1490 }
1491
1492 if (inend - in < trailing)
1493 break;
1494
1495 while (trailing--) {
1496 if (((d= *in++) & 0xC0) != 0x80) {
1497 *outlen = out - outstart;
1498 *inlen = processed - instart;
1499 return(-2);
1500 }
1501 c <<= 6;
1502 c |= d & 0x3F;
1503 }
1504
1505 /* assertion: c is a single UTF-4 value */
1506 if (c < 0x80 && c != quoteChar && c != '&' && c != '<' && c != '>') {
1507 if (out >= outend)
1508 break;
1509 *out++ = c;
1510 } else {
1511 htmlEntityDescPtr ent;
1512 const char *cp;
1513 char nbuf[16];
1514 int len;
1515
1516 /*
1517 * Try to lookup a predefined HTML entity for it
1518 */
1519 ent = htmlEntityValueLookup(c);
1520 if (ent == NULL) {
1521 sprintf(nbuf, "#%u", c);
1522 cp = nbuf;
1523 }
1524 else
1525 cp = ent->name;
1526 len = strlen(cp);
1527 if (out + 2 + len > outend)
1528 break;
1529 *out++ = '&';
1530 memcpy(out, cp, len);
1531 out += len;
1532 *out++ = ';';
1533 }
1534 processed = in;
1535 }
1536 *outlen = out - outstart;
1537 *inlen = processed - instart;
1538 return(0);
1539}
1540
1541/**
1542 * htmlDecodeEntities:
1543 * @ctxt: the parser context
1544 * @len: the len to decode (in bytes !), -1 for no size limit
1545 * @end: an end marker xmlChar, 0 if none
1546 * @end2: an end marker xmlChar, 0 if none
1547 * @end3: an end marker xmlChar, 0 if none
1548 *
1549 * Subtitute the HTML entities by their value
1550 *
1551 * DEPRECATED !!!!
1552 *
1553 * Returns A newly allocated string with the substitution done. The caller
1554 * must deallocate it !
1555 */
1556xmlChar *
1557htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
1558 xmlChar end, xmlChar end2, xmlChar end3) {
1559 xmlChar *name = NULL;
1560 xmlChar *buffer = NULL;
1561 unsigned int buffer_size = 0;
1562 unsigned int nbchars = 0;
1563 htmlEntityDescPtr ent;
1564 unsigned int max = (unsigned int) len;
1565 int c,l;
1566
1567 if (ctxt->depth > 40) {
1568 ctxt->errNo = XML_ERR_ENTITY_LOOP;
1569 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1570 ctxt->sax->error(ctxt->userData,
1571 "Detected entity reference loop\n");
1572 ctxt->wellFormed = 0;
1573 ctxt->disableSAX = 1;
1574 return(NULL);
1575 }
1576
1577 /*
1578 * allocate a translation buffer.
1579 */
1580 buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1581 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1582 if (buffer == NULL) {
1583 perror("xmlDecodeEntities: malloc failed");
1584 return(NULL);
1585 }
1586
1587 /*
1588 * Ok loop until we reach one of the ending char or a size limit.
1589 */
1590 c = CUR_CHAR(l);
1591 while ((nbchars < max) && (c != end) &&
1592 (c != end2) && (c != end3)) {
1593
1594 if (c == 0) break;
1595 if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1596 int val = htmlParseCharRef(ctxt);
1597 COPY_BUF(0,buffer,nbchars,val);
1598 NEXTL(l);
1599 } else if ((c == '&') && (ctxt->token != '&')) {
1600 ent = htmlParseEntityRef(ctxt, &name);
1601 if (name != NULL) {
1602 if (ent != NULL) {
1603 int val = ent->value;
1604 COPY_BUF(0,buffer,nbchars,val);
1605 NEXTL(l);
1606 } else {
1607 const xmlChar *cur = name;
1608
1609 buffer[nbchars++] = '&';
1610 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1611 growBuffer(buffer);
1612 }
1613 while (*cur != 0) {
1614 buffer[nbchars++] = *cur++;
1615 }
1616 buffer[nbchars++] = ';';
1617 }
1618 }
1619 } else {
1620 COPY_BUF(l,buffer,nbchars,c);
1621 NEXTL(l);
1622 if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1623 growBuffer(buffer);
1624 }
1625 }
1626 c = CUR_CHAR(l);
1627 }
1628 buffer[nbchars++] = 0;
1629 return(buffer);
1630}
1631
1632/************************************************************************
1633 * *
1634 * Commodity functions to handle streams *
1635 * *
1636 ************************************************************************/
1637
1638/**
1639 * htmlFreeInputStream:
1640 * @input: an htmlParserInputPtr
1641 *
1642 * Free up an input stream.
1643 */
1644void
1645htmlFreeInputStream(htmlParserInputPtr input) {
1646 if (input == NULL) return;
1647
1648 if (input->filename != NULL) xmlFree((char *) input->filename);
1649 if (input->directory != NULL) xmlFree((char *) input->directory);
1650 if ((input->free != NULL) && (input->base != NULL))
1651 input->free((xmlChar *) input->base);
1652 if (input->buf != NULL)
1653 xmlFreeParserInputBuffer(input->buf);
Daniel Veillard48b2f892001-02-25 16:11:03 +00001654 MEM_CLEANUP(input, sizeof(htmlParserInput));
Owen Taylor3473f882001-02-23 17:55:21 +00001655 xmlFree(input);
1656}
1657
1658/**
1659 * htmlNewInputStream:
1660 * @ctxt: an HTML parser context
1661 *
1662 * Create a new input stream structure
1663 * Returns the new input stream or NULL
1664 */
1665htmlParserInputPtr
1666htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1667 htmlParserInputPtr input;
1668
1669 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1670 if (input == NULL) {
1671 ctxt->errNo = XML_ERR_NO_MEMORY;
1672 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1673 ctxt->sax->error(ctxt->userData,
1674 "malloc: couldn't allocate a new input stream\n");
1675 return(NULL);
1676 }
1677 memset(input, 0, sizeof(htmlParserInput));
1678 input->filename = NULL;
1679 input->directory = NULL;
1680 input->base = NULL;
1681 input->cur = NULL;
1682 input->buf = NULL;
1683 input->line = 1;
1684 input->col = 1;
1685 input->buf = NULL;
1686 input->free = NULL;
1687 input->version = NULL;
1688 input->consumed = 0;
1689 input->length = 0;
1690 return(input);
1691}
1692
1693
1694/************************************************************************
1695 * *
1696 * Commodity functions, cleanup needed ? *
1697 * *
1698 ************************************************************************/
1699
1700/**
1701 * areBlanks:
1702 * @ctxt: an HTML parser context
1703 * @str: a xmlChar *
1704 * @len: the size of @str
1705 *
1706 * Is this a sequence of blank chars that one can ignore ?
1707 *
1708 * Returns 1 if ignorable 0 otherwise.
1709 */
1710
1711static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1712 int i;
1713 xmlNodePtr lastChild;
1714
1715 for (i = 0;i < len;i++)
1716 if (!(IS_BLANK(str[i]))) return(0);
1717
1718 if (CUR == 0) return(1);
1719 if (CUR != '<') return(0);
1720 if (ctxt->name == NULL)
1721 return(1);
1722 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1723 return(1);
1724 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1725 return(1);
1726 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1727 return(1);
1728 if (ctxt->node == NULL) return(0);
1729 lastChild = xmlGetLastChild(ctxt->node);
1730 if (lastChild == NULL) {
1731 if (ctxt->node->content != NULL) return(0);
1732 } else if (xmlNodeIsText(lastChild)) {
1733 return(0);
1734 } else if (xmlStrEqual(lastChild->name, BAD_CAST"b")) {
1735 return(0);
1736 } else if (xmlStrEqual(lastChild->name, BAD_CAST"bold")) {
1737 return(0);
1738 } else if (xmlStrEqual(lastChild->name, BAD_CAST"em")) {
1739 return(0);
1740 }
1741 return(1);
1742}
1743
1744/**
1745 * htmlHandleEntity:
1746 * @ctxt: an HTML parser context
1747 * @entity: an XML entity pointer.
1748 *
1749 * Default handling of an HTML entity, call the parser with the
1750 * substitution string
1751 */
1752
1753void
1754htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1755 int len;
1756
1757 if (entity->content == NULL) {
1758 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1759 ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1760 entity->name);
1761 ctxt->wellFormed = 0;
1762 return;
1763 }
1764 len = xmlStrlen(entity->content);
1765
1766 /*
1767 * Just handle the content as a set of chars.
1768 */
1769 htmlCheckParagraph(ctxt);
1770 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1771 ctxt->sax->characters(ctxt->userData, entity->content, len);
1772
1773}
1774
1775/**
1776 * htmlNewDocNoDtD:
1777 * @URI: URI for the dtd, or NULL
1778 * @ExternalID: the external ID of the DTD, or NULL
1779 *
1780 * Returns a new document, do not intialize the DTD if not provided
1781 */
1782htmlDocPtr
1783htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1784 xmlDocPtr cur;
1785
1786 /*
1787 * Allocate a new document and fill the fields.
1788 */
1789 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1790 if (cur == NULL) {
1791 xmlGenericError(xmlGenericErrorContext,
1792 "xmlNewDoc : malloc failed\n");
1793 return(NULL);
1794 }
1795 memset(cur, 0, sizeof(xmlDoc));
1796
1797 cur->type = XML_HTML_DOCUMENT_NODE;
1798 cur->version = NULL;
1799 cur->intSubset = NULL;
1800 if ((ExternalID != NULL) ||
1801 (URI != NULL))
1802 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1803 cur->doc = cur;
1804 cur->name = NULL;
1805 cur->children = NULL;
1806 cur->extSubset = NULL;
1807 cur->oldNs = NULL;
1808 cur->encoding = NULL;
1809 cur->standalone = 1;
1810 cur->compression = 0;
1811 cur->ids = NULL;
1812 cur->refs = NULL;
1813#ifndef XML_WITHOUT_CORBA
1814 cur->_private = NULL;
1815#endif
1816 return(cur);
1817}
1818
1819/**
1820 * htmlNewDoc:
1821 * @URI: URI for the dtd, or NULL
1822 * @ExternalID: the external ID of the DTD, or NULL
1823 *
1824 * Returns a new document
1825 */
1826htmlDocPtr
1827htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1828 if ((URI == NULL) && (ExternalID == NULL))
1829 return(htmlNewDocNoDtD(
1830 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1831 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
1832
1833 return(htmlNewDocNoDtD(URI, ExternalID));
1834}
1835
1836
1837/************************************************************************
1838 * *
1839 * The parser itself *
1840 * Relates to http://www.w3.org/TR/html40 *
1841 * *
1842 ************************************************************************/
1843
1844/************************************************************************
1845 * *
1846 * The parser itself *
1847 * *
1848 ************************************************************************/
1849
1850/**
1851 * htmlParseHTMLName:
1852 * @ctxt: an HTML parser context
1853 *
1854 * parse an HTML tag or attribute name, note that we convert it to lowercase
1855 * since HTML names are not case-sensitive.
1856 *
1857 * Returns the Tag Name parsed or NULL
1858 */
1859
1860xmlChar *
1861htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1862 xmlChar *ret = NULL;
1863 int i = 0;
1864 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1865
1866 if (!IS_LETTER(CUR) && (CUR != '_') &&
1867 (CUR != ':')) return(NULL);
1868
1869 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1870 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1871 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1872 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1873 else loc[i] = CUR;
1874 i++;
1875
1876 NEXT;
1877 }
1878
1879 ret = xmlStrndup(loc, i);
1880
1881 return(ret);
1882}
1883
1884/**
1885 * htmlParseName:
1886 * @ctxt: an HTML parser context
1887 *
1888 * parse an HTML name, this routine is case sensistive.
1889 *
1890 * Returns the Name parsed or NULL
1891 */
1892
1893xmlChar *
1894htmlParseName(htmlParserCtxtPtr ctxt) {
1895 xmlChar buf[HTML_MAX_NAMELEN];
1896 int len = 0;
1897
1898 GROW;
1899 if (!IS_LETTER(CUR) && (CUR != '_')) {
1900 return(NULL);
1901 }
1902
1903 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1904 (CUR == '.') || (CUR == '-') ||
1905 (CUR == '_') || (CUR == ':') ||
1906 (IS_COMBINING(CUR)) ||
1907 (IS_EXTENDER(CUR))) {
1908 buf[len++] = CUR;
1909 NEXT;
1910 if (len >= HTML_MAX_NAMELEN) {
1911 xmlGenericError(xmlGenericErrorContext,
1912 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1913 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1914 (CUR == '.') || (CUR == '-') ||
1915 (CUR == '_') || (CUR == ':') ||
1916 (IS_COMBINING(CUR)) ||
1917 (IS_EXTENDER(CUR)))
1918 NEXT;
1919 break;
1920 }
1921 }
1922 return(xmlStrndup(buf, len));
1923}
1924
1925/**
1926 * htmlParseHTMLAttribute:
1927 * @ctxt: an HTML parser context
1928 * @stop: a char stop value
1929 *
1930 * parse an HTML attribute value till the stop (quote), if
1931 * stop is 0 then it stops at the first space
1932 *
1933 * Returns the attribute parsed or NULL
1934 */
1935
1936xmlChar *
1937htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1938 xmlChar *buffer = NULL;
1939 int buffer_size = 0;
1940 xmlChar *out = NULL;
1941 xmlChar *name = NULL;
1942
1943 xmlChar *cur = NULL;
1944 htmlEntityDescPtr ent;
1945
1946 /*
1947 * allocate a translation buffer.
1948 */
1949 buffer_size = HTML_PARSER_BUFFER_SIZE;
1950 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1951 if (buffer == NULL) {
1952 perror("htmlParseHTMLAttribute: malloc failed");
1953 return(NULL);
1954 }
1955 out = buffer;
1956
1957 /*
1958 * Ok loop until we reach one of the ending chars
1959 */
1960 while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1961 if ((stop == 0) && (IS_BLANK(CUR))) break;
1962 if (CUR == '&') {
1963 if (NXT(1) == '#') {
1964 unsigned int c;
1965 int bits;
1966
1967 c = htmlParseCharRef(ctxt);
1968 if (c < 0x80)
1969 { *out++ = c; bits= -6; }
1970 else if (c < 0x800)
1971 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1972 else if (c < 0x10000)
1973 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1974 else
1975 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1976
1977 for ( ; bits >= 0; bits-= 6) {
1978 *out++ = ((c >> bits) & 0x3F) | 0x80;
1979 }
1980 } else {
1981 ent = htmlParseEntityRef(ctxt, &name);
1982 if (name == NULL) {
1983 *out++ = '&';
1984 if (out - buffer > buffer_size - 100) {
1985 int index = out - buffer;
1986
1987 growBuffer(buffer);
1988 out = &buffer[index];
1989 }
1990 } else if (ent == NULL) {
1991 *out++ = '&';
1992 cur = name;
1993 while (*cur != 0) {
1994 if (out - buffer > buffer_size - 100) {
1995 int index = out - buffer;
1996
1997 growBuffer(buffer);
1998 out = &buffer[index];
1999 }
2000 *out++ = *cur++;
2001 }
2002 xmlFree(name);
2003 } else {
2004 unsigned int c;
2005 int bits;
2006
2007 if (out - buffer > buffer_size - 100) {
2008 int index = out - buffer;
2009
2010 growBuffer(buffer);
2011 out = &buffer[index];
2012 }
2013 c = (xmlChar)ent->value;
2014 if (c < 0x80)
2015 { *out++ = c; bits= -6; }
2016 else if (c < 0x800)
2017 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2018 else if (c < 0x10000)
2019 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2020 else
2021 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2022
2023 for ( ; bits >= 0; bits-= 6) {
2024 *out++ = ((c >> bits) & 0x3F) | 0x80;
2025 }
2026 xmlFree(name);
2027 }
2028 }
2029 } else {
2030 unsigned int c;
2031 int bits, l;
2032
2033 if (out - buffer > buffer_size - 100) {
2034 int index = out - buffer;
2035
2036 growBuffer(buffer);
2037 out = &buffer[index];
2038 }
2039 c = CUR_CHAR(l);
2040 if (c < 0x80)
2041 { *out++ = c; bits= -6; }
2042 else if (c < 0x800)
2043 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2044 else if (c < 0x10000)
2045 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2046 else
2047 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2048
2049 for ( ; bits >= 0; bits-= 6) {
2050 *out++ = ((c >> bits) & 0x3F) | 0x80;
2051 }
2052 NEXT;
2053 }
2054 }
2055 *out++ = 0;
2056 return(buffer);
2057}
2058
2059/**
2060 * htmlParseNmtoken:
2061 * @ctxt: an HTML parser context
2062 *
2063 * parse an HTML Nmtoken.
2064 *
2065 * Returns the Nmtoken parsed or NULL
2066 */
2067
2068xmlChar *
2069htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
2070 xmlChar buf[HTML_MAX_NAMELEN];
2071 int len = 0;
2072
2073 GROW;
2074 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2075 (CUR == '.') || (CUR == '-') ||
2076 (CUR == '_') || (CUR == ':') ||
2077 (IS_COMBINING(CUR)) ||
2078 (IS_EXTENDER(CUR))) {
2079 buf[len++] = CUR;
2080 NEXT;
2081 if (len >= HTML_MAX_NAMELEN) {
2082 xmlGenericError(xmlGenericErrorContext,
2083 "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
2084 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2085 (CUR == '.') || (CUR == '-') ||
2086 (CUR == '_') || (CUR == ':') ||
2087 (IS_COMBINING(CUR)) ||
2088 (IS_EXTENDER(CUR)))
2089 NEXT;
2090 break;
2091 }
2092 }
2093 return(xmlStrndup(buf, len));
2094}
2095
2096/**
2097 * htmlParseEntityRef:
2098 * @ctxt: an HTML parser context
2099 * @str: location to store the entity name
2100 *
2101 * parse an HTML ENTITY references
2102 *
2103 * [68] EntityRef ::= '&' Name ';'
2104 *
2105 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2106 * if non-NULL *str will have to be freed by the caller.
2107 */
2108htmlEntityDescPtr
2109htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2110 xmlChar *name;
2111 htmlEntityDescPtr ent = NULL;
2112 *str = NULL;
2113
2114 if (CUR == '&') {
2115 NEXT;
2116 name = htmlParseName(ctxt);
2117 if (name == NULL) {
2118 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2119 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2120 ctxt->wellFormed = 0;
2121 } else {
2122 GROW;
2123 if (CUR == ';') {
2124 *str = name;
2125
2126 /*
2127 * Lookup the entity in the table.
2128 */
2129 ent = htmlEntityLookup(name);
2130 if (ent != NULL) /* OK that's ugly !!! */
2131 NEXT;
2132 } else {
2133 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2134 ctxt->sax->error(ctxt->userData,
2135 "htmlParseEntityRef: expecting ';'\n");
2136 *str = name;
2137 }
2138 }
2139 }
2140 return(ent);
2141}
2142
2143/**
2144 * htmlParseAttValue:
2145 * @ctxt: an HTML parser context
2146 *
2147 * parse a value for an attribute
2148 * Note: the parser won't do substitution of entities here, this
2149 * will be handled later in xmlStringGetNodeList, unless it was
2150 * asked for ctxt->replaceEntities != 0
2151 *
2152 * Returns the AttValue parsed or NULL.
2153 */
2154
2155xmlChar *
2156htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2157 xmlChar *ret = NULL;
2158
2159 if (CUR == '"') {
2160 NEXT;
2161 ret = htmlParseHTMLAttribute(ctxt, '"');
2162 if (CUR != '"') {
2163 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2164 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2165 ctxt->wellFormed = 0;
2166 } else
2167 NEXT;
2168 } else if (CUR == '\'') {
2169 NEXT;
2170 ret = htmlParseHTMLAttribute(ctxt, '\'');
2171 if (CUR != '\'') {
2172 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2173 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2174 ctxt->wellFormed = 0;
2175 } else
2176 NEXT;
2177 } else {
2178 /*
2179 * That's an HTMLism, the attribute value may not be quoted
2180 */
2181 ret = htmlParseHTMLAttribute(ctxt, 0);
2182 if (ret == NULL) {
2183 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2184 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2185 ctxt->wellFormed = 0;
2186 }
2187 }
2188 return(ret);
2189}
2190
2191/**
2192 * htmlParseSystemLiteral:
2193 * @ctxt: an HTML parser context
2194 *
2195 * parse an HTML Literal
2196 *
2197 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2198 *
2199 * Returns the SystemLiteral parsed or NULL
2200 */
2201
2202xmlChar *
2203htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2204 const xmlChar *q;
2205 xmlChar *ret = NULL;
2206
2207 if (CUR == '"') {
2208 NEXT;
2209 q = CUR_PTR;
2210 while ((IS_CHAR(CUR)) && (CUR != '"'))
2211 NEXT;
2212 if (!IS_CHAR(CUR)) {
2213 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2214 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2215 ctxt->wellFormed = 0;
2216 } else {
2217 ret = xmlStrndup(q, CUR_PTR - q);
2218 NEXT;
2219 }
2220 } else if (CUR == '\'') {
2221 NEXT;
2222 q = CUR_PTR;
2223 while ((IS_CHAR(CUR)) && (CUR != '\''))
2224 NEXT;
2225 if (!IS_CHAR(CUR)) {
2226 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2227 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2228 ctxt->wellFormed = 0;
2229 } else {
2230 ret = xmlStrndup(q, CUR_PTR - q);
2231 NEXT;
2232 }
2233 } else {
2234 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2235 ctxt->sax->error(ctxt->userData,
2236 "SystemLiteral \" or ' expected\n");
2237 ctxt->wellFormed = 0;
2238 }
2239
2240 return(ret);
2241}
2242
2243/**
2244 * htmlParsePubidLiteral:
2245 * @ctxt: an HTML parser context
2246 *
2247 * parse an HTML public literal
2248 *
2249 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2250 *
2251 * Returns the PubidLiteral parsed or NULL.
2252 */
2253
2254xmlChar *
2255htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2256 const xmlChar *q;
2257 xmlChar *ret = NULL;
2258 /*
2259 * Name ::= (Letter | '_') (NameChar)*
2260 */
2261 if (CUR == '"') {
2262 NEXT;
2263 q = CUR_PTR;
2264 while (IS_PUBIDCHAR(CUR)) NEXT;
2265 if (CUR != '"') {
2266 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2267 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2268 ctxt->wellFormed = 0;
2269 } else {
2270 ret = xmlStrndup(q, CUR_PTR - q);
2271 NEXT;
2272 }
2273 } else if (CUR == '\'') {
2274 NEXT;
2275 q = CUR_PTR;
2276 while ((IS_LETTER(CUR)) && (CUR != '\''))
2277 NEXT;
2278 if (!IS_LETTER(CUR)) {
2279 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2280 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2281 ctxt->wellFormed = 0;
2282 } else {
2283 ret = xmlStrndup(q, CUR_PTR - q);
2284 NEXT;
2285 }
2286 } else {
2287 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2288 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2289 ctxt->wellFormed = 0;
2290 }
2291
2292 return(ret);
2293}
2294
2295/**
2296 * htmlParseScript:
2297 * @ctxt: an HTML parser context
2298 *
2299 * parse the content of an HTML SCRIPT or STYLE element
2300 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2301 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2302 * http://www.w3.org/TR/html4/types.html#type-script
2303 * http://www.w3.org/TR/html4/types.html#h-6.15
2304 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2305 *
2306 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2307 * element and the value of intrinsic event attributes. User agents must
2308 * not evaluate script data as HTML markup but instead must pass it on as
2309 * data to a script engine.
2310 * NOTES:
2311 * - The content is passed like CDATA
2312 * - the attributes for style and scripting "onXXX" are also described
2313 * as CDATA but SGML allows entities references in attributes so their
2314 * processing is identical as other attributes
2315 */
2316void
2317htmlParseScript(htmlParserCtxtPtr ctxt) {
2318 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2319 int nbchar = 0;
2320 xmlChar cur;
2321
2322 SHRINK;
2323 cur = CUR;
2324 while (IS_CHAR(cur)) {
2325 if ((cur == '<') && (NXT(1) == '/')) {
2326 /*
2327 * One should break here, the specification is clear:
2328 * Authors should therefore escape "</" within the content.
2329 * Escape mechanisms are specific to each scripting or
2330 * style sheet language.
2331 */
2332 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2333 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2334 break; /* while */
2335 }
2336 buf[nbchar++] = cur;
2337 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2338 if (ctxt->sax->cdataBlock!= NULL) {
2339 /*
2340 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2341 */
2342 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2343 }
2344 nbchar = 0;
2345 }
2346 NEXT;
2347 cur = CUR;
2348 }
2349 if (!(IS_CHAR(cur))) {
2350 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2351 ctxt->sax->error(ctxt->userData,
2352 "Invalid char in CDATA 0x%X\n", cur);
2353 ctxt->wellFormed = 0;
2354 NEXT;
2355 }
2356
2357 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2358 if (ctxt->sax->cdataBlock!= NULL) {
2359 /*
2360 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2361 */
2362 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2363 }
2364 }
2365}
2366
2367
2368/**
2369 * htmlParseCharData:
2370 * @ctxt: an HTML parser context
2371 * @cdata: int indicating whether we are within a CDATA section
2372 *
2373 * parse a CharData section.
2374 * if we are within a CDATA section ']]>' marks an end of section.
2375 *
2376 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2377 */
2378
2379void
2380htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
2381 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2382 int nbchar = 0;
2383 int cur, l;
2384
2385 SHRINK;
2386 cur = CUR_CHAR(l);
2387 while (((cur != '<') || (ctxt->token == '<')) &&
2388 ((cur != '&') || (ctxt->token == '&')) &&
2389 (IS_CHAR(cur))) {
2390 COPY_BUF(l,buf,nbchar,cur);
2391 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2392 /*
2393 * Ok the segment is to be consumed as chars.
2394 */
2395 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2396 if (areBlanks(ctxt, buf, nbchar)) {
2397 if (ctxt->sax->ignorableWhitespace != NULL)
2398 ctxt->sax->ignorableWhitespace(ctxt->userData,
2399 buf, nbchar);
2400 } else {
2401 htmlCheckParagraph(ctxt);
2402 if (ctxt->sax->characters != NULL)
2403 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2404 }
2405 }
2406 nbchar = 0;
2407 }
2408 NEXTL(l);
2409 cur = CUR_CHAR(l);
2410 }
2411 if (nbchar != 0) {
2412 /*
2413 * Ok the segment is to be consumed as chars.
2414 */
2415 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2416 if (areBlanks(ctxt, buf, nbchar)) {
2417 if (ctxt->sax->ignorableWhitespace != NULL)
2418 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2419 } else {
2420 htmlCheckParagraph(ctxt);
2421 if (ctxt->sax->characters != NULL)
2422 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2423 }
2424 }
2425 }
2426}
2427
2428/**
2429 * htmlParseExternalID:
2430 * @ctxt: an HTML parser context
2431 * @publicID: a xmlChar** receiving PubidLiteral
2432 * @strict: indicate whether we should restrict parsing to only
2433 * production [75], see NOTE below
2434 *
2435 * Parse an External ID or a Public ID
2436 *
2437 * NOTE: Productions [75] and [83] interract badly since [75] can generate
2438 * 'PUBLIC' S PubidLiteral S SystemLiteral
2439 *
2440 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2441 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2442 *
2443 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2444 *
2445 * Returns the function returns SystemLiteral and in the second
2446 * case publicID receives PubidLiteral, is strict is off
2447 * it is possible to return NULL and have publicID set.
2448 */
2449
2450xmlChar *
2451htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
2452 xmlChar *URI = NULL;
2453
2454 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2455 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2456 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2457 SKIP(6);
2458 if (!IS_BLANK(CUR)) {
2459 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2460 ctxt->sax->error(ctxt->userData,
2461 "Space required after 'SYSTEM'\n");
2462 ctxt->wellFormed = 0;
2463 }
2464 SKIP_BLANKS;
2465 URI = htmlParseSystemLiteral(ctxt);
2466 if (URI == NULL) {
2467 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2468 ctxt->sax->error(ctxt->userData,
2469 "htmlParseExternalID: SYSTEM, no URI\n");
2470 ctxt->wellFormed = 0;
2471 }
2472 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2473 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2474 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2475 SKIP(6);
2476 if (!IS_BLANK(CUR)) {
2477 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2478 ctxt->sax->error(ctxt->userData,
2479 "Space required after 'PUBLIC'\n");
2480 ctxt->wellFormed = 0;
2481 }
2482 SKIP_BLANKS;
2483 *publicID = htmlParsePubidLiteral(ctxt);
2484 if (*publicID == NULL) {
2485 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2486 ctxt->sax->error(ctxt->userData,
2487 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2488 ctxt->wellFormed = 0;
2489 }
2490 SKIP_BLANKS;
2491 if ((CUR == '"') || (CUR == '\'')) {
2492 URI = htmlParseSystemLiteral(ctxt);
2493 }
2494 }
2495 return(URI);
2496}
2497
2498/**
2499 * htmlParseComment:
2500 * @ctxt: an HTML parser context
2501 *
2502 * Parse an XML (SGML) comment <!-- .... -->
2503 *
2504 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2505 */
2506void
2507htmlParseComment(htmlParserCtxtPtr ctxt) {
2508 xmlChar *buf = NULL;
2509 int len;
2510 int size = HTML_PARSER_BUFFER_SIZE;
2511 int q, ql;
2512 int r, rl;
2513 int cur, l;
2514 xmlParserInputState state;
2515
2516 /*
2517 * Check that there is a comment right here.
2518 */
2519 if ((RAW != '<') || (NXT(1) != '!') ||
2520 (NXT(2) != '-') || (NXT(3) != '-')) return;
2521
2522 state = ctxt->instate;
2523 ctxt->instate = XML_PARSER_COMMENT;
2524 SHRINK;
2525 SKIP(4);
2526 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2527 if (buf == NULL) {
2528 xmlGenericError(xmlGenericErrorContext,
2529 "malloc of %d byte failed\n", size);
2530 ctxt->instate = state;
2531 return;
2532 }
2533 q = CUR_CHAR(ql);
2534 NEXTL(ql);
2535 r = CUR_CHAR(rl);
2536 NEXTL(rl);
2537 cur = CUR_CHAR(l);
2538 len = 0;
2539 while (IS_CHAR(cur) &&
2540 ((cur != '>') ||
2541 (r != '-') || (q != '-'))) {
2542 if (len + 5 >= size) {
2543 size *= 2;
2544 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2545 if (buf == NULL) {
2546 xmlGenericError(xmlGenericErrorContext,
2547 "realloc of %d byte failed\n", size);
2548 ctxt->instate = state;
2549 return;
2550 }
2551 }
2552 COPY_BUF(ql,buf,len,q);
2553 q = r;
2554 ql = rl;
2555 r = cur;
2556 rl = l;
2557 NEXTL(l);
2558 cur = CUR_CHAR(l);
2559 if (cur == 0) {
2560 SHRINK;
2561 GROW;
2562 cur = CUR_CHAR(l);
2563 }
2564 }
2565 buf[len] = 0;
2566 if (!IS_CHAR(cur)) {
2567 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2568 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2569 ctxt->sax->error(ctxt->userData,
2570 "Comment not terminated \n<!--%.50s\n", buf);
2571 ctxt->wellFormed = 0;
2572 xmlFree(buf);
2573 } else {
2574 NEXT;
2575 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2576 (!ctxt->disableSAX))
2577 ctxt->sax->comment(ctxt->userData, buf);
2578 xmlFree(buf);
2579 }
2580 ctxt->instate = state;
2581}
2582
2583/**
2584 * htmlParseCharRef:
2585 * @ctxt: an HTML parser context
2586 *
2587 * parse Reference declarations
2588 *
2589 * [66] CharRef ::= '&#' [0-9]+ ';' |
2590 * '&#x' [0-9a-fA-F]+ ';'
2591 *
2592 * Returns the value parsed (as an int)
2593 */
2594int
2595htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2596 int val = 0;
2597
2598 if ((CUR == '&') && (NXT(1) == '#') &&
2599 (NXT(2) == 'x')) {
2600 SKIP(3);
2601 while (CUR != ';') {
2602 if ((CUR >= '0') && (CUR <= '9'))
2603 val = val * 16 + (CUR - '0');
2604 else if ((CUR >= 'a') && (CUR <= 'f'))
2605 val = val * 16 + (CUR - 'a') + 10;
2606 else if ((CUR >= 'A') && (CUR <= 'F'))
2607 val = val * 16 + (CUR - 'A') + 10;
2608 else {
2609 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2610 ctxt->sax->error(ctxt->userData,
2611 "htmlParseCharRef: invalid hexadecimal value\n");
2612 ctxt->wellFormed = 0;
2613 return(0);
2614 }
2615 NEXT;
2616 }
2617 if (CUR == ';')
2618 NEXT;
2619 } else if ((CUR == '&') && (NXT(1) == '#')) {
2620 SKIP(2);
2621 while (CUR != ';') {
2622 if ((CUR >= '0') && (CUR <= '9'))
2623 val = val * 10 + (CUR - '0');
2624 else {
2625 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2626 ctxt->sax->error(ctxt->userData,
2627 "htmlParseCharRef: invalid decimal value\n");
2628 ctxt->wellFormed = 0;
2629 return(0);
2630 }
2631 NEXT;
2632 }
2633 if (CUR == ';')
2634 NEXT;
2635 } else {
2636 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2637 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2638 ctxt->wellFormed = 0;
2639 }
2640 /*
2641 * Check the value IS_CHAR ...
2642 */
2643 if (IS_CHAR(val)) {
2644 return(val);
2645 } else {
2646 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2647 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2648 val);
2649 ctxt->wellFormed = 0;
2650 }
2651 return(0);
2652}
2653
2654
2655/**
2656 * htmlParseDocTypeDecl :
2657 * @ctxt: an HTML parser context
2658 *
2659 * parse a DOCTYPE declaration
2660 *
2661 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2662 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2663 */
2664
2665void
2666htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2667 xmlChar *name;
2668 xmlChar *ExternalID = NULL;
2669 xmlChar *URI = NULL;
2670
2671 /*
2672 * We know that '<!DOCTYPE' has been detected.
2673 */
2674 SKIP(9);
2675
2676 SKIP_BLANKS;
2677
2678 /*
2679 * Parse the DOCTYPE name.
2680 */
2681 name = htmlParseName(ctxt);
2682 if (name == NULL) {
2683 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2684 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2685 ctxt->wellFormed = 0;
2686 }
2687 /*
2688 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2689 */
2690
2691 SKIP_BLANKS;
2692
2693 /*
2694 * Check for SystemID and ExternalID
2695 */
2696 URI = htmlParseExternalID(ctxt, &ExternalID, 0);
2697 SKIP_BLANKS;
2698
2699 /*
2700 * We should be at the end of the DOCTYPE declaration.
2701 */
2702 if (CUR != '>') {
2703 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2704 ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2705 ctxt->wellFormed = 0;
2706 /* We shouldn't try to resynchronize ... */
2707 }
2708 NEXT;
2709
2710 /*
2711 * Create or update the document accordingly to the DOCTYPE
2712 */
2713 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2714 (!ctxt->disableSAX))
2715 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2716
2717 /*
2718 * Cleanup, since we don't use all those identifiers
2719 */
2720 if (URI != NULL) xmlFree(URI);
2721 if (ExternalID != NULL) xmlFree(ExternalID);
2722 if (name != NULL) xmlFree(name);
2723}
2724
2725/**
2726 * htmlParseAttribute:
2727 * @ctxt: an HTML parser context
2728 * @value: a xmlChar ** used to store the value of the attribute
2729 *
2730 * parse an attribute
2731 *
2732 * [41] Attribute ::= Name Eq AttValue
2733 *
2734 * [25] Eq ::= S? '=' S?
2735 *
2736 * With namespace:
2737 *
2738 * [NS 11] Attribute ::= QName Eq AttValue
2739 *
2740 * Also the case QName == xmlns:??? is handled independently as a namespace
2741 * definition.
2742 *
2743 * Returns the attribute name, and the value in *value.
2744 */
2745
2746xmlChar *
2747htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2748 xmlChar *name, *val = NULL;
2749
2750 *value = NULL;
2751 name = htmlParseHTMLName(ctxt);
2752 if (name == NULL) {
2753 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2754 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2755 ctxt->wellFormed = 0;
2756 return(NULL);
2757 }
2758
2759 /*
2760 * read the value
2761 */
2762 SKIP_BLANKS;
2763 if (CUR == '=') {
2764 NEXT;
2765 SKIP_BLANKS;
2766 val = htmlParseAttValue(ctxt);
2767 /******
2768 } else {
2769 * TODO : some attribute must have values, some may not
2770 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2771 ctxt->sax->warning(ctxt->userData,
2772 "No value for attribute %s\n", name); */
2773 }
2774
2775 *value = val;
2776 return(name);
2777}
2778
2779/**
2780 * htmlCheckEncoding:
2781 * @ctxt: an HTML parser context
2782 * @attvalue: the attribute value
2783 *
2784 * Checks an http-equiv attribute from a Meta tag to detect
2785 * the encoding
2786 * If a new encoding is detected the parser is switched to decode
2787 * it and pass UTF8
2788 */
2789void
2790htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2791 const xmlChar *encoding;
2792
2793 if ((ctxt == NULL) || (attvalue == NULL))
2794 return;
2795
2796 /* do not change encoding */
2797 if (ctxt->input->encoding != NULL)
2798 return;
2799
2800 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2801 if (encoding != NULL) {
2802 encoding += 8;
2803 } else {
2804 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2805 if (encoding != NULL)
2806 encoding += 9;
2807 }
2808 if (encoding != NULL) {
2809 xmlCharEncoding enc;
2810 xmlCharEncodingHandlerPtr handler;
2811
2812 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2813
2814 if (ctxt->input->encoding != NULL)
2815 xmlFree((xmlChar *) ctxt->input->encoding);
2816 ctxt->input->encoding = xmlStrdup(encoding);
2817
2818 enc = xmlParseCharEncoding((const char *) encoding);
2819 /*
2820 * registered set of known encodings
2821 */
2822 if (enc != XML_CHAR_ENCODING_ERROR) {
2823 xmlSwitchEncoding(ctxt, enc);
2824 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2825 } else {
2826 /*
2827 * fallback for unknown encodings
2828 */
2829 handler = xmlFindCharEncodingHandler((const char *) encoding);
2830 if (handler != NULL) {
2831 xmlSwitchToEncoding(ctxt, handler);
2832 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2833 } else {
2834 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2835 }
2836 }
2837
2838 if ((ctxt->input->buf != NULL) &&
2839 (ctxt->input->buf->encoder != NULL) &&
2840 (ctxt->input->buf->raw != NULL) &&
2841 (ctxt->input->buf->buffer != NULL)) {
2842 int nbchars;
2843 int processed;
2844
2845 /*
2846 * convert as much as possible to the parser reading buffer.
2847 */
2848 processed = ctxt->input->cur - ctxt->input->base;
2849 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2850 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2851 ctxt->input->buf->buffer,
2852 ctxt->input->buf->raw);
2853 if (nbchars < 0) {
2854 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2855 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2856 ctxt->sax->error(ctxt->userData,
2857 "htmlCheckEncoding: encoder error\n");
2858 }
2859 ctxt->input->base =
2860 ctxt->input->cur = ctxt->input->buf->buffer->content;
2861 }
2862 }
2863}
2864
2865/**
2866 * htmlCheckMeta:
2867 * @ctxt: an HTML parser context
2868 * @atts: the attributes values
2869 *
2870 * Checks an attributes from a Meta tag
2871 */
2872void
2873htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2874 int i;
2875 const xmlChar *att, *value;
2876 int http = 0;
2877 const xmlChar *content = NULL;
2878
2879 if ((ctxt == NULL) || (atts == NULL))
2880 return;
2881
2882 i = 0;
2883 att = atts[i++];
2884 while (att != NULL) {
2885 value = atts[i++];
2886 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2887 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2888 http = 1;
2889 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2890 content = value;
2891 att = atts[i++];
2892 }
2893 if ((http) && (content != NULL))
2894 htmlCheckEncoding(ctxt, content);
2895
2896}
2897
2898/**
2899 * htmlParseStartTag:
2900 * @ctxt: an HTML parser context
2901 *
2902 * parse a start of tag either for rule element or
2903 * EmptyElement. In both case we don't parse the tag closing chars.
2904 *
2905 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2906 *
2907 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2908 *
2909 * With namespace:
2910 *
2911 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2912 *
2913 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2914 *
2915 */
2916
2917void
2918htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2919 xmlChar *name;
2920 xmlChar *attname;
2921 xmlChar *attvalue;
2922 const xmlChar **atts = NULL;
2923 int nbatts = 0;
2924 int maxatts = 0;
2925 int meta = 0;
2926 int i;
2927
2928 if (CUR != '<') return;
2929 NEXT;
2930
2931 GROW;
2932 name = htmlParseHTMLName(ctxt);
2933 if (name == NULL) {
2934 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2935 ctxt->sax->error(ctxt->userData,
2936 "htmlParseStartTag: invalid element name\n");
2937 ctxt->wellFormed = 0;
2938 /* Dump the bogus tag like browsers do */
2939 while ((IS_CHAR(CUR)) && (CUR != '>'))
2940 NEXT;
2941 return;
2942 }
2943 if (xmlStrEqual(name, BAD_CAST"meta"))
2944 meta = 1;
2945
2946 /*
2947 * Check for auto-closure of HTML elements.
2948 */
2949 htmlAutoClose(ctxt, name);
2950
2951 /*
2952 * Check for implied HTML elements.
2953 */
2954 htmlCheckImplied(ctxt, name);
2955
2956 /*
2957 * Avoid html at any level > 0, head at any level != 1
2958 * or any attempt to recurse body
2959 */
2960 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2961 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2962 ctxt->sax->error(ctxt->userData,
2963 "htmlParseStartTag: misplaced <html> tag\n");
2964 ctxt->wellFormed = 0;
2965 xmlFree(name);
2966 return;
2967 }
2968 if ((ctxt->nameNr != 1) &&
2969 (xmlStrEqual(name, BAD_CAST"head"))) {
2970 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2971 ctxt->sax->error(ctxt->userData,
2972 "htmlParseStartTag: misplaced <head> tag\n");
2973 ctxt->wellFormed = 0;
2974 xmlFree(name);
2975 return;
2976 }
2977 if (xmlStrEqual(name, BAD_CAST"body")) {
2978 int i;
2979 for (i = 0;i < ctxt->nameNr;i++) {
2980 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
2981 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2982 ctxt->sax->error(ctxt->userData,
2983 "htmlParseStartTag: misplaced <body> tag\n");
2984 ctxt->wellFormed = 0;
2985 xmlFree(name);
2986 return;
2987 }
2988 }
2989 }
2990
2991 /*
2992 * Now parse the attributes, it ends up with the ending
2993 *
2994 * (S Attribute)* S?
2995 */
2996 SKIP_BLANKS;
2997 while ((IS_CHAR(CUR)) &&
2998 (CUR != '>') &&
2999 ((CUR != '/') || (NXT(1) != '>'))) {
3000 long cons = ctxt->nbChars;
3001
3002 GROW;
3003 attname = htmlParseAttribute(ctxt, &attvalue);
3004 if (attname != NULL) {
3005
3006 /*
3007 * Well formedness requires at most one declaration of an attribute
3008 */
3009 for (i = 0; i < nbatts;i += 2) {
3010 if (xmlStrEqual(atts[i], attname)) {
3011 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3012 ctxt->sax->error(ctxt->userData,
3013 "Attribute %s redefined\n",
3014 attname);
3015 ctxt->wellFormed = 0;
3016 xmlFree(attname);
3017 if (attvalue != NULL)
3018 xmlFree(attvalue);
3019 goto failed;
3020 }
3021 }
3022
3023 /*
3024 * Add the pair to atts
3025 */
3026 if (atts == NULL) {
3027 maxatts = 10;
3028 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3029 if (atts == NULL) {
3030 xmlGenericError(xmlGenericErrorContext,
3031 "malloc of %ld byte failed\n",
3032 maxatts * (long)sizeof(xmlChar *));
3033 if (name != NULL) xmlFree(name);
3034 return;
3035 }
3036 } else if (nbatts + 4 > maxatts) {
3037 maxatts *= 2;
3038 atts = (const xmlChar **) xmlRealloc((void *) atts,
3039 maxatts * sizeof(xmlChar *));
3040 if (atts == NULL) {
3041 xmlGenericError(xmlGenericErrorContext,
3042 "realloc of %ld byte failed\n",
3043 maxatts * (long)sizeof(xmlChar *));
3044 if (name != NULL) xmlFree(name);
3045 return;
3046 }
3047 }
3048 atts[nbatts++] = attname;
3049 atts[nbatts++] = attvalue;
3050 atts[nbatts] = NULL;
3051 atts[nbatts + 1] = NULL;
3052 }
3053 else {
3054 /* Dump the bogus attribute string up to the next blank or
3055 * the end of the tag. */
3056 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3057 && ((CUR != '/') || (NXT(1) != '>')))
3058 NEXT;
3059 }
3060
3061failed:
3062 SKIP_BLANKS;
3063 if (cons == ctxt->nbChars) {
3064 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3065 ctxt->sax->error(ctxt->userData,
3066 "htmlParseStartTag: problem parsing attributes\n");
3067 ctxt->wellFormed = 0;
3068 break;
3069 }
3070 }
3071
3072 /*
3073 * Handle specific association to the META tag
3074 */
3075 if (meta)
3076 htmlCheckMeta(ctxt, atts);
3077
3078 /*
3079 * SAX: Start of Element !
3080 */
3081 htmlnamePush(ctxt, xmlStrdup(name));
3082#ifdef DEBUG
3083 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3084#endif
3085 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3086 ctxt->sax->startElement(ctxt->userData, name, atts);
3087
3088 if (atts != NULL) {
3089 for (i = 0;i < nbatts;i++) {
3090 if (atts[i] != NULL)
3091 xmlFree((xmlChar *) atts[i]);
3092 }
3093 xmlFree((void *) atts);
3094 }
3095 if (name != NULL) xmlFree(name);
3096}
3097
3098/**
3099 * htmlParseEndTag:
3100 * @ctxt: an HTML parser context
3101 *
3102 * parse an end of tag
3103 *
3104 * [42] ETag ::= '</' Name S? '>'
3105 *
3106 * With namespace
3107 *
3108 * [NS 9] ETag ::= '</' QName S? '>'
3109 */
3110
3111void
3112htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3113 xmlChar *name;
3114 xmlChar *oldname;
3115 int i;
3116
3117 if ((CUR != '<') || (NXT(1) != '/')) {
3118 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3119 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3120 ctxt->wellFormed = 0;
3121 return;
3122 }
3123 SKIP(2);
3124
3125 name = htmlParseHTMLName(ctxt);
3126 if (name == NULL) return;
3127
3128 /*
3129 * We should definitely be at the ending "S? '>'" part
3130 */
3131 SKIP_BLANKS;
3132 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3133 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3134 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3135 ctxt->wellFormed = 0;
3136 } else
3137 NEXT;
3138
3139 /*
3140 * If the name read is not one of the element in the parsing stack
3141 * then return, it's just an error.
3142 */
3143 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3144 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3145 }
3146 if (i < 0) {
3147 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3148 ctxt->sax->error(ctxt->userData,
3149 "Unexpected end tag : %s\n", name);
3150 xmlFree(name);
3151 ctxt->wellFormed = 0;
3152 return;
3153 }
3154
3155
3156 /*
3157 * Check for auto-closure of HTML elements.
3158 */
3159
3160 htmlAutoCloseOnClose(ctxt, name);
3161
3162 /*
3163 * Well formedness constraints, opening and closing must match.
3164 * With the exception that the autoclose may have popped stuff out
3165 * of the stack.
3166 */
3167 if (!xmlStrEqual(name, ctxt->name)) {
3168#ifdef DEBUG
3169 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3170#endif
3171 if ((ctxt->name != NULL) &&
3172 (!xmlStrEqual(ctxt->name, name))) {
3173 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3174 ctxt->sax->error(ctxt->userData,
3175 "Opening and ending tag mismatch: %s and %s\n",
3176 name, ctxt->name);
3177 ctxt->wellFormed = 0;
3178 }
3179 }
3180
3181 /*
3182 * SAX: End of Tag
3183 */
3184 oldname = ctxt->name;
3185 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3186 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3187 ctxt->sax->endElement(ctxt->userData, name);
3188 oldname = htmlnamePop(ctxt);
3189 if (oldname != NULL) {
3190#ifdef DEBUG
3191 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3192#endif
3193 xmlFree(oldname);
3194#ifdef DEBUG
3195 } else {
3196 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3197#endif
3198 }
3199 }
3200
3201 if (name != NULL)
3202 xmlFree(name);
3203
3204 return;
3205}
3206
3207
3208/**
3209 * htmlParseReference:
3210 * @ctxt: an HTML parser context
3211 *
3212 * parse and handle entity references in content,
3213 * this will end-up in a call to character() since this is either a
3214 * CharRef, or a predefined entity.
3215 */
3216void
3217htmlParseReference(htmlParserCtxtPtr ctxt) {
3218 htmlEntityDescPtr ent;
3219 xmlChar out[6];
3220 xmlChar *name;
3221 if (CUR != '&') return;
3222
3223 if (NXT(1) == '#') {
3224 unsigned int c;
3225 int bits, i = 0;
3226
3227 c = htmlParseCharRef(ctxt);
3228 if (c == 0)
3229 return;
3230
3231 if (c < 0x80) { out[i++]= c; bits= -6; }
3232 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3233 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3234 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3235
3236 for ( ; bits >= 0; bits-= 6) {
3237 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3238 }
3239 out[i] = 0;
3240
3241 htmlCheckParagraph(ctxt);
3242 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3243 ctxt->sax->characters(ctxt->userData, out, i);
3244 } else {
3245 ent = htmlParseEntityRef(ctxt, &name);
3246 if (name == NULL) {
3247 htmlCheckParagraph(ctxt);
3248 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3249 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3250 return;
3251 }
3252 if ((ent == NULL) || (ent->value <= 0)) {
3253 htmlCheckParagraph(ctxt);
3254 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3255 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3256 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3257 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3258 }
3259 } else {
3260 unsigned int c;
3261 int bits, i = 0;
3262
3263 c = ent->value;
3264 if (c < 0x80)
3265 { out[i++]= c; bits= -6; }
3266 else if (c < 0x800)
3267 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3268 else if (c < 0x10000)
3269 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3270 else
3271 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3272
3273 for ( ; bits >= 0; bits-= 6) {
3274 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3275 }
3276 out[i] = 0;
3277
3278 htmlCheckParagraph(ctxt);
3279 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3280 ctxt->sax->characters(ctxt->userData, out, i);
3281 }
3282 xmlFree(name);
3283 }
3284}
3285
3286/**
3287 * htmlParseContent:
3288 * @ctxt: an HTML parser context
3289 * @name: the node name
3290 *
3291 * Parse a content: comment, sub-element, reference or text.
3292 *
3293 */
3294
3295void
3296htmlParseContent(htmlParserCtxtPtr ctxt) {
3297 xmlChar *currentNode;
3298 int depth;
3299
3300 currentNode = xmlStrdup(ctxt->name);
3301 depth = ctxt->nameNr;
3302 while (1) {
3303 long cons = ctxt->nbChars;
3304
3305 GROW;
3306 /*
3307 * Our tag or one of it's parent or children is ending.
3308 */
3309 if ((CUR == '<') && (NXT(1) == '/')) {
3310 htmlParseEndTag(ctxt);
3311 if (currentNode != NULL) xmlFree(currentNode);
3312 return;
3313 }
3314
3315 /*
3316 * Has this node been popped out during parsing of
3317 * the next element
3318 */
3319 if ((!xmlStrEqual(currentNode, ctxt->name)) &&
3320 (depth >= ctxt->nameNr)) {
3321 if (currentNode != NULL) xmlFree(currentNode);
3322 return;
3323 }
3324
Daniel Veillardf9533d12001-03-03 10:04:57 +00003325 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3326 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
Owen Taylor3473f882001-02-23 17:55:21 +00003327 /*
3328 * Handle SCRIPT/STYLE separately
3329 */
3330 htmlParseScript(ctxt);
3331 } else {
3332 /*
3333 * Sometimes DOCTYPE arrives in the middle of the document
3334 */
3335 if ((CUR == '<') && (NXT(1) == '!') &&
3336 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3337 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3338 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3339 (UPP(8) == 'E')) {
3340 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3341 ctxt->sax->error(ctxt->userData,
3342 "Misplaced DOCTYPE declaration\n");
3343 ctxt->wellFormed = 0;
3344 htmlParseDocTypeDecl(ctxt);
3345 }
3346
3347 /*
3348 * First case : a comment
3349 */
3350 if ((CUR == '<') && (NXT(1) == '!') &&
3351 (NXT(2) == '-') && (NXT(3) == '-')) {
3352 htmlParseComment(ctxt);
3353 }
3354
3355 /*
3356 * Second case : a sub-element.
3357 */
3358 else if (CUR == '<') {
3359 htmlParseElement(ctxt);
3360 }
3361
3362 /*
3363 * Third case : a reference. If if has not been resolved,
3364 * parsing returns it's Name, create the node
3365 */
3366 else if (CUR == '&') {
3367 htmlParseReference(ctxt);
3368 }
3369
3370 /*
3371 * Fourth : end of the resource
3372 */
3373 else if (CUR == 0) {
Daniel Veillardf9533d12001-03-03 10:04:57 +00003374 int level = ctxt->nodeNr;
Owen Taylor3473f882001-02-23 17:55:21 +00003375 htmlAutoClose(ctxt, NULL);
Daniel Veillardf9533d12001-03-03 10:04:57 +00003376 if (level == ctxt->nodeNr)
3377 break;
Owen Taylor3473f882001-02-23 17:55:21 +00003378 }
3379
3380 /*
3381 * Last case, text. Note that References are handled directly.
3382 */
3383 else {
3384 htmlParseCharData(ctxt, 0);
3385 }
3386
3387 if (cons == ctxt->nbChars) {
3388 if (ctxt->node != NULL) {
3389 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3390 ctxt->sax->error(ctxt->userData,
3391 "detected an error in element content\n");
3392 ctxt->wellFormed = 0;
3393 }
3394 break;
3395 }
3396 }
3397 GROW;
3398 }
3399 if (currentNode != NULL) xmlFree(currentNode);
3400}
3401
3402/**
3403 * htmlParseElement:
3404 * @ctxt: an HTML parser context
3405 *
3406 * parse an HTML element, this is highly recursive
3407 *
3408 * [39] element ::= EmptyElemTag | STag content ETag
3409 *
3410 * [41] Attribute ::= Name Eq AttValue
3411 */
3412
3413void
3414htmlParseElement(htmlParserCtxtPtr ctxt) {
3415 xmlChar *name;
3416 xmlChar *currentNode = NULL;
3417 htmlElemDescPtr info;
3418 htmlParserNodeInfo node_info;
3419 xmlChar *oldname;
3420 int depth = ctxt->nameNr;
3421
3422 /* Capture start position */
3423 if (ctxt->record_info) {
3424 node_info.begin_pos = ctxt->input->consumed +
3425 (CUR_PTR - ctxt->input->base);
3426 node_info.begin_line = ctxt->input->line;
3427 }
3428
3429 oldname = xmlStrdup(ctxt->name);
3430 htmlParseStartTag(ctxt);
3431 name = ctxt->name;
3432#ifdef DEBUG
3433 if (oldname == NULL)
3434 xmlGenericError(xmlGenericErrorContext,
3435 "Start of element %s\n", name);
3436 else if (name == NULL)
3437 xmlGenericError(xmlGenericErrorContext,
3438 "Start of element failed, was %s\n", oldname);
3439 else
3440 xmlGenericError(xmlGenericErrorContext,
3441 "Start of element %s, was %s\n", name, oldname);
3442#endif
3443 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3444 (name == NULL)) {
3445 if (CUR == '>')
3446 NEXT;
3447 if (oldname != NULL)
3448 xmlFree(oldname);
3449 return;
3450 }
3451 if (oldname != NULL)
3452 xmlFree(oldname);
3453
3454 /*
3455 * Lookup the info for that element.
3456 */
3457 info = htmlTagLookup(name);
3458 if (info == NULL) {
3459 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3460 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3461 name);
3462 ctxt->wellFormed = 0;
3463 } else if (info->depr) {
3464/***************************
3465 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3466 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3467 name);
3468 ***************************/
3469 }
3470
3471 /*
3472 * Check for an Empty Element labelled the XML/SGML way
3473 */
3474 if ((CUR == '/') && (NXT(1) == '>')) {
3475 SKIP(2);
3476 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3477 ctxt->sax->endElement(ctxt->userData, name);
3478 oldname = htmlnamePop(ctxt);
3479#ifdef DEBUG
3480 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3481#endif
3482 if (oldname != NULL)
3483 xmlFree(oldname);
3484 return;
3485 }
3486
3487 if (CUR == '>') {
3488 NEXT;
3489 } else {
3490 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3491 ctxt->sax->error(ctxt->userData,
3492 "Couldn't find end of Start Tag %s\n",
3493 name);
3494 ctxt->wellFormed = 0;
3495
3496 /*
3497 * end of parsing of this node.
3498 */
3499 if (xmlStrEqual(name, ctxt->name)) {
3500 nodePop(ctxt);
3501 oldname = htmlnamePop(ctxt);
3502#ifdef DEBUG
3503 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3504#endif
3505 if (oldname != NULL)
3506 xmlFree(oldname);
3507 }
3508
3509 /*
3510 * Capture end position and add node
3511 */
3512 if ( currentNode != NULL && ctxt->record_info ) {
3513 node_info.end_pos = ctxt->input->consumed +
3514 (CUR_PTR - ctxt->input->base);
3515 node_info.end_line = ctxt->input->line;
3516 node_info.node = ctxt->node;
3517 xmlParserAddNodeInfo(ctxt, &node_info);
3518 }
3519 return;
3520 }
3521
3522 /*
3523 * Check for an Empty Element from DTD definition
3524 */
3525 if ((info != NULL) && (info->empty)) {
3526 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3527 ctxt->sax->endElement(ctxt->userData, name);
3528 oldname = htmlnamePop(ctxt);
3529#ifdef DEBUG
3530 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3531#endif
3532 if (oldname != NULL)
3533 xmlFree(oldname);
3534 return;
3535 }
3536
3537 /*
3538 * Parse the content of the element:
3539 */
3540 currentNode = xmlStrdup(ctxt->name);
3541 depth = ctxt->nameNr;
3542 while (IS_CHAR(CUR)) {
3543 htmlParseContent(ctxt);
3544 if (ctxt->nameNr < depth) break;
3545 }
3546
3547 if (!IS_CHAR(CUR)) {
3548 /************
3549 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3550 ctxt->sax->error(ctxt->userData,
3551 "Premature end of data in tag %s\n", currentNode);
3552 ctxt->wellFormed = 0;
3553 *************/
3554
3555 /*
3556 * end of parsing of this node.
3557 */
3558 nodePop(ctxt);
3559 oldname = htmlnamePop(ctxt);
3560#ifdef DEBUG
3561 xmlGenericError(xmlGenericErrorContext,"Premature end of tag %s : popping out %s\n", name, oldname);
3562#endif
3563 if (oldname != NULL)
3564 xmlFree(oldname);
3565 if (currentNode != NULL)
3566 xmlFree(currentNode);
3567 return;
3568 }
3569
3570 /*
3571 * Capture end position and add node
3572 */
3573 if ( currentNode != NULL && ctxt->record_info ) {
3574 node_info.end_pos = ctxt->input->consumed +
3575 (CUR_PTR - ctxt->input->base);
3576 node_info.end_line = ctxt->input->line;
3577 node_info.node = ctxt->node;
3578 xmlParserAddNodeInfo(ctxt, &node_info);
3579 }
3580 if (currentNode != NULL)
3581 xmlFree(currentNode);
3582}
3583
3584/**
3585 * htmlParseDocument :
3586 * @ctxt: an HTML parser context
3587 *
3588 * parse an HTML document (and build a tree if using the standard SAX
3589 * interface).
3590 *
3591 * Returns 0, -1 in case of error. the parser context is augmented
3592 * as a result of the parsing.
3593 */
3594
3595int
3596htmlParseDocument(htmlParserCtxtPtr ctxt) {
3597 xmlDtdPtr dtd;
3598
3599 htmlDefaultSAXHandlerInit();
3600 ctxt->html = 1;
3601
3602 GROW;
3603 /*
3604 * SAX: beginning of the document processing.
3605 */
3606 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3607 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3608
3609 /*
3610 * Wipe out everything which is before the first '<'
3611 */
3612 SKIP_BLANKS;
3613 if (CUR == 0) {
3614 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3615 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3616 ctxt->wellFormed = 0;
3617 }
3618
3619 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3620 ctxt->sax->startDocument(ctxt->userData);
3621
3622
3623 /*
3624 * Parse possible comments before any content
3625 */
3626 while ((CUR == '<') && (NXT(1) == '!') &&
3627 (NXT(2) == '-') && (NXT(3) == '-')) {
3628 htmlParseComment(ctxt);
3629 SKIP_BLANKS;
3630 }
3631
3632
3633 /*
3634 * Then possibly doc type declaration(s) and more Misc
3635 * (doctypedecl Misc*)?
3636 */
3637 if ((CUR == '<') && (NXT(1) == '!') &&
3638 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3639 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3640 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3641 (UPP(8) == 'E')) {
3642 htmlParseDocTypeDecl(ctxt);
3643 }
3644 SKIP_BLANKS;
3645
3646 /*
3647 * Parse possible comments before any content
3648 */
3649 while ((CUR == '<') && (NXT(1) == '!') &&
3650 (NXT(2) == '-') && (NXT(3) == '-')) {
3651 htmlParseComment(ctxt);
3652 SKIP_BLANKS;
3653 }
3654
3655 /*
3656 * Time to start parsing the tree itself
3657 */
3658 htmlParseContent(ctxt);
3659
3660 /*
3661 * autoclose
3662 */
3663 if (CUR == 0)
3664 htmlAutoClose(ctxt, NULL);
3665
3666
3667 /*
3668 * SAX: end of the document processing.
3669 */
3670 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3671 ctxt->sax->endDocument(ctxt->userData);
3672
3673 if (ctxt->myDoc != NULL) {
3674 dtd = xmlGetIntSubset(ctxt->myDoc);
3675 if (dtd == NULL)
3676 ctxt->myDoc->intSubset =
3677 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3678 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3679 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3680 }
3681 if (! ctxt->wellFormed) return(-1);
3682 return(0);
3683}
3684
3685
3686/************************************************************************
3687 * *
3688 * Parser contexts handling *
3689 * *
3690 ************************************************************************/
3691
3692/**
3693 * xmlInitParserCtxt:
3694 * @ctxt: an HTML parser context
3695 *
3696 * Initialize a parser context
3697 */
3698
3699void
3700htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3701{
3702 htmlSAXHandler *sax;
3703
3704 if (ctxt == NULL) return;
3705 memset(ctxt, 0, sizeof(htmlParserCtxt));
3706
3707 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3708 if (sax == NULL) {
3709 xmlGenericError(xmlGenericErrorContext,
3710 "htmlInitParserCtxt: out of memory\n");
3711 }
3712 else
3713 memset(sax, 0, sizeof(htmlSAXHandler));
3714
3715 /* Allocate the Input stack */
3716 ctxt->inputTab = (htmlParserInputPtr *)
3717 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3718 if (ctxt->inputTab == NULL) {
3719 xmlGenericError(xmlGenericErrorContext,
3720 "htmlInitParserCtxt: out of memory\n");
3721 ctxt->inputNr = 0;
3722 ctxt->inputMax = 0;
3723 ctxt->input = NULL;
3724 return;
3725 }
3726 ctxt->inputNr = 0;
3727 ctxt->inputMax = 5;
3728 ctxt->input = NULL;
3729 ctxt->version = NULL;
3730 ctxt->encoding = NULL;
3731 ctxt->standalone = -1;
3732 ctxt->instate = XML_PARSER_START;
3733
3734 /* Allocate the Node stack */
3735 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3736 if (ctxt->nodeTab == NULL) {
3737 xmlGenericError(xmlGenericErrorContext,
3738 "htmlInitParserCtxt: out of memory\n");
3739 ctxt->nodeNr = 0;
3740 ctxt->nodeMax = 0;
3741 ctxt->node = NULL;
3742 ctxt->inputNr = 0;
3743 ctxt->inputMax = 0;
3744 ctxt->input = NULL;
3745 return;
3746 }
3747 ctxt->nodeNr = 0;
3748 ctxt->nodeMax = 10;
3749 ctxt->node = NULL;
3750
3751 /* Allocate the Name stack */
3752 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3753 if (ctxt->nameTab == NULL) {
3754 xmlGenericError(xmlGenericErrorContext,
3755 "htmlInitParserCtxt: out of memory\n");
3756 ctxt->nameNr = 0;
3757 ctxt->nameMax = 10;
3758 ctxt->name = NULL;
3759 ctxt->nodeNr = 0;
3760 ctxt->nodeMax = 0;
3761 ctxt->node = NULL;
3762 ctxt->inputNr = 0;
3763 ctxt->inputMax = 0;
3764 ctxt->input = NULL;
3765 return;
3766 }
3767 ctxt->nameNr = 0;
3768 ctxt->nameMax = 10;
3769 ctxt->name = NULL;
3770
3771 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3772 else {
3773 ctxt->sax = sax;
3774 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3775 }
3776 ctxt->userData = ctxt;
3777 ctxt->myDoc = NULL;
3778 ctxt->wellFormed = 1;
3779 ctxt->replaceEntities = 0;
3780 ctxt->html = 1;
3781 ctxt->record_info = 0;
3782 ctxt->validate = 0;
3783 ctxt->nbChars = 0;
3784 ctxt->checkIndex = 0;
3785 xmlInitNodeInfoSeq(&ctxt->node_seq);
3786}
3787
3788/**
3789 * htmlFreeParserCtxt:
3790 * @ctxt: an HTML parser context
3791 *
3792 * Free all the memory used by a parser context. However the parsed
3793 * document in ctxt->myDoc is not freed.
3794 */
3795
3796void
3797htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3798{
3799 xmlFreeParserCtxt(ctxt);
3800}
3801
3802/**
3803 * htmlCreateDocParserCtxt :
3804 * @cur: a pointer to an array of xmlChar
3805 * @encoding: a free form C string describing the HTML document encoding, or NULL
3806 *
3807 * Create a parser context for an HTML document.
3808 *
3809 * Returns the new parser context or NULL
3810 */
3811htmlParserCtxtPtr
3812htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
3813 htmlParserCtxtPtr ctxt;
3814 htmlParserInputPtr input;
3815 /* htmlCharEncoding enc; */
3816
3817 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3818 if (ctxt == NULL) {
3819 perror("malloc");
3820 return(NULL);
3821 }
3822 htmlInitParserCtxt(ctxt);
3823 input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
3824 if (input == NULL) {
3825 perror("malloc");
3826 xmlFree(ctxt);
3827 return(NULL);
3828 }
3829 memset(input, 0, sizeof(htmlParserInput));
3830
3831 input->line = 1;
3832 input->col = 1;
3833 input->base = cur;
3834 input->cur = cur;
3835
3836 inputPush(ctxt, input);
3837 return(ctxt);
3838}
3839
3840/************************************************************************
3841 * *
3842 * Progressive parsing interfaces *
3843 * *
3844 ************************************************************************/
3845
3846/**
3847 * htmlParseLookupSequence:
3848 * @ctxt: an HTML parser context
3849 * @first: the first char to lookup
3850 * @next: the next char to lookup or zero
3851 * @third: the next char to lookup or zero
3852 *
3853 * Try to find if a sequence (first, next, third) or just (first next) or
3854 * (first) is available in the input stream.
3855 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3856 * to avoid rescanning sequences of bytes, it DOES change the state of the
3857 * parser, do not use liberally.
3858 * This is basically similar to xmlParseLookupSequence()
3859 *
3860 * Returns the index to the current parsing point if the full sequence
3861 * is available, -1 otherwise.
3862 */
3863int
3864htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3865 xmlChar next, xmlChar third) {
3866 int base, len;
3867 htmlParserInputPtr in;
3868 const xmlChar *buf;
3869
3870 in = ctxt->input;
3871 if (in == NULL) return(-1);
3872 base = in->cur - in->base;
3873 if (base < 0) return(-1);
3874 if (ctxt->checkIndex > base)
3875 base = ctxt->checkIndex;
3876 if (in->buf == NULL) {
3877 buf = in->base;
3878 len = in->length;
3879 } else {
3880 buf = in->buf->buffer->content;
3881 len = in->buf->buffer->use;
3882 }
3883 /* take into account the sequence length */
3884 if (third) len -= 2;
3885 else if (next) len --;
3886 for (;base < len;base++) {
3887 if (buf[base] == first) {
3888 if (third != 0) {
3889 if ((buf[base + 1] != next) ||
3890 (buf[base + 2] != third)) continue;
3891 } else if (next != 0) {
3892 if (buf[base + 1] != next) continue;
3893 }
3894 ctxt->checkIndex = 0;
3895#ifdef DEBUG_PUSH
3896 if (next == 0)
3897 xmlGenericError(xmlGenericErrorContext,
3898 "HPP: lookup '%c' found at %d\n",
3899 first, base);
3900 else if (third == 0)
3901 xmlGenericError(xmlGenericErrorContext,
3902 "HPP: lookup '%c%c' found at %d\n",
3903 first, next, base);
3904 else
3905 xmlGenericError(xmlGenericErrorContext,
3906 "HPP: lookup '%c%c%c' found at %d\n",
3907 first, next, third, base);
3908#endif
3909 return(base - (in->cur - in->base));
3910 }
3911 }
3912 ctxt->checkIndex = base;
3913#ifdef DEBUG_PUSH
3914 if (next == 0)
3915 xmlGenericError(xmlGenericErrorContext,
3916 "HPP: lookup '%c' failed\n", first);
3917 else if (third == 0)
3918 xmlGenericError(xmlGenericErrorContext,
3919 "HPP: lookup '%c%c' failed\n", first, next);
3920 else
3921 xmlGenericError(xmlGenericErrorContext,
3922 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3923#endif
3924 return(-1);
3925}
3926
3927/**
3928 * htmlParseTryOrFinish:
3929 * @ctxt: an HTML parser context
3930 * @terminate: last chunk indicator
3931 *
3932 * Try to progress on parsing
3933 *
3934 * Returns zero if no parsing was possible
3935 */
3936int
3937htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3938 int ret = 0;
3939 htmlParserInputPtr in;
3940 int avail = 0;
3941 xmlChar cur, next;
3942
3943#ifdef DEBUG_PUSH
3944 switch (ctxt->instate) {
3945 case XML_PARSER_EOF:
3946 xmlGenericError(xmlGenericErrorContext,
3947 "HPP: try EOF\n"); break;
3948 case XML_PARSER_START:
3949 xmlGenericError(xmlGenericErrorContext,
3950 "HPP: try START\n"); break;
3951 case XML_PARSER_MISC:
3952 xmlGenericError(xmlGenericErrorContext,
3953 "HPP: try MISC\n");break;
3954 case XML_PARSER_COMMENT:
3955 xmlGenericError(xmlGenericErrorContext,
3956 "HPP: try COMMENT\n");break;
3957 case XML_PARSER_PROLOG:
3958 xmlGenericError(xmlGenericErrorContext,
3959 "HPP: try PROLOG\n");break;
3960 case XML_PARSER_START_TAG:
3961 xmlGenericError(xmlGenericErrorContext,
3962 "HPP: try START_TAG\n");break;
3963 case XML_PARSER_CONTENT:
3964 xmlGenericError(xmlGenericErrorContext,
3965 "HPP: try CONTENT\n");break;
3966 case XML_PARSER_CDATA_SECTION:
3967 xmlGenericError(xmlGenericErrorContext,
3968 "HPP: try CDATA_SECTION\n");break;
3969 case XML_PARSER_END_TAG:
3970 xmlGenericError(xmlGenericErrorContext,
3971 "HPP: try END_TAG\n");break;
3972 case XML_PARSER_ENTITY_DECL:
3973 xmlGenericError(xmlGenericErrorContext,
3974 "HPP: try ENTITY_DECL\n");break;
3975 case XML_PARSER_ENTITY_VALUE:
3976 xmlGenericError(xmlGenericErrorContext,
3977 "HPP: try ENTITY_VALUE\n");break;
3978 case XML_PARSER_ATTRIBUTE_VALUE:
3979 xmlGenericError(xmlGenericErrorContext,
3980 "HPP: try ATTRIBUTE_VALUE\n");break;
3981 case XML_PARSER_DTD:
3982 xmlGenericError(xmlGenericErrorContext,
3983 "HPP: try DTD\n");break;
3984 case XML_PARSER_EPILOG:
3985 xmlGenericError(xmlGenericErrorContext,
3986 "HPP: try EPILOG\n");break;
3987 case XML_PARSER_PI:
3988 xmlGenericError(xmlGenericErrorContext,
3989 "HPP: try PI\n");break;
3990 case XML_PARSER_SYSTEM_LITERAL:
3991 xmlGenericError(xmlGenericErrorContext,
3992 "HPP: try SYSTEM_LITERAL\n");break;
3993 }
3994#endif
3995
3996 while (1) {
3997
3998 in = ctxt->input;
3999 if (in == NULL) break;
4000 if (in->buf == NULL)
4001 avail = in->length - (in->cur - in->base);
4002 else
4003 avail = in->buf->buffer->use - (in->cur - in->base);
4004 if ((avail == 0) && (terminate)) {
4005 htmlAutoClose(ctxt, NULL);
4006 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4007 /*
4008 * SAX: end of the document processing.
4009 */
4010 ctxt->instate = XML_PARSER_EOF;
4011 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4012 ctxt->sax->endDocument(ctxt->userData);
4013 }
4014 }
4015 if (avail < 1)
4016 goto done;
4017 switch (ctxt->instate) {
4018 case XML_PARSER_EOF:
4019 /*
4020 * Document parsing is done !
4021 */
4022 goto done;
4023 case XML_PARSER_START:
4024 /*
4025 * Very first chars read from the document flow.
4026 */
4027 cur = in->cur[0];
4028 if (IS_BLANK(cur)) {
4029 SKIP_BLANKS;
4030 if (in->buf == NULL)
4031 avail = in->length - (in->cur - in->base);
4032 else
4033 avail = in->buf->buffer->use - (in->cur - in->base);
4034 }
4035 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4036 ctxt->sax->setDocumentLocator(ctxt->userData,
4037 &xmlDefaultSAXLocator);
4038 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4039 (!ctxt->disableSAX))
4040 ctxt->sax->startDocument(ctxt->userData);
4041
4042 cur = in->cur[0];
4043 next = in->cur[1];
4044 if ((cur == '<') && (next == '!') &&
4045 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4046 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4047 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4048 (UPP(8) == 'E')) {
4049 if ((!terminate) &&
4050 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4051 goto done;
4052#ifdef DEBUG_PUSH
4053 xmlGenericError(xmlGenericErrorContext,
4054 "HPP: Parsing internal subset\n");
4055#endif
4056 htmlParseDocTypeDecl(ctxt);
4057 ctxt->instate = XML_PARSER_PROLOG;
4058#ifdef DEBUG_PUSH
4059 xmlGenericError(xmlGenericErrorContext,
4060 "HPP: entering PROLOG\n");
4061#endif
4062 } else {
4063 ctxt->instate = XML_PARSER_MISC;
4064 }
4065#ifdef DEBUG_PUSH
4066 xmlGenericError(xmlGenericErrorContext,
4067 "HPP: entering MISC\n");
4068#endif
4069 break;
4070 case XML_PARSER_MISC:
4071 SKIP_BLANKS;
4072 if (in->buf == NULL)
4073 avail = in->length - (in->cur - in->base);
4074 else
4075 avail = in->buf->buffer->use - (in->cur - in->base);
4076 if (avail < 2)
4077 goto done;
4078 cur = in->cur[0];
4079 next = in->cur[1];
4080 if ((cur == '<') && (next == '!') &&
4081 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4082 if ((!terminate) &&
4083 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4084 goto done;
4085#ifdef DEBUG_PUSH
4086 xmlGenericError(xmlGenericErrorContext,
4087 "HPP: Parsing Comment\n");
4088#endif
4089 htmlParseComment(ctxt);
4090 ctxt->instate = XML_PARSER_MISC;
4091 } else if ((cur == '<') && (next == '!') &&
4092 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4093 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4094 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4095 (UPP(8) == 'E')) {
4096 if ((!terminate) &&
4097 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4098 goto done;
4099#ifdef DEBUG_PUSH
4100 xmlGenericError(xmlGenericErrorContext,
4101 "HPP: Parsing internal subset\n");
4102#endif
4103 htmlParseDocTypeDecl(ctxt);
4104 ctxt->instate = XML_PARSER_PROLOG;
4105#ifdef DEBUG_PUSH
4106 xmlGenericError(xmlGenericErrorContext,
4107 "HPP: entering PROLOG\n");
4108#endif
4109 } else if ((cur == '<') && (next == '!') &&
4110 (avail < 9)) {
4111 goto done;
4112 } else {
4113 ctxt->instate = XML_PARSER_START_TAG;
4114#ifdef DEBUG_PUSH
4115 xmlGenericError(xmlGenericErrorContext,
4116 "HPP: entering START_TAG\n");
4117#endif
4118 }
4119 break;
4120 case XML_PARSER_PROLOG:
4121 SKIP_BLANKS;
4122 if (in->buf == NULL)
4123 avail = in->length - (in->cur - in->base);
4124 else
4125 avail = in->buf->buffer->use - (in->cur - in->base);
4126 if (avail < 2)
4127 goto done;
4128 cur = in->cur[0];
4129 next = in->cur[1];
4130 if ((cur == '<') && (next == '!') &&
4131 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4132 if ((!terminate) &&
4133 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4134 goto done;
4135#ifdef DEBUG_PUSH
4136 xmlGenericError(xmlGenericErrorContext,
4137 "HPP: Parsing Comment\n");
4138#endif
4139 htmlParseComment(ctxt);
4140 ctxt->instate = XML_PARSER_PROLOG;
4141 } else if ((cur == '<') && (next == '!') &&
4142 (avail < 4)) {
4143 goto done;
4144 } else {
4145 ctxt->instate = XML_PARSER_START_TAG;
4146#ifdef DEBUG_PUSH
4147 xmlGenericError(xmlGenericErrorContext,
4148 "HPP: entering START_TAG\n");
4149#endif
4150 }
4151 break;
4152 case XML_PARSER_EPILOG:
4153 if (in->buf == NULL)
4154 avail = in->length - (in->cur - in->base);
4155 else
4156 avail = in->buf->buffer->use - (in->cur - in->base);
4157 if (avail < 1)
4158 goto done;
4159 cur = in->cur[0];
4160 if (IS_BLANK(cur)) {
4161 htmlParseCharData(ctxt, 0);
4162 goto done;
4163 }
4164 if (avail < 2)
4165 goto done;
4166 next = in->cur[1];
4167 if ((cur == '<') && (next == '!') &&
4168 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4169 if ((!terminate) &&
4170 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4171 goto done;
4172#ifdef DEBUG_PUSH
4173 xmlGenericError(xmlGenericErrorContext,
4174 "HPP: Parsing Comment\n");
4175#endif
4176 htmlParseComment(ctxt);
4177 ctxt->instate = XML_PARSER_EPILOG;
4178 } else if ((cur == '<') && (next == '!') &&
4179 (avail < 4)) {
4180 goto done;
4181 } else {
4182 ctxt->errNo = XML_ERR_DOCUMENT_END;
4183 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4184 ctxt->sax->error(ctxt->userData,
4185 "Extra content at the end of the document\n");
4186 ctxt->wellFormed = 0;
4187 ctxt->instate = XML_PARSER_EOF;
4188#ifdef DEBUG_PUSH
4189 xmlGenericError(xmlGenericErrorContext,
4190 "HPP: entering EOF\n");
4191#endif
4192 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4193 ctxt->sax->endDocument(ctxt->userData);
4194 goto done;
4195 }
4196 break;
4197 case XML_PARSER_START_TAG: {
4198 xmlChar *name, *oldname;
4199 int depth = ctxt->nameNr;
4200 htmlElemDescPtr info;
4201
4202 if (avail < 2)
4203 goto done;
4204 cur = in->cur[0];
4205 if (cur != '<') {
4206 ctxt->instate = XML_PARSER_CONTENT;
4207#ifdef DEBUG_PUSH
4208 xmlGenericError(xmlGenericErrorContext,
4209 "HPP: entering CONTENT\n");
4210#endif
4211 break;
4212 }
4213 if ((!terminate) &&
4214 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4215 goto done;
4216
4217 oldname = xmlStrdup(ctxt->name);
4218 htmlParseStartTag(ctxt);
4219 name = ctxt->name;
4220#ifdef DEBUG
4221 if (oldname == NULL)
4222 xmlGenericError(xmlGenericErrorContext,
4223 "Start of element %s\n", name);
4224 else if (name == NULL)
4225 xmlGenericError(xmlGenericErrorContext,
4226 "Start of element failed, was %s\n",
4227 oldname);
4228 else
4229 xmlGenericError(xmlGenericErrorContext,
4230 "Start of element %s, was %s\n",
4231 name, oldname);
4232#endif
4233 if (((depth == ctxt->nameNr) &&
4234 (xmlStrEqual(oldname, ctxt->name))) ||
4235 (name == NULL)) {
4236 if (CUR == '>')
4237 NEXT;
4238 if (oldname != NULL)
4239 xmlFree(oldname);
4240 break;
4241 }
4242 if (oldname != NULL)
4243 xmlFree(oldname);
4244
4245 /*
4246 * Lookup the info for that element.
4247 */
4248 info = htmlTagLookup(name);
4249 if (info == NULL) {
4250 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4251 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4252 name);
4253 ctxt->wellFormed = 0;
4254 } else if (info->depr) {
4255 /***************************
4256 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4257 ctxt->sax->warning(ctxt->userData,
4258 "Tag %s is deprecated\n",
4259 name);
4260 ***************************/
4261 }
4262
4263 /*
4264 * Check for an Empty Element labelled the XML/SGML way
4265 */
4266 if ((CUR == '/') && (NXT(1) == '>')) {
4267 SKIP(2);
4268 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4269 ctxt->sax->endElement(ctxt->userData, name);
4270 oldname = htmlnamePop(ctxt);
4271#ifdef DEBUG
4272 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4273 oldname);
4274#endif
4275 if (oldname != NULL)
4276 xmlFree(oldname);
4277 ctxt->instate = XML_PARSER_CONTENT;
4278#ifdef DEBUG_PUSH
4279 xmlGenericError(xmlGenericErrorContext,
4280 "HPP: entering CONTENT\n");
4281#endif
4282 break;
4283 }
4284
4285 if (CUR == '>') {
4286 NEXT;
4287 } else {
4288 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4289 ctxt->sax->error(ctxt->userData,
4290 "Couldn't find end of Start Tag %s\n",
4291 name);
4292 ctxt->wellFormed = 0;
4293
4294 /*
4295 * end of parsing of this node.
4296 */
4297 if (xmlStrEqual(name, ctxt->name)) {
4298 nodePop(ctxt);
4299 oldname = htmlnamePop(ctxt);
4300#ifdef DEBUG
4301 xmlGenericError(xmlGenericErrorContext,
4302 "End of start tag problem: popping out %s\n", oldname);
4303#endif
4304 if (oldname != NULL)
4305 xmlFree(oldname);
4306 }
4307
4308 ctxt->instate = XML_PARSER_CONTENT;
4309#ifdef DEBUG_PUSH
4310 xmlGenericError(xmlGenericErrorContext,
4311 "HPP: entering CONTENT\n");
4312#endif
4313 break;
4314 }
4315
4316 /*
4317 * Check for an Empty Element from DTD definition
4318 */
4319 if ((info != NULL) && (info->empty)) {
4320 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4321 ctxt->sax->endElement(ctxt->userData, name);
4322 oldname = htmlnamePop(ctxt);
4323#ifdef DEBUG
4324 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4325#endif
4326 if (oldname != NULL)
4327 xmlFree(oldname);
4328 }
4329 ctxt->instate = XML_PARSER_CONTENT;
4330#ifdef DEBUG_PUSH
4331 xmlGenericError(xmlGenericErrorContext,
4332 "HPP: entering CONTENT\n");
4333#endif
4334 break;
4335 }
4336 case XML_PARSER_CONTENT: {
4337 long cons;
4338 /*
4339 * Handle preparsed entities and charRef
4340 */
4341 if (ctxt->token != 0) {
4342 xmlChar chr[2] = { 0 , 0 } ;
4343
4344 chr[0] = (xmlChar) ctxt->token;
4345 htmlCheckParagraph(ctxt);
4346 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4347 ctxt->sax->characters(ctxt->userData, chr, 1);
4348 ctxt->token = 0;
4349 ctxt->checkIndex = 0;
4350 }
4351 if ((avail == 1) && (terminate)) {
4352 cur = in->cur[0];
4353 if ((cur != '<') && (cur != '&')) {
4354 if (ctxt->sax != NULL) {
4355 if (IS_BLANK(cur)) {
4356 if (ctxt->sax->ignorableWhitespace != NULL)
4357 ctxt->sax->ignorableWhitespace(
4358 ctxt->userData, &cur, 1);
4359 } else {
4360 htmlCheckParagraph(ctxt);
4361 if (ctxt->sax->characters != NULL)
4362 ctxt->sax->characters(
4363 ctxt->userData, &cur, 1);
4364 }
4365 }
4366 ctxt->token = 0;
4367 ctxt->checkIndex = 0;
4368 NEXT;
4369 }
4370 break;
4371 }
4372 if (avail < 2)
4373 goto done;
4374 cur = in->cur[0];
4375 next = in->cur[1];
4376 cons = ctxt->nbChars;
4377 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4378 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4379 /*
4380 * Handle SCRIPT/STYLE separately
4381 */
4382 if ((!terminate) &&
4383 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4384 goto done;
4385 htmlParseScript(ctxt);
4386 if ((cur == '<') && (next == '/')) {
4387 ctxt->instate = XML_PARSER_END_TAG;
4388 ctxt->checkIndex = 0;
4389#ifdef DEBUG_PUSH
4390 xmlGenericError(xmlGenericErrorContext,
4391 "HPP: entering END_TAG\n");
4392#endif
4393 break;
4394 }
4395 } else {
4396 /*
4397 * Sometimes DOCTYPE arrives in the middle of the document
4398 */
4399 if ((cur == '<') && (next == '!') &&
4400 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4401 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4402 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4403 (UPP(8) == 'E')) {
4404 if ((!terminate) &&
4405 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4406 goto done;
4407 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4408 ctxt->sax->error(ctxt->userData,
4409 "Misplaced DOCTYPE declaration\n");
4410 ctxt->wellFormed = 0;
4411 htmlParseDocTypeDecl(ctxt);
4412 } else if ((cur == '<') && (next == '!') &&
4413 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4414 if ((!terminate) &&
4415 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4416 goto done;
4417#ifdef DEBUG_PUSH
4418 xmlGenericError(xmlGenericErrorContext,
4419 "HPP: Parsing Comment\n");
4420#endif
4421 htmlParseComment(ctxt);
4422 ctxt->instate = XML_PARSER_CONTENT;
4423 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4424 goto done;
4425 } else if ((cur == '<') && (next == '/')) {
4426 ctxt->instate = XML_PARSER_END_TAG;
4427 ctxt->checkIndex = 0;
4428#ifdef DEBUG_PUSH
4429 xmlGenericError(xmlGenericErrorContext,
4430 "HPP: entering END_TAG\n");
4431#endif
4432 break;
4433 } else if (cur == '<') {
4434 ctxt->instate = XML_PARSER_START_TAG;
4435 ctxt->checkIndex = 0;
4436#ifdef DEBUG_PUSH
4437 xmlGenericError(xmlGenericErrorContext,
4438 "HPP: entering START_TAG\n");
4439#endif
4440 break;
4441 } else if (cur == '&') {
4442 if ((!terminate) &&
4443 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4444 goto done;
4445#ifdef DEBUG_PUSH
4446 xmlGenericError(xmlGenericErrorContext,
4447 "HPP: Parsing Reference\n");
4448#endif
4449 /* TODO: check generation of subtrees if noent !!! */
4450 htmlParseReference(ctxt);
4451 } else {
4452 /* TODO Avoid the extra copy, handle directly !!!!!! */
4453 /*
4454 * Goal of the following test is :
4455 * - minimize calls to the SAX 'character' callback
4456 * when they are mergeable
4457 */
4458 if ((ctxt->inputNr == 1) &&
4459 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4460 if ((!terminate) &&
4461 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4462 goto done;
4463 }
4464 ctxt->checkIndex = 0;
4465#ifdef DEBUG_PUSH
4466 xmlGenericError(xmlGenericErrorContext,
4467 "HPP: Parsing char data\n");
4468#endif
4469 htmlParseCharData(ctxt, 0);
4470 }
4471 }
4472 if (cons == ctxt->nbChars) {
4473 if (ctxt->node != NULL) {
4474 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4475 ctxt->sax->error(ctxt->userData,
4476 "detected an error in element content\n");
4477 ctxt->wellFormed = 0;
4478 }
4479 NEXT;
4480 break;
4481 }
4482
4483 break;
4484 }
4485 case XML_PARSER_END_TAG:
4486 if (avail < 2)
4487 goto done;
4488 if ((!terminate) &&
4489 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4490 goto done;
4491 htmlParseEndTag(ctxt);
4492 if (ctxt->nameNr == 0) {
4493 ctxt->instate = XML_PARSER_EPILOG;
4494 } else {
4495 ctxt->instate = XML_PARSER_CONTENT;
4496 }
4497 ctxt->checkIndex = 0;
4498#ifdef DEBUG_PUSH
4499 xmlGenericError(xmlGenericErrorContext,
4500 "HPP: entering CONTENT\n");
4501#endif
4502 break;
4503 case XML_PARSER_CDATA_SECTION:
4504 xmlGenericError(xmlGenericErrorContext,
4505 "HPP: internal error, state == CDATA\n");
4506 ctxt->instate = XML_PARSER_CONTENT;
4507 ctxt->checkIndex = 0;
4508#ifdef DEBUG_PUSH
4509 xmlGenericError(xmlGenericErrorContext,
4510 "HPP: entering CONTENT\n");
4511#endif
4512 break;
4513 case XML_PARSER_DTD:
4514 xmlGenericError(xmlGenericErrorContext,
4515 "HPP: internal error, state == DTD\n");
4516 ctxt->instate = XML_PARSER_CONTENT;
4517 ctxt->checkIndex = 0;
4518#ifdef DEBUG_PUSH
4519 xmlGenericError(xmlGenericErrorContext,
4520 "HPP: entering CONTENT\n");
4521#endif
4522 break;
4523 case XML_PARSER_COMMENT:
4524 xmlGenericError(xmlGenericErrorContext,
4525 "HPP: internal error, state == COMMENT\n");
4526 ctxt->instate = XML_PARSER_CONTENT;
4527 ctxt->checkIndex = 0;
4528#ifdef DEBUG_PUSH
4529 xmlGenericError(xmlGenericErrorContext,
4530 "HPP: entering CONTENT\n");
4531#endif
4532 break;
4533 case XML_PARSER_PI:
4534 xmlGenericError(xmlGenericErrorContext,
4535 "HPP: internal error, state == PI\n");
4536 ctxt->instate = XML_PARSER_CONTENT;
4537 ctxt->checkIndex = 0;
4538#ifdef DEBUG_PUSH
4539 xmlGenericError(xmlGenericErrorContext,
4540 "HPP: entering CONTENT\n");
4541#endif
4542 break;
4543 case XML_PARSER_ENTITY_DECL:
4544 xmlGenericError(xmlGenericErrorContext,
4545 "HPP: internal error, state == ENTITY_DECL\n");
4546 ctxt->instate = XML_PARSER_CONTENT;
4547 ctxt->checkIndex = 0;
4548#ifdef DEBUG_PUSH
4549 xmlGenericError(xmlGenericErrorContext,
4550 "HPP: entering CONTENT\n");
4551#endif
4552 break;
4553 case XML_PARSER_ENTITY_VALUE:
4554 xmlGenericError(xmlGenericErrorContext,
4555 "HPP: internal error, state == ENTITY_VALUE\n");
4556 ctxt->instate = XML_PARSER_CONTENT;
4557 ctxt->checkIndex = 0;
4558#ifdef DEBUG_PUSH
4559 xmlGenericError(xmlGenericErrorContext,
4560 "HPP: entering DTD\n");
4561#endif
4562 break;
4563 case XML_PARSER_ATTRIBUTE_VALUE:
4564 xmlGenericError(xmlGenericErrorContext,
4565 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4566 ctxt->instate = XML_PARSER_START_TAG;
4567 ctxt->checkIndex = 0;
4568#ifdef DEBUG_PUSH
4569 xmlGenericError(xmlGenericErrorContext,
4570 "HPP: entering START_TAG\n");
4571#endif
4572 break;
4573 case XML_PARSER_SYSTEM_LITERAL:
4574 xmlGenericError(xmlGenericErrorContext,
4575 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4576 ctxt->instate = XML_PARSER_CONTENT;
4577 ctxt->checkIndex = 0;
4578#ifdef DEBUG_PUSH
4579 xmlGenericError(xmlGenericErrorContext,
4580 "HPP: entering CONTENT\n");
4581#endif
4582 break;
4583 case XML_PARSER_IGNORE:
4584 xmlGenericError(xmlGenericErrorContext,
4585 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4586 ctxt->instate = XML_PARSER_CONTENT;
4587 ctxt->checkIndex = 0;
4588#ifdef DEBUG_PUSH
4589 xmlGenericError(xmlGenericErrorContext,
4590 "HPP: entering CONTENT\n");
4591#endif
4592 break;
4593 }
4594 }
4595done:
4596 if ((avail == 0) && (terminate)) {
4597 htmlAutoClose(ctxt, NULL);
4598 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4599 /*
4600 * SAX: end of the document processing.
4601 */
4602 ctxt->instate = XML_PARSER_EOF;
4603 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4604 ctxt->sax->endDocument(ctxt->userData);
4605 }
4606 }
4607 if ((ctxt->myDoc != NULL) &&
4608 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4609 (ctxt->instate == XML_PARSER_EPILOG))) {
4610 xmlDtdPtr dtd;
4611 dtd = xmlGetIntSubset(ctxt->myDoc);
4612 if (dtd == NULL)
4613 ctxt->myDoc->intSubset =
4614 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4615 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4616 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4617 }
4618#ifdef DEBUG_PUSH
4619 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4620#endif
4621 return(ret);
4622}
4623
4624/**
4625 * htmlParseTry:
4626 * @ctxt: an HTML parser context
4627 *
4628 * Try to progress on parsing
4629 *
4630 * Returns zero if no parsing was possible
4631 */
4632int
4633htmlParseTry(htmlParserCtxtPtr ctxt) {
4634 return(htmlParseTryOrFinish(ctxt, 0));
4635}
4636
4637/**
4638 * htmlParseChunk:
4639 * @ctxt: an XML parser context
4640 * @chunk: an char array
4641 * @size: the size in byte of the chunk
4642 * @terminate: last chunk indicator
4643 *
4644 * Parse a Chunk of memory
4645 *
4646 * Returns zero if no error, the xmlParserErrors otherwise.
4647 */
4648int
4649htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4650 int terminate) {
4651 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4652 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4653 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4654 int cur = ctxt->input->cur - ctxt->input->base;
4655
4656 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4657 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4658 ctxt->input->cur = ctxt->input->base + cur;
4659#ifdef DEBUG_PUSH
4660 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4661#endif
4662
4663 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4664 htmlParseTryOrFinish(ctxt, terminate);
4665 } else if (ctxt->instate != XML_PARSER_EOF) {
4666 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4667 htmlParseTryOrFinish(ctxt, terminate);
4668 }
4669 if (terminate) {
4670 if ((ctxt->instate != XML_PARSER_EOF) &&
4671 (ctxt->instate != XML_PARSER_EPILOG) &&
4672 (ctxt->instate != XML_PARSER_MISC)) {
4673 ctxt->errNo = XML_ERR_DOCUMENT_END;
4674 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4675 ctxt->sax->error(ctxt->userData,
4676 "Extra content at the end of the document\n");
4677 ctxt->wellFormed = 0;
4678 }
4679 if (ctxt->instate != XML_PARSER_EOF) {
4680 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4681 ctxt->sax->endDocument(ctxt->userData);
4682 }
4683 ctxt->instate = XML_PARSER_EOF;
4684 }
4685 return((xmlParserErrors) ctxt->errNo);
4686}
4687
4688/************************************************************************
4689 * *
4690 * User entry points *
4691 * *
4692 ************************************************************************/
4693
4694/**
4695 * htmlCreatePushParserCtxt :
4696 * @sax: a SAX handler
4697 * @user_data: The user data returned on SAX callbacks
4698 * @chunk: a pointer to an array of chars
4699 * @size: number of chars in the array
4700 * @filename: an optional file name or URI
4701 * @enc: an optional encoding
4702 *
4703 * Create a parser context for using the HTML parser in push mode
4704 * To allow content encoding detection, @size should be >= 4
4705 * The value of @filename is used for fetching external entities
4706 * and error/warning reports.
4707 *
4708 * Returns the new parser context or NULL
4709 */
4710htmlParserCtxtPtr
4711htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4712 const char *chunk, int size, const char *filename,
4713 xmlCharEncoding enc) {
4714 htmlParserCtxtPtr ctxt;
4715 htmlParserInputPtr inputStream;
4716 xmlParserInputBufferPtr buf;
4717
4718 buf = xmlAllocParserInputBuffer(enc);
4719 if (buf == NULL) return(NULL);
4720
4721 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4722 if (ctxt == NULL) {
4723 xmlFree(buf);
4724 return(NULL);
4725 }
4726 memset(ctxt, 0, sizeof(htmlParserCtxt));
4727 htmlInitParserCtxt(ctxt);
4728 if (sax != NULL) {
4729 if (ctxt->sax != &htmlDefaultSAXHandler)
4730 xmlFree(ctxt->sax);
4731 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4732 if (ctxt->sax == NULL) {
4733 xmlFree(buf);
4734 xmlFree(ctxt);
4735 return(NULL);
4736 }
4737 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4738 if (user_data != NULL)
4739 ctxt->userData = user_data;
4740 }
4741 if (filename == NULL) {
4742 ctxt->directory = NULL;
4743 } else {
4744 ctxt->directory = xmlParserGetDirectory(filename);
4745 }
4746
4747 inputStream = htmlNewInputStream(ctxt);
4748 if (inputStream == NULL) {
4749 xmlFreeParserCtxt(ctxt);
4750 return(NULL);
4751 }
4752
4753 if (filename == NULL)
4754 inputStream->filename = NULL;
4755 else
4756 inputStream->filename = xmlMemStrdup(filename);
4757 inputStream->buf = buf;
4758 inputStream->base = inputStream->buf->buffer->content;
4759 inputStream->cur = inputStream->buf->buffer->content;
4760
4761 inputPush(ctxt, inputStream);
4762
4763 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4764 (ctxt->input->buf != NULL)) {
4765 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4766#ifdef DEBUG_PUSH
4767 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4768#endif
4769 }
4770
4771 return(ctxt);
4772}
4773
4774/**
4775 * htmlSAXParseDoc :
4776 * @cur: a pointer to an array of xmlChar
4777 * @encoding: a free form C string describing the HTML document encoding, or NULL
4778 * @sax: the SAX handler block
4779 * @userData: if using SAX, this pointer will be provided on callbacks.
4780 *
4781 * parse an HTML in-memory document and build a tree.
4782 * It use the given SAX function block to handle the parsing callback.
4783 * If sax is NULL, fallback to the default DOM tree building routines.
4784 *
4785 * Returns the resulting document tree
4786 */
4787
4788htmlDocPtr
4789htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4790 htmlDocPtr ret;
4791 htmlParserCtxtPtr ctxt;
4792
4793 if (cur == NULL) return(NULL);
4794
4795
4796 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4797 if (ctxt == NULL) return(NULL);
4798 if (sax != NULL) {
4799 ctxt->sax = sax;
4800 ctxt->userData = userData;
4801 }
4802
4803 htmlParseDocument(ctxt);
4804 ret = ctxt->myDoc;
4805 if (sax != NULL) {
4806 ctxt->sax = NULL;
4807 ctxt->userData = NULL;
4808 }
4809 htmlFreeParserCtxt(ctxt);
4810
4811 return(ret);
4812}
4813
4814/**
4815 * htmlParseDoc :
4816 * @cur: a pointer to an array of xmlChar
4817 * @encoding: a free form C string describing the HTML document encoding, or NULL
4818 *
4819 * parse an HTML in-memory document and build a tree.
4820 *
4821 * Returns the resulting document tree
4822 */
4823
4824htmlDocPtr
4825htmlParseDoc(xmlChar *cur, const char *encoding) {
4826 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4827}
4828
4829
4830/**
4831 * htmlCreateFileParserCtxt :
4832 * @filename: the filename
4833 * @encoding: a free form C string describing the HTML document encoding, or NULL
4834 *
4835 * Create a parser context for a file content.
4836 * Automatic support for ZLIB/Compress compressed document is provided
4837 * by default if found at compile-time.
4838 *
4839 * Returns the new parser context or NULL
4840 */
4841htmlParserCtxtPtr
4842htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4843{
4844 htmlParserCtxtPtr ctxt;
4845 htmlParserInputPtr inputStream;
4846 xmlParserInputBufferPtr buf;
4847 /* htmlCharEncoding enc; */
4848 xmlChar *content, *content_line = (xmlChar *) "charset=";
4849
4850 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4851 if (buf == NULL) return(NULL);
4852
4853 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4854 if (ctxt == NULL) {
4855 perror("malloc");
4856 return(NULL);
4857 }
4858 memset(ctxt, 0, sizeof(htmlParserCtxt));
4859 htmlInitParserCtxt(ctxt);
4860 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4861 if (inputStream == NULL) {
4862 perror("malloc");
4863 xmlFree(ctxt);
4864 return(NULL);
4865 }
4866 memset(inputStream, 0, sizeof(htmlParserInput));
4867
4868 inputStream->filename = xmlMemStrdup(filename);
4869 inputStream->line = 1;
4870 inputStream->col = 1;
4871 inputStream->buf = buf;
4872 inputStream->directory = NULL;
4873
4874 inputStream->base = inputStream->buf->buffer->content;
4875 inputStream->cur = inputStream->buf->buffer->content;
4876 inputStream->free = NULL;
4877
4878 inputPush(ctxt, inputStream);
4879
4880 /* set encoding */
4881 if (encoding) {
4882 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4883 if (content) {
4884 strcpy ((char *)content, (char *)content_line);
4885 strcat ((char *)content, (char *)encoding);
4886 htmlCheckEncoding (ctxt, content);
4887 xmlFree (content);
4888 }
4889 }
4890
4891 return(ctxt);
4892}
4893
4894/**
4895 * htmlSAXParseFile :
4896 * @filename: the filename
4897 * @encoding: a free form C string describing the HTML document encoding, or NULL
4898 * @sax: the SAX handler block
4899 * @userData: if using SAX, this pointer will be provided on callbacks.
4900 *
4901 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4902 * compressed document is provided by default if found at compile-time.
4903 * It use the given SAX function block to handle the parsing callback.
4904 * If sax is NULL, fallback to the default DOM tree building routines.
4905 *
4906 * Returns the resulting document tree
4907 */
4908
4909htmlDocPtr
4910htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4911 void *userData) {
4912 htmlDocPtr ret;
4913 htmlParserCtxtPtr ctxt;
4914 htmlSAXHandlerPtr oldsax = NULL;
4915
4916 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4917 if (ctxt == NULL) return(NULL);
4918 if (sax != NULL) {
4919 oldsax = ctxt->sax;
4920 ctxt->sax = sax;
4921 ctxt->userData = userData;
4922 }
4923
4924 htmlParseDocument(ctxt);
4925
4926 ret = ctxt->myDoc;
4927 if (sax != NULL) {
4928 ctxt->sax = oldsax;
4929 ctxt->userData = NULL;
4930 }
4931 htmlFreeParserCtxt(ctxt);
4932
4933 return(ret);
4934}
4935
4936/**
4937 * htmlParseFile :
4938 * @filename: the filename
4939 * @encoding: a free form C string describing the HTML document encoding, or NULL
4940 *
4941 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4942 * compressed document is provided by default if found at compile-time.
4943 *
4944 * Returns the resulting document tree
4945 */
4946
4947htmlDocPtr
4948htmlParseFile(const char *filename, const char *encoding) {
4949 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4950}
4951
4952/**
4953 * htmlHandleOmittedElem:
4954 * @val: int 0 or 1
4955 *
4956 * Set and return the previous value for handling HTML omitted tags.
4957 *
4958 * Returns the last value for 0 for no handling, 1 for auto insertion.
4959 */
4960
4961int
4962htmlHandleOmittedElem(int val) {
4963 int old = htmlOmittedDefaultValue;
4964
4965 htmlOmittedDefaultValue = val;
4966 return(old);
4967}
4968
4969#endif /* LIBXML_HTML_ENABLED */