blob: be9b2f94f4d002b2fb39b79dec2bf8079f9822ba [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLtree.c : implemetation of access function for an HTML tree.
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
12
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h> /* for memset() only ! */
14
15#ifdef HAVE_CTYPE_H
16#include <ctype.h>
17#endif
18#ifdef HAVE_STDLIB_H
19#include <stdlib.h>
20#endif
21
22#include <libxml/xmlmemory.h>
23#include <libxml/HTMLparser.h>
24#include <libxml/HTMLtree.h>
25#include <libxml/entities.h>
26#include <libxml/valid.h>
27#include <libxml/xmlerror.h>
28#include <libxml/parserInternals.h>
29
30/************************************************************************
31 * *
32 * Getting/Setting encoding meta tags *
33 * *
34 ************************************************************************/
35
36/**
37 * htmlGetMetaEncoding:
38 * @doc: the document
39 *
40 * Encoding definition lookup in the Meta tags
41 *
42 * Returns the current encoding as flagged in the HTML source
43 */
44const xmlChar *
45htmlGetMetaEncoding(htmlDocPtr doc) {
46 htmlNodePtr cur;
47 const xmlChar *content;
48 const xmlChar *encoding;
49
50 if (doc == NULL)
51 return(NULL);
52 cur = doc->children;
53
54 /*
55 * Search the html
56 */
57 while (cur != NULL) {
58 if (cur->name != NULL) {
59 if (xmlStrEqual(cur->name, BAD_CAST"html"))
60 break;
61 if (xmlStrEqual(cur->name, BAD_CAST"head"))
62 goto found_head;
63 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
64 goto found_meta;
65 }
66 cur = cur->next;
67 }
68 if (cur == NULL)
69 return(NULL);
70 cur = cur->children;
71
72 /*
73 * Search the head
74 */
75 while (cur != NULL) {
76 if (cur->name != NULL) {
77 if (xmlStrEqual(cur->name, BAD_CAST"head"))
78 break;
79 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
80 goto found_meta;
81 }
82 cur = cur->next;
83 }
84 if (cur == NULL)
85 return(NULL);
86found_head:
87 cur = cur->children;
88
89 /*
90 * Search the meta elements
91 */
92found_meta:
93 while (cur != NULL) {
94 if (cur->name != NULL) {
95 if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
96 xmlAttrPtr attr = cur->properties;
97 int http;
98 const xmlChar *value;
99
100 content = NULL;
101 http = 0;
102 while (attr != NULL) {
103 if ((attr->children != NULL) &&
104 (attr->children->type == XML_TEXT_NODE) &&
105 (attr->children->next == NULL)) {
106#ifndef XML_USE_BUFFER_CONTENT
107 value = attr->children->content;
108#else
109 value = xmlBufferContent(attr->children->content);
110#endif
111 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
112 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
113 http = 1;
114 else if ((value != NULL)
115 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
116 content = value;
117 if ((http != 0) && (content != NULL))
118 goto found_content;
119 }
120 attr = attr->next;
121 }
122 }
123 }
124 cur = cur->next;
125 }
126 return(NULL);
127
128found_content:
129 encoding = xmlStrstr(content, BAD_CAST"charset=");
130 if (encoding == NULL)
131 encoding = xmlStrstr(content, BAD_CAST"Charset=");
132 if (encoding == NULL)
133 encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
134 if (encoding != NULL) {
135 encoding += 8;
136 } else {
137 encoding = xmlStrstr(content, BAD_CAST"charset =");
138 if (encoding == NULL)
139 encoding = xmlStrstr(content, BAD_CAST"Charset =");
140 if (encoding == NULL)
141 encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
142 if (encoding != NULL)
143 encoding += 9;
144 }
145 if (encoding != NULL) {
146 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
147 }
148 return(encoding);
149}
150
151/**
152 * htmlSetMetaEncoding:
153 * @doc: the document
154 * @encoding: the encoding string
155 *
156 * Sets the current encoding in the Meta tags
157 * NOTE: this will not change the document content encoding, just
158 * the META flag associated.
159 *
160 * Returns 0 in case of success and -1 in case of error
161 */
162int
163htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
164 htmlNodePtr cur, meta;
165 const xmlChar *content;
166 char newcontent[100];
167
168
169 if (doc == NULL)
170 return(-1);
171
172 if (encoding != NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +0000173 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
174 encoding);
Owen Taylor3473f882001-02-23 17:55:21 +0000175 newcontent[sizeof(newcontent) - 1] = 0;
176 }
177
178 cur = doc->children;
179
180 /*
181 * Search the html
182 */
183 while (cur != NULL) {
184 if (cur->name != NULL) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000185/*
Owen Taylor3473f882001-02-23 17:55:21 +0000186 if (xmlStrEqual(cur->name, BAD_CAST"html"))
187 break;
188 if (xmlStrEqual(cur->name, BAD_CAST"body")) {
189 if (encoding == NULL)
190 return(0);
191 meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
192 xmlAddPrevSibling(cur, meta);
193 cur = meta;
194 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
195 xmlAddChild(cur, meta);
196 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
197 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
198 return(0);
199 }
200 if (xmlStrEqual(cur->name, BAD_CAST"head"))
201 goto found_head;
202 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
203 goto found_meta;
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000204*/
205 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
206 break;
207 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
208 goto found_head;
209 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
210 goto found_meta;
Owen Taylor3473f882001-02-23 17:55:21 +0000211 }
212 cur = cur->next;
213 }
214 if (cur == NULL)
215 return(-1);
216 cur = cur->children;
217
218 /*
219 * Search the head
220 */
221 while (cur != NULL) {
222 if (cur->name != NULL) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000223/*
Owen Taylor3473f882001-02-23 17:55:21 +0000224 if (xmlStrEqual(cur->name, BAD_CAST"head"))
225 break;
226 if (xmlStrEqual(cur->name, BAD_CAST"body")) {
227 if (encoding == NULL)
228 return(0);
229 meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
230 xmlAddPrevSibling(cur, meta);
231 cur = meta;
232 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
233 xmlAddChild(cur, meta);
234 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
235 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
236 return(0);
237 }
238 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
239 goto found_meta;
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000240*/
241 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
242 break;
243 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
244 goto found_meta;
Owen Taylor3473f882001-02-23 17:55:21 +0000245 }
246 cur = cur->next;
247 }
248 if (cur == NULL)
249 return(-1);
250found_head:
251 if (cur->children == NULL) {
252 if (encoding == NULL)
253 return(0);
254 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
255 xmlAddChild(cur, meta);
Owen Taylor3473f882001-02-23 17:55:21 +0000256 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000257 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
Owen Taylor3473f882001-02-23 17:55:21 +0000258 return(0);
259 }
260 cur = cur->children;
261
262found_meta:
263 if (encoding != NULL) {
264 /*
265 * Create a new Meta element with the right aatributes
266 */
267
268 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
269 xmlAddPrevSibling(cur, meta);
Owen Taylor3473f882001-02-23 17:55:21 +0000270 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000271 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
Owen Taylor3473f882001-02-23 17:55:21 +0000272 }
273
274 /*
275 * Search and destroy all the remaining the meta elements carrying
276 * encoding informations
277 */
278 while (cur != NULL) {
279 if (cur->name != NULL) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000280 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000281 xmlAttrPtr attr = cur->properties;
282 int http;
283 const xmlChar *value;
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000284 int same_charset;
Owen Taylor3473f882001-02-23 17:55:21 +0000285
286 content = NULL;
287 http = 0;
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000288 same_charset = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000289 while (attr != NULL) {
290 if ((attr->children != NULL) &&
291 (attr->children->type == XML_TEXT_NODE) &&
292 (attr->children->next == NULL)) {
293#ifndef XML_USE_BUFFER_CONTENT
294 value = attr->children->content;
295#else
296 value = xmlBufferContent(attr->children->content);
297#endif
298 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
299 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
300 http = 1;
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000301 else
302 {
303 if ((value != NULL) &&
304 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
305 content = value;
306 else
307 if ((!xmlStrcasecmp(attr->name, BAD_CAST"charset"))
308 && (!xmlStrcasecmp(value, encoding)))
309 same_charset = 1;
310 }
311 if ((http != 0) && (content != NULL) && (same_charset != 0))
Owen Taylor3473f882001-02-23 17:55:21 +0000312 break;
313 }
314 attr = attr->next;
315 }
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000316 if ((http != 0) && (content != NULL) && (same_charset != 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +0000317 meta = cur;
318 cur = cur->next;
319 xmlUnlinkNode(meta);
320 xmlFreeNode(meta);
321 continue;
322 }
323
324 }
325 }
326 cur = cur->next;
327 }
328 return(0);
329}
330
331/************************************************************************
332 * *
333 * Dumping HTML tree content to a simple buffer *
334 * *
335 ************************************************************************/
336
337static void
338htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur);
339
340/**
341 * htmlDtdDump:
342 * @buf: the HTML buffer output
343 * @doc: the document
344 *
345 * Dump the HTML document DTD, if any.
346 */
347static void
348htmlDtdDump(xmlBufferPtr buf, xmlDocPtr doc) {
349 xmlDtdPtr cur = doc->intSubset;
350
351 if (cur == NULL) {
352 xmlGenericError(xmlGenericErrorContext,
353 "htmlDtdDump : no internal subset\n");
354 return;
355 }
356 xmlBufferWriteChar(buf, "<!DOCTYPE ");
357 xmlBufferWriteCHAR(buf, cur->name);
358 if (cur->ExternalID != NULL) {
359 xmlBufferWriteChar(buf, " PUBLIC ");
360 xmlBufferWriteQuotedString(buf, cur->ExternalID);
361 if (cur->SystemID != NULL) {
362 xmlBufferWriteChar(buf, " ");
363 xmlBufferWriteQuotedString(buf, cur->SystemID);
364 }
365 } else if (cur->SystemID != NULL) {
366 xmlBufferWriteChar(buf, " SYSTEM ");
367 xmlBufferWriteQuotedString(buf, cur->SystemID);
368 }
369 xmlBufferWriteChar(buf, ">\n");
370}
371
372/**
373 * htmlAttrDump:
374 * @buf: the HTML buffer output
375 * @doc: the document
376 * @cur: the attribute pointer
377 *
378 * Dump an HTML attribute
379 */
380static void
381htmlAttrDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
382 xmlChar *value;
383
384 if (cur == NULL) {
385 xmlGenericError(xmlGenericErrorContext,
386 "htmlAttrDump : property == NULL\n");
387 return;
388 }
389 xmlBufferWriteChar(buf, " ");
390 xmlBufferWriteCHAR(buf, cur->name);
391 if (cur->children != NULL) {
392 value = xmlNodeListGetString(doc, cur->children, 0);
393 if (value) {
394 xmlBufferWriteChar(buf, "=");
395 xmlBufferWriteQuotedString(buf, value);
396 xmlFree(value);
397 } else {
398 xmlBufferWriteChar(buf, "=\"\"");
399 }
400 }
401}
402
403/**
404 * htmlAttrListDump:
405 * @buf: the HTML buffer output
406 * @doc: the document
407 * @cur: the first attribute pointer
408 *
409 * Dump a list of HTML attributes
410 */
411static void
412htmlAttrListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
413 if (cur == NULL) {
414 xmlGenericError(xmlGenericErrorContext,
415 "htmlAttrListDump : property == NULL\n");
416 return;
417 }
418 while (cur != NULL) {
419 htmlAttrDump(buf, doc, cur);
420 cur = cur->next;
421 }
422}
423
424
425void
426htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur);
427/**
428 * htmlNodeListDump:
429 * @buf: the HTML buffer output
430 * @doc: the document
431 * @cur: the first node
432 *
433 * Dump an HTML node list, recursive behaviour,children are printed too.
434 */
435static void
436htmlNodeListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
437 if (cur == NULL) {
438 xmlGenericError(xmlGenericErrorContext,
439 "htmlNodeListDump : node == NULL\n");
440 return;
441 }
442 while (cur != NULL) {
443 htmlNodeDump(buf, doc, cur);
444 cur = cur->next;
445 }
446}
447
448/**
449 * htmlNodeDump:
450 * @buf: the HTML buffer output
451 * @doc: the document
452 * @cur: the current node
453 *
454 * Dump an HTML node, recursive behaviour,children are printed too.
455 */
456void
457htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
458 htmlElemDescPtr info;
459
460 if (cur == NULL) {
461 xmlGenericError(xmlGenericErrorContext,
462 "htmlNodeDump : node == NULL\n");
463 return;
464 }
465 /*
466 * Special cases.
467 */
468 if (cur->type == XML_DTD_NODE)
469 return;
470 if (cur->type == XML_HTML_DOCUMENT_NODE) {
471 htmlDocContentDump(buf, (xmlDocPtr) cur);
472 return;
473 }
474 if (cur->type == HTML_TEXT_NODE) {
475 if (cur->content != NULL) {
Daniel Veillard6e93c4a2001-06-05 20:57:42 +0000476 if (((cur->name == xmlStringText) ||
477 (cur->name != xmlStringTextNoenc)) &&
478 ((cur->parent == NULL) ||
479 (!xmlStrEqual(cur->parent->name, BAD_CAST "script")))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000480 xmlChar *buffer;
481
482#ifndef XML_USE_BUFFER_CONTENT
483 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
484#else
485 buffer = xmlEncodeEntitiesReentrant(doc,
486 xmlBufferContent(cur->content));
487#endif
488 if (buffer != NULL) {
489 xmlBufferWriteCHAR(buf, buffer);
490 xmlFree(buffer);
491 }
492 } else {
493 xmlBufferWriteCHAR(buf, cur->content);
494 }
495 }
496 return;
497 }
498 if (cur->type == HTML_COMMENT_NODE) {
499 if (cur->content != NULL) {
500 xmlBufferWriteChar(buf, "<!--");
501#ifndef XML_USE_BUFFER_CONTENT
502 xmlBufferWriteCHAR(buf, cur->content);
503#else
504 xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
505#endif
506 xmlBufferWriteChar(buf, "-->");
507 }
508 return;
509 }
Daniel Veillard7533cc82001-04-24 15:52:00 +0000510 if (cur->type == HTML_PI_NODE) {
Daniel Veillard5146f202001-04-25 10:29:44 +0000511 if (cur->name == NULL)
512 return;
513 xmlBufferWriteChar(buf, "<?");
514 xmlBufferWriteCHAR(buf, cur->name);
Daniel Veillard7533cc82001-04-24 15:52:00 +0000515 if (cur->content != NULL) {
Daniel Veillard5146f202001-04-25 10:29:44 +0000516 xmlBufferWriteChar(buf, " ");
Daniel Veillard7533cc82001-04-24 15:52:00 +0000517#ifndef XML_USE_BUFFER_CONTENT
518 xmlBufferWriteCHAR(buf, cur->content);
519#else
520 xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
521#endif
Daniel Veillard7533cc82001-04-24 15:52:00 +0000522 }
Daniel Veillard5146f202001-04-25 10:29:44 +0000523 xmlBufferWriteChar(buf, ">");
Daniel Veillard7533cc82001-04-24 15:52:00 +0000524 return;
525 }
Owen Taylor3473f882001-02-23 17:55:21 +0000526 if (cur->type == HTML_ENTITY_REF_NODE) {
527 xmlBufferWriteChar(buf, "&");
528 xmlBufferWriteCHAR(buf, cur->name);
529 xmlBufferWriteChar(buf, ";");
530 return;
531 }
Daniel Veillard083c2662001-05-08 08:27:14 +0000532 if (cur->type == HTML_PRESERVE_NODE) {
533 if (cur->content != NULL) {
534#ifndef XML_USE_BUFFER_CONTENT
535 xmlBufferWriteCHAR(buf, cur->content);
536#else
537 xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
538#endif
539 }
540 return;
541 }
Owen Taylor3473f882001-02-23 17:55:21 +0000542
543 /*
Daniel Veillard083c2662001-05-08 08:27:14 +0000544 * Get specific HTML info for taht node.
Owen Taylor3473f882001-02-23 17:55:21 +0000545 */
546 info = htmlTagLookup(cur->name);
547
548 xmlBufferWriteChar(buf, "<");
549 xmlBufferWriteCHAR(buf, cur->name);
550 if (cur->properties != NULL)
551 htmlAttrListDump(buf, doc, cur->properties);
552
553 if ((info != NULL) && (info->empty)) {
554 xmlBufferWriteChar(buf, ">");
555 if (cur->next != NULL) {
556 if ((cur->next->type != HTML_TEXT_NODE) &&
557 (cur->next->type != HTML_ENTITY_REF_NODE))
558 xmlBufferWriteChar(buf, "\n");
559 }
560 return;
561 }
562 if ((cur->content == NULL) && (cur->children == NULL)) {
Daniel Veillard083c2662001-05-08 08:27:14 +0000563 if ((info != NULL) && (info->saveEndTag != 0) &&
564 (strcmp(info->name, "html")) && (strcmp(info->name, "body"))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000565 xmlBufferWriteChar(buf, ">");
Daniel Veillard083c2662001-05-08 08:27:14 +0000566 } else {
Owen Taylor3473f882001-02-23 17:55:21 +0000567 xmlBufferWriteChar(buf, "></");
568 xmlBufferWriteCHAR(buf, cur->name);
569 xmlBufferWriteChar(buf, ">");
570 }
571 if (cur->next != NULL) {
572 if ((cur->next->type != HTML_TEXT_NODE) &&
573 (cur->next->type != HTML_ENTITY_REF_NODE))
574 xmlBufferWriteChar(buf, "\n");
575 }
576 return;
577 }
578 xmlBufferWriteChar(buf, ">");
579 if (cur->content != NULL) {
580 xmlChar *buffer;
581
582#ifndef XML_USE_BUFFER_CONTENT
Daniel Veillard083c2662001-05-08 08:27:14 +0000583 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
Owen Taylor3473f882001-02-23 17:55:21 +0000584#else
Daniel Veillard083c2662001-05-08 08:27:14 +0000585 buffer = xmlEncodeEntitiesReentrant(doc,
586 xmlBufferContent(cur->content));
Owen Taylor3473f882001-02-23 17:55:21 +0000587#endif
588 if (buffer != NULL) {
589 xmlBufferWriteCHAR(buf, buffer);
590 xmlFree(buffer);
591 }
592 }
593 if (cur->children != NULL) {
594 if ((cur->children->type != HTML_TEXT_NODE) &&
595 (cur->children->type != HTML_ENTITY_REF_NODE) &&
596 (cur->children != cur->last))
597 xmlBufferWriteChar(buf, "\n");
598 htmlNodeListDump(buf, doc, cur->children);
599 if ((cur->last->type != HTML_TEXT_NODE) &&
600 (cur->last->type != HTML_ENTITY_REF_NODE) &&
601 (cur->children != cur->last))
602 xmlBufferWriteChar(buf, "\n");
603 }
Owen Taylor3473f882001-02-23 17:55:21 +0000604 xmlBufferWriteChar(buf, "</");
605 xmlBufferWriteCHAR(buf, cur->name);
606 xmlBufferWriteChar(buf, ">");
Owen Taylor3473f882001-02-23 17:55:21 +0000607 if (cur->next != NULL) {
608 if ((cur->next->type != HTML_TEXT_NODE) &&
609 (cur->next->type != HTML_ENTITY_REF_NODE))
610 xmlBufferWriteChar(buf, "\n");
611 }
612}
613
614/**
615 * htmlNodeDumpFile:
616 * @out: the FILE pointer
617 * @doc: the document
618 * @cur: the current node
619 *
620 * Dump an HTML node, recursive behaviour,children are printed too.
621 */
622void
623htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
624 xmlBufferPtr buf;
625
626 buf = xmlBufferCreate();
627 if (buf == NULL) return;
628 htmlNodeDump(buf, doc, cur);
629 xmlBufferDump(out, buf);
630 xmlBufferFree(buf);
631}
632
633/**
634 * htmlDocContentDump:
635 * @buf: the HTML buffer output
636 * @cur: the document
637 *
638 * Dump an HTML document.
639 */
640static void
641htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur) {
642 int type;
643
644 /*
645 * force to output the stuff as HTML, especially for entities
646 */
647 type = cur->type;
648 cur->type = XML_HTML_DOCUMENT_NODE;
649 if (cur->intSubset != NULL)
650 htmlDtdDump(buf, cur);
651 else {
652 /* Default to HTML-4.0 transitionnal @@@@ */
653 xmlBufferWriteChar(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
654
655 }
656 if (cur->children != NULL) {
657 htmlNodeListDump(buf, cur, cur->children);
658 }
659 xmlBufferWriteChar(buf, "\n");
660 cur->type = (xmlElementType) type;
661}
662
663/**
664 * htmlDocDumpMemory:
665 * @cur: the document
666 * @mem: OUT: the memory pointer
Daniel Veillard2d703722001-05-30 18:32:34 +0000667 * @size: OUT: the memory length
Owen Taylor3473f882001-02-23 17:55:21 +0000668 *
669 * Dump an HTML document in memory and return the xmlChar * and it's size.
670 * It's up to the caller to free the memory.
671 */
672void
673htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
Daniel Veillard2d703722001-05-30 18:32:34 +0000674 xmlOutputBufferPtr buf;
675 xmlCharEncodingHandlerPtr handler = NULL;
676 const char *encoding;
Owen Taylor3473f882001-02-23 17:55:21 +0000677
678 if (cur == NULL) {
679#ifdef DEBUG_TREE
680 xmlGenericError(xmlGenericErrorContext,
Daniel Veillard2d703722001-05-30 18:32:34 +0000681 "htmlDocDumpMemory : document == NULL\n");
Owen Taylor3473f882001-02-23 17:55:21 +0000682#endif
683 *mem = NULL;
684 *size = 0;
685 return;
686 }
Daniel Veillard2d703722001-05-30 18:32:34 +0000687
688 encoding = (const char *) htmlGetMetaEncoding(cur);
689
690 if (encoding != NULL) {
691 xmlCharEncoding enc;
692
693 enc = xmlParseCharEncoding(encoding);
694 if (enc != cur->charset) {
695 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
696 /*
697 * Not supported yet
698 */
699 *mem = NULL;
700 *size = 0;
701 return;
702 }
703
704 handler = xmlFindCharEncodingHandler(encoding);
705 if (handler == NULL) {
706 *mem = NULL;
707 *size = 0;
708 return;
709 }
710 }
711 }
712
713 /*
714 * Fallback to HTML or ASCII when the encoding is unspecified
715 */
716 if (handler == NULL)
717 handler = xmlFindCharEncodingHandler("HTML");
718 if (handler == NULL)
719 handler = xmlFindCharEncodingHandler("ascii");
720
721 buf = xmlAllocOutputBuffer(handler);
Owen Taylor3473f882001-02-23 17:55:21 +0000722 if (buf == NULL) {
723 *mem = NULL;
724 *size = 0;
725 return;
726 }
Daniel Veillard2d703722001-05-30 18:32:34 +0000727
728 htmlDocContentDumpOutput(buf, cur, NULL);
729 xmlOutputBufferFlush(buf);
730 if (buf->conv != NULL) {
731 *size = buf->conv->use;
732 *mem = xmlStrndup(buf->conv->content, *size);
733 } else {
734 *size = buf->buffer->use;
735 *mem = xmlStrndup(buf->buffer->content, *size);
736 }
737 (void)xmlOutputBufferClose(buf);
Owen Taylor3473f882001-02-23 17:55:21 +0000738}
739
740
741/************************************************************************
742 * *
743 * Dumping HTML tree content to an I/O output buffer *
744 * *
745 ************************************************************************/
746
747/**
748 * htmlDtdDump:
749 * @buf: the HTML buffer output
750 * @doc: the document
751 * @encoding: the encoding string
752 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000753 * TODO: check whether encoding is needed
754 *
Owen Taylor3473f882001-02-23 17:55:21 +0000755 * Dump the HTML document DTD, if any.
756 */
757static void
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000758htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
Daniel Veillardc86a4fa2001-03-26 16:28:29 +0000759 const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +0000760 xmlDtdPtr cur = doc->intSubset;
761
762 if (cur == NULL) {
763 xmlGenericError(xmlGenericErrorContext,
764 "htmlDtdDump : no internal subset\n");
765 return;
766 }
767 xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
768 xmlOutputBufferWriteString(buf, (const char *)cur->name);
769 if (cur->ExternalID != NULL) {
770 xmlOutputBufferWriteString(buf, " PUBLIC ");
771 xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
772 if (cur->SystemID != NULL) {
773 xmlOutputBufferWriteString(buf, " ");
774 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
775 }
776 } else if (cur->SystemID != NULL) {
777 xmlOutputBufferWriteString(buf, " SYSTEM ");
778 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
779 }
780 xmlOutputBufferWriteString(buf, ">\n");
781}
782
783/**
784 * htmlAttrDump:
785 * @buf: the HTML buffer output
786 * @doc: the document
787 * @cur: the attribute pointer
788 * @encoding: the encoding string
789 *
790 * Dump an HTML attribute
791 */
792static void
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000793htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
Daniel Veillardc86a4fa2001-03-26 16:28:29 +0000794 const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +0000795 xmlChar *value;
796
797 if (cur == NULL) {
798 xmlGenericError(xmlGenericErrorContext,
799 "htmlAttrDump : property == NULL\n");
800 return;
801 }
802 xmlOutputBufferWriteString(buf, " ");
803 xmlOutputBufferWriteString(buf, (const char *)cur->name);
804 if (cur->children != NULL) {
805 value = xmlNodeListGetString(doc, cur->children, 0);
806 if (value) {
807 xmlOutputBufferWriteString(buf, "=");
808 xmlBufferWriteQuotedString(buf->buffer, value);
809 xmlFree(value);
810 } else {
811 xmlOutputBufferWriteString(buf, "=\"\"");
812 }
813 }
814}
815
816/**
817 * htmlAttrListDump:
818 * @buf: the HTML buffer output
819 * @doc: the document
820 * @cur: the first attribute pointer
821 * @encoding: the encoding string
822 *
823 * Dump a list of HTML attributes
824 */
825static void
826htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
827 if (cur == NULL) {
828 xmlGenericError(xmlGenericErrorContext,
829 "htmlAttrListDump : property == NULL\n");
830 return;
831 }
832 while (cur != NULL) {
833 htmlAttrDumpOutput(buf, doc, cur, encoding);
834 cur = cur->next;
835 }
836}
837
838
839void htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
840 xmlNodePtr cur, const char *encoding);
841
842/**
843 * htmlNodeListDump:
844 * @buf: the HTML buffer output
845 * @doc: the document
846 * @cur: the first node
847 * @encoding: the encoding string
848 *
849 * Dump an HTML node list, recursive behaviour,children are printed too.
850 */
851static void
852htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
853 if (cur == NULL) {
854 xmlGenericError(xmlGenericErrorContext,
855 "htmlNodeListDump : node == NULL\n");
856 return;
857 }
858 while (cur != NULL) {
859 htmlNodeDumpOutput(buf, doc, cur, encoding);
860 cur = cur->next;
861 }
862}
863
864/**
865 * htmlNodeDumpOutput:
866 * @buf: the HTML buffer output
867 * @doc: the document
868 * @cur: the current node
869 * @encoding: the encoding string
870 *
871 * Dump an HTML node, recursive behaviour,children are printed too.
872 */
873void
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000874htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
875 xmlNodePtr cur, const char *encoding) {
Owen Taylor3473f882001-02-23 17:55:21 +0000876 htmlElemDescPtr info;
877
878 if (cur == NULL) {
879 xmlGenericError(xmlGenericErrorContext,
880 "htmlNodeDump : node == NULL\n");
881 return;
882 }
883 /*
884 * Special cases.
885 */
886 if (cur->type == XML_DTD_NODE)
887 return;
888 if (cur->type == XML_HTML_DOCUMENT_NODE) {
889 htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
890 return;
891 }
892 if (cur->type == HTML_TEXT_NODE) {
893 if (cur->content != NULL) {
Daniel Veillard6e93c4a2001-06-05 20:57:42 +0000894 if (((cur->name == xmlStringText) ||
895 (cur->name != xmlStringTextNoenc)) &&
896 ((cur->parent == NULL) ||
897 (!xmlStrEqual(cur->parent->name, BAD_CAST "script")))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000898 xmlChar *buffer;
899
900#ifndef XML_USE_BUFFER_CONTENT
901 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
902#else
903 buffer = xmlEncodeEntitiesReentrant(doc,
904 xmlBufferContent(cur->content));
905#endif
906 if (buffer != NULL) {
907 xmlOutputBufferWriteString(buf, (const char *)buffer);
908 xmlFree(buffer);
909 }
910 } else {
911 xmlOutputBufferWriteString(buf, (const char *)cur->content);
912 }
913 }
914 return;
915 }
916 if (cur->type == HTML_COMMENT_NODE) {
917 if (cur->content != NULL) {
918 xmlOutputBufferWriteString(buf, "<!--");
919#ifndef XML_USE_BUFFER_CONTENT
920 xmlOutputBufferWriteString(buf, (const char *)cur->content);
921#else
922 xmlOutputBufferWriteString(buf, (const char *)
923 xmlBufferContent(cur->content));
924#endif
925 xmlOutputBufferWriteString(buf, "-->");
926 }
927 return;
928 }
Daniel Veillard7533cc82001-04-24 15:52:00 +0000929 if (cur->type == HTML_PI_NODE) {
Daniel Veillard5146f202001-04-25 10:29:44 +0000930 if (cur->name == NULL)
931 return;
932 xmlOutputBufferWriteString(buf, "<?");
933 xmlOutputBufferWriteString(buf, (const char *)cur->name);
Daniel Veillard7533cc82001-04-24 15:52:00 +0000934 if (cur->content != NULL) {
Daniel Veillard5146f202001-04-25 10:29:44 +0000935 xmlOutputBufferWriteString(buf, " ");
Daniel Veillard7533cc82001-04-24 15:52:00 +0000936#ifndef XML_USE_BUFFER_CONTENT
937 xmlOutputBufferWriteString(buf, (const char *)cur->content);
938#else
939 xmlOutputBufferWriteString(buf, (const char *)
940 xmlBufferContent(cur->content));
941#endif
Daniel Veillard7533cc82001-04-24 15:52:00 +0000942 }
Daniel Veillard5146f202001-04-25 10:29:44 +0000943 xmlOutputBufferWriteString(buf, ">");
Daniel Veillard7533cc82001-04-24 15:52:00 +0000944 return;
945 }
Owen Taylor3473f882001-02-23 17:55:21 +0000946 if (cur->type == HTML_ENTITY_REF_NODE) {
947 xmlOutputBufferWriteString(buf, "&");
948 xmlOutputBufferWriteString(buf, (const char *)cur->name);
949 xmlOutputBufferWriteString(buf, ";");
950 return;
951 }
952 if (cur->type == HTML_PRESERVE_NODE) {
953 if (cur->content != NULL) {
954#ifndef XML_USE_BUFFER_CONTENT
955 xmlOutputBufferWriteString(buf, (const char *)cur->content);
956#else
957 xmlOutputBufferWriteString(buf, (const char *)
958 xmlBufferContent(cur->content));
959#endif
960 }
961 return;
962 }
963
964 /*
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000965 * Get specific HTML info for taht node.
Owen Taylor3473f882001-02-23 17:55:21 +0000966 */
967 info = htmlTagLookup(cur->name);
968
969 xmlOutputBufferWriteString(buf, "<");
970 xmlOutputBufferWriteString(buf, (const char *)cur->name);
971 if (cur->properties != NULL)
972 htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
973
974 if ((info != NULL) && (info->empty)) {
975 xmlOutputBufferWriteString(buf, ">");
976 if (cur->next != NULL) {
977 if ((cur->next->type != HTML_TEXT_NODE) &&
978 (cur->next->type != HTML_ENTITY_REF_NODE))
979 xmlOutputBufferWriteString(buf, "\n");
980 }
981 return;
982 }
983 if ((cur->content == NULL) && (cur->children == NULL)) {
984 if ((info != NULL) && (info->saveEndTag != 0) &&
985 (strcmp(info->name, "html")) && (strcmp(info->name, "body"))) {
986 xmlOutputBufferWriteString(buf, ">");
987 } else {
988 xmlOutputBufferWriteString(buf, "></");
989 xmlOutputBufferWriteString(buf, (const char *)cur->name);
990 xmlOutputBufferWriteString(buf, ">");
991 }
992 if (cur->next != NULL) {
993 if ((cur->next->type != HTML_TEXT_NODE) &&
994 (cur->next->type != HTML_ENTITY_REF_NODE))
995 xmlOutputBufferWriteString(buf, "\n");
996 }
997 return;
998 }
999 xmlOutputBufferWriteString(buf, ">");
1000 if (cur->content != NULL) {
1001 /*
1002 * Uses the OutputBuffer property to automatically convert
1003 * invalids to charrefs
1004 */
1005
1006#ifndef XML_USE_BUFFER_CONTENT
1007 xmlOutputBufferWriteString(buf, (const char *) cur->content);
1008#else
1009 xmlOutputBufferWriteString(buf,
1010 (const char *) xmlBufferContent(cur->content));
1011#endif
1012 }
1013 if (cur->children != NULL) {
1014 if ((cur->children->type != HTML_TEXT_NODE) &&
1015 (cur->children->type != HTML_ENTITY_REF_NODE) &&
1016 (cur->children != cur->last))
1017 xmlOutputBufferWriteString(buf, "\n");
1018 htmlNodeListDumpOutput(buf, doc, cur->children, encoding);
1019 if ((cur->last->type != HTML_TEXT_NODE) &&
1020 (cur->last->type != HTML_ENTITY_REF_NODE) &&
1021 (cur->children != cur->last))
1022 xmlOutputBufferWriteString(buf, "\n");
1023 }
Owen Taylor3473f882001-02-23 17:55:21 +00001024 xmlOutputBufferWriteString(buf, "</");
1025 xmlOutputBufferWriteString(buf, (const char *)cur->name);
1026 xmlOutputBufferWriteString(buf, ">");
Owen Taylor3473f882001-02-23 17:55:21 +00001027 if (cur->next != NULL) {
1028 if ((cur->next->type != HTML_TEXT_NODE) &&
1029 (cur->next->type != HTML_ENTITY_REF_NODE))
1030 xmlOutputBufferWriteString(buf, "\n");
1031 }
1032}
1033
1034/**
1035 * htmlDocContentDump:
1036 * @buf: the HTML buffer output
1037 * @cur: the document
1038 * @encoding: the encoding string
1039 *
1040 * Dump an HTML document.
1041 */
1042void
1043htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding) {
1044 int type;
1045
1046 /*
1047 * force to output the stuff as HTML, especially for entities
1048 */
1049 type = cur->type;
1050 cur->type = XML_HTML_DOCUMENT_NODE;
Daniel Veillard4dd93462001-04-02 15:16:19 +00001051 if (cur->intSubset != NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00001052 htmlDtdDumpOutput(buf, cur, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001053 }
1054 if (cur->children != NULL) {
1055 htmlNodeListDumpOutput(buf, cur, cur->children, encoding);
1056 }
1057 xmlOutputBufferWriteString(buf, "\n");
1058 cur->type = (xmlElementType) type;
1059}
1060
Owen Taylor3473f882001-02-23 17:55:21 +00001061/************************************************************************
1062 * *
1063 * Saving functions front-ends *
1064 * *
1065 ************************************************************************/
1066
1067/**
1068 * htmlDocDump:
1069 * @f: the FILE*
1070 * @cur: the document
1071 *
1072 * Dump an HTML document to an open FILE.
1073 *
1074 * returns: the number of byte written or -1 in case of failure.
1075 */
1076int
1077htmlDocDump(FILE *f, xmlDocPtr cur) {
1078 xmlOutputBufferPtr buf;
1079 xmlCharEncodingHandlerPtr handler = NULL;
1080 const char *encoding;
1081 int ret;
1082
1083 if (cur == NULL) {
1084#ifdef DEBUG_TREE
1085 xmlGenericError(xmlGenericErrorContext,
1086 "htmlDocDump : document == NULL\n");
1087#endif
1088 return(-1);
1089 }
1090
1091 encoding = (const char *) htmlGetMetaEncoding(cur);
1092
1093 if (encoding != NULL) {
1094 xmlCharEncoding enc;
1095
1096 enc = xmlParseCharEncoding(encoding);
1097 if (enc != cur->charset) {
1098 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1099 /*
1100 * Not supported yet
1101 */
1102 return(-1);
1103 }
1104
1105 handler = xmlFindCharEncodingHandler(encoding);
1106 if (handler == NULL)
1107 return(-1);
1108 }
1109 }
1110
1111 /*
1112 * Fallback to HTML or ASCII when the encoding is unspecified
1113 */
1114 if (handler == NULL)
1115 handler = xmlFindCharEncodingHandler("HTML");
1116 if (handler == NULL)
1117 handler = xmlFindCharEncodingHandler("ascii");
1118
1119 buf = xmlOutputBufferCreateFile(f, handler);
1120 if (buf == NULL) return(-1);
1121 htmlDocContentDumpOutput(buf, cur, NULL);
1122
1123 ret = xmlOutputBufferClose(buf);
1124 return(ret);
1125}
1126
1127/**
1128 * htmlSaveFile:
1129 * @filename: the filename (or URL)
1130 * @cur: the document
1131 *
1132 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1133 * used.
1134 * returns: the number of byte written or -1 in case of failure.
1135 */
1136int
1137htmlSaveFile(const char *filename, xmlDocPtr cur) {
1138 xmlOutputBufferPtr buf;
1139 xmlCharEncodingHandlerPtr handler = NULL;
1140 const char *encoding;
1141 int ret;
1142
1143 encoding = (const char *) htmlGetMetaEncoding(cur);
1144
1145 if (encoding != NULL) {
1146 xmlCharEncoding enc;
1147
1148 enc = xmlParseCharEncoding(encoding);
1149 if (enc != cur->charset) {
1150 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1151 /*
1152 * Not supported yet
1153 */
1154 return(-1);
1155 }
1156
1157 handler = xmlFindCharEncodingHandler(encoding);
1158 if (handler == NULL)
1159 return(-1);
1160 }
1161 }
1162
1163 /*
1164 * Fallback to HTML or ASCII when the encoding is unspecified
1165 */
1166 if (handler == NULL)
1167 handler = xmlFindCharEncodingHandler("HTML");
1168 if (handler == NULL)
1169 handler = xmlFindCharEncodingHandler("ascii");
1170
1171 /*
1172 * save the content to a temp buffer.
1173 */
1174 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1175 if (buf == NULL) return(0);
1176
1177 htmlDocContentDumpOutput(buf, cur, NULL);
1178
1179 ret = xmlOutputBufferClose(buf);
1180 return(ret);
1181}
1182
1183/**
1184 * htmlSaveFileEnc:
1185 * @filename: the filename
1186 * @cur: the document
1187 *
1188 * Dump an HTML document to a file using a given encoding.
1189 *
1190 * returns: the number of byte written or -1 in case of failure.
1191 */
1192int
1193htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1194 xmlOutputBufferPtr buf;
1195 xmlCharEncodingHandlerPtr handler = NULL;
1196 int ret;
1197
1198 if (encoding != NULL) {
1199 xmlCharEncoding enc;
1200
1201 enc = xmlParseCharEncoding(encoding);
1202 if (enc != cur->charset) {
1203 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1204 /*
1205 * Not supported yet
1206 */
1207 return(-1);
1208 }
1209
1210 handler = xmlFindCharEncodingHandler(encoding);
1211 if (handler == NULL)
1212 return(-1);
1213 htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1214 }
Daniel Veillard4dd93462001-04-02 15:16:19 +00001215 } else {
1216 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
Owen Taylor3473f882001-02-23 17:55:21 +00001217 }
1218
1219 /*
1220 * Fallback to HTML or ASCII when the encoding is unspecified
1221 */
1222 if (handler == NULL)
1223 handler = xmlFindCharEncodingHandler("HTML");
1224 if (handler == NULL)
1225 handler = xmlFindCharEncodingHandler("ascii");
1226
1227 /*
1228 * save the content to a temp buffer.
1229 */
1230 buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1231 if (buf == NULL) return(0);
1232
1233 htmlDocContentDumpOutput(buf, cur, encoding);
1234
1235 ret = xmlOutputBufferClose(buf);
1236 return(ret);
1237}
1238#endif /* LIBXML_HTML_ENABLED */