blob: 51a72f91f1f13a1ef8479b1e631949cfb5969c3a [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLtree.c : implemetation of access function for an HTML tree.
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
12
Owen Taylor3473f882001-02-23 17:55:21 +000013#include <string.h> /* for memset() only ! */
14
15#ifdef HAVE_CTYPE_H
16#include <ctype.h>
17#endif
18#ifdef HAVE_STDLIB_H
19#include <stdlib.h>
20#endif
21
22#include <libxml/xmlmemory.h>
23#include <libxml/HTMLparser.h>
24#include <libxml/HTMLtree.h>
25#include <libxml/entities.h>
26#include <libxml/valid.h>
27#include <libxml/xmlerror.h>
28#include <libxml/parserInternals.h>
29
30/************************************************************************
31 * *
32 * Getting/Setting encoding meta tags *
33 * *
34 ************************************************************************/
35
36/**
37 * htmlGetMetaEncoding:
38 * @doc: the document
39 *
40 * Encoding definition lookup in the Meta tags
41 *
42 * Returns the current encoding as flagged in the HTML source
43 */
44const xmlChar *
45htmlGetMetaEncoding(htmlDocPtr doc) {
46 htmlNodePtr cur;
47 const xmlChar *content;
48 const xmlChar *encoding;
49
50 if (doc == NULL)
51 return(NULL);
52 cur = doc->children;
53
54 /*
55 * Search the html
56 */
57 while (cur != NULL) {
58 if (cur->name != NULL) {
59 if (xmlStrEqual(cur->name, BAD_CAST"html"))
60 break;
61 if (xmlStrEqual(cur->name, BAD_CAST"head"))
62 goto found_head;
63 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
64 goto found_meta;
65 }
66 cur = cur->next;
67 }
68 if (cur == NULL)
69 return(NULL);
70 cur = cur->children;
71
72 /*
73 * Search the head
74 */
75 while (cur != NULL) {
76 if (cur->name != NULL) {
77 if (xmlStrEqual(cur->name, BAD_CAST"head"))
78 break;
79 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
80 goto found_meta;
81 }
82 cur = cur->next;
83 }
84 if (cur == NULL)
85 return(NULL);
86found_head:
87 cur = cur->children;
88
89 /*
90 * Search the meta elements
91 */
92found_meta:
93 while (cur != NULL) {
94 if (cur->name != NULL) {
95 if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
96 xmlAttrPtr attr = cur->properties;
97 int http;
98 const xmlChar *value;
99
100 content = NULL;
101 http = 0;
102 while (attr != NULL) {
103 if ((attr->children != NULL) &&
104 (attr->children->type == XML_TEXT_NODE) &&
105 (attr->children->next == NULL)) {
106#ifndef XML_USE_BUFFER_CONTENT
107 value = attr->children->content;
108#else
109 value = xmlBufferContent(attr->children->content);
110#endif
111 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
112 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
113 http = 1;
114 else if ((value != NULL)
115 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
116 content = value;
117 if ((http != 0) && (content != NULL))
118 goto found_content;
119 }
120 attr = attr->next;
121 }
122 }
123 }
124 cur = cur->next;
125 }
126 return(NULL);
127
128found_content:
129 encoding = xmlStrstr(content, BAD_CAST"charset=");
130 if (encoding == NULL)
131 encoding = xmlStrstr(content, BAD_CAST"Charset=");
132 if (encoding == NULL)
133 encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
134 if (encoding != NULL) {
135 encoding += 8;
136 } else {
137 encoding = xmlStrstr(content, BAD_CAST"charset =");
138 if (encoding == NULL)
139 encoding = xmlStrstr(content, BAD_CAST"Charset =");
140 if (encoding == NULL)
141 encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
142 if (encoding != NULL)
143 encoding += 9;
144 }
145 if (encoding != NULL) {
146 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
147 }
148 return(encoding);
149}
150
151/**
152 * htmlSetMetaEncoding:
153 * @doc: the document
154 * @encoding: the encoding string
155 *
156 * Sets the current encoding in the Meta tags
157 * NOTE: this will not change the document content encoding, just
158 * the META flag associated.
159 *
160 * Returns 0 in case of success and -1 in case of error
161 */
162int
163htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
164 htmlNodePtr cur, meta;
165 const xmlChar *content;
166 char newcontent[100];
167
168
169 if (doc == NULL)
170 return(-1);
171
172 if (encoding != NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +0000173 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
174 encoding);
Owen Taylor3473f882001-02-23 17:55:21 +0000175 newcontent[sizeof(newcontent) - 1] = 0;
176 }
177
178 cur = doc->children;
179
180 /*
181 * Search the html
182 */
183 while (cur != NULL) {
184 if (cur->name != NULL) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000185/*
Owen Taylor3473f882001-02-23 17:55:21 +0000186 if (xmlStrEqual(cur->name, BAD_CAST"html"))
187 break;
188 if (xmlStrEqual(cur->name, BAD_CAST"body")) {
189 if (encoding == NULL)
190 return(0);
191 meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
192 xmlAddPrevSibling(cur, meta);
193 cur = meta;
194 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
195 xmlAddChild(cur, meta);
196 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
197 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
198 return(0);
199 }
200 if (xmlStrEqual(cur->name, BAD_CAST"head"))
201 goto found_head;
202 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
203 goto found_meta;
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000204*/
205 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
206 break;
207 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
208 goto found_head;
209 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
210 goto found_meta;
Owen Taylor3473f882001-02-23 17:55:21 +0000211 }
212 cur = cur->next;
213 }
214 if (cur == NULL)
215 return(-1);
216 cur = cur->children;
217
218 /*
219 * Search the head
220 */
221 while (cur != NULL) {
222 if (cur->name != NULL) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000223/*
Owen Taylor3473f882001-02-23 17:55:21 +0000224 if (xmlStrEqual(cur->name, BAD_CAST"head"))
225 break;
226 if (xmlStrEqual(cur->name, BAD_CAST"body")) {
227 if (encoding == NULL)
228 return(0);
229 meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
230 xmlAddPrevSibling(cur, meta);
231 cur = meta;
232 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
233 xmlAddChild(cur, meta);
234 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
235 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
236 return(0);
237 }
238 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
239 goto found_meta;
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000240*/
241 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
242 break;
243 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
244 goto found_meta;
Owen Taylor3473f882001-02-23 17:55:21 +0000245 }
246 cur = cur->next;
247 }
248 if (cur == NULL)
249 return(-1);
250found_head:
251 if (cur->children == NULL) {
252 if (encoding == NULL)
253 return(0);
254 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
255 xmlAddChild(cur, meta);
Owen Taylor3473f882001-02-23 17:55:21 +0000256 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000257 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
Owen Taylor3473f882001-02-23 17:55:21 +0000258 return(0);
259 }
260 cur = cur->children;
261
262found_meta:
263 if (encoding != NULL) {
264 /*
265 * Create a new Meta element with the right aatributes
266 */
267
268 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
269 xmlAddPrevSibling(cur, meta);
Owen Taylor3473f882001-02-23 17:55:21 +0000270 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000271 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
Owen Taylor3473f882001-02-23 17:55:21 +0000272 }
273
274 /*
275 * Search and destroy all the remaining the meta elements carrying
276 * encoding informations
277 */
278 while (cur != NULL) {
279 if (cur->name != NULL) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000280 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000281 xmlAttrPtr attr = cur->properties;
282 int http;
283 const xmlChar *value;
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000284 int same_charset;
Owen Taylor3473f882001-02-23 17:55:21 +0000285
286 content = NULL;
287 http = 0;
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000288 same_charset = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000289 while (attr != NULL) {
290 if ((attr->children != NULL) &&
291 (attr->children->type == XML_TEXT_NODE) &&
292 (attr->children->next == NULL)) {
293#ifndef XML_USE_BUFFER_CONTENT
294 value = attr->children->content;
295#else
296 value = xmlBufferContent(attr->children->content);
297#endif
298 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
299 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
300 http = 1;
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000301 else
302 {
303 if ((value != NULL) &&
304 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
305 content = value;
306 else
307 if ((!xmlStrcasecmp(attr->name, BAD_CAST"charset"))
308 && (!xmlStrcasecmp(value, encoding)))
309 same_charset = 1;
310 }
311 if ((http != 0) && (content != NULL) && (same_charset != 0))
Owen Taylor3473f882001-02-23 17:55:21 +0000312 break;
313 }
314 attr = attr->next;
315 }
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000316 if ((http != 0) && (content != NULL) && (same_charset != 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +0000317 meta = cur;
318 cur = cur->next;
319 xmlUnlinkNode(meta);
320 xmlFreeNode(meta);
321 continue;
322 }
323
324 }
325 }
326 cur = cur->next;
327 }
328 return(0);
329}
330
331/************************************************************************
332 * *
333 * Dumping HTML tree content to a simple buffer *
334 * *
335 ************************************************************************/
336
337static void
338htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur);
339
340/**
341 * htmlDtdDump:
342 * @buf: the HTML buffer output
343 * @doc: the document
344 *
345 * Dump the HTML document DTD, if any.
346 */
347static void
348htmlDtdDump(xmlBufferPtr buf, xmlDocPtr doc) {
349 xmlDtdPtr cur = doc->intSubset;
350
351 if (cur == NULL) {
352 xmlGenericError(xmlGenericErrorContext,
353 "htmlDtdDump : no internal subset\n");
354 return;
355 }
356 xmlBufferWriteChar(buf, "<!DOCTYPE ");
357 xmlBufferWriteCHAR(buf, cur->name);
358 if (cur->ExternalID != NULL) {
359 xmlBufferWriteChar(buf, " PUBLIC ");
360 xmlBufferWriteQuotedString(buf, cur->ExternalID);
361 if (cur->SystemID != NULL) {
362 xmlBufferWriteChar(buf, " ");
363 xmlBufferWriteQuotedString(buf, cur->SystemID);
364 }
365 } else if (cur->SystemID != NULL) {
366 xmlBufferWriteChar(buf, " SYSTEM ");
367 xmlBufferWriteQuotedString(buf, cur->SystemID);
368 }
369 xmlBufferWriteChar(buf, ">\n");
370}
371
372/**
373 * htmlAttrDump:
374 * @buf: the HTML buffer output
375 * @doc: the document
376 * @cur: the attribute pointer
377 *
378 * Dump an HTML attribute
379 */
380static void
381htmlAttrDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
382 xmlChar *value;
383
384 if (cur == NULL) {
385 xmlGenericError(xmlGenericErrorContext,
386 "htmlAttrDump : property == NULL\n");
387 return;
388 }
389 xmlBufferWriteChar(buf, " ");
390 xmlBufferWriteCHAR(buf, cur->name);
391 if (cur->children != NULL) {
392 value = xmlNodeListGetString(doc, cur->children, 0);
393 if (value) {
394 xmlBufferWriteChar(buf, "=");
395 xmlBufferWriteQuotedString(buf, value);
396 xmlFree(value);
397 } else {
398 xmlBufferWriteChar(buf, "=\"\"");
399 }
400 }
401}
402
403/**
404 * htmlAttrListDump:
405 * @buf: the HTML buffer output
406 * @doc: the document
407 * @cur: the first attribute pointer
408 *
409 * Dump a list of HTML attributes
410 */
411static void
412htmlAttrListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
413 if (cur == NULL) {
414 xmlGenericError(xmlGenericErrorContext,
415 "htmlAttrListDump : property == NULL\n");
416 return;
417 }
418 while (cur != NULL) {
419 htmlAttrDump(buf, doc, cur);
420 cur = cur->next;
421 }
422}
423
424
425void
426htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur);
427/**
428 * htmlNodeListDump:
429 * @buf: the HTML buffer output
430 * @doc: the document
431 * @cur: the first node
432 *
433 * Dump an HTML node list, recursive behaviour,children are printed too.
434 */
435static void
436htmlNodeListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
437 if (cur == NULL) {
438 xmlGenericError(xmlGenericErrorContext,
439 "htmlNodeListDump : node == NULL\n");
440 return;
441 }
442 while (cur != NULL) {
443 htmlNodeDump(buf, doc, cur);
444 cur = cur->next;
445 }
446}
447
448/**
449 * htmlNodeDump:
450 * @buf: the HTML buffer output
451 * @doc: the document
452 * @cur: the current node
453 *
454 * Dump an HTML node, recursive behaviour,children are printed too.
455 */
456void
457htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
458 htmlElemDescPtr info;
459
460 if (cur == NULL) {
461 xmlGenericError(xmlGenericErrorContext,
462 "htmlNodeDump : node == NULL\n");
463 return;
464 }
465 /*
466 * Special cases.
467 */
468 if (cur->type == XML_DTD_NODE)
469 return;
470 if (cur->type == XML_HTML_DOCUMENT_NODE) {
471 htmlDocContentDump(buf, (xmlDocPtr) cur);
472 return;
473 }
474 if (cur->type == HTML_TEXT_NODE) {
475 if (cur->content != NULL) {
476 if ((cur->name == xmlStringText) ||
477 (cur->name != xmlStringTextNoenc)) {
478 xmlChar *buffer;
479
480#ifndef XML_USE_BUFFER_CONTENT
481 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
482#else
483 buffer = xmlEncodeEntitiesReentrant(doc,
484 xmlBufferContent(cur->content));
485#endif
486 if (buffer != NULL) {
487 xmlBufferWriteCHAR(buf, buffer);
488 xmlFree(buffer);
489 }
490 } else {
491 xmlBufferWriteCHAR(buf, cur->content);
492 }
493 }
494 return;
495 }
Daniel Veillardde57c612001-04-23 09:13:36 +0000496 if (cur->type == HTML_PRESERVE_NODE) {
497 if (cur->content != NULL) {
498#ifndef XML_USE_BUFFER_CONTENT
499 xmlBufferWriteCHAR(buf, cur->content);
500#else
501 xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
502#endif
503 }
504 return;
505 }
Owen Taylor3473f882001-02-23 17:55:21 +0000506 if (cur->type == HTML_COMMENT_NODE) {
507 if (cur->content != NULL) {
508 xmlBufferWriteChar(buf, "<!--");
509#ifndef XML_USE_BUFFER_CONTENT
510 xmlBufferWriteCHAR(buf, cur->content);
511#else
512 xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
513#endif
514 xmlBufferWriteChar(buf, "-->");
515 }
516 return;
517 }
Daniel Veillard7533cc82001-04-24 15:52:00 +0000518 if (cur->type == HTML_PI_NODE) {
519 if (cur->content != NULL) {
520 xmlBufferWriteChar(buf, "<?");
521#ifndef XML_USE_BUFFER_CONTENT
522 xmlBufferWriteCHAR(buf, cur->content);
523#else
524 xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
525#endif
526 xmlBufferWriteChar(buf, ">");
527 }
528 return;
529 }
Owen Taylor3473f882001-02-23 17:55:21 +0000530 if (cur->type == HTML_ENTITY_REF_NODE) {
531 xmlBufferWriteChar(buf, "&");
532 xmlBufferWriteCHAR(buf, cur->name);
533 xmlBufferWriteChar(buf, ";");
534 return;
535 }
536
537 /*
538 * Get specific HTmL info for taht node.
539 */
540 info = htmlTagLookup(cur->name);
541
542 xmlBufferWriteChar(buf, "<");
543 xmlBufferWriteCHAR(buf, cur->name);
544 if (cur->properties != NULL)
545 htmlAttrListDump(buf, doc, cur->properties);
546
547 if ((info != NULL) && (info->empty)) {
548 xmlBufferWriteChar(buf, ">");
549 if (cur->next != NULL) {
550 if ((cur->next->type != HTML_TEXT_NODE) &&
551 (cur->next->type != HTML_ENTITY_REF_NODE))
552 xmlBufferWriteChar(buf, "\n");
553 }
554 return;
555 }
556 if ((cur->content == NULL) && (cur->children == NULL)) {
557 if ((info != NULL) && (info->endTag != 0))
558 xmlBufferWriteChar(buf, ">");
559 else {
560 xmlBufferWriteChar(buf, "></");
561 xmlBufferWriteCHAR(buf, cur->name);
562 xmlBufferWriteChar(buf, ">");
563 }
564 if (cur->next != NULL) {
565 if ((cur->next->type != HTML_TEXT_NODE) &&
566 (cur->next->type != HTML_ENTITY_REF_NODE))
567 xmlBufferWriteChar(buf, "\n");
568 }
569 return;
570 }
571 xmlBufferWriteChar(buf, ">");
572 if (cur->content != NULL) {
573 xmlChar *buffer;
574
575#ifndef XML_USE_BUFFER_CONTENT
576 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
577#else
578 buffer = xmlEncodeEntitiesReentrant(doc,
579 xmlBufferContent(cur->content));
580#endif
581 if (buffer != NULL) {
582 xmlBufferWriteCHAR(buf, buffer);
583 xmlFree(buffer);
584 }
585 }
586 if (cur->children != NULL) {
587 if ((cur->children->type != HTML_TEXT_NODE) &&
588 (cur->children->type != HTML_ENTITY_REF_NODE) &&
589 (cur->children != cur->last))
590 xmlBufferWriteChar(buf, "\n");
591 htmlNodeListDump(buf, doc, cur->children);
592 if ((cur->last->type != HTML_TEXT_NODE) &&
593 (cur->last->type != HTML_ENTITY_REF_NODE) &&
594 (cur->children != cur->last))
595 xmlBufferWriteChar(buf, "\n");
596 }
597 if (!htmlIsAutoClosed(doc, cur)) {
598 xmlBufferWriteChar(buf, "</");
599 xmlBufferWriteCHAR(buf, cur->name);
600 xmlBufferWriteChar(buf, ">");
601 }
Owen Taylor3473f882001-02-23 17:55:21 +0000602 xmlBufferWriteChar(buf, "</");
603 xmlBufferWriteCHAR(buf, cur->name);
604 xmlBufferWriteChar(buf, ">");
Owen Taylor3473f882001-02-23 17:55:21 +0000605 if (cur->next != NULL) {
606 if ((cur->next->type != HTML_TEXT_NODE) &&
607 (cur->next->type != HTML_ENTITY_REF_NODE))
608 xmlBufferWriteChar(buf, "\n");
609 }
610}
611
612/**
613 * htmlNodeDumpFile:
614 * @out: the FILE pointer
615 * @doc: the document
616 * @cur: the current node
617 *
618 * Dump an HTML node, recursive behaviour,children are printed too.
619 */
620void
621htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
622 xmlBufferPtr buf;
623
624 buf = xmlBufferCreate();
625 if (buf == NULL) return;
626 htmlNodeDump(buf, doc, cur);
627 xmlBufferDump(out, buf);
628 xmlBufferFree(buf);
629}
630
631/**
632 * htmlDocContentDump:
633 * @buf: the HTML buffer output
634 * @cur: the document
635 *
636 * Dump an HTML document.
637 */
638static void
639htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur) {
640 int type;
641
642 /*
643 * force to output the stuff as HTML, especially for entities
644 */
645 type = cur->type;
646 cur->type = XML_HTML_DOCUMENT_NODE;
647 if (cur->intSubset != NULL)
648 htmlDtdDump(buf, cur);
649 else {
650 /* Default to HTML-4.0 transitionnal @@@@ */
651 xmlBufferWriteChar(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
652
653 }
654 if (cur->children != NULL) {
655 htmlNodeListDump(buf, cur, cur->children);
656 }
657 xmlBufferWriteChar(buf, "\n");
658 cur->type = (xmlElementType) type;
659}
660
661/**
662 * htmlDocDumpMemory:
663 * @cur: the document
664 * @mem: OUT: the memory pointer
665 * @size: OUT: the memory lenght
666 *
667 * Dump an HTML document in memory and return the xmlChar * and it's size.
668 * It's up to the caller to free the memory.
669 */
670void
671htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
672 xmlBufferPtr buf;
673
674 if (cur == NULL) {
675#ifdef DEBUG_TREE
676 xmlGenericError(xmlGenericErrorContext,
677 "htmlxmlDocDumpMemory : document == NULL\n");
678#endif
679 *mem = NULL;
680 *size = 0;
681 return;
682 }
683 buf = xmlBufferCreate();
684 if (buf == NULL) {
685 *mem = NULL;
686 *size = 0;
687 return;
688 }
689 htmlDocContentDump(buf, cur);
690 *mem = buf->content;
691 *size = buf->use;
Owen Taylor3473f882001-02-23 17:55:21 +0000692 xmlFree(buf);
693}
694
695
696/************************************************************************
697 * *
698 * Dumping HTML tree content to an I/O output buffer *
699 * *
700 ************************************************************************/
701
702/**
703 * htmlDtdDump:
704 * @buf: the HTML buffer output
705 * @doc: the document
706 * @encoding: the encoding string
707 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000708 * TODO: check whether encoding is needed
709 *
Owen Taylor3473f882001-02-23 17:55:21 +0000710 * Dump the HTML document DTD, if any.
711 */
712static void
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000713htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
Daniel Veillardc86a4fa2001-03-26 16:28:29 +0000714 const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +0000715 xmlDtdPtr cur = doc->intSubset;
716
717 if (cur == NULL) {
718 xmlGenericError(xmlGenericErrorContext,
719 "htmlDtdDump : no internal subset\n");
720 return;
721 }
722 xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
723 xmlOutputBufferWriteString(buf, (const char *)cur->name);
724 if (cur->ExternalID != NULL) {
725 xmlOutputBufferWriteString(buf, " PUBLIC ");
726 xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
727 if (cur->SystemID != NULL) {
728 xmlOutputBufferWriteString(buf, " ");
729 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
730 }
731 } else if (cur->SystemID != NULL) {
732 xmlOutputBufferWriteString(buf, " SYSTEM ");
733 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
734 }
735 xmlOutputBufferWriteString(buf, ">\n");
736}
737
738/**
739 * htmlAttrDump:
740 * @buf: the HTML buffer output
741 * @doc: the document
742 * @cur: the attribute pointer
743 * @encoding: the encoding string
744 *
745 * Dump an HTML attribute
746 */
747static void
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000748htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
Daniel Veillardc86a4fa2001-03-26 16:28:29 +0000749 const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +0000750 xmlChar *value;
751
752 if (cur == NULL) {
753 xmlGenericError(xmlGenericErrorContext,
754 "htmlAttrDump : property == NULL\n");
755 return;
756 }
757 xmlOutputBufferWriteString(buf, " ");
758 xmlOutputBufferWriteString(buf, (const char *)cur->name);
759 if (cur->children != NULL) {
760 value = xmlNodeListGetString(doc, cur->children, 0);
761 if (value) {
762 xmlOutputBufferWriteString(buf, "=");
763 xmlBufferWriteQuotedString(buf->buffer, value);
764 xmlFree(value);
765 } else {
766 xmlOutputBufferWriteString(buf, "=\"\"");
767 }
768 }
769}
770
771/**
772 * htmlAttrListDump:
773 * @buf: the HTML buffer output
774 * @doc: the document
775 * @cur: the first attribute pointer
776 * @encoding: the encoding string
777 *
778 * Dump a list of HTML attributes
779 */
780static void
781htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
782 if (cur == NULL) {
783 xmlGenericError(xmlGenericErrorContext,
784 "htmlAttrListDump : property == NULL\n");
785 return;
786 }
787 while (cur != NULL) {
788 htmlAttrDumpOutput(buf, doc, cur, encoding);
789 cur = cur->next;
790 }
791}
792
793
794void htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
795 xmlNodePtr cur, const char *encoding);
796
797/**
798 * htmlNodeListDump:
799 * @buf: the HTML buffer output
800 * @doc: the document
801 * @cur: the first node
802 * @encoding: the encoding string
803 *
804 * Dump an HTML node list, recursive behaviour,children are printed too.
805 */
806static void
807htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
808 if (cur == NULL) {
809 xmlGenericError(xmlGenericErrorContext,
810 "htmlNodeListDump : node == NULL\n");
811 return;
812 }
813 while (cur != NULL) {
814 htmlNodeDumpOutput(buf, doc, cur, encoding);
815 cur = cur->next;
816 }
817}
818
819/**
820 * htmlNodeDumpOutput:
821 * @buf: the HTML buffer output
822 * @doc: the document
823 * @cur: the current node
824 * @encoding: the encoding string
825 *
826 * Dump an HTML node, recursive behaviour,children are printed too.
827 */
828void
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000829htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
830 xmlNodePtr cur, const char *encoding) {
Owen Taylor3473f882001-02-23 17:55:21 +0000831 htmlElemDescPtr info;
832
833 if (cur == NULL) {
834 xmlGenericError(xmlGenericErrorContext,
835 "htmlNodeDump : node == NULL\n");
836 return;
837 }
838 /*
839 * Special cases.
840 */
841 if (cur->type == XML_DTD_NODE)
842 return;
843 if (cur->type == XML_HTML_DOCUMENT_NODE) {
844 htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
845 return;
846 }
847 if (cur->type == HTML_TEXT_NODE) {
848 if (cur->content != NULL) {
849 if ((cur->name == xmlStringText) ||
850 (cur->name != xmlStringTextNoenc)) {
851 xmlChar *buffer;
852
853#ifndef XML_USE_BUFFER_CONTENT
854 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
855#else
856 buffer = xmlEncodeEntitiesReentrant(doc,
857 xmlBufferContent(cur->content));
858#endif
859 if (buffer != NULL) {
860 xmlOutputBufferWriteString(buf, (const char *)buffer);
861 xmlFree(buffer);
862 }
863 } else {
864 xmlOutputBufferWriteString(buf, (const char *)cur->content);
865 }
866 }
867 return;
868 }
869 if (cur->type == HTML_COMMENT_NODE) {
870 if (cur->content != NULL) {
871 xmlOutputBufferWriteString(buf, "<!--");
872#ifndef XML_USE_BUFFER_CONTENT
873 xmlOutputBufferWriteString(buf, (const char *)cur->content);
874#else
875 xmlOutputBufferWriteString(buf, (const char *)
876 xmlBufferContent(cur->content));
877#endif
878 xmlOutputBufferWriteString(buf, "-->");
879 }
880 return;
881 }
Daniel Veillard7533cc82001-04-24 15:52:00 +0000882 if (cur->type == HTML_PI_NODE) {
883 if (cur->content != NULL) {
884 xmlOutputBufferWriteString(buf, "<?");
885#ifndef XML_USE_BUFFER_CONTENT
886 xmlOutputBufferWriteString(buf, (const char *)cur->content);
887#else
888 xmlOutputBufferWriteString(buf, (const char *)
889 xmlBufferContent(cur->content));
890#endif
891 xmlOutputBufferWriteString(buf, ">");
892 }
893 return;
894 }
Owen Taylor3473f882001-02-23 17:55:21 +0000895 if (cur->type == HTML_ENTITY_REF_NODE) {
896 xmlOutputBufferWriteString(buf, "&");
897 xmlOutputBufferWriteString(buf, (const char *)cur->name);
898 xmlOutputBufferWriteString(buf, ";");
899 return;
900 }
901 if (cur->type == HTML_PRESERVE_NODE) {
902 if (cur->content != NULL) {
903#ifndef XML_USE_BUFFER_CONTENT
904 xmlOutputBufferWriteString(buf, (const char *)cur->content);
905#else
906 xmlOutputBufferWriteString(buf, (const char *)
907 xmlBufferContent(cur->content));
908#endif
909 }
910 return;
911 }
912
913 /*
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000914 * Get specific HTML info for taht node.
Owen Taylor3473f882001-02-23 17:55:21 +0000915 */
916 info = htmlTagLookup(cur->name);
917
918 xmlOutputBufferWriteString(buf, "<");
919 xmlOutputBufferWriteString(buf, (const char *)cur->name);
920 if (cur->properties != NULL)
921 htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
922
923 if ((info != NULL) && (info->empty)) {
924 xmlOutputBufferWriteString(buf, ">");
925 if (cur->next != NULL) {
926 if ((cur->next->type != HTML_TEXT_NODE) &&
927 (cur->next->type != HTML_ENTITY_REF_NODE))
928 xmlOutputBufferWriteString(buf, "\n");
929 }
930 return;
931 }
932 if ((cur->content == NULL) && (cur->children == NULL)) {
933 if ((info != NULL) && (info->saveEndTag != 0) &&
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000934/*
935 (xmlStrcasecmp(BAD_CAST info->name, BAD_CAST "html")) &&
936 (xmlStrcasecmp(BAD_CAST info->name, BAD_CAST "body"))) {
937*/
Owen Taylor3473f882001-02-23 17:55:21 +0000938 (strcmp(info->name, "html")) && (strcmp(info->name, "body"))) {
939 xmlOutputBufferWriteString(buf, ">");
940 } else {
941 xmlOutputBufferWriteString(buf, "></");
942 xmlOutputBufferWriteString(buf, (const char *)cur->name);
943 xmlOutputBufferWriteString(buf, ">");
944 }
945 if (cur->next != NULL) {
946 if ((cur->next->type != HTML_TEXT_NODE) &&
947 (cur->next->type != HTML_ENTITY_REF_NODE))
948 xmlOutputBufferWriteString(buf, "\n");
949 }
950 return;
951 }
952 xmlOutputBufferWriteString(buf, ">");
953 if (cur->content != NULL) {
954 /*
955 * Uses the OutputBuffer property to automatically convert
956 * invalids to charrefs
957 */
958
959#ifndef XML_USE_BUFFER_CONTENT
960 xmlOutputBufferWriteString(buf, (const char *) cur->content);
961#else
962 xmlOutputBufferWriteString(buf,
963 (const char *) xmlBufferContent(cur->content));
964#endif
965 }
966 if (cur->children != NULL) {
967 if ((cur->children->type != HTML_TEXT_NODE) &&
968 (cur->children->type != HTML_ENTITY_REF_NODE) &&
969 (cur->children != cur->last))
970 xmlOutputBufferWriteString(buf, "\n");
971 htmlNodeListDumpOutput(buf, doc, cur->children, encoding);
972 if ((cur->last->type != HTML_TEXT_NODE) &&
973 (cur->last->type != HTML_ENTITY_REF_NODE) &&
974 (cur->children != cur->last))
975 xmlOutputBufferWriteString(buf, "\n");
976 }
Owen Taylor3473f882001-02-23 17:55:21 +0000977 xmlOutputBufferWriteString(buf, "</");
978 xmlOutputBufferWriteString(buf, (const char *)cur->name);
979 xmlOutputBufferWriteString(buf, ">");
Owen Taylor3473f882001-02-23 17:55:21 +0000980 if (cur->next != NULL) {
981 if ((cur->next->type != HTML_TEXT_NODE) &&
982 (cur->next->type != HTML_ENTITY_REF_NODE))
983 xmlOutputBufferWriteString(buf, "\n");
984 }
985}
986
987/**
988 * htmlDocContentDump:
989 * @buf: the HTML buffer output
990 * @cur: the document
991 * @encoding: the encoding string
992 *
993 * Dump an HTML document.
994 */
995void
996htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding) {
997 int type;
998
999 /*
1000 * force to output the stuff as HTML, especially for entities
1001 */
1002 type = cur->type;
1003 cur->type = XML_HTML_DOCUMENT_NODE;
Daniel Veillard4dd93462001-04-02 15:16:19 +00001004 if (cur->intSubset != NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00001005 htmlDtdDumpOutput(buf, cur, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001006 }
1007 if (cur->children != NULL) {
1008 htmlNodeListDumpOutput(buf, cur, cur->children, encoding);
1009 }
1010 xmlOutputBufferWriteString(buf, "\n");
1011 cur->type = (xmlElementType) type;
1012}
1013
Owen Taylor3473f882001-02-23 17:55:21 +00001014/************************************************************************
1015 * *
1016 * Saving functions front-ends *
1017 * *
1018 ************************************************************************/
1019
1020/**
1021 * htmlDocDump:
1022 * @f: the FILE*
1023 * @cur: the document
1024 *
1025 * Dump an HTML document to an open FILE.
1026 *
1027 * returns: the number of byte written or -1 in case of failure.
1028 */
1029int
1030htmlDocDump(FILE *f, xmlDocPtr cur) {
1031 xmlOutputBufferPtr buf;
1032 xmlCharEncodingHandlerPtr handler = NULL;
1033 const char *encoding;
1034 int ret;
1035
1036 if (cur == NULL) {
1037#ifdef DEBUG_TREE
1038 xmlGenericError(xmlGenericErrorContext,
1039 "htmlDocDump : document == NULL\n");
1040#endif
1041 return(-1);
1042 }
1043
1044 encoding = (const char *) htmlGetMetaEncoding(cur);
1045
1046 if (encoding != NULL) {
1047 xmlCharEncoding enc;
1048
1049 enc = xmlParseCharEncoding(encoding);
1050 if (enc != cur->charset) {
1051 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1052 /*
1053 * Not supported yet
1054 */
1055 return(-1);
1056 }
1057
1058 handler = xmlFindCharEncodingHandler(encoding);
1059 if (handler == NULL)
1060 return(-1);
1061 }
1062 }
1063
1064 /*
1065 * Fallback to HTML or ASCII when the encoding is unspecified
1066 */
1067 if (handler == NULL)
1068 handler = xmlFindCharEncodingHandler("HTML");
1069 if (handler == NULL)
1070 handler = xmlFindCharEncodingHandler("ascii");
1071
1072 buf = xmlOutputBufferCreateFile(f, handler);
1073 if (buf == NULL) return(-1);
1074 htmlDocContentDumpOutput(buf, cur, NULL);
1075
1076 ret = xmlOutputBufferClose(buf);
1077 return(ret);
1078}
1079
1080/**
1081 * htmlSaveFile:
1082 * @filename: the filename (or URL)
1083 * @cur: the document
1084 *
1085 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1086 * used.
1087 * returns: the number of byte written or -1 in case of failure.
1088 */
1089int
1090htmlSaveFile(const char *filename, xmlDocPtr cur) {
1091 xmlOutputBufferPtr buf;
1092 xmlCharEncodingHandlerPtr handler = NULL;
1093 const char *encoding;
1094 int ret;
1095
1096 encoding = (const char *) htmlGetMetaEncoding(cur);
1097
1098 if (encoding != NULL) {
1099 xmlCharEncoding enc;
1100
1101 enc = xmlParseCharEncoding(encoding);
1102 if (enc != cur->charset) {
1103 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1104 /*
1105 * Not supported yet
1106 */
1107 return(-1);
1108 }
1109
1110 handler = xmlFindCharEncodingHandler(encoding);
1111 if (handler == NULL)
1112 return(-1);
1113 }
1114 }
1115
1116 /*
1117 * Fallback to HTML or ASCII when the encoding is unspecified
1118 */
1119 if (handler == NULL)
1120 handler = xmlFindCharEncodingHandler("HTML");
1121 if (handler == NULL)
1122 handler = xmlFindCharEncodingHandler("ascii");
1123
1124 /*
1125 * save the content to a temp buffer.
1126 */
1127 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1128 if (buf == NULL) return(0);
1129
1130 htmlDocContentDumpOutput(buf, cur, NULL);
1131
1132 ret = xmlOutputBufferClose(buf);
1133 return(ret);
1134}
1135
1136/**
1137 * htmlSaveFileEnc:
1138 * @filename: the filename
1139 * @cur: the document
1140 *
1141 * Dump an HTML document to a file using a given encoding.
1142 *
1143 * returns: the number of byte written or -1 in case of failure.
1144 */
1145int
1146htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1147 xmlOutputBufferPtr buf;
1148 xmlCharEncodingHandlerPtr handler = NULL;
1149 int ret;
1150
1151 if (encoding != NULL) {
1152 xmlCharEncoding enc;
1153
1154 enc = xmlParseCharEncoding(encoding);
1155 if (enc != cur->charset) {
1156 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1157 /*
1158 * Not supported yet
1159 */
1160 return(-1);
1161 }
1162
1163 handler = xmlFindCharEncodingHandler(encoding);
1164 if (handler == NULL)
1165 return(-1);
1166 htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1167 }
Daniel Veillard4dd93462001-04-02 15:16:19 +00001168 } else {
1169 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
Owen Taylor3473f882001-02-23 17:55:21 +00001170 }
1171
1172 /*
1173 * Fallback to HTML or ASCII when the encoding is unspecified
1174 */
1175 if (handler == NULL)
1176 handler = xmlFindCharEncodingHandler("HTML");
1177 if (handler == NULL)
1178 handler = xmlFindCharEncodingHandler("ascii");
1179
1180 /*
1181 * save the content to a temp buffer.
1182 */
1183 buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1184 if (buf == NULL) return(0);
1185
1186 htmlDocContentDumpOutput(buf, cur, encoding);
1187
1188 ret = xmlOutputBufferClose(buf);
1189 return(ret);
1190}
1191#endif /* LIBXML_HTML_ENABLED */