blob: c10aac18dba82fabead75c23622b1effa0821d59 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * HTMLtree.c : implemetation of access function for an HTML tree.
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9
Bjorn Reese70a9da52001-04-21 16:57:29 +000010#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000011#ifdef LIBXML_HTML_ENABLED
12
Owen Taylor3473f882001-02-23 17:55:21 +000013#ifdef HAVE_CTYPE_H
14#include <ctype.h>
15#endif
16#ifdef HAVE_STDLIB_H
17#include <stdlib.h>
18#endif
19
20#include <libxml/xmlmemory.h>
21#include <libxml/HTMLparser.h>
22#include <libxml/HTMLtree.h>
23#include <libxml/entities.h>
24#include <libxml/valid.h>
25#include <libxml/xmlerror.h>
26#include <libxml/parserInternals.h>
27
28/************************************************************************
29 * *
30 * Getting/Setting encoding meta tags *
31 * *
32 ************************************************************************/
33
34/**
35 * htmlGetMetaEncoding:
36 * @doc: the document
37 *
38 * Encoding definition lookup in the Meta tags
39 *
40 * Returns the current encoding as flagged in the HTML source
41 */
42const xmlChar *
43htmlGetMetaEncoding(htmlDocPtr doc) {
44 htmlNodePtr cur;
45 const xmlChar *content;
46 const xmlChar *encoding;
47
48 if (doc == NULL)
49 return(NULL);
50 cur = doc->children;
51
52 /*
53 * Search the html
54 */
55 while (cur != NULL) {
56 if (cur->name != NULL) {
57 if (xmlStrEqual(cur->name, BAD_CAST"html"))
58 break;
59 if (xmlStrEqual(cur->name, BAD_CAST"head"))
60 goto found_head;
61 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
62 goto found_meta;
63 }
64 cur = cur->next;
65 }
66 if (cur == NULL)
67 return(NULL);
68 cur = cur->children;
69
70 /*
71 * Search the head
72 */
73 while (cur != NULL) {
74 if (cur->name != NULL) {
75 if (xmlStrEqual(cur->name, BAD_CAST"head"))
76 break;
77 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
78 goto found_meta;
79 }
80 cur = cur->next;
81 }
82 if (cur == NULL)
83 return(NULL);
84found_head:
85 cur = cur->children;
86
87 /*
88 * Search the meta elements
89 */
90found_meta:
91 while (cur != NULL) {
92 if (cur->name != NULL) {
93 if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
94 xmlAttrPtr attr = cur->properties;
95 int http;
96 const xmlChar *value;
97
98 content = NULL;
99 http = 0;
100 while (attr != NULL) {
101 if ((attr->children != NULL) &&
102 (attr->children->type == XML_TEXT_NODE) &&
103 (attr->children->next == NULL)) {
104#ifndef XML_USE_BUFFER_CONTENT
105 value = attr->children->content;
106#else
107 value = xmlBufferContent(attr->children->content);
108#endif
109 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
110 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
111 http = 1;
112 else if ((value != NULL)
113 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
114 content = value;
115 if ((http != 0) && (content != NULL))
116 goto found_content;
117 }
118 attr = attr->next;
119 }
120 }
121 }
122 cur = cur->next;
123 }
124 return(NULL);
125
126found_content:
127 encoding = xmlStrstr(content, BAD_CAST"charset=");
128 if (encoding == NULL)
129 encoding = xmlStrstr(content, BAD_CAST"Charset=");
130 if (encoding == NULL)
131 encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
132 if (encoding != NULL) {
133 encoding += 8;
134 } else {
135 encoding = xmlStrstr(content, BAD_CAST"charset =");
136 if (encoding == NULL)
137 encoding = xmlStrstr(content, BAD_CAST"Charset =");
138 if (encoding == NULL)
139 encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
140 if (encoding != NULL)
141 encoding += 9;
142 }
143 if (encoding != NULL) {
144 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
145 }
146 return(encoding);
147}
148
149/**
150 * htmlSetMetaEncoding:
151 * @doc: the document
152 * @encoding: the encoding string
153 *
154 * Sets the current encoding in the Meta tags
155 * NOTE: this will not change the document content encoding, just
156 * the META flag associated.
157 *
158 * Returns 0 in case of success and -1 in case of error
159 */
160int
161htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
162 htmlNodePtr cur, meta;
163 const xmlChar *content;
164 char newcontent[100];
165
166
167 if (doc == NULL)
168 return(-1);
169
170 if (encoding != NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +0000171 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
172 encoding);
Owen Taylor3473f882001-02-23 17:55:21 +0000173 newcontent[sizeof(newcontent) - 1] = 0;
174 }
175
176 cur = doc->children;
177
178 /*
179 * Search the html
180 */
181 while (cur != NULL) {
182 if (cur->name != NULL) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000183 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
184 break;
185 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
186 goto found_head;
187 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
188 goto found_meta;
Owen Taylor3473f882001-02-23 17:55:21 +0000189 }
190 cur = cur->next;
191 }
192 if (cur == NULL)
193 return(-1);
194 cur = cur->children;
195
196 /*
197 * Search the head
198 */
199 while (cur != NULL) {
200 if (cur->name != NULL) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000201 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
202 break;
203 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
204 goto found_meta;
Owen Taylor3473f882001-02-23 17:55:21 +0000205 }
206 cur = cur->next;
207 }
208 if (cur == NULL)
209 return(-1);
210found_head:
211 if (cur->children == NULL) {
212 if (encoding == NULL)
213 return(0);
214 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
215 xmlAddChild(cur, meta);
Owen Taylor3473f882001-02-23 17:55:21 +0000216 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000217 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
Owen Taylor3473f882001-02-23 17:55:21 +0000218 return(0);
219 }
220 cur = cur->children;
221
222found_meta:
223 if (encoding != NULL) {
224 /*
225 * Create a new Meta element with the right aatributes
226 */
227
228 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
229 xmlAddPrevSibling(cur, meta);
Owen Taylor3473f882001-02-23 17:55:21 +0000230 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000231 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
Owen Taylor3473f882001-02-23 17:55:21 +0000232 }
233
234 /*
235 * Search and destroy all the remaining the meta elements carrying
236 * encoding informations
237 */
238 while (cur != NULL) {
239 if (cur->name != NULL) {
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000240 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
Owen Taylor3473f882001-02-23 17:55:21 +0000241 xmlAttrPtr attr = cur->properties;
242 int http;
243 const xmlChar *value;
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000244 int same_charset;
Owen Taylor3473f882001-02-23 17:55:21 +0000245
246 content = NULL;
247 http = 0;
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000248 same_charset = 0;
Owen Taylor3473f882001-02-23 17:55:21 +0000249 while (attr != NULL) {
250 if ((attr->children != NULL) &&
251 (attr->children->type == XML_TEXT_NODE) &&
252 (attr->children->next == NULL)) {
253#ifndef XML_USE_BUFFER_CONTENT
254 value = attr->children->content;
255#else
256 value = xmlBufferContent(attr->children->content);
257#endif
258 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
259 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
260 http = 1;
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000261 else
262 {
263 if ((value != NULL) &&
264 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
265 content = value;
266 else
267 if ((!xmlStrcasecmp(attr->name, BAD_CAST"charset"))
268 && (!xmlStrcasecmp(value, encoding)))
269 same_charset = 1;
270 }
271 if ((http != 0) && (content != NULL) && (same_charset != 0))
Owen Taylor3473f882001-02-23 17:55:21 +0000272 break;
273 }
274 attr = attr->next;
275 }
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000276 if ((http != 0) && (content != NULL) && (same_charset != 0)) {
Owen Taylor3473f882001-02-23 17:55:21 +0000277 meta = cur;
278 cur = cur->next;
279 xmlUnlinkNode(meta);
280 xmlFreeNode(meta);
281 continue;
282 }
283
284 }
285 }
286 cur = cur->next;
287 }
288 return(0);
289}
290
291/************************************************************************
292 * *
293 * Dumping HTML tree content to a simple buffer *
294 * *
295 ************************************************************************/
296
297static void
298htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur);
299
300/**
301 * htmlDtdDump:
302 * @buf: the HTML buffer output
303 * @doc: the document
304 *
305 * Dump the HTML document DTD, if any.
306 */
307static void
308htmlDtdDump(xmlBufferPtr buf, xmlDocPtr doc) {
309 xmlDtdPtr cur = doc->intSubset;
310
311 if (cur == NULL) {
312 xmlGenericError(xmlGenericErrorContext,
313 "htmlDtdDump : no internal subset\n");
314 return;
315 }
316 xmlBufferWriteChar(buf, "<!DOCTYPE ");
317 xmlBufferWriteCHAR(buf, cur->name);
318 if (cur->ExternalID != NULL) {
319 xmlBufferWriteChar(buf, " PUBLIC ");
320 xmlBufferWriteQuotedString(buf, cur->ExternalID);
321 if (cur->SystemID != NULL) {
322 xmlBufferWriteChar(buf, " ");
323 xmlBufferWriteQuotedString(buf, cur->SystemID);
324 }
325 } else if (cur->SystemID != NULL) {
326 xmlBufferWriteChar(buf, " SYSTEM ");
327 xmlBufferWriteQuotedString(buf, cur->SystemID);
328 }
329 xmlBufferWriteChar(buf, ">\n");
330}
331
332/**
333 * htmlAttrDump:
334 * @buf: the HTML buffer output
335 * @doc: the document
336 * @cur: the attribute pointer
337 *
338 * Dump an HTML attribute
339 */
340static void
341htmlAttrDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
342 xmlChar *value;
343
Daniel Veillardeca60d02001-06-13 07:45:41 +0000344 /*
345 * TODO: The html output method should not escape a & character
346 * occurring in an attribute value immediately followed by
347 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
348 */
349
Owen Taylor3473f882001-02-23 17:55:21 +0000350 if (cur == NULL) {
351 xmlGenericError(xmlGenericErrorContext,
352 "htmlAttrDump : property == NULL\n");
353 return;
354 }
355 xmlBufferWriteChar(buf, " ");
356 xmlBufferWriteCHAR(buf, cur->name);
357 if (cur->children != NULL) {
358 value = xmlNodeListGetString(doc, cur->children, 0);
359 if (value) {
360 xmlBufferWriteChar(buf, "=");
361 xmlBufferWriteQuotedString(buf, value);
362 xmlFree(value);
363 } else {
364 xmlBufferWriteChar(buf, "=\"\"");
365 }
366 }
367}
368
369/**
370 * htmlAttrListDump:
371 * @buf: the HTML buffer output
372 * @doc: the document
373 * @cur: the first attribute pointer
374 *
375 * Dump a list of HTML attributes
376 */
377static void
378htmlAttrListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
379 if (cur == NULL) {
380 xmlGenericError(xmlGenericErrorContext,
381 "htmlAttrListDump : property == NULL\n");
382 return;
383 }
384 while (cur != NULL) {
385 htmlAttrDump(buf, doc, cur);
386 cur = cur->next;
387 }
388}
389
390
391void
392htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur);
393/**
394 * htmlNodeListDump:
395 * @buf: the HTML buffer output
396 * @doc: the document
397 * @cur: the first node
398 *
399 * Dump an HTML node list, recursive behaviour,children are printed too.
400 */
401static void
402htmlNodeListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
403 if (cur == NULL) {
404 xmlGenericError(xmlGenericErrorContext,
405 "htmlNodeListDump : node == NULL\n");
406 return;
407 }
408 while (cur != NULL) {
409 htmlNodeDump(buf, doc, cur);
410 cur = cur->next;
411 }
412}
413
414/**
415 * htmlNodeDump:
416 * @buf: the HTML buffer output
417 * @doc: the document
418 * @cur: the current node
419 *
420 * Dump an HTML node, recursive behaviour,children are printed too.
421 */
422void
423htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
424 htmlElemDescPtr info;
425
426 if (cur == NULL) {
427 xmlGenericError(xmlGenericErrorContext,
428 "htmlNodeDump : node == NULL\n");
429 return;
430 }
431 /*
432 * Special cases.
433 */
434 if (cur->type == XML_DTD_NODE)
435 return;
436 if (cur->type == XML_HTML_DOCUMENT_NODE) {
437 htmlDocContentDump(buf, (xmlDocPtr) cur);
438 return;
439 }
440 if (cur->type == HTML_TEXT_NODE) {
441 if (cur->content != NULL) {
Daniel Veillard6e93c4a2001-06-05 20:57:42 +0000442 if (((cur->name == xmlStringText) ||
443 (cur->name != xmlStringTextNoenc)) &&
444 ((cur->parent == NULL) ||
445 (!xmlStrEqual(cur->parent->name, BAD_CAST "script")))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000446 xmlChar *buffer;
447
448#ifndef XML_USE_BUFFER_CONTENT
449 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
450#else
451 buffer = xmlEncodeEntitiesReentrant(doc,
452 xmlBufferContent(cur->content));
453#endif
454 if (buffer != NULL) {
455 xmlBufferWriteCHAR(buf, buffer);
456 xmlFree(buffer);
457 }
458 } else {
459 xmlBufferWriteCHAR(buf, cur->content);
460 }
461 }
462 return;
463 }
464 if (cur->type == HTML_COMMENT_NODE) {
465 if (cur->content != NULL) {
466 xmlBufferWriteChar(buf, "<!--");
467#ifndef XML_USE_BUFFER_CONTENT
468 xmlBufferWriteCHAR(buf, cur->content);
469#else
470 xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
471#endif
472 xmlBufferWriteChar(buf, "-->");
473 }
474 return;
475 }
Daniel Veillard7533cc82001-04-24 15:52:00 +0000476 if (cur->type == HTML_PI_NODE) {
Daniel Veillard5146f202001-04-25 10:29:44 +0000477 if (cur->name == NULL)
478 return;
479 xmlBufferWriteChar(buf, "<?");
480 xmlBufferWriteCHAR(buf, cur->name);
Daniel Veillard7533cc82001-04-24 15:52:00 +0000481 if (cur->content != NULL) {
Daniel Veillard5146f202001-04-25 10:29:44 +0000482 xmlBufferWriteChar(buf, " ");
Daniel Veillard7533cc82001-04-24 15:52:00 +0000483#ifndef XML_USE_BUFFER_CONTENT
484 xmlBufferWriteCHAR(buf, cur->content);
485#else
486 xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
487#endif
Daniel Veillard7533cc82001-04-24 15:52:00 +0000488 }
Daniel Veillard5146f202001-04-25 10:29:44 +0000489 xmlBufferWriteChar(buf, ">");
Daniel Veillard7533cc82001-04-24 15:52:00 +0000490 return;
491 }
Owen Taylor3473f882001-02-23 17:55:21 +0000492 if (cur->type == HTML_ENTITY_REF_NODE) {
493 xmlBufferWriteChar(buf, "&");
494 xmlBufferWriteCHAR(buf, cur->name);
495 xmlBufferWriteChar(buf, ";");
496 return;
497 }
Daniel Veillard083c2662001-05-08 08:27:14 +0000498 if (cur->type == HTML_PRESERVE_NODE) {
499 if (cur->content != NULL) {
500#ifndef XML_USE_BUFFER_CONTENT
501 xmlBufferWriteCHAR(buf, cur->content);
502#else
503 xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
504#endif
505 }
506 return;
507 }
Owen Taylor3473f882001-02-23 17:55:21 +0000508
509 /*
Daniel Veillard083c2662001-05-08 08:27:14 +0000510 * Get specific HTML info for taht node.
Owen Taylor3473f882001-02-23 17:55:21 +0000511 */
512 info = htmlTagLookup(cur->name);
513
514 xmlBufferWriteChar(buf, "<");
515 xmlBufferWriteCHAR(buf, cur->name);
516 if (cur->properties != NULL)
517 htmlAttrListDump(buf, doc, cur->properties);
518
519 if ((info != NULL) && (info->empty)) {
520 xmlBufferWriteChar(buf, ">");
521 if (cur->next != NULL) {
522 if ((cur->next->type != HTML_TEXT_NODE) &&
523 (cur->next->type != HTML_ENTITY_REF_NODE))
524 xmlBufferWriteChar(buf, "\n");
525 }
526 return;
527 }
528 if ((cur->content == NULL) && (cur->children == NULL)) {
Daniel Veillard083c2662001-05-08 08:27:14 +0000529 if ((info != NULL) && (info->saveEndTag != 0) &&
Daniel Veillardeca60d02001-06-13 07:45:41 +0000530 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
531 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000532 xmlBufferWriteChar(buf, ">");
Daniel Veillard083c2662001-05-08 08:27:14 +0000533 } else {
Owen Taylor3473f882001-02-23 17:55:21 +0000534 xmlBufferWriteChar(buf, "></");
535 xmlBufferWriteCHAR(buf, cur->name);
536 xmlBufferWriteChar(buf, ">");
537 }
538 if (cur->next != NULL) {
539 if ((cur->next->type != HTML_TEXT_NODE) &&
540 (cur->next->type != HTML_ENTITY_REF_NODE))
541 xmlBufferWriteChar(buf, "\n");
542 }
543 return;
544 }
545 xmlBufferWriteChar(buf, ">");
546 if (cur->content != NULL) {
547 xmlChar *buffer;
548
549#ifndef XML_USE_BUFFER_CONTENT
Daniel Veillard083c2662001-05-08 08:27:14 +0000550 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
Owen Taylor3473f882001-02-23 17:55:21 +0000551#else
Daniel Veillard083c2662001-05-08 08:27:14 +0000552 buffer = xmlEncodeEntitiesReentrant(doc,
553 xmlBufferContent(cur->content));
Owen Taylor3473f882001-02-23 17:55:21 +0000554#endif
555 if (buffer != NULL) {
556 xmlBufferWriteCHAR(buf, buffer);
557 xmlFree(buffer);
558 }
559 }
560 if (cur->children != NULL) {
561 if ((cur->children->type != HTML_TEXT_NODE) &&
562 (cur->children->type != HTML_ENTITY_REF_NODE) &&
563 (cur->children != cur->last))
564 xmlBufferWriteChar(buf, "\n");
565 htmlNodeListDump(buf, doc, cur->children);
566 if ((cur->last->type != HTML_TEXT_NODE) &&
567 (cur->last->type != HTML_ENTITY_REF_NODE) &&
568 (cur->children != cur->last))
569 xmlBufferWriteChar(buf, "\n");
570 }
Owen Taylor3473f882001-02-23 17:55:21 +0000571 xmlBufferWriteChar(buf, "</");
572 xmlBufferWriteCHAR(buf, cur->name);
573 xmlBufferWriteChar(buf, ">");
Owen Taylor3473f882001-02-23 17:55:21 +0000574 if (cur->next != NULL) {
575 if ((cur->next->type != HTML_TEXT_NODE) &&
576 (cur->next->type != HTML_ENTITY_REF_NODE))
577 xmlBufferWriteChar(buf, "\n");
578 }
579}
580
581/**
582 * htmlNodeDumpFile:
583 * @out: the FILE pointer
584 * @doc: the document
585 * @cur: the current node
586 *
587 * Dump an HTML node, recursive behaviour,children are printed too.
588 */
589void
590htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
591 xmlBufferPtr buf;
592
593 buf = xmlBufferCreate();
594 if (buf == NULL) return;
595 htmlNodeDump(buf, doc, cur);
596 xmlBufferDump(out, buf);
597 xmlBufferFree(buf);
598}
599
600/**
601 * htmlDocContentDump:
602 * @buf: the HTML buffer output
603 * @cur: the document
604 *
605 * Dump an HTML document.
606 */
607static void
608htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur) {
609 int type;
610
611 /*
612 * force to output the stuff as HTML, especially for entities
613 */
614 type = cur->type;
615 cur->type = XML_HTML_DOCUMENT_NODE;
616 if (cur->intSubset != NULL)
617 htmlDtdDump(buf, cur);
618 else {
619 /* Default to HTML-4.0 transitionnal @@@@ */
620 xmlBufferWriteChar(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
621
622 }
623 if (cur->children != NULL) {
624 htmlNodeListDump(buf, cur, cur->children);
625 }
626 xmlBufferWriteChar(buf, "\n");
627 cur->type = (xmlElementType) type;
628}
629
630/**
631 * htmlDocDumpMemory:
632 * @cur: the document
633 * @mem: OUT: the memory pointer
Daniel Veillard2d703722001-05-30 18:32:34 +0000634 * @size: OUT: the memory length
Owen Taylor3473f882001-02-23 17:55:21 +0000635 *
636 * Dump an HTML document in memory and return the xmlChar * and it's size.
637 * It's up to the caller to free the memory.
638 */
639void
640htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
Daniel Veillard2d703722001-05-30 18:32:34 +0000641 xmlOutputBufferPtr buf;
642 xmlCharEncodingHandlerPtr handler = NULL;
643 const char *encoding;
Owen Taylor3473f882001-02-23 17:55:21 +0000644
645 if (cur == NULL) {
646#ifdef DEBUG_TREE
647 xmlGenericError(xmlGenericErrorContext,
Daniel Veillard2d703722001-05-30 18:32:34 +0000648 "htmlDocDumpMemory : document == NULL\n");
Owen Taylor3473f882001-02-23 17:55:21 +0000649#endif
650 *mem = NULL;
651 *size = 0;
652 return;
653 }
Daniel Veillard2d703722001-05-30 18:32:34 +0000654
655 encoding = (const char *) htmlGetMetaEncoding(cur);
656
657 if (encoding != NULL) {
658 xmlCharEncoding enc;
659
660 enc = xmlParseCharEncoding(encoding);
661 if (enc != cur->charset) {
662 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
663 /*
664 * Not supported yet
665 */
666 *mem = NULL;
667 *size = 0;
668 return;
669 }
670
671 handler = xmlFindCharEncodingHandler(encoding);
672 if (handler == NULL) {
673 *mem = NULL;
674 *size = 0;
675 return;
676 }
677 }
678 }
679
680 /*
681 * Fallback to HTML or ASCII when the encoding is unspecified
682 */
683 if (handler == NULL)
684 handler = xmlFindCharEncodingHandler("HTML");
685 if (handler == NULL)
686 handler = xmlFindCharEncodingHandler("ascii");
687
688 buf = xmlAllocOutputBuffer(handler);
Owen Taylor3473f882001-02-23 17:55:21 +0000689 if (buf == NULL) {
690 *mem = NULL;
691 *size = 0;
692 return;
693 }
Daniel Veillard2d703722001-05-30 18:32:34 +0000694
695 htmlDocContentDumpOutput(buf, cur, NULL);
696 xmlOutputBufferFlush(buf);
697 if (buf->conv != NULL) {
698 *size = buf->conv->use;
699 *mem = xmlStrndup(buf->conv->content, *size);
700 } else {
701 *size = buf->buffer->use;
702 *mem = xmlStrndup(buf->buffer->content, *size);
703 }
704 (void)xmlOutputBufferClose(buf);
Owen Taylor3473f882001-02-23 17:55:21 +0000705}
706
707
708/************************************************************************
709 * *
710 * Dumping HTML tree content to an I/O output buffer *
711 * *
712 ************************************************************************/
713
714/**
Daniel Veillardeca60d02001-06-13 07:45:41 +0000715 * htmlDtdDumpOutput:
Owen Taylor3473f882001-02-23 17:55:21 +0000716 * @buf: the HTML buffer output
717 * @doc: the document
718 * @encoding: the encoding string
719 *
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000720 * TODO: check whether encoding is needed
721 *
Owen Taylor3473f882001-02-23 17:55:21 +0000722 * Dump the HTML document DTD, if any.
723 */
724static void
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000725htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
Daniel Veillardc86a4fa2001-03-26 16:28:29 +0000726 const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +0000727 xmlDtdPtr cur = doc->intSubset;
728
729 if (cur == NULL) {
730 xmlGenericError(xmlGenericErrorContext,
731 "htmlDtdDump : no internal subset\n");
732 return;
733 }
734 xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
735 xmlOutputBufferWriteString(buf, (const char *)cur->name);
736 if (cur->ExternalID != NULL) {
737 xmlOutputBufferWriteString(buf, " PUBLIC ");
738 xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
739 if (cur->SystemID != NULL) {
740 xmlOutputBufferWriteString(buf, " ");
741 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
742 }
743 } else if (cur->SystemID != NULL) {
744 xmlOutputBufferWriteString(buf, " SYSTEM ");
745 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
746 }
747 xmlOutputBufferWriteString(buf, ">\n");
748}
749
750/**
Daniel Veillardeca60d02001-06-13 07:45:41 +0000751 * htmlAttrDumpOutput:
Owen Taylor3473f882001-02-23 17:55:21 +0000752 * @buf: the HTML buffer output
753 * @doc: the document
754 * @cur: the attribute pointer
755 * @encoding: the encoding string
756 *
757 * Dump an HTML attribute
758 */
759static void
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000760htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
Daniel Veillardc86a4fa2001-03-26 16:28:29 +0000761 const char *encoding ATTRIBUTE_UNUSED) {
Owen Taylor3473f882001-02-23 17:55:21 +0000762 xmlChar *value;
763
Daniel Veillardeca60d02001-06-13 07:45:41 +0000764 /*
765 * TODO: The html output method should not escape a & character
766 * occurring in an attribute value immediately followed by
767 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
768 */
769
Owen Taylor3473f882001-02-23 17:55:21 +0000770 if (cur == NULL) {
771 xmlGenericError(xmlGenericErrorContext,
772 "htmlAttrDump : property == NULL\n");
773 return;
774 }
775 xmlOutputBufferWriteString(buf, " ");
776 xmlOutputBufferWriteString(buf, (const char *)cur->name);
777 if (cur->children != NULL) {
778 value = xmlNodeListGetString(doc, cur->children, 0);
779 if (value) {
780 xmlOutputBufferWriteString(buf, "=");
781 xmlBufferWriteQuotedString(buf->buffer, value);
782 xmlFree(value);
783 } else {
784 xmlOutputBufferWriteString(buf, "=\"\"");
785 }
786 }
787}
788
789/**
Daniel Veillardeca60d02001-06-13 07:45:41 +0000790 * htmlAttrListDumpOutput:
Owen Taylor3473f882001-02-23 17:55:21 +0000791 * @buf: the HTML buffer output
792 * @doc: the document
793 * @cur: the first attribute pointer
794 * @encoding: the encoding string
795 *
796 * Dump a list of HTML attributes
797 */
798static void
799htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
800 if (cur == NULL) {
801 xmlGenericError(xmlGenericErrorContext,
802 "htmlAttrListDump : property == NULL\n");
803 return;
804 }
805 while (cur != NULL) {
806 htmlAttrDumpOutput(buf, doc, cur, encoding);
807 cur = cur->next;
808 }
809}
810
811
812void htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
813 xmlNodePtr cur, const char *encoding);
814
815/**
Daniel Veillardeca60d02001-06-13 07:45:41 +0000816 * htmlNodeListDumpOutput:
Owen Taylor3473f882001-02-23 17:55:21 +0000817 * @buf: the HTML buffer output
818 * @doc: the document
819 * @cur: the first node
820 * @encoding: the encoding string
821 *
822 * Dump an HTML node list, recursive behaviour,children are printed too.
823 */
824static void
825htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
826 if (cur == NULL) {
827 xmlGenericError(xmlGenericErrorContext,
828 "htmlNodeListDump : node == NULL\n");
829 return;
830 }
831 while (cur != NULL) {
832 htmlNodeDumpOutput(buf, doc, cur, encoding);
833 cur = cur->next;
834 }
835}
836
837/**
838 * htmlNodeDumpOutput:
839 * @buf: the HTML buffer output
840 * @doc: the document
841 * @cur: the current node
842 * @encoding: the encoding string
843 *
844 * Dump an HTML node, recursive behaviour,children are printed too.
845 */
846void
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000847htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
848 xmlNodePtr cur, const char *encoding) {
Owen Taylor3473f882001-02-23 17:55:21 +0000849 htmlElemDescPtr info;
850
851 if (cur == NULL) {
852 xmlGenericError(xmlGenericErrorContext,
853 "htmlNodeDump : node == NULL\n");
854 return;
855 }
856 /*
857 * Special cases.
858 */
859 if (cur->type == XML_DTD_NODE)
860 return;
861 if (cur->type == XML_HTML_DOCUMENT_NODE) {
862 htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
863 return;
864 }
865 if (cur->type == HTML_TEXT_NODE) {
866 if (cur->content != NULL) {
Daniel Veillard6e93c4a2001-06-05 20:57:42 +0000867 if (((cur->name == xmlStringText) ||
868 (cur->name != xmlStringTextNoenc)) &&
869 ((cur->parent == NULL) ||
870 (!xmlStrEqual(cur->parent->name, BAD_CAST "script")))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000871 xmlChar *buffer;
872
873#ifndef XML_USE_BUFFER_CONTENT
874 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
875#else
876 buffer = xmlEncodeEntitiesReentrant(doc,
877 xmlBufferContent(cur->content));
878#endif
879 if (buffer != NULL) {
880 xmlOutputBufferWriteString(buf, (const char *)buffer);
881 xmlFree(buffer);
882 }
883 } else {
884 xmlOutputBufferWriteString(buf, (const char *)cur->content);
885 }
886 }
887 return;
888 }
889 if (cur->type == HTML_COMMENT_NODE) {
890 if (cur->content != NULL) {
891 xmlOutputBufferWriteString(buf, "<!--");
892#ifndef XML_USE_BUFFER_CONTENT
893 xmlOutputBufferWriteString(buf, (const char *)cur->content);
894#else
895 xmlOutputBufferWriteString(buf, (const char *)
896 xmlBufferContent(cur->content));
897#endif
898 xmlOutputBufferWriteString(buf, "-->");
899 }
900 return;
901 }
Daniel Veillard7533cc82001-04-24 15:52:00 +0000902 if (cur->type == HTML_PI_NODE) {
Daniel Veillard5146f202001-04-25 10:29:44 +0000903 if (cur->name == NULL)
904 return;
905 xmlOutputBufferWriteString(buf, "<?");
906 xmlOutputBufferWriteString(buf, (const char *)cur->name);
Daniel Veillard7533cc82001-04-24 15:52:00 +0000907 if (cur->content != NULL) {
Daniel Veillard5146f202001-04-25 10:29:44 +0000908 xmlOutputBufferWriteString(buf, " ");
Daniel Veillard7533cc82001-04-24 15:52:00 +0000909#ifndef XML_USE_BUFFER_CONTENT
910 xmlOutputBufferWriteString(buf, (const char *)cur->content);
911#else
912 xmlOutputBufferWriteString(buf, (const char *)
913 xmlBufferContent(cur->content));
914#endif
Daniel Veillard7533cc82001-04-24 15:52:00 +0000915 }
Daniel Veillard5146f202001-04-25 10:29:44 +0000916 xmlOutputBufferWriteString(buf, ">");
Daniel Veillard7533cc82001-04-24 15:52:00 +0000917 return;
918 }
Owen Taylor3473f882001-02-23 17:55:21 +0000919 if (cur->type == HTML_ENTITY_REF_NODE) {
920 xmlOutputBufferWriteString(buf, "&");
921 xmlOutputBufferWriteString(buf, (const char *)cur->name);
922 xmlOutputBufferWriteString(buf, ";");
923 return;
924 }
925 if (cur->type == HTML_PRESERVE_NODE) {
926 if (cur->content != NULL) {
927#ifndef XML_USE_BUFFER_CONTENT
928 xmlOutputBufferWriteString(buf, (const char *)cur->content);
929#else
930 xmlOutputBufferWriteString(buf, (const char *)
931 xmlBufferContent(cur->content));
932#endif
933 }
934 return;
935 }
936
937 /*
Daniel Veillard1ed3f882001-04-18 09:45:35 +0000938 * Get specific HTML info for taht node.
Owen Taylor3473f882001-02-23 17:55:21 +0000939 */
940 info = htmlTagLookup(cur->name);
941
942 xmlOutputBufferWriteString(buf, "<");
943 xmlOutputBufferWriteString(buf, (const char *)cur->name);
944 if (cur->properties != NULL)
945 htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
946
947 if ((info != NULL) && (info->empty)) {
948 xmlOutputBufferWriteString(buf, ">");
949 if (cur->next != NULL) {
950 if ((cur->next->type != HTML_TEXT_NODE) &&
Daniel Veillard8a926292001-06-07 11:20:20 +0000951 (cur->next->type != HTML_ENTITY_REF_NODE) &&
952 (cur->parent != NULL) &&
953 (!xmlStrEqual(cur->parent->name, BAD_CAST "pre")))
Owen Taylor3473f882001-02-23 17:55:21 +0000954 xmlOutputBufferWriteString(buf, "\n");
955 }
956 return;
957 }
958 if ((cur->content == NULL) && (cur->children == NULL)) {
959 if ((info != NULL) && (info->saveEndTag != 0) &&
Daniel Veillardeca60d02001-06-13 07:45:41 +0000960 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
961 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
Owen Taylor3473f882001-02-23 17:55:21 +0000962 xmlOutputBufferWriteString(buf, ">");
963 } else {
964 xmlOutputBufferWriteString(buf, "></");
965 xmlOutputBufferWriteString(buf, (const char *)cur->name);
966 xmlOutputBufferWriteString(buf, ">");
967 }
968 if (cur->next != NULL) {
969 if ((cur->next->type != HTML_TEXT_NODE) &&
Daniel Veillard8a926292001-06-07 11:20:20 +0000970 (cur->next->type != HTML_ENTITY_REF_NODE) &&
971 (cur->parent != NULL) &&
972 (!xmlStrEqual(cur->parent->name, BAD_CAST "pre")))
Owen Taylor3473f882001-02-23 17:55:21 +0000973 xmlOutputBufferWriteString(buf, "\n");
974 }
975 return;
976 }
977 xmlOutputBufferWriteString(buf, ">");
978 if (cur->content != NULL) {
979 /*
980 * Uses the OutputBuffer property to automatically convert
981 * invalids to charrefs
982 */
983
984#ifndef XML_USE_BUFFER_CONTENT
985 xmlOutputBufferWriteString(buf, (const char *) cur->content);
986#else
987 xmlOutputBufferWriteString(buf,
988 (const char *) xmlBufferContent(cur->content));
989#endif
990 }
991 if (cur->children != NULL) {
992 if ((cur->children->type != HTML_TEXT_NODE) &&
993 (cur->children->type != HTML_ENTITY_REF_NODE) &&
Daniel Veillardf0c53762001-06-07 16:07:07 +0000994 (cur->children != cur->last) &&
995 (!xmlStrEqual(cur->name, BAD_CAST "pre")))
Owen Taylor3473f882001-02-23 17:55:21 +0000996 xmlOutputBufferWriteString(buf, "\n");
997 htmlNodeListDumpOutput(buf, doc, cur->children, encoding);
998 if ((cur->last->type != HTML_TEXT_NODE) &&
999 (cur->last->type != HTML_ENTITY_REF_NODE) &&
Daniel Veillardf0c53762001-06-07 16:07:07 +00001000 (cur->children != cur->last) &&
1001 (!xmlStrEqual(cur->name, BAD_CAST "pre")))
Owen Taylor3473f882001-02-23 17:55:21 +00001002 xmlOutputBufferWriteString(buf, "\n");
1003 }
Owen Taylor3473f882001-02-23 17:55:21 +00001004 xmlOutputBufferWriteString(buf, "</");
1005 xmlOutputBufferWriteString(buf, (const char *)cur->name);
1006 xmlOutputBufferWriteString(buf, ">");
Owen Taylor3473f882001-02-23 17:55:21 +00001007 if (cur->next != NULL) {
1008 if ((cur->next->type != HTML_TEXT_NODE) &&
Daniel Veillardf0c53762001-06-07 16:07:07 +00001009 (cur->next->type != HTML_ENTITY_REF_NODE) &&
1010 (cur->parent != NULL) &&
1011 (!xmlStrEqual(cur->parent->name, BAD_CAST "pre")))
Owen Taylor3473f882001-02-23 17:55:21 +00001012 xmlOutputBufferWriteString(buf, "\n");
1013 }
1014}
1015
1016/**
Daniel Veillardeca60d02001-06-13 07:45:41 +00001017 * htmlDocContentDumpOutput:
Owen Taylor3473f882001-02-23 17:55:21 +00001018 * @buf: the HTML buffer output
1019 * @cur: the document
1020 * @encoding: the encoding string
1021 *
1022 * Dump an HTML document.
1023 */
1024void
Daniel Veillardeca60d02001-06-13 07:45:41 +00001025htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1026 const char *encoding) {
Owen Taylor3473f882001-02-23 17:55:21 +00001027 int type;
1028
1029 /*
1030 * force to output the stuff as HTML, especially for entities
1031 */
1032 type = cur->type;
1033 cur->type = XML_HTML_DOCUMENT_NODE;
Daniel Veillard4dd93462001-04-02 15:16:19 +00001034 if (cur->intSubset != NULL) {
Owen Taylor3473f882001-02-23 17:55:21 +00001035 htmlDtdDumpOutput(buf, cur, NULL);
Owen Taylor3473f882001-02-23 17:55:21 +00001036 }
1037 if (cur->children != NULL) {
1038 htmlNodeListDumpOutput(buf, cur, cur->children, encoding);
1039 }
1040 xmlOutputBufferWriteString(buf, "\n");
1041 cur->type = (xmlElementType) type;
1042}
1043
Owen Taylor3473f882001-02-23 17:55:21 +00001044/************************************************************************
1045 * *
1046 * Saving functions front-ends *
1047 * *
1048 ************************************************************************/
1049
1050/**
1051 * htmlDocDump:
1052 * @f: the FILE*
1053 * @cur: the document
1054 *
1055 * Dump an HTML document to an open FILE.
1056 *
1057 * returns: the number of byte written or -1 in case of failure.
1058 */
1059int
1060htmlDocDump(FILE *f, xmlDocPtr cur) {
1061 xmlOutputBufferPtr buf;
1062 xmlCharEncodingHandlerPtr handler = NULL;
1063 const char *encoding;
1064 int ret;
1065
1066 if (cur == NULL) {
1067#ifdef DEBUG_TREE
1068 xmlGenericError(xmlGenericErrorContext,
1069 "htmlDocDump : document == NULL\n");
1070#endif
1071 return(-1);
1072 }
1073
1074 encoding = (const char *) htmlGetMetaEncoding(cur);
1075
1076 if (encoding != NULL) {
1077 xmlCharEncoding enc;
1078
1079 enc = xmlParseCharEncoding(encoding);
1080 if (enc != cur->charset) {
1081 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1082 /*
1083 * Not supported yet
1084 */
1085 return(-1);
1086 }
1087
1088 handler = xmlFindCharEncodingHandler(encoding);
1089 if (handler == NULL)
1090 return(-1);
1091 }
1092 }
1093
1094 /*
1095 * Fallback to HTML or ASCII when the encoding is unspecified
1096 */
1097 if (handler == NULL)
1098 handler = xmlFindCharEncodingHandler("HTML");
1099 if (handler == NULL)
1100 handler = xmlFindCharEncodingHandler("ascii");
1101
1102 buf = xmlOutputBufferCreateFile(f, handler);
1103 if (buf == NULL) return(-1);
1104 htmlDocContentDumpOutput(buf, cur, NULL);
1105
1106 ret = xmlOutputBufferClose(buf);
1107 return(ret);
1108}
1109
1110/**
1111 * htmlSaveFile:
1112 * @filename: the filename (or URL)
1113 * @cur: the document
1114 *
1115 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1116 * used.
1117 * returns: the number of byte written or -1 in case of failure.
1118 */
1119int
1120htmlSaveFile(const char *filename, xmlDocPtr cur) {
1121 xmlOutputBufferPtr buf;
1122 xmlCharEncodingHandlerPtr handler = NULL;
1123 const char *encoding;
1124 int ret;
1125
1126 encoding = (const char *) htmlGetMetaEncoding(cur);
1127
1128 if (encoding != NULL) {
1129 xmlCharEncoding enc;
1130
1131 enc = xmlParseCharEncoding(encoding);
1132 if (enc != cur->charset) {
1133 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1134 /*
1135 * Not supported yet
1136 */
1137 return(-1);
1138 }
1139
1140 handler = xmlFindCharEncodingHandler(encoding);
1141 if (handler == NULL)
1142 return(-1);
1143 }
1144 }
1145
1146 /*
1147 * Fallback to HTML or ASCII when the encoding is unspecified
1148 */
1149 if (handler == NULL)
1150 handler = xmlFindCharEncodingHandler("HTML");
1151 if (handler == NULL)
1152 handler = xmlFindCharEncodingHandler("ascii");
1153
1154 /*
1155 * save the content to a temp buffer.
1156 */
1157 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1158 if (buf == NULL) return(0);
1159
1160 htmlDocContentDumpOutput(buf, cur, NULL);
1161
1162 ret = xmlOutputBufferClose(buf);
1163 return(ret);
1164}
1165
1166/**
1167 * htmlSaveFileEnc:
1168 * @filename: the filename
1169 * @cur: the document
1170 *
1171 * Dump an HTML document to a file using a given encoding.
1172 *
1173 * returns: the number of byte written or -1 in case of failure.
1174 */
1175int
1176htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1177 xmlOutputBufferPtr buf;
1178 xmlCharEncodingHandlerPtr handler = NULL;
1179 int ret;
1180
1181 if (encoding != NULL) {
1182 xmlCharEncoding enc;
1183
1184 enc = xmlParseCharEncoding(encoding);
1185 if (enc != cur->charset) {
1186 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1187 /*
1188 * Not supported yet
1189 */
1190 return(-1);
1191 }
1192
1193 handler = xmlFindCharEncodingHandler(encoding);
1194 if (handler == NULL)
1195 return(-1);
1196 htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1197 }
Daniel Veillard4dd93462001-04-02 15:16:19 +00001198 } else {
1199 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
Owen Taylor3473f882001-02-23 17:55:21 +00001200 }
1201
1202 /*
1203 * Fallback to HTML or ASCII when the encoding is unspecified
1204 */
1205 if (handler == NULL)
1206 handler = xmlFindCharEncodingHandler("HTML");
1207 if (handler == NULL)
1208 handler = xmlFindCharEncodingHandler("ascii");
1209
1210 /*
1211 * save the content to a temp buffer.
1212 */
1213 buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1214 if (buf == NULL) return(0);
1215
1216 htmlDocContentDumpOutput(buf, cur, encoding);
1217
1218 ret = xmlOutputBufferClose(buf);
1219 return(ret);
1220}
1221#endif /* LIBXML_HTML_ENABLED */