work done on auto-opening of <p> tags and cleanup of SAX output, Daniel.
diff --git a/ChangeLog b/ChangeLog
index ff671b5..f3e161b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+Sat Aug 19 21:02:08 CEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>
+
+ * HTMLparser.c SAX.c tree.c HTMLtree.h result/HTML/*: work
+ done on auto-opening of <p> tags and cleanup of SAX output
+
Sat Aug 19 18:45:40 CEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>
* libxml.4 xmllint.1 Makefile.am libxml.spec.in: added man pages
diff --git a/HTMLparser.c b/HTMLparser.c
index 9f7da5c..75edb10 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -552,6 +552,20 @@
NULL
};
+/*
+ * The list of HTML elements which are supposed not to have
+ * CDATA content and where a p element will be implied
+ *
+ * TODO: extend that list by reading the HTML SGML DtD on
+ * implied paragraph
+ */
+static char *htmlNoContentElements[] = {
+ "html",
+ "head",
+ "body",
+ NULL
+};
+
static char** htmlStartCloseIndex[100];
static int htmlStartCloseIndexinitialized = 0;
@@ -845,6 +859,49 @@
}
}
+/**
+ * htmlCheckParagraph
+ * @ctxt: an HTML parser context
+ *
+ * Check whether a p element need to be implied before inserting
+ * characters in the current element.
+ *
+ * Returns 1 if a paragraph has been inserted, 0 if not and -1
+ * in case of error.
+ */
+
+int
+htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
+ const xmlChar *tag;
+ int i;
+
+ if (ctxt == NULL)
+ return(-1);
+ tag = ctxt->name;
+ if (tag == NULL) {
+ htmlAutoClose(ctxt, BAD_CAST"p");
+ htmlCheckImplied(ctxt, BAD_CAST"p");
+ htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
+ if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
+ ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
+ return(1);
+ }
+ for (i = 0; htmlNoContentElements[i] != NULL; i++) {
+ if (!xmlStrcmp(tag, BAD_CAST htmlNoContentElements[i])) {
+#ifdef DEBUG
+ fprintf(stderr,"Implied element paragraph\n");
+#endif
+ htmlAutoClose(ctxt, BAD_CAST"p");
+ htmlCheckImplied(ctxt, BAD_CAST"p");
+ htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
+ if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
+ ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
+ return(1);
+ }
+ }
+ return(0);
+}
+
/************************************************************************
* *
* The list of HTML predefined entities *
@@ -1253,7 +1310,8 @@
sizeof(html40EntitiesTable[0]));i++) {
if (html40EntitiesTable[i].value == c) {
#ifdef DEBUG
- fprintf(stderr,"Found entity %s\n", name);
+ fprintf(stderr,"Found entity %s\n",
+ html40EntitiesTable[i].name);
#endif
goto found_ent;
}
@@ -1496,20 +1554,21 @@
/*
* Just handle the content as a set of chars.
*/
+ htmlCheckParagraph(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, entity->content, len);
}
/**
- * htmlNewDoc:
+ * htmlNewDocNoDtD:
* @URI: URI for the dtd, or NULL
* @ExternalID: the external ID of the DTD, or NULL
*
- * Returns a new document
+ * Returns a new document, do not intialize the DTD if not provided
*/
htmlDocPtr
-htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
+htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
xmlDocPtr cur;
/*
@@ -1525,12 +1584,8 @@
cur->type = XML_HTML_DOCUMENT_NODE;
cur->version = NULL;
cur->intSubset = NULL;
- if ((ExternalID == NULL) &&
- (URI == NULL))
- xmlCreateIntSubset(cur, BAD_CAST "HTML",
- BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
- BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
- else
+ if ((ExternalID != NULL) ||
+ (URI != NULL))
xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
cur->doc = cur;
cur->name = NULL;
@@ -1548,6 +1603,23 @@
return(cur);
}
+/**
+ * htmlNewDoc:
+ * @URI: URI for the dtd, or NULL
+ * @ExternalID: the external ID of the DTD, or NULL
+ *
+ * Returns a new document
+ */
+htmlDocPtr
+htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
+ if ((URI == NULL) && (ExternalID == NULL))
+ return(htmlNewDocNoDtD(
+ BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
+ BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
+
+ return(htmlNewDocNoDtD(URI, ExternalID));
+}
+
/************************************************************************
* *
@@ -2062,6 +2134,7 @@
ctxt->sax->ignorableWhitespace(ctxt->userData,
buf, nbchar);
} else {
+ htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData, buf, nbchar);
}
@@ -2080,6 +2153,7 @@
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
} else {
+ htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData, buf, nbchar);
}
@@ -2861,16 +2935,19 @@
}
out[i] = 0;
+ htmlCheckParagraph(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, out, i);
} else {
ent = htmlParseEntityRef(ctxt, &name);
if (name == NULL) {
+ htmlCheckParagraph(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
return;
}
if ((ent == NULL) || (ent->value <= 0)) {
+ htmlCheckParagraph(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
@@ -2895,6 +2972,7 @@
}
out[i] = 0;
+ htmlCheckParagraph(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, out, i);
}
@@ -2942,6 +3020,21 @@
}
/*
+ * Sometimes DOCTYPE arrives in the middle of the document
+ */
+ if ((CUR == '<') && (NXT(1) == '!') &&
+ (UPP(2) == 'D') && (UPP(3) == 'O') &&
+ (UPP(4) == 'C') && (UPP(5) == 'T') &&
+ (UPP(6) == 'Y') && (UPP(7) == 'P') &&
+ (UPP(8) == 'E')) {
+ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "Misplaced DOCTYPE declaration\n");
+ ctxt->wellFormed = 0;
+ htmlParseDocTypeDecl(ctxt);
+ }
+
+ /*
* First case : a comment
*/
if ((CUR == '<') && (NXT(1) == '!') &&
@@ -3185,6 +3278,8 @@
int
htmlParseDocument(htmlParserCtxtPtr ctxt) {
+ xmlDtdPtr dtd;
+
htmlDefaultSAXHandlerInit();
ctxt->html = 1;
@@ -3258,6 +3353,15 @@
*/
if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
ctxt->sax->endDocument(ctxt->userData);
+
+ if (ctxt->myDoc != NULL) {
+ dtd = xmlGetIntSubset(ctxt->myDoc);
+ if (dtd == NULL)
+ ctxt->myDoc->intSubset =
+ xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
+ BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
+ BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
+ }
if (! ctxt->wellFormed) return(-1);
return(0);
}
@@ -3848,6 +3952,7 @@
xmlChar chr[2] = { 0 , 0 } ;
chr[0] = (xmlChar) ctxt->token;
+ htmlCheckParagraph(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, chr, 1);
ctxt->token = 0;
@@ -3862,6 +3967,7 @@
ctxt->sax->ignorableWhitespace(
ctxt->userData, &cur, 1);
} else {
+ htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(
ctxt->userData, &cur, 1);
@@ -3878,7 +3984,23 @@
cur = in->cur[0];
next = in->cur[1];
cons = ctxt->nbChars;
- if ((cur == '<') && (next == '!') &&
+ /*
+ * Sometimes DOCTYPE arrives in the middle of the document
+ */
+ if ((cur == '<') && (next == '!') &&
+ (UPP(2) == 'D') && (UPP(3) == 'O') &&
+ (UPP(4) == 'C') && (UPP(5) == 'T') &&
+ (UPP(6) == 'Y') && (UPP(7) == 'P') &&
+ (UPP(8) == 'E')) {
+ if ((!terminate) &&
+ (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
+ goto done;
+ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "Misplaced DOCTYPE declaration\n");
+ ctxt->wellFormed = 0;
+ htmlParseDocTypeDecl(ctxt);
+ } else if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
@@ -4040,6 +4162,17 @@
ctxt->sax->endDocument(ctxt->userData);
}
}
+ if ((ctxt->myDoc != NULL) &&
+ ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
+ (ctxt->instate == XML_PARSER_EPILOG))) {
+ xmlDtdPtr dtd;
+ dtd = xmlGetIntSubset(ctxt->myDoc);
+ if (dtd == NULL)
+ ctxt->myDoc->intSubset =
+ xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
+ BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
+ BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
+ }
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: done %d\n", ret);
#endif
diff --git a/HTMLtree.h b/HTMLtree.h
index feff3a4..17043b7 100644
--- a/HTMLtree.h
+++ b/HTMLtree.h
@@ -25,6 +25,8 @@
htmlDocPtr htmlNewDoc (const xmlChar *URI,
const xmlChar *ExternalID);
+htmlDocPtr htmlNewDocNoDtD (const xmlChar *URI,
+ const xmlChar *ExternalID);
const xmlChar * htmlGetMetaEncoding (htmlDocPtr doc);
int htmlSetMetaEncoding (htmlDocPtr doc,
const xmlChar *encoding);
diff --git a/SAX.c b/SAX.c
index ecc5331..fb5e741 100644
--- a/SAX.c
+++ b/SAX.c
@@ -169,6 +169,8 @@
return;
dtd = xmlGetIntSubset(ctxt->myDoc);
if (dtd != NULL) {
+ if (ctxt->html)
+ return;
xmlUnlinkNode((xmlNodePtr) dtd);
xmlFreeDtd(dtd);
ctxt->myDoc->intSubset = NULL;
@@ -605,7 +607,7 @@
if (ctxt->html) {
if (ctxt->myDoc == NULL)
#ifdef LIBXML_HTML_ENABLED
- ctxt->myDoc = htmlNewDoc(NULL, NULL);
+ ctxt->myDoc = htmlNewDocNoDtD(NULL, NULL);
#else
fprintf(stderr, "libxml2 built without HTML support\n");
#endif
diff --git a/include/libxml/HTMLtree.h b/include/libxml/HTMLtree.h
index feff3a4..17043b7 100644
--- a/include/libxml/HTMLtree.h
+++ b/include/libxml/HTMLtree.h
@@ -25,6 +25,8 @@
htmlDocPtr htmlNewDoc (const xmlChar *URI,
const xmlChar *ExternalID);
+htmlDocPtr htmlNewDocNoDtD (const xmlChar *URI,
+ const xmlChar *ExternalID);
const xmlChar * htmlGetMetaEncoding (htmlDocPtr doc);
int htmlSetMetaEncoding (htmlDocPtr doc,
const xmlChar *encoding);
diff --git a/result/HTML/Down.html b/result/HTML/Down.html
index 2eb3e8b..7a004e5 100644
--- a/result/HTML/Down.html
+++ b/result/HTML/Down.html
@@ -3,8 +3,10 @@
<head><title>This service is temporary down</title></head>
<body bgcolor="#FFFFFF">
<h1 align="center">Sorry, this service is temporary down</h1>
+<p>
We are doing our best to get it back on-line,
+</p>
<p>The W3C system administrators</p>
</body>
</html>
diff --git a/result/HTML/Down.html.sax b/result/HTML/Down.html.sax
index ce3052e..6b23930 100644
--- a/result/HTML/Down.html.sax
+++ b/result/HTML/Down.html.sax
@@ -1,36 +1,53 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.startElement(html)
+SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(head)
+SAX.endElement(head)
+SAX.startElement(p)
SAX.characters(
, 3)
+SAX.endElement(p)
SAX.startElement(title)
SAX.characters(This service is temporary down, 30)
SAX.endElement(title)
+SAX.startElement(p)
SAX.characters(
, 1)
-SAX.endElement(head)
+SAX.error: Unexpected end tag : head
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(body, bgcolor='#FFFFFF')
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(h1, align='center')
SAX.characters(Sorry, this service is tempora, 37)
SAX.endElement(h1)
+SAX.startElement(p)
SAX.characters(
We are doing our best to get , 48)
+SAX.endElement(p)
SAX.startElement(p)
SAX.characters(The W3C system administrators, 29)
SAX.endElement(p)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.endElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
+SAX.endElement(body)
SAX.endElement(html)
SAX.ignorableWhitespace(
, 1)
diff --git a/result/HTML/doc2.htm b/result/HTML/doc2.htm
index 31db496..07fcc3e 100644
--- a/result/HTML/doc2.htm
+++ b/result/HTML/doc2.htm
@@ -8,7 +8,7 @@
function NS_NewOpen(url,nam,atr){return(new NS_NullWindow());}
window.open=NS_NewOpen;
</script>
-<!-- END Naviscope Javascript -->!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"><!-- saved from url=(0027)http://www.agents-tech.com/ --><meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type">
+<!-- END Naviscope Javascript --><!-- saved from url=(0027)http://www.agents-tech.com/ --><meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type">
<meta content="Copernic.com Inc. develops innovative agent technology solutions to efficiently access and manage the overwhelming quantity of information available on the Internet and intranets." name="DESCRIPTION">
<meta content="agent,technology,intranet,extranet,management,filtering,ranking,solution,service,intelligent,intelligence,client,server,architecture,developer,development,information,telecommunication,announcement,press,product,profile,contact,multi-agent,meta-search,metasearch,multi-thread,mobile,wireless,shopping,robot,PCS,Copernic,engine,toolkit,CDK,EDK" name="KEYWORDS">
<meta content="MSHTML 5.00.3103.1000" name="GENERATOR">
diff --git a/result/HTML/doc2.htm.err b/result/HTML/doc2.htm.err
index 9e37bf4..bf46ffa 100644
--- a/result/HTML/doc2.htm.err
+++ b/result/HTML/doc2.htm.err
@@ -1,3 +1,3 @@
-./test/HTML/doc2.htm:10: error: htmlParseStartTag: invalid element name
+./test/HTML/doc2.htm:10: error: Misplaced DOCTYPE declaration
<!-- END Naviscope Javascript --><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Tr
- ^
+ ^
diff --git a/result/HTML/doc2.htm.sax b/result/HTML/doc2.htm.sax
index ee80794..5eafa0c 100644
--- a/result/HTML/doc2.htm.sax
+++ b/result/HTML/doc2.htm.sax
@@ -8,6 +8,9 @@
SAX.startElement(title)
SAX.characters(Welcome to Copernic.com, 23)
SAX.endElement(title)
+SAX.endElement(head)
+SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
SAX.startElement(script, language='javascript')
@@ -17,8 +20,8 @@
SAX.characters(
, 1)
SAX.comment( END Naviscope Javascript )
-SAX.error: htmlParseStartTag: invalid element name
-SAX.characters(!DOCTYPE HTML PUBLIC "-//W3C//, 61)
+SAX.error: Misplaced DOCTYPE declaration
+SAX.internalSubset(HTML, -//W3C//DTD HTML 4.0 Transitional//EN, )
SAX.comment( saved from url=(0027)http://www.agents-tech.com/ )
SAX.characters(
, 1)
@@ -36,8 +39,7 @@
, 1)
SAX.startElement(meta, content='MSHTML 5.00.3103.1000', name='GENERATOR')
SAX.endElement(meta)
-SAX.endElement(head)
-SAX.startElement(body)
+SAX.error: Unexpected end tag : head
SAX.startElement(frameset, border='false', cols='172,*', frameBorder='0', frameSpacing='0')
SAX.startElement(frame, marginHeight='0', marginWidth='0', name='left', noResize, scrolling='no', src='doc2_files/side.htm', target='rtop')
SAX.endElement(frame)
@@ -52,18 +54,23 @@
, 4)
SAX.startElement(body, bgcolor='#FFFFFF', text='#000000', link='#000080', vlink='#000080', alink='#000080', topmargin='0', leftmargin='0', marginheight='0', marginwidth='0')
+SAX.startElement(p)
SAX.characters(
, 3)
+SAX.endElement(p)
SAX.startElement(p)
SAX.characters(This page uses frames, but you, 61)
SAX.endElement(p)
+SAX.startElement(p)
SAX.characters(
, 3)
+SAX.endElement(p)
SAX.endElement(body)
SAX.characters(
, 3)
SAX.endElement(noframes)
SAX.endElement(frameset)
+SAX.endElement(p)
SAX.endElement(body)
SAX.endElement(html)
SAX.ignorableWhitespace(
diff --git a/result/HTML/doc3.htm b/result/HTML/doc3.htm
index cd7e2b6..483ee52 100644
--- a/result/HTML/doc3.htm
+++ b/result/HTML/doc3.htm
@@ -8,7 +8,7 @@
function NS_NewOpen(url,nam,atr){return(new NS_NullWindow());}
window.open=NS_NewOpen;
</script>
-<!-- END Naviscope Javascript -->!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN"><!--last modified on Tuesday, February 22, 2000 11:47 PM --><meta content="text/html;CHARSET=iso-8859-1" http-equiv="Content-Type">
+<!-- END Naviscope Javascript --><!--last modified on Tuesday, February 22, 2000 11:47 PM --><meta content="text/html;CHARSET=iso-8859-1" http-equiv="Content-Type">
<meta content="Tim" name="Author">
<style type="text/css">A.nav {
COLOR: #003399; TEXT-DECORATION: none
diff --git a/result/HTML/doc3.htm.err b/result/HTML/doc3.htm.err
index aabc200..51266e1 100644
--- a/result/HTML/doc3.htm.err
+++ b/result/HTML/doc3.htm.err
@@ -1,6 +1,6 @@
-./test/HTML/doc3.htm:10: error: htmlParseStartTag: invalid element name
+./test/HTML/doc3.htm:10: error: Misplaced DOCTYPE declaration
<!-- END Naviscope Javascript --><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//E
- ^
+ ^
./test/HTML/doc3.htm:52: error: htmlParseEntityRef: expecting ';'
href="http://ads.gamesquad.net/addclick.exe/adclick.cgi?REGION=game|tech|ent&i
^
diff --git a/result/HTML/doc3.htm.sax b/result/HTML/doc3.htm.sax
index 4cb9ac4..98ef2ac 100644
--- a/result/HTML/doc3.htm.sax
+++ b/result/HTML/doc3.htm.sax
Binary files differ
diff --git a/result/HTML/fp40.htm.sax b/result/HTML/fp40.htm.sax
index f181d62..94c7055 100644
--- a/result/HTML/fp40.htm.sax
+++ b/result/HTML/fp40.htm.sax
@@ -2,30 +2,39 @@
SAX.startDocument()
SAX.internalSubset(html, -//IETF//DTD HTML//EN, )
SAX.startElement(html)
+SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(head)
+SAX.endElement(head)
+SAX.startElement(p)
SAX.characters(
, 1)
SAX.startElement(meta, name='GENERATOR', content='Microsoft FrontPage 4.0')
SAX.endElement(meta)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(title)
SAX.characters(README - Microsoft FrontPage 2, 51)
SAX.endElement(title)
+SAX.startElement(p)
SAX.characters(
, 1)
SAX.startElement(meta, name='Microsoft Theme', content='none')
SAX.endElement(meta)
SAX.characters(
, 1)
-SAX.endElement(head)
+SAX.error: Unexpected end tag : head
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
SAX.startElement(font, face='Verdana')
@@ -169,6 +178,7 @@
SAX.endElement(font)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(blockquote)
SAX.characters(
, 3)
@@ -184,6 +194,7 @@
SAX.characters(
, 1)
SAX.endElement(blockquote)
+SAX.startElement(p)
SAX.characters(
, 1)
SAX.startElement(font, face='Verdana')
@@ -454,9 +465,13 @@
SAX.endElement(font)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.endElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
+SAX.endElement(body)
SAX.endElement(html)
SAX.ignorableWhitespace(
, 1)
diff --git a/result/HTML/liclose.html.sax b/result/HTML/liclose.html.sax
index 8752acd..519688f 100644
--- a/result/HTML/liclose.html.sax
+++ b/result/HTML/liclose.html.sax
@@ -2,21 +2,31 @@
SAX.startDocument()
SAX.internalSubset(HTML, -//W3C//DTD HTML 4.0 Transitional//EN, http://www.w3.org/TR/REC-html40/loose.dtd)
SAX.startElement(html)
+SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(head)
+SAX.endElement(head)
+SAX.startElement(p)
SAX.characters(
, 3)
+SAX.endElement(p)
SAX.startElement(title)
SAX.endElement(title)
+SAX.startElement(p)
SAX.characters(
, 1)
-SAX.endElement(head)
+SAX.error: Unexpected end tag : head
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(ul)
SAX.characters(
, 1)
@@ -28,9 +38,12 @@
SAX.characters(Second item, closes the first , 34)
SAX.endElement(li)
SAX.endElement(ul)
+SAX.startElement(p)
SAX.characters(
, 2)
+SAX.endElement(p)
+SAX.endElement(body)
SAX.endElement(body)
SAX.endElement(html)
SAX.ignorableWhitespace(
diff --git a/result/HTML/reg1.html.sax b/result/HTML/reg1.html.sax
index 0e52b63..135cb57 100644
--- a/result/HTML/reg1.html.sax
+++ b/result/HTML/reg1.html.sax
@@ -1,35 +1,50 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.startElement(html)
+SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(head)
+SAX.endElement(head)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(title)
SAX.characters(Regression test 1, 17)
SAX.endElement(title)
+SAX.startElement(p)
SAX.characters(
, 1)
-SAX.endElement(head)
+SAX.error: Unexpected end tag : head
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(h1)
SAX.characters(Regression test 1, 17)
SAX.endElement(h1)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(p)
SAX.characters(
Ok file no problem
, 20)
SAX.endElement(p)
SAX.endElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
+SAX.endElement(body)
SAX.endElement(html)
SAX.ignorableWhitespace(
, 1)
diff --git a/result/HTML/reg2.html.sax b/result/HTML/reg2.html.sax
index c824c59..0db1e53 100644
--- a/result/HTML/reg2.html.sax
+++ b/result/HTML/reg2.html.sax
@@ -1,27 +1,39 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.startElement(html)
+SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(head)
+SAX.endElement(head)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(title)
SAX.characters(Regression test 2, 17)
SAX.endElement(title)
+SAX.startElement(p)
SAX.characters(
, 1)
-SAX.endElement(head)
+SAX.error: Unexpected end tag : head
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(h1)
SAX.characters(Regression test 2, 17)
SAX.endElement(h1)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(p)
SAX.characters(
Autoclose of tag P
@@ -33,8 +45,11 @@
, 20)
SAX.endElement(p)
SAX.endElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
+SAX.endElement(body)
SAX.endElement(html)
SAX.ignorableWhitespace(
, 1)
diff --git a/result/HTML/reg3.html.sax b/result/HTML/reg3.html.sax
index 3dd6c02..75cd2cc 100644
--- a/result/HTML/reg3.html.sax
+++ b/result/HTML/reg3.html.sax
@@ -1,27 +1,39 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.startElement(html)
+SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(head)
+SAX.endElement(head)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(title)
SAX.characters(Regression test 3, 17)
SAX.endElement(title)
+SAX.startElement(p)
SAX.characters(
, 1)
-SAX.endElement(head)
+SAX.error: Unexpected end tag : head
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(h1)
SAX.characters(Regression test 3, 17)
SAX.endElement(h1)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(p)
SAX.characters(
Autoclose of tag P
@@ -29,16 +41,21 @@
SAX.endElement(p)
SAX.startElement(hr)
SAX.endElement(hr)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(p)
SAX.characters(
Ok file no problem
, 20)
SAX.endElement(p)
SAX.endElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
+SAX.endElement(body)
SAX.endElement(html)
SAX.ignorableWhitespace(
, 1)
diff --git a/result/HTML/reg4.html.sax b/result/HTML/reg4.html.sax
index 80472e2..832fe69 100644
--- a/result/HTML/reg4.html.sax
+++ b/result/HTML/reg4.html.sax
@@ -1,27 +1,39 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.startElement(html)
+SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(head)
+SAX.endElement(head)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(title)
SAX.characters(Regression test 4, 17)
SAX.endElement(title)
+SAX.startElement(p)
SAX.characters(
, 1)
-SAX.endElement(head)
+SAX.error: Unexpected end tag : head
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(h1)
SAX.characters(Regression test 4, 17)
SAX.endElement(h1)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(p)
SAX.characters(
Wrong close of tag P
@@ -29,14 +41,20 @@
SAX.endElement(p)
SAX.startElement(hr)
SAX.endElement(hr)
+SAX.startElement(p)
SAX.characters(
, 1)
-SAX.error: Unexpected end tag : p
+SAX.endElement(p)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.endElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
+SAX.endElement(body)
SAX.endElement(html)
SAX.ignorableWhitespace(
, 1)
diff --git a/result/HTML/test2.html.sax b/result/HTML/test2.html.sax
index 275ae1b..d457ed0 100644
--- a/result/HTML/test2.html.sax
+++ b/result/HTML/test2.html.sax
@@ -3,17 +3,25 @@
SAX.internalSubset(HTML, -//W3C//DTD HTML 4.0 Transitional//EN, http://www.w3.org/TR/REC-html40/loose.dtd)
SAX.startElement(html)
SAX.startElement(head)
+SAX.endElement(head)
+SAX.startElement(body)
+SAX.startElement(p)
SAX.characters( , 1)
+SAX.endElement(p)
SAX.startElement(title)
SAX.characters(Linux Today, 11)
SAX.endElement(title)
-SAX.endElement(head)
+SAX.error: Unexpected end tag : head
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(body, bgcolor='White', link='Blue', text='Black', VLINK='Black', ALINK='Red')
+SAX.startElement(p)
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(center)
SAX.characters(
, 1)
@@ -130,15 +138,20 @@
SAX.characters(
, 1)
SAX.endElement(center)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(p)
SAX.characters(
, 1)
SAX.endElement(p)
SAX.endElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
+SAX.endElement(body)
SAX.endElement(html)
SAX.ignorableWhitespace(
, 1)
diff --git a/result/HTML/test3.html.sax b/result/HTML/test3.html.sax
index 41ee432..8546efd 100644
--- a/result/HTML/test3.html.sax
+++ b/result/HTML/test3.html.sax
@@ -1,19 +1,23 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.startElement(html)
+SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(head)
+SAX.endElement(head)
+SAX.startElement(p)
SAX.characters(
, 2)
SAX.startElement(base, target='contents')
SAX.endElement(base)
SAX.characters(
, 2)
-SAX.endElement(head)
+SAX.error: Unexpected end tag : head
SAX.characters(
, 2)
-SAX.startElement(body)
SAX.startElement(a, name='ProblemDomain.Package')
SAX.startElement(h2)
SAX.characters(Component Package diagram Prob, 39)
@@ -21,13 +25,16 @@
SAX.characters(
, 2)
SAX.endElement(a)
+SAX.endElement(p)
SAX.startElement(p)
SAX.endElement(p)
SAX.startElement(hr)
SAX.endElement(hr)
SAX.error: Unexpected end tag : p
+SAX.startElement(p)
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(dl)
SAX.characters(
, 2)
@@ -59,15 +66,19 @@
SAX.endElement(dd)
SAX.endElement(dd)
SAX.endElement(dl)
+SAX.startElement(p)
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(p)
SAX.endElement(p)
SAX.startElement(hr)
SAX.endElement(hr)
SAX.error: Unexpected end tag : p
+SAX.startElement(p)
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(dl)
SAX.characters(
@@ -163,9 +174,11 @@
SAX.characters(
, 2)
SAX.endElement(dl)
+SAX.startElement(p)
SAX.characters(
, 4)
+SAX.endElement(p)
SAX.startElement(h4)
SAX.startElement(b)
SAX.characters(Links, 5)
@@ -173,8 +186,10 @@
SAX.endElement(b)
SAX.endElement(h4)
SAX.error: Unexpected end tag : b
+SAX.startElement(p)
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(ul)
SAX.startElement(li)
SAX.startElement(b)
@@ -185,12 +200,16 @@
SAX.endElement(a)
SAX.endElement(li)
SAX.endElement(ul)
+SAX.startElement(p)
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(dir)
SAX.endElement(dir)
+SAX.startElement(p)
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(ul)
SAX.startElement(li)
SAX.startElement(b)
@@ -201,12 +220,16 @@
SAX.endElement(a)
SAX.endElement(li)
SAX.endElement(ul)
+SAX.startElement(p)
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(dir)
SAX.endElement(dir)
+SAX.startElement(p)
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(ul)
SAX.startElement(li)
SAX.startElement(b)
@@ -217,12 +240,16 @@
SAX.endElement(a)
SAX.endElement(li)
SAX.endElement(ul)
+SAX.startElement(p)
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(dir)
SAX.endElement(dir)
+SAX.startElement(p)
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.endElement(body)
SAX.endElement(html)
SAX.ignorableWhitespace(
diff --git a/result/HTML/wired.html.sax b/result/HTML/wired.html.sax
index d38d628..d67e0bb 100644
--- a/result/HTML/wired.html.sax
+++ b/result/HTML/wired.html.sax
@@ -8,9 +8,11 @@
SAX.endElement(title)
SAX.endElement(head)
SAX.startElement(body, bgcolor='#FFFFFF', text='#000000', link='#333399', vlink='#660066', alink='#666699')
+SAX.startElement(p)
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(table, border='0', width='600', cellspacing='0', cellpadding='0')
SAX.characters(
, 3)
@@ -315,6 +317,7 @@
SAX.characters(
, 1)
SAX.endElement(table)
+SAX.startElement(p)
SAX.characters(
, 2)
@@ -329,6 +332,7 @@
SAX.endElement(a)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.startElement(table, border='0', width='600', cellspacing='0', cellpadding='0')
SAX.characters(
@@ -574,6 +578,7 @@
SAX.characters(
, 1)
SAX.endElement(table)
+SAX.startElement(p)
SAX.characters(
, 1)
SAX.comment( end WIRED NEWS header )
@@ -584,6 +589,7 @@
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(table, border='0', cellpadding='3', cellspacing='0', align='LEFT', bgcolor='#FFFFFF')
SAX.characters(
, 3)
@@ -1465,6 +1471,7 @@
, 2)
SAX.endElement(table)
+SAX.startElement(p)
SAX.characters(
, 2)
@@ -1475,6 +1482,7 @@
SAX.characters(
, 2)
+SAX.endElement(p)
SAX.startElement(table, border='0', width='447', cellspacing='0', cellpadding='0', bordercolor='#66FF00')
SAX.characters(
, 2)
@@ -2832,6 +2840,7 @@
SAX.characters(
, 1)
SAX.endElement(table)
+SAX.startElement(p)
SAX.characters(
@@ -2840,9 +2849,14 @@
SAX.endElement(br)
SAX.characters(
, 1)
+SAX.endElement(p)
SAX.endElement(body)
+SAX.startElement(body)
+SAX.startElement(p)
SAX.characters(
, 1)
+SAX.endElement(p)
+SAX.endElement(body)
SAX.endElement(html)
SAX.ignorableWhitespace(
diff --git a/tree.c b/tree.c
index bb4d2f4..b5e116c 100644
--- a/tree.c
+++ b/tree.c
@@ -426,10 +426,17 @@
} else {
xmlNodePtr prev;
- prev = doc->last;
- prev->next = (xmlNodePtr) cur;
- cur->prev = prev;
- doc->last = (xmlNodePtr) cur;
+ if (doc->type == XML_HTML_DOCUMENT_NODE) {
+ prev = doc->children;
+ prev->prev = (xmlNodePtr) cur;
+ cur->next = prev;
+ doc->children = (xmlNodePtr) cur;
+ } else {
+ prev = doc->last;
+ prev->next = (xmlNodePtr) cur;
+ cur->prev = prev;
+ doc->last = (xmlNodePtr) cur;
+ }
}
}
return(cur);