Fixed problems with HTML parsing, Daniel.

commit: 2673d3c8564bb7970af29cecb0678a4c16ac23f0 [log] [tgz]
author: Daniel Veillard <veillard@src.gnome.org> Fri Oct 08 14:37:09 1999 +0000
committer: Daniel Veillard <veillard@src.gnome.org> Fri Oct 08 14:37:09 1999 +0000
tree: 68cdae9b183bfb309fb1cabf2add6c688eb77d5f
parent: 00fdf370d37d016629b8e27326eab426c6bcb2ee [diff]
diff --git a/ChangeLog b/ChangeLog
index 522f60e..3ead3d8 100644
--- a/ChangeLog
+++ b/ChangeLog

@@ -1,3 +1,8 @@
+Fri Oct  8 16:35:37 CEST 1999 Daniel Veillard <Daniel.Veillard@w3.org>
+
+	* HTMLparser.c parser.h : Fixed problems with HTML parsing
+	    reported by Kristian Hogsberg Kristensen <hogsberg@daimi.au.dk>
+
 Fri Oct  8 11:37:11 CEST 1999 Daniel Veillard <Daniel.Veillard@w3.org>
 
 	* tree.c : Raph patch for initialization of CORBA fields

diff --git a/HTMLparser.c b/HTMLparser.c
index 74f350f..6df2172 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c

@@ -87,6 +87,7 @@
 }									\
 
 PUSH_AND_POP(xmlNodePtr, node)
+PUSH_AND_POP(xmlChar*, name)
 
 /*
  * Macros for accessing the content. Those should be used only by the parser,
@@ -300,7 +301,7 @@
 "TITLE",	"P", NULL,
 "BODY",		"HEAD", "STYLE", "LINK", "TITLE", "P", NULL,
 "LI",		"P", "H1", "H2", "H3", "H4", "H5", "H6", "DL", "ADDRESS",
-		"PRE", "LISTING", "XMP", "HEAD", NULL,
+		"PRE", "LISTING", "XMP", "HEAD", "LI", NULL,
 "HR",		"P", "HEAD", NULL,
 "H1",		"P", "HEAD", NULL,
 "H2",		"P", "HEAD", NULL,
@@ -443,14 +444,18 @@
  */
 void
 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
+    xmlChar *oldname;
 
-    while ((ctxt->node != NULL) && 
-           (htmlCheckAutoClose(new, ctxt->node->name))) {
+    while ((ctxt->name != NULL) && 
+           (htmlCheckAutoClose(new, ctxt->name))) {
 #ifdef DEBUG
-	printf("htmlAutoClose: %s closes %s\n", new, ctxt->node->name);
+	printf("htmlAutoClose: %s closes %s\n", new, ctxt->name);
 #endif
 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
-	    ctxt->sax->endElement(ctxt->userData, ctxt->node->name);
+	    ctxt->sax->endElement(ctxt->userData, ctxt->name);
+	oldname = ctxt->name;
+	htmlnamePop(ctxt);
+	xmlFree(oldname);
     }
 }
 
@@ -464,16 +469,20 @@
 void
 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
     htmlElemDescPtr info;
+    xmlChar *oldname;
 
-    while ((ctxt->node != NULL) && 
-           (xmlStrcmp(new, ctxt->node->name))) {
-	info = htmlTagLookup(ctxt->node->name);
+    while ((ctxt->name != NULL) && 
+           (xmlStrcmp(new, ctxt->name))) {
+	info = htmlTagLookup(ctxt->name);
 	if ((info == NULL) || (info->endTag == 1)) {
 #ifdef DEBUG
-	    printf("htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->node->name);
+	    printf("htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->name);
 #endif
 	    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
-		ctxt->sax->endElement(ctxt->userData, ctxt->node->name);
+		ctxt->sax->endElement(ctxt->userData, ctxt->name);
+	    oldname = ctxt->name;
+	    htmlnamePop(ctxt);
+	    xmlFree(oldname);
         } else
 	    break;
     }
@@ -2000,6 +2009,7 @@
     /*
      * SAX: Start of Element !
      */
+    htmlnamePush(ctxt, xmlStrdup(name));
     if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
         ctxt->sax->startElement(ctxt->userData, name, atts);
 
@@ -2027,6 +2037,7 @@
 void
 htmlParseEndTag(htmlParserCtxtPtr ctxt, const xmlChar *tagname) {
     xmlChar *name;
+    xmlChar *oldname;
     int i;
 
     if ((CUR != '<') || (NXT(1) != '/')) {
@@ -2054,9 +2065,9 @@
      * Check that we are not closing an already closed tag,
      * <p><b>...</p></b> is a really common error !
      */
-    for (i = ctxt->nodeNr - 1;i >= 0;i--) {
-        if ((ctxt->nodeTab[i] != NULL) &&
-	    (!xmlStrcmp(tagname, ctxt->nodeTab[i]->name)))
+    for (i = ctxt->nameNr - 1;i >= 0;i--) {
+        if ((ctxt->nameTab[i] != NULL) &&
+	    (!xmlStrcmp(tagname, ctxt->nameTab[i])))
 	    break;
     }
     if (i < 0) {
@@ -2080,12 +2091,12 @@
      * of the stack.
      */
     if (xmlStrcmp(name, tagname)) {
-        if ((ctxt->node != NULL) && 
-	    (xmlStrcmp(ctxt->node->name, name))) {
+        if ((ctxt->name != NULL) && 
+	    (xmlStrcmp(ctxt->name, name))) {
 	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
 		ctxt->sax->error(ctxt->userData,
 		 "Opening and ending tag mismatch: %s and %s\n",
-		                 name, ctxt->node->name);
+		                 name, ctxt->name);
 	    ctxt->wellFormed = 0;
         }
     }
@@ -2095,6 +2106,9 @@
      */
     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
         ctxt->sax->endElement(ctxt->userData, name);
+    oldname = ctxt->name;
+    htmlnamePop(ctxt);
+    xmlFree(oldname);
 
     if (name != NULL)
 	xmlFree(name);
@@ -2157,9 +2171,9 @@
 
 void
 htmlParseContent(htmlParserCtxtPtr ctxt, const xmlChar *name) {
-    htmlNodePtr currentNode;
+    xmlChar *currentNode;
 
-    currentNode = ctxt->node;
+    currentNode = ctxt->name;
     while ((CUR != '<') || (NXT(1) != '/')) {
 	const xmlChar *test = CUR_PTR;
 
@@ -2167,7 +2181,7 @@
 	 * Has this node been popped out during parsing of
 	 * the next element
 	 */
-        if (currentNode != ctxt->node) return;
+        if (currentNode != ctxt->name) return;
 
 	/*
 	 * First case :  a comment
@@ -2230,7 +2244,8 @@
 htmlParseElement(htmlParserCtxtPtr ctxt) {
     const xmlChar *openTag = CUR_PTR;
     xmlChar *name;
-    htmlNodePtr currentNode;
+    xmlChar *oldname;
+    xmlChar *currentNode;
     htmlElemDescPtr info;
     htmlParserNodeInfo node_info;
 
@@ -2245,7 +2260,6 @@
     if (name == NULL) {
         return;
     }
-    currentNode = ctxt->node;
 
     /*
      * Lookup the info for that element.
@@ -2271,6 +2285,9 @@
         SKIP(2);
 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
 	    ctxt->sax->endElement(ctxt->userData, name);
+	oldname = ctxt->name;
+	htmlnamePop(ctxt);
+	xmlFree(oldname);
 	xmlFree(name);
 	return;
     }
@@ -2288,6 +2305,9 @@
 	 */
 	nodePop(ctxt);
 	xmlFree(name);
+	oldname = ctxt->name;
+	htmlnamePop(ctxt);
+	xmlFree(oldname);
 
 	/*
 	 * Capture end position and add node
@@ -2296,7 +2316,7 @@
 	   node_info.end_pos = ctxt->input->consumed +
 			      (CUR_PTR - ctxt->input->base);
 	   node_info.end_line = ctxt->input->line;
-	   node_info.node = currentNode;
+	   node_info.node = ctxt->node;
 	   xmlParserAddNodeInfo(ctxt, &node_info);
 	}
 	return;
@@ -2309,20 +2329,23 @@
 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
 	    ctxt->sax->endElement(ctxt->userData, name);
 	xmlFree(name);
+	oldname = ctxt->name;
+	htmlnamePop(ctxt);
+	xmlFree(oldname);
 	return;
     }
 
     /*
      * Parse the content of the element:
      */
-    currentNode = ctxt->node;
+    currentNode = ctxt->name;
     htmlParseContent(ctxt, name);
 
     /*
      * check whether the element get popped due to auto closure
      * on start tag
      */
-    if (currentNode != ctxt->node) {
+    if (currentNode != ctxt->name) {
 	xmlFree(name);
         return;
     }
@@ -2338,6 +2361,9 @@
 	 */
 	nodePop(ctxt);
 	xmlFree(name);
+	oldname = ctxt->name;
+	htmlnamePop(ctxt);
+	xmlFree(oldname);
 	return;
     }
 
@@ -2350,7 +2376,7 @@
        node_info.end_pos = ctxt->input->consumed +
                           (CUR_PTR - ctxt->input->base);
        node_info.end_line = ctxt->input->line;
-       node_info.node = currentNode;
+       node_info.node = ctxt->node;
        xmlParserAddNodeInfo(ctxt, &node_info);
     }
 }
@@ -2469,6 +2495,12 @@
     ctxt->nodeMax = 10;
     ctxt->node = NULL;
 
+    /* Allocate the Name stack */
+    ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
+    ctxt->nameNr = 0;
+    ctxt->nameMax = 10;
+    ctxt->name = NULL;
+
     if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
     else {
         ctxt->sax = sax;
@@ -2495,6 +2527,7 @@
 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
 {
     htmlParserInputPtr input;
+    xmlChar *oldname;
 
     if (ctxt == NULL) return;
 
@@ -2503,6 +2536,11 @@
     }
 
     if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
+    while ((oldname = ctxt->name) != NULL) {
+        htmlnamePop(ctxt);
+	xmlFree(oldname);
+    }
+    if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab);
     if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
     if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
     if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))

diff --git a/include/libxml/parser.h b/include/libxml/parser.h
index 397be04..9ffab60 100644
--- a/include/libxml/parser.h
+++ b/include/libxml/parser.h

@@ -141,6 +141,13 @@
     int                 token;        /* next char look-ahead */    
 
     char           *directory;        /* the data directory */
+
+    /* Node name stack only used for HTML parsing */
+    xmlChar           *name;          /* Current parsed Node */
+    int                nameNr;        /* Depth of the parsing stack */
+    int                nameMax;       /* Max depth of the parsing stack */
+    xmlChar *         *nameTab;       /* array of nodes */
+
 } _xmlParserCtxt;
 typedef _xmlParserCtxt xmlParserCtxt;
 typedef xmlParserCtxt *xmlParserCtxtPtr;

diff --git a/parser.h b/parser.h
index 397be04..9ffab60 100644
--- a/parser.h
+++ b/parser.h

@@ -141,6 +141,13 @@
     int                 token;        /* next char look-ahead */    
 
     char           *directory;        /* the data directory */
+
+    /* Node name stack only used for HTML parsing */
+    xmlChar           *name;          /* Current parsed Node */
+    int                nameNr;        /* Depth of the parsing stack */
+    int                nameMax;       /* Max depth of the parsing stack */
+    xmlChar *         *nameTab;       /* array of nodes */
+
 } _xmlParserCtxt;
 typedef _xmlParserCtxt xmlParserCtxt;
 typedef xmlParserCtxt *xmlParserCtxtPtr;
commit	2673d3c8564bb7970af29cecb0678a4c16ac23f0	[log] [tgz]
author	Daniel Veillard <veillard@src.gnome.org>	Fri Oct 08 14:37:09 1999 +0000
committer	Daniel Veillard <veillard@src.gnome.org>	Fri Oct 08 14:37:09 1999 +0000
tree	68cdae9b183bfb309fb1cabf2add6c688eb77d5f
parent	00fdf370d37d016629b8e27326eab426c6bcb2ee [diff]