added support for HTML PIs #156087 added specific tests Daniel

* HTMLparser.c: added support for HTML PIs #156087
* test/HTML/python.html result/HTML/python.html*: added specific tests
Daniel
diff --git a/ChangeLog b/ChangeLog
index 6860448..91cdd3a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+Fri Oct 22 16:36:50 CEST 2004 Daniel Veillard <daniel@veillard.com>
+
+	* HTMLparser.c: added support for HTML PIs #156087
+	* test/HTML/python.html result/HTML/python.html*: added specific tests
+
 Fri Oct 22 15:20:23 CEST 2004 Daniel Veillard <daniel@veillard.com>
 
 	* threads.c: fixed nasty bug #156087
diff --git a/HTMLparser.c b/HTMLparser.c
index 72a0870..947e4aa 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -2809,6 +2809,117 @@
 }
 
 /**
+ * xmlParsePI:
+ * @ctxt:  an XML parser context
+ *
+ * parse an XML Processing Instruction.
+ *
+ * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
+ */
+static void
+htmlParsePI(htmlParserCtxtPtr ctxt) {
+    xmlChar *buf = NULL;
+    int len = 0;
+    int size = HTML_PARSER_BUFFER_SIZE;
+    int cur, l;
+    const xmlChar *target;
+    xmlParserInputState state;
+    int count = 0;
+
+    if ((RAW == '<') && (NXT(1) == '?')) {
+	state = ctxt->instate;
+        ctxt->instate = XML_PARSER_PI;
+	/*
+	 * this is a Processing Instruction.
+	 */
+	SKIP(2);
+	SHRINK;
+
+	/*
+	 * Parse the target name and check for special support like
+	 * namespace.
+	 */
+        target = htmlParseName(ctxt);
+	if (target != NULL) {
+	    if (RAW == '>') {
+		SKIP(1);
+
+		/*
+		 * SAX: PI detected.
+		 */
+		if ((ctxt->sax) && (!ctxt->disableSAX) &&
+		    (ctxt->sax->processingInstruction != NULL))
+		    ctxt->sax->processingInstruction(ctxt->userData,
+		                                     target, NULL);
+		ctxt->instate = state;
+		return;
+	    }
+	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
+	    if (buf == NULL) {
+		htmlErrMemory(ctxt, NULL);
+		ctxt->instate = state;
+		return;
+	    }
+	    cur = CUR;
+	    if (!IS_BLANK(cur)) {
+		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
+			  "ParsePI: PI %s space expected\n", target, NULL);
+	    }
+            SKIP_BLANKS;
+	    cur = CUR_CHAR(l);
+	    while (IS_CHAR(cur) && (cur != '>')) {
+		if (len + 5 >= size) {
+		    xmlChar *tmp;
+
+		    size *= 2;
+		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
+		    if (tmp == NULL) {
+			htmlErrMemory(ctxt, NULL);
+			xmlFree(buf);
+			ctxt->instate = state;
+			return;
+		    }
+		    buf = tmp;
+		}
+		count++;
+		if (count > 50) {
+		    GROW;
+		    count = 0;
+		}
+		COPY_BUF(l,buf,len,cur);
+		NEXTL(l);
+		cur = CUR_CHAR(l);
+		if (cur == 0) {
+		    SHRINK;
+		    GROW;
+		    cur = CUR_CHAR(l);
+		}
+	    }
+	    buf[len] = 0;
+	    if (cur != '>') {
+		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
+		      "ParsePI: PI %s never end ...\n", target, NULL);
+	    } else {
+		SKIP(1);
+
+		/*
+		 * SAX: PI detected.
+		 */
+		if ((ctxt->sax) && (!ctxt->disableSAX) &&
+		    (ctxt->sax->processingInstruction != NULL))
+		    ctxt->sax->processingInstruction(ctxt->userData,
+		                                     target, buf);
+	    }
+	    xmlFree(buf);
+	} else {
+	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, 
+                         "PI is not started correctly", NULL, NULL);
+	}
+	ctxt->instate = state;
+    }
+}
+
+/**
  * htmlParseComment:
  * @ctxt:  an HTML parser context
  *
@@ -3643,14 +3754,21 @@
 	    }
 
 	    /*
-	     * Second case :  a sub-element.
+	     * Second case : a Processing Instruction.
+	     */
+	    else if ((CUR == '<') && (NXT(1) == '?')) {
+		htmlParsePI(ctxt);
+	    }
+
+	    /*
+	     * Third case :  a sub-element.
 	     */
 	    else if (CUR == '<') {
 		htmlParseElement(ctxt);
 	    }
 
 	    /*
-	     * Third case : a reference. If if has not been resolved,
+	     * Fourth case : a reference. If if has not been resolved,
 	     *    parsing returns it's Name, create the node 
 	     */
 	    else if (CUR == '&') {
@@ -3658,7 +3776,7 @@
 	    }
 
 	    /*
-	     * Fourth : end of the resource
+	     * Fifth case : end of the resource
 	     */
 	    else if (CUR == 0) {
 		htmlAutoCloseOnEnd(ctxt);
@@ -3852,11 +3970,13 @@
 
 
     /*
-     * Parse possible comments before any content
+     * Parse possible comments and PIs before any content
      */
-    while ((CUR == '<') && (NXT(1) == '!') &&
-           (NXT(2) == '-') && (NXT(3) == '-')) {
+    while (((CUR == '<') && (NXT(1) == '!') &&
+            (NXT(2) == '-') && (NXT(3) == '-')) ||
+	   ((CUR == '<') && (NXT(1) == '?'))) {
         htmlParseComment(ctxt);	   
+        htmlParsePI(ctxt);	   
 	SKIP_BLANKS;
     }	   
 
@@ -3875,11 +3995,13 @@
     SKIP_BLANKS;
 
     /*
-     * Parse possible comments before any content
+     * Parse possible comments and PIs before any content
      */
-    while ((CUR == '<') && (NXT(1) == '!') &&
-           (NXT(2) == '-') && (NXT(3) == '-')) {
+    while (((CUR == '<') && (NXT(1) == '!') &&
+            (NXT(2) == '-') && (NXT(3) == '-')) ||
+	   ((CUR == '<') && (NXT(1) == '?'))) {
         htmlParseComment(ctxt);	   
+        htmlParsePI(ctxt);	   
 	SKIP_BLANKS;
     }	   
 
@@ -4444,6 +4566,16 @@
 #endif
 		    htmlParseComment(ctxt);
 		    ctxt->instate = XML_PARSER_MISC;
+	        } else if ((cur == '<') && (next == '?')) {
+		    if ((!terminate) &&
+		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
+			goto done;
+#ifdef DEBUG_PUSH
+		    xmlGenericError(xmlGenericErrorContext,
+			    "HPP: Parsing PI\n");
+#endif
+		    htmlParsePI(ctxt);
+		    ctxt->instate = XML_PARSER_MISC;
 		} else if ((cur == '<') && (next == '!') &&
 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
@@ -4494,6 +4626,16 @@
 #endif
 		    htmlParseComment(ctxt);
 		    ctxt->instate = XML_PARSER_PROLOG;
+	        } else if ((cur == '<') && (next == '?')) {
+		    if ((!terminate) &&
+		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
+			goto done;
+#ifdef DEBUG_PUSH
+		    xmlGenericError(xmlGenericErrorContext,
+			    "HPP: Parsing PI\n");
+#endif
+		    htmlParsePI(ctxt);
+		    ctxt->instate = XML_PARSER_PROLOG;
 		} else if ((cur == '<') && (next == '!') &&
 		           (avail < 4)) {
 		    goto done;
@@ -4531,6 +4673,16 @@
 #endif
 		    htmlParseComment(ctxt);
 		    ctxt->instate = XML_PARSER_EPILOG;
+	        } else if ((cur == '<') && (next == '?')) {
+		    if ((!terminate) &&
+		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
+			goto done;
+#ifdef DEBUG_PUSH
+		    xmlGenericError(xmlGenericErrorContext,
+			    "HPP: Parsing PI\n");
+#endif
+		    htmlParsePI(ctxt);
+		    ctxt->instate = XML_PARSER_EPILOG;
 		} else if ((cur == '<') && (next == '!') &&
 		           (avail < 4)) {
 		    goto done;
@@ -4737,6 +4889,16 @@
 #endif
 			htmlParseComment(ctxt);
 			ctxt->instate = XML_PARSER_CONTENT;
+		    } else if ((cur == '<') && (next == '?')) {
+			if ((!terminate) &&
+			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
+			    goto done;
+#ifdef DEBUG_PUSH
+			xmlGenericError(xmlGenericErrorContext,
+				"HPP: Parsing PI\n");
+#endif
+			htmlParsePI(ctxt);
+			ctxt->instate = XML_PARSER_CONTENT;
 		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
 			goto done;
 		    } else if ((cur == '<') && (next == '/')) {
diff --git a/SAX.c b/SAX.c
index 0ff2017..08e8588 100644
--- a/SAX.c
+++ b/SAX.c
@@ -115,6 +115,7 @@
     hdlr->characters = xmlSAX2Characters;
     hdlr->cdataBlock = xmlSAX2CDataBlock;
     hdlr->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
+    hdlr->processingInstruction = xmlSAX2ProcessingInstruction;
     hdlr->processingInstruction = NULL;
     hdlr->comment = xmlSAX2Comment;
     hdlr->warning = xmlParserWarning;
diff --git a/SAX2.c b/SAX2.c
index c41af00..ddb13e9 100644
--- a/SAX2.c
+++ b/SAX2.c
@@ -2635,7 +2635,7 @@
     hdlr->characters = xmlSAX2Characters;
     hdlr->cdataBlock = xmlSAX2CDataBlock;
     hdlr->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
-    hdlr->processingInstruction = NULL;
+    hdlr->processingInstruction = xmlSAX2ProcessingInstruction;
     hdlr->comment = xmlSAX2Comment;
     hdlr->warning = xmlParserWarning;
     hdlr->error = xmlParserError;
diff --git a/globals.c b/globals.c
index feb19cc..e64461f 100644
--- a/globals.c
+++ b/globals.c
@@ -430,7 +430,7 @@
     NULL,
     xmlSAX2Characters,
     xmlSAX2IgnorableWhitespace,
-    NULL,
+    xmlSAX2ProcessingInstruction,
     xmlSAX2Comment,
     xmlParserWarning,
     xmlParserError,
diff --git a/result/HTML/python.html b/result/HTML/python.html
new file mode 100644
index 0000000..e5f9d56
--- /dev/null
+++ b/result/HTML/python.html
@@ -0,0 +1,5 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<?xml-stylesheet href="./css/ht2html.css" type="text/css"?><html>
+<!-- THIS PAGE IS AUTOMATICALLY GENERATED.  DO NOT EDIT. --><head><title>Python Programming Language</title></head>
+<body></body>
+</html>
diff --git a/result/HTML/python.html.err b/result/HTML/python.html.err
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/result/HTML/python.html.err
diff --git a/result/HTML/python.html.sax b/result/HTML/python.html.sax
new file mode 100644
index 0000000..4f96e8a
--- /dev/null
+++ b/result/HTML/python.html.sax
@@ -0,0 +1,29 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.internalSubset(html, -//W3C//DTD HTML 4.01 Transitional//EN, http://www.w3.org/TR/html4/loose.dtd)
+SAX.processingInstruction(xml-stylesheet, href="./css/ht2html.css" type="text/css"?)
+SAX.startElement(html)
+SAX.ignorableWhitespace(
+, 1)
+SAX.comment( THIS PAGE IS AUTOMATICALLY GENERATED.  DO NOT EDIT. )
+SAX.ignorableWhitespace(
+, 1)
+SAX.startElement(head)
+SAX.ignorableWhitespace(
+, 1)
+SAX.startElement(title)
+SAX.characters(Python Programming Language, 27)
+SAX.endElement(title)
+SAX.ignorableWhitespace(
+, 1)
+SAX.endElement(head)
+SAX.ignorableWhitespace(
+, 1)
+SAX.startElement(body)
+SAX.ignorableWhitespace(
+, 1)
+SAX.endElement(body)
+SAX.endElement(html)
+SAX.ignorableWhitespace(
+, 1)
+SAX.endDocument()
diff --git a/test/HTML/python.html b/test/HTML/python.html
new file mode 100644
index 0000000..51a6394
--- /dev/null
+++ b/test/HTML/python.html
@@ -0,0 +1,10 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+                      "http://www.w3.org/TR/html4/loose.dtd" >
+<?xml-stylesheet href="./css/ht2html.css" type="text/css"?>
+<html>
+<!-- THIS PAGE IS AUTOMATICALLY GENERATED.  DO NOT EDIT. -->
+<head>
+<title>Python Programming Language</title>
+</head>
+<body>
+</body></html>