made configuring with regexps/automata/unicode the default but without

* Makefile.am configure.in include/libxml/xmlversion.h.in:
  made configuring with regexps/automata/unicode the default
  but without schemas ATM
* testRegexp.c valid.c xmlregexp.c include/libxml/xmlregexp.h:
  fixed the regexp based DTD validation performance and memory
  problem by switching to a compact form for determinist regexps
  and detecting the determinism property in the process. Seems
  as fast as the old DTD validation specific engine :-) despite
  the regexp built and compaction process.
Daniel
diff --git a/ChangeLog b/ChangeLog
index 01ca5b8..954f831 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+Thu Sep 19 21:46:53 CEST 2002 Daniel Veillard <daniel@veillard.com>
+
+	* Makefile.am configure.in include/libxml/xmlversion.h.in:
+	  made configuring with regexps/automata/unicode the default
+	  but without schemas ATM
+	* testRegexp.c valid.c xmlregexp.c include/libxml/xmlregexp.h:
+	  fixed the regexp based DTD validation performance and memory
+	  problem by switching to a compact form for determinist regexps
+	  and detecting the determinism property in the process. Seems
+	  as fast as the old DTD validation specific engine :-) despite
+	  the regexp built and compaction process.
+
 Wed Sep 18 18:27:26 CEST 2002 Daniel Veillard <daniel@veillard.com>
 
 	* valid.c: determinism is debugged, new DTD checking code now works
diff --git a/Makefile.am b/Makefile.am
index f9d9426..7269a9e 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -106,7 +106,7 @@
 
 testall : tests SVGtests SAXtests
 
-tests: XMLtests XMLenttests HTMLtests Validtests URItests XPathtests XPtrtests XIncludetests C14Ntests Scripttests Catatests @TEST_SCHEMAS@ @TEST_THREADS@
+tests: XMLtests XMLenttests HTMLtests Validtests URItests XPathtests XPtrtests XIncludetests C14Ntests Scripttests Catatests @TEST_REGEXPS@ @TEST_SCHEMAS@ @TEST_THREADS@
 	@(cd python ; $(MAKE) tests)
 
 valgrind:
diff --git a/configure.in b/configure.in
index 55e395d..26268f7 100644
--- a/configure.in
+++ b/configure.in
@@ -525,7 +525,8 @@
 if test "$with_schemas" = "yes" ; then
     echo Enabling Schemas support
     WITH_SCHEMAS=1
-    TEST_SCHEMAS="Regexptests Automatatests Schemastests"
+    TEST_SCHEMAS="Schemastests"
+    with_regexps=yes
 else    
     WITH_SCHEMAS=0
     TEST_SCHEMAS=
@@ -533,6 +534,18 @@
 AC_SUBST(WITH_SCHEMAS)
 AC_SUBST(TEST_SCHEMAS)
 
+AC_ARG_WITH(regexps, [  --with-regexps              Add Regular Expressions support (on)])
+if test "$with_regexps" = "no" ; then
+    echo Disabling Regexps support
+    WITH_REGEXPS=0
+    TEST_REGEXPS=
+else    
+    WITH_REGEXPS=1
+    TEST_REGEXPS="Regexptests Automatatests"
+fi
+AC_SUBST(WITH_REGEXPS)
+AC_SUBST(TEST_REGEXPS)
+
 AC_ARG_WITH(debug, [  --with-debug            Add the debugging module (on)])
 if test "$with_debug" = "no" ; then
     echo Disabling DEBUG support
diff --git a/include/libxml/xmlregexp.h b/include/libxml/xmlregexp.h
index b66d4d9..48d1aa2 100644
--- a/include/libxml/xmlregexp.h
+++ b/include/libxml/xmlregexp.h
@@ -55,6 +55,7 @@
 					 const xmlChar *value);
 void			xmlRegexpPrint	(FILE *output,
 					 xmlRegexpPtr regexp);
+int			xmlRegexpIsDeterminist(xmlRegexpPtr comp);
 
 /*
  * Callback function when doing a transition in the automata
diff --git a/include/libxml/xmlversion.h.in b/include/libxml/xmlversion.h.in
index 777e9f7..94def18 100644
--- a/include/libxml/xmlversion.h.in
+++ b/include/libxml/xmlversion.h.in
@@ -194,7 +194,7 @@
  *
  * Whether the Unicode related interfaces are compiled in
  */
-#if @WITH_SCHEMAS@
+#if @WITH_REGEXPS@
 #define LIBXML_UNICODE_ENABLED
 #endif
 
@@ -203,7 +203,7 @@
  *
  * Whether the regular expressions interfaces are compiled in
  */
-#if @WITH_SCHEMAS@
+#if @WITH_REGEXPS@
 #define LIBXML_REGEXP_ENABLED
 #endif
 
@@ -212,7 +212,7 @@
  *
  * Whether the automata interfaces are compiled in
  */
-#if @WITH_SCHEMAS@
+#if @WITH_REGEXPS@
 #define LIBXML_AUTOMATA_ENABLED
 #endif
 
diff --git a/testRegexp.c b/testRegexp.c
index a1d0d27..c0a5cbc 100644
--- a/testRegexp.c
+++ b/testRegexp.c
@@ -140,11 +140,12 @@
 		}
 	    }
 	}
+	xmlMemoryDump();
 	if (comp != NULL)
 	    xmlRegFreeRegexp(comp);
     }
     xmlCleanupParser();
-    xmlMemoryDump();
+    /* xmlMemoryDump(); */
     return(0);
 }
 
diff --git a/valid.c b/valid.c
index 5a9a614..fbdc766 100644
--- a/valid.c
+++ b/valid.c
@@ -553,7 +553,7 @@
     xmlValidBuildAContentModel(elem->content, ctxt, elem->name);
     xmlAutomataSetFinalState(ctxt->am, ctxt->state);
     elem->contModel = xmlAutomataCompile(ctxt->am);
-    if (!xmlAutomataIsDeterminist(ctxt->am)) {
+    if (!xmlRegexpIsDeterminist(elem->contModel)) {
 	char expr[5000];
 	expr[0] = 0;
 	xmlSnprintfElementContent(expr, 5000, elem->content, 1);
@@ -3849,6 +3849,7 @@
     return(child);
 }
 
+#ifndef  LIBXML_REGEXP_ENABLED
 /**
  * xmlValidateElementType:
  * @ctxt:  the validation context
@@ -4217,6 +4218,7 @@
     }
     return(determinist);
 }
+#endif
 
 /**
  * xmlSnprintfElements:
@@ -4319,7 +4321,10 @@
 xmlValidateElementContent(xmlValidCtxtPtr ctxt, xmlNodePtr child,
        xmlElementPtr elemDecl, int warn, xmlNodePtr parent) {
     int ret = 1;
-    xmlNodePtr repl = NULL, last = NULL, cur, tmp;
+#ifndef  LIBXML_REGEXP_ENABLED
+    xmlNodePtr last = NULL;
+#endif
+    xmlNodePtr repl = NULL, cur, tmp;
     xmlElementContentPtr cont;
     const xmlChar *name;
 
@@ -4561,7 +4566,9 @@
     if (ret == -3)
 	ret = 1;
 
+#ifndef  LIBXML_REGEXP_ENABLED
 done:
+#endif
     /*
      * Deallocate the copy if done, and free up the validation stack
      */
diff --git a/xmlregexp.c b/xmlregexp.c
index 8ff7385..c4cf681 100644
--- a/xmlregexp.c
+++ b/xmlregexp.c
@@ -30,6 +30,7 @@
 /* #define DEBUG_REGEXP_GRAPH  */
 /* #define DEBUG_REGEXP_EXEC */
 /* #define DEBUG_PUSH */
+/* #define DEBUG_COMPACTION */
 
 #define ERROR(str) ctxt->error = 1;					\
     xmlGenericError(xmlGenericErrorContext, "Regexp: %s: %s\n", str, ctxt->cur)
@@ -193,6 +194,7 @@
 struct _xmlAutomataState {
     xmlRegStateType type;
     xmlRegMarkedType mark;
+    xmlRegMarkedType reached;
     int no;
 
     int maxTrans;
@@ -240,6 +242,13 @@
     int nbCounters;
     xmlRegCounter *counters;
     int determinist;
+    /*
+     * That's the compact form for determinists automatas
+     */
+    int nbstates;
+    int *compact;
+    int nbstrings;
+    xmlChar **stringMap;
 };
 
 typedef struct _xmlRegExecRollback xmlRegExecRollback;
@@ -299,6 +308,8 @@
 #define REGEXP_ALL_LAX_COUNTER	0x123457
 
 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
+static void xmlRegFreeState(xmlRegStatePtr state);
+static void xmlRegFreeAtom(xmlRegAtomPtr atom);
 
 /************************************************************************
  * 									*
@@ -306,6 +317,7 @@
  * 									*
  ************************************************************************/
 
+static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
 /**
  * xmlRegEpxFromParse:
  * @ctxt:  the parser context used to build it
@@ -337,6 +349,166 @@
     ret->counters = ctxt->counters;
     ctxt->counters = NULL;
     ret->determinist = ctxt->determinist;
+
+    if ((ret->determinist != 0) &&
+	(ret->nbCounters == 0) &&
+	(ret->atoms[0] != NULL) &&
+	(ret->atoms[0]->type == XML_REGEXP_STRING)) {
+	int i, j, nbstates = 0, nbatoms = 0;
+	int *stateRemap;
+	int *stringRemap;
+	int *transitions;
+	xmlChar **stringMap;
+        xmlChar *value;
+
+	/*
+	 * Switch to a compact representation
+	 * 1/ counting the effective number of states left
+	 * 2/ conting the unique number of atoms, and check that
+	 *    they are all of the string type
+	 * 3/ build a table state x atom for the transitions
+	 */
+
+	stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
+	for (i = 0;i < ret->nbStates;i++) {
+	    if (ret->states[i] != NULL) {
+		stateRemap[i] = nbstates;
+		nbstates++;
+	    } else {
+		stateRemap[i] = -1;
+	    }
+	}
+#ifdef DEBUG_COMPACTION
+	printf("Final: %d states\n", nbstates);
+#endif
+	stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
+	stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
+	for (i = 0;i < ret->nbAtoms;i++) {
+	    if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
+		(ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
+		value = ret->atoms[i]->valuep;
+                for (j = 0;j < nbatoms;j++) {
+		    if (xmlStrEqual(stringMap[j], value)) {
+			stringRemap[i] = j;
+			break;
+		    }
+		}
+		if (j >= nbatoms) {
+		    stringRemap[i] = nbatoms;
+		    stringMap[nbatoms] = xmlStrdup(value);
+		    nbatoms++;
+		}
+	    } else {
+		xmlFree(stateRemap);
+		xmlFree(stringRemap);
+		for (i = 0;i < nbatoms;i++)
+		    xmlFree(stringMap[i]);
+		xmlFree(stringMap);
+		goto fail_compact;
+	    }
+	}
+#ifdef DEBUG_COMPACTION
+	printf("Final: %d atoms\n", nbatoms);
+#endif
+
+	/*
+	 * Allocate the transition table. The first entry for each
+	 * state correspond to the state type.
+	 */
+	transitions = (int *) xmlMalloc(nbstates * (nbatoms + 1) * sizeof(int));
+	memset(transitions, 0, nbstates * (nbatoms + 1) * sizeof(int));
+
+	for (i = 0;i < ret->nbStates;i++) {
+	    int stateno, atomno, targetno, prev;
+	    xmlRegStatePtr state;
+	    xmlRegTransPtr trans;
+
+	    stateno = stateRemap[i];
+	    if (stateno == -1)
+		continue;
+	    state = ret->states[i];
+
+	    transitions[stateno * (nbatoms + 1)] = state->type;
+
+	    for (j = 0;j < state->nbTrans;j++) {
+		trans = &(state->trans[j]);
+		if ((trans->to == -1) || (trans->atom == NULL))
+		    continue;
+                atomno = stringRemap[trans->atom->no];
+		targetno = stateRemap[trans->to];
+		/*
+		 * if the same atome can generate transition to 2 different
+		 * states then it means the automata is not determinist and
+		 * the compact form can't be used !
+		 */
+		prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
+		if (prev != 0) {
+		    if (prev != targetno + 1) {
+			printf("not determinist\n");
+			ret->determinist = 0;
+#ifdef DEBUG_COMPACTION
+			printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
+			       i, j, trans->atom->no, trans->to, atomno, targetno);
+			printf("       previous to is %d\n", prev);
+#endif
+			ret->determinist = 0;
+			xmlFree(transitions);
+			xmlFree(stateRemap);
+			xmlFree(stringRemap);
+			for (i = 0;i < nbatoms;i++)
+			    xmlFree(stringMap[i]);
+			xmlFree(stringMap);
+			goto fail_compact;
+		    }
+		} else {
+#if 0
+		    printf("State %d trans %d: atom %d to %d : %d to %d\n",
+			   i, j, trans->atom->no, trans->to, atomno, targetno);
+#endif
+		    transitions[stateno * (nbatoms + 1) + atomno + 1] =
+			targetno + 1;; /* to avoid 0 */
+		}
+	    }
+	}
+	ret->determinist = 1;
+#ifdef DEBUG_COMPACTION
+	/*
+	 * Debug
+	 */
+	for (i = 0;i < nbstates;i++) {
+	    for (j = 0;j < nbatoms + 1;j++) {
+                printf("%02d ", transitions[i * (nbatoms + 1) + j]);
+	    }
+	    printf("\n");
+	}
+	printf("\n");
+#endif
+	/*
+	 * Cleanup of the old data
+	 */
+	if (ret->states != NULL) {
+	    for (i = 0;i < ret->nbStates;i++)
+		xmlRegFreeState(ret->states[i]);
+	    xmlFree(ret->states);
+	}
+	ret->states = NULL;
+	ret->nbStates = 0;
+	if (ret->atoms != NULL) {
+	    for (i = 0;i < ret->nbAtoms;i++)
+		xmlRegFreeAtom(ret->atoms[i]);
+	    xmlFree(ret->atoms);
+	}
+	ret->atoms = NULL;
+	ret->nbAtoms = 0;
+
+	ret->compact = transitions;
+	ret->stringMap = stringMap;
+	ret->nbstrings = nbatoms;
+	ret->nbstates = nbstates;
+	xmlFree(stateRemap);
+	xmlFree(stringRemap);
+    }
+fail_compact:
     return(ret);
 }
 
@@ -741,6 +913,7 @@
     }
 }
 
+#ifdef DEBUG_REGEXP_GRAPH
 static void
 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
     int i;
@@ -780,6 +953,7 @@
 		                                ctxt->counters[i].max);
     }
 }
+#endif
 
 /************************************************************************
  * 									*
@@ -1228,7 +1402,6 @@
 		xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 
 				    ctxt->states[newto], counter, -1);
 	    }
-
 	}
     }
     to->mark = XML_REGEXP_MARK_NORMAL;
@@ -1296,6 +1469,63 @@
 	    }
 	}
     }
+
+    /*
+     * Use this pass to detect unreachable states too
+     */
+    for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
+	state = ctxt->states[statenr];
+	if (state != NULL)
+	    state->reached = 0;
+    }
+    state = ctxt->states[0];
+    if (state != NULL)
+	state->reached = 1;
+    while (state != NULL) {
+	xmlRegStatePtr target = NULL;
+	state->reached = 2;
+	/*
+	 * Mark all state reachable from the current reachable state
+	 */
+	for (transnr = 0;transnr < state->nbTrans;transnr++) {
+	    if ((state->trans[transnr].to >= 0) &&
+		((state->trans[transnr].atom != NULL) ||
+		 (state->trans[transnr].count >= 0))) {
+		int newto = state->trans[transnr].to;
+
+		if (ctxt->states[newto] == NULL)
+		    continue;
+		if (ctxt->states[newto]->reached == 0) {
+		    ctxt->states[newto]->reached = 1;
+		    target = ctxt->states[newto];
+		}
+	    }
+	}
+	/*
+	 * find the next accessible state not explored
+	 */
+	if (target == NULL) {
+	    for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
+		state = ctxt->states[statenr];
+		if ((state != NULL) && (state->reached == 1)) {
+		    target = state;
+		    break;
+		}
+	    }
+	}
+	state = target;
+    }
+    for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
+	state = ctxt->states[statenr];
+	if ((state != NULL) && (state->reached == 0)) {
+#ifdef DEBUG_REGEXP_GRAPH
+	    printf("Removed unreachable state %d\n", statenr);
+#endif
+	    xmlRegFreeState(state);
+	    ctxt->states[statenr] = NULL;
+	}
+    }
+
 }
 
 /**
@@ -2041,7 +2271,8 @@
     exec->rollbacks = NULL;
     exec->status = 0;
     exec->comp = comp;
-    exec->state = comp->states[0];
+    if (comp->compact == NULL)
+	exec->state = comp->states[0];
     exec->transno = 0;
     exec->transcount = 0;
     exec->callback = callback;
@@ -2131,6 +2362,73 @@
 
 
 /**
+ * xmlRegCompactPushString:
+ * @exec: a regexp execution context
+ * @comp:  the precompiled exec with a compact table
+ * @value: a string token input
+ * @data: data associated to the token to reuse in callbacks
+ *
+ * Push one input token in the execution context
+ *
+ * Returns: 1 if the regexp reached a final state, 0 if non-final, and
+ *     a negative value in case of error.
+ */
+static int
+xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
+	                xmlRegexpPtr comp,
+	                const xmlChar *value,
+	                void *data) {
+    int state = exec->index;
+    int i, target;
+
+    if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
+	return(-1);
+    
+    if (value == NULL) {
+	/*
+	 * are we at a final state ?
+	 */
+	if (comp->compact[state * (comp->nbstrings + 1)] ==
+            XML_REGEXP_FINAL_STATE)
+	    return(1);
+	return(0);
+    }
+
+#ifdef DEBUG_PUSH
+    printf("value pushed: %s\n", value);
+#endif
+
+    /*
+     * Examine all outside transition from current state
+     */
+    for (i = 0;i < comp->nbstrings;i++) {
+	target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
+	if ((target > 0) && (target <= comp->nbstates)) {
+	    target--; /* to avoid 0 */
+	    if (xmlStrEqual(comp->stringMap[i], value)) {
+		exec->index = target;
+#ifdef DEBUG_PUSH
+		printf("entering state %d\n", target);
+#endif
+		if (comp->compact[target * (comp->nbstrings + 1)] ==
+		    XML_REGEXP_FINAL_STATE)
+		    return(1);
+		return(0);
+	    }
+	}
+    }
+    /*
+     * Failed to find an exit transition out from current state for the
+     * current token
+     */
+#ifdef DEBUG_PUSH
+    printf("failed to find a transition for %s on state %d\n", value, state);
+#endif
+    exec->status = -1;
+    return(-1);
+}
+
+/**
  * xmlRegExecPushString:
  * @exec: a regexp execution context
  * @value: a string token input
@@ -2151,9 +2449,14 @@
 
     if (exec == NULL)
 	return(-1);
+    if (exec->comp == NULL)
+	return(-1);
     if (exec->status != 0)
 	return(exec->status);
 
+    if (exec->comp->compact != NULL)
+	return(xmlRegCompactPushString(exec, exec->comp, value, data));
+
     if (value == NULL) {
         if (exec->state->type == XML_REGEXP_FINAL_STATE)
 	    return(1);
@@ -3510,6 +3813,37 @@
 }
 
 /**
+ * xmlRegexpIsDeterminist:
+ * @comp:  the compiled regular expression
+ *
+ * Check if the regular expression is determinist
+ *
+ * Returns 1 if it yes, 0 if not and a negativa value in case of error
+ */
+int
+xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
+    xmlAutomataPtr am;
+    int ret;
+
+    if (comp == NULL)
+	return(-1);
+    if (comp->determinist != -1)
+	return(comp->determinist);
+
+    am = xmlNewAutomata();
+    am->nbAtoms = comp->nbAtoms;
+    am->atoms = comp->atoms;
+    am->nbStates = comp->nbStates;
+    am->states = comp->states;
+    am->determinist = -1;
+    ret = xmlFAComputesDeterminism(am);
+    am->atoms = NULL;
+    am->states = NULL;
+    xmlFreeAutomata(am);
+    return(ret);
+}
+
+/**
  * xmlRegFreeRegexp:
  * @regexp:  the regexp
  *
@@ -3535,6 +3869,14 @@
     }
     if (regexp->counters != NULL)
 	xmlFree(regexp->counters);
+    if (regexp->compact != NULL)
+	xmlFree(regexp->compact);
+    if (regexp->stringMap != NULL) {
+	for (i = 0; i < regexp->nbstrings;i++)
+	    xmlFree(regexp->stringMap[i]);
+	xmlFree(regexp->stringMap);
+    }
+
     xmlFree(regexp);
 }
 
@@ -3910,7 +4252,7 @@
     xmlRegexpPtr ret;
 
     xmlFAEliminateEpsilonTransitions(am);
-    xmlFAComputesDeterminism(am);
+    /* xmlFAComputesDeterminism(am); */
     ret = xmlRegEpxFromParse(am);
 
     return(ret);