made configuring with regexps/automata/unicode the default but without

* Makefile.am configure.in include/libxml/xmlversion.h.in:
  made configuring with regexps/automata/unicode the default
  but without schemas ATM
* testRegexp.c valid.c xmlregexp.c include/libxml/xmlregexp.h:
  fixed the regexp based DTD validation performance and memory
  problem by switching to a compact form for determinist regexps
  and detecting the determinism property in the process. Seems
  as fast as the old DTD validation specific engine :-) despite
  the regexp built and compaction process.
Daniel
diff --git a/xmlregexp.c b/xmlregexp.c
index 8ff7385..c4cf681 100644
--- a/xmlregexp.c
+++ b/xmlregexp.c
@@ -30,6 +30,7 @@
 /* #define DEBUG_REGEXP_GRAPH  */
 /* #define DEBUG_REGEXP_EXEC */
 /* #define DEBUG_PUSH */
+/* #define DEBUG_COMPACTION */
 
 #define ERROR(str) ctxt->error = 1;					\
     xmlGenericError(xmlGenericErrorContext, "Regexp: %s: %s\n", str, ctxt->cur)
@@ -193,6 +194,7 @@
 struct _xmlAutomataState {
     xmlRegStateType type;
     xmlRegMarkedType mark;
+    xmlRegMarkedType reached;
     int no;
 
     int maxTrans;
@@ -240,6 +242,13 @@
     int nbCounters;
     xmlRegCounter *counters;
     int determinist;
+    /*
+     * That's the compact form for determinists automatas
+     */
+    int nbstates;
+    int *compact;
+    int nbstrings;
+    xmlChar **stringMap;
 };
 
 typedef struct _xmlRegExecRollback xmlRegExecRollback;
@@ -299,6 +308,8 @@
 #define REGEXP_ALL_LAX_COUNTER	0x123457
 
 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
+static void xmlRegFreeState(xmlRegStatePtr state);
+static void xmlRegFreeAtom(xmlRegAtomPtr atom);
 
 /************************************************************************
  * 									*
@@ -306,6 +317,7 @@
  * 									*
  ************************************************************************/
 
+static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
 /**
  * xmlRegEpxFromParse:
  * @ctxt:  the parser context used to build it
@@ -337,6 +349,166 @@
     ret->counters = ctxt->counters;
     ctxt->counters = NULL;
     ret->determinist = ctxt->determinist;
+
+    if ((ret->determinist != 0) &&
+	(ret->nbCounters == 0) &&
+	(ret->atoms[0] != NULL) &&
+	(ret->atoms[0]->type == XML_REGEXP_STRING)) {
+	int i, j, nbstates = 0, nbatoms = 0;
+	int *stateRemap;
+	int *stringRemap;
+	int *transitions;
+	xmlChar **stringMap;
+        xmlChar *value;
+
+	/*
+	 * Switch to a compact representation
+	 * 1/ counting the effective number of states left
+	 * 2/ conting the unique number of atoms, and check that
+	 *    they are all of the string type
+	 * 3/ build a table state x atom for the transitions
+	 */
+
+	stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
+	for (i = 0;i < ret->nbStates;i++) {
+	    if (ret->states[i] != NULL) {
+		stateRemap[i] = nbstates;
+		nbstates++;
+	    } else {
+		stateRemap[i] = -1;
+	    }
+	}
+#ifdef DEBUG_COMPACTION
+	printf("Final: %d states\n", nbstates);
+#endif
+	stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
+	stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
+	for (i = 0;i < ret->nbAtoms;i++) {
+	    if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
+		(ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
+		value = ret->atoms[i]->valuep;
+                for (j = 0;j < nbatoms;j++) {
+		    if (xmlStrEqual(stringMap[j], value)) {
+			stringRemap[i] = j;
+			break;
+		    }
+		}
+		if (j >= nbatoms) {
+		    stringRemap[i] = nbatoms;
+		    stringMap[nbatoms] = xmlStrdup(value);
+		    nbatoms++;
+		}
+	    } else {
+		xmlFree(stateRemap);
+		xmlFree(stringRemap);
+		for (i = 0;i < nbatoms;i++)
+		    xmlFree(stringMap[i]);
+		xmlFree(stringMap);
+		goto fail_compact;
+	    }
+	}
+#ifdef DEBUG_COMPACTION
+	printf("Final: %d atoms\n", nbatoms);
+#endif
+
+	/*
+	 * Allocate the transition table. The first entry for each
+	 * state correspond to the state type.
+	 */
+	transitions = (int *) xmlMalloc(nbstates * (nbatoms + 1) * sizeof(int));
+	memset(transitions, 0, nbstates * (nbatoms + 1) * sizeof(int));
+
+	for (i = 0;i < ret->nbStates;i++) {
+	    int stateno, atomno, targetno, prev;
+	    xmlRegStatePtr state;
+	    xmlRegTransPtr trans;
+
+	    stateno = stateRemap[i];
+	    if (stateno == -1)
+		continue;
+	    state = ret->states[i];
+
+	    transitions[stateno * (nbatoms + 1)] = state->type;
+
+	    for (j = 0;j < state->nbTrans;j++) {
+		trans = &(state->trans[j]);
+		if ((trans->to == -1) || (trans->atom == NULL))
+		    continue;
+                atomno = stringRemap[trans->atom->no];
+		targetno = stateRemap[trans->to];
+		/*
+		 * if the same atome can generate transition to 2 different
+		 * states then it means the automata is not determinist and
+		 * the compact form can't be used !
+		 */
+		prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
+		if (prev != 0) {
+		    if (prev != targetno + 1) {
+			printf("not determinist\n");
+			ret->determinist = 0;
+#ifdef DEBUG_COMPACTION
+			printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
+			       i, j, trans->atom->no, trans->to, atomno, targetno);
+			printf("       previous to is %d\n", prev);
+#endif
+			ret->determinist = 0;
+			xmlFree(transitions);
+			xmlFree(stateRemap);
+			xmlFree(stringRemap);
+			for (i = 0;i < nbatoms;i++)
+			    xmlFree(stringMap[i]);
+			xmlFree(stringMap);
+			goto fail_compact;
+		    }
+		} else {
+#if 0
+		    printf("State %d trans %d: atom %d to %d : %d to %d\n",
+			   i, j, trans->atom->no, trans->to, atomno, targetno);
+#endif
+		    transitions[stateno * (nbatoms + 1) + atomno + 1] =
+			targetno + 1;; /* to avoid 0 */
+		}
+	    }
+	}
+	ret->determinist = 1;
+#ifdef DEBUG_COMPACTION
+	/*
+	 * Debug
+	 */
+	for (i = 0;i < nbstates;i++) {
+	    for (j = 0;j < nbatoms + 1;j++) {
+                printf("%02d ", transitions[i * (nbatoms + 1) + j]);
+	    }
+	    printf("\n");
+	}
+	printf("\n");
+#endif
+	/*
+	 * Cleanup of the old data
+	 */
+	if (ret->states != NULL) {
+	    for (i = 0;i < ret->nbStates;i++)
+		xmlRegFreeState(ret->states[i]);
+	    xmlFree(ret->states);
+	}
+	ret->states = NULL;
+	ret->nbStates = 0;
+	if (ret->atoms != NULL) {
+	    for (i = 0;i < ret->nbAtoms;i++)
+		xmlRegFreeAtom(ret->atoms[i]);
+	    xmlFree(ret->atoms);
+	}
+	ret->atoms = NULL;
+	ret->nbAtoms = 0;
+
+	ret->compact = transitions;
+	ret->stringMap = stringMap;
+	ret->nbstrings = nbatoms;
+	ret->nbstates = nbstates;
+	xmlFree(stateRemap);
+	xmlFree(stringRemap);
+    }
+fail_compact:
     return(ret);
 }
 
@@ -741,6 +913,7 @@
     }
 }
 
+#ifdef DEBUG_REGEXP_GRAPH
 static void
 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
     int i;
@@ -780,6 +953,7 @@
 		                                ctxt->counters[i].max);
     }
 }
+#endif
 
 /************************************************************************
  * 									*
@@ -1228,7 +1402,6 @@
 		xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 
 				    ctxt->states[newto], counter, -1);
 	    }
-
 	}
     }
     to->mark = XML_REGEXP_MARK_NORMAL;
@@ -1296,6 +1469,63 @@
 	    }
 	}
     }
+
+    /*
+     * Use this pass to detect unreachable states too
+     */
+    for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
+	state = ctxt->states[statenr];
+	if (state != NULL)
+	    state->reached = 0;
+    }
+    state = ctxt->states[0];
+    if (state != NULL)
+	state->reached = 1;
+    while (state != NULL) {
+	xmlRegStatePtr target = NULL;
+	state->reached = 2;
+	/*
+	 * Mark all state reachable from the current reachable state
+	 */
+	for (transnr = 0;transnr < state->nbTrans;transnr++) {
+	    if ((state->trans[transnr].to >= 0) &&
+		((state->trans[transnr].atom != NULL) ||
+		 (state->trans[transnr].count >= 0))) {
+		int newto = state->trans[transnr].to;
+
+		if (ctxt->states[newto] == NULL)
+		    continue;
+		if (ctxt->states[newto]->reached == 0) {
+		    ctxt->states[newto]->reached = 1;
+		    target = ctxt->states[newto];
+		}
+	    }
+	}
+	/*
+	 * find the next accessible state not explored
+	 */
+	if (target == NULL) {
+	    for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
+		state = ctxt->states[statenr];
+		if ((state != NULL) && (state->reached == 1)) {
+		    target = state;
+		    break;
+		}
+	    }
+	}
+	state = target;
+    }
+    for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
+	state = ctxt->states[statenr];
+	if ((state != NULL) && (state->reached == 0)) {
+#ifdef DEBUG_REGEXP_GRAPH
+	    printf("Removed unreachable state %d\n", statenr);
+#endif
+	    xmlRegFreeState(state);
+	    ctxt->states[statenr] = NULL;
+	}
+    }
+
 }
 
 /**
@@ -2041,7 +2271,8 @@
     exec->rollbacks = NULL;
     exec->status = 0;
     exec->comp = comp;
-    exec->state = comp->states[0];
+    if (comp->compact == NULL)
+	exec->state = comp->states[0];
     exec->transno = 0;
     exec->transcount = 0;
     exec->callback = callback;
@@ -2131,6 +2362,73 @@
 
 
 /**
+ * xmlRegCompactPushString:
+ * @exec: a regexp execution context
+ * @comp:  the precompiled exec with a compact table
+ * @value: a string token input
+ * @data: data associated to the token to reuse in callbacks
+ *
+ * Push one input token in the execution context
+ *
+ * Returns: 1 if the regexp reached a final state, 0 if non-final, and
+ *     a negative value in case of error.
+ */
+static int
+xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
+	                xmlRegexpPtr comp,
+	                const xmlChar *value,
+	                void *data) {
+    int state = exec->index;
+    int i, target;
+
+    if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
+	return(-1);
+    
+    if (value == NULL) {
+	/*
+	 * are we at a final state ?
+	 */
+	if (comp->compact[state * (comp->nbstrings + 1)] ==
+            XML_REGEXP_FINAL_STATE)
+	    return(1);
+	return(0);
+    }
+
+#ifdef DEBUG_PUSH
+    printf("value pushed: %s\n", value);
+#endif
+
+    /*
+     * Examine all outside transition from current state
+     */
+    for (i = 0;i < comp->nbstrings;i++) {
+	target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
+	if ((target > 0) && (target <= comp->nbstates)) {
+	    target--; /* to avoid 0 */
+	    if (xmlStrEqual(comp->stringMap[i], value)) {
+		exec->index = target;
+#ifdef DEBUG_PUSH
+		printf("entering state %d\n", target);
+#endif
+		if (comp->compact[target * (comp->nbstrings + 1)] ==
+		    XML_REGEXP_FINAL_STATE)
+		    return(1);
+		return(0);
+	    }
+	}
+    }
+    /*
+     * Failed to find an exit transition out from current state for the
+     * current token
+     */
+#ifdef DEBUG_PUSH
+    printf("failed to find a transition for %s on state %d\n", value, state);
+#endif
+    exec->status = -1;
+    return(-1);
+}
+
+/**
  * xmlRegExecPushString:
  * @exec: a regexp execution context
  * @value: a string token input
@@ -2151,9 +2449,14 @@
 
     if (exec == NULL)
 	return(-1);
+    if (exec->comp == NULL)
+	return(-1);
     if (exec->status != 0)
 	return(exec->status);
 
+    if (exec->comp->compact != NULL)
+	return(xmlRegCompactPushString(exec, exec->comp, value, data));
+
     if (value == NULL) {
         if (exec->state->type == XML_REGEXP_FINAL_STATE)
 	    return(1);
@@ -3510,6 +3813,37 @@
 }
 
 /**
+ * xmlRegexpIsDeterminist:
+ * @comp:  the compiled regular expression
+ *
+ * Check if the regular expression is determinist
+ *
+ * Returns 1 if it yes, 0 if not and a negativa value in case of error
+ */
+int
+xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
+    xmlAutomataPtr am;
+    int ret;
+
+    if (comp == NULL)
+	return(-1);
+    if (comp->determinist != -1)
+	return(comp->determinist);
+
+    am = xmlNewAutomata();
+    am->nbAtoms = comp->nbAtoms;
+    am->atoms = comp->atoms;
+    am->nbStates = comp->nbStates;
+    am->states = comp->states;
+    am->determinist = -1;
+    ret = xmlFAComputesDeterminism(am);
+    am->atoms = NULL;
+    am->states = NULL;
+    xmlFreeAutomata(am);
+    return(ret);
+}
+
+/**
  * xmlRegFreeRegexp:
  * @regexp:  the regexp
  *
@@ -3535,6 +3869,14 @@
     }
     if (regexp->counters != NULL)
 	xmlFree(regexp->counters);
+    if (regexp->compact != NULL)
+	xmlFree(regexp->compact);
+    if (regexp->stringMap != NULL) {
+	for (i = 0; i < regexp->nbstrings;i++)
+	    xmlFree(regexp->stringMap[i]);
+	xmlFree(regexp->stringMap);
+    }
+
     xmlFree(regexp);
 }
 
@@ -3910,7 +4252,7 @@
     xmlRegexpPtr ret;
 
     xmlFAEliminateEpsilonTransitions(am);
-    xmlFAComputesDeterminism(am);
+    /* xmlFAComputesDeterminism(am); */
     ret = xmlRegEpxFromParse(am);
 
     return(ret);