blob: 21091f39cf50a39f31ca87485ec5b3bc576108cc [file] [log] [blame]
Daniel Veillard4255d502002-04-16 15:50:10 +00001/*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/shemas mechanisms now available in
William M. Brackddf71d62004-05-06 04:17:26 +00006 * XML related specifications these include:
Daniel Veillard4255d502002-04-16 15:50:10 +00007 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17#define IN_LIBXML
18#include "libxml.h"
19
20#ifdef LIBXML_REGEXP_ENABLED
21
Daniel Veillardcee2b3a2005-01-25 00:22:52 +000022/* #define DEBUG_ERR */
Daniel Veillardfc0b6f62005-01-09 17:48:02 +000023
Daniel Veillard4255d502002-04-16 15:50:10 +000024#include <stdio.h>
25#include <string.h>
Daniel Veillardebe48c62003-12-03 12:12:27 +000026#ifdef HAVE_LIMITS_H
27#include <limits.h>
28#endif
29
Daniel Veillard4255d502002-04-16 15:50:10 +000030#include <libxml/tree.h>
31#include <libxml/parserInternals.h>
32#include <libxml/xmlregexp.h>
33#include <libxml/xmlautomata.h>
34#include <libxml/xmlunicode.h>
35
Daniel Veillardebe48c62003-12-03 12:12:27 +000036#ifndef INT_MAX
37#define INT_MAX 123456789 /* easy to flag and big enough for our needs */
38#endif
39
Daniel Veillardc0826a72004-08-10 14:17:33 +000040/* #define DEBUG_REGEXP_GRAPH */
Daniel Veillard10752282005-08-08 13:05:13 +000041/* #define DEBUG_REGEXP_EXEC */
Daniel Veillard4255d502002-04-16 15:50:10 +000042/* #define DEBUG_PUSH */
Daniel Veillard23e73572002-09-19 19:56:43 +000043/* #define DEBUG_COMPACTION */
Daniel Veillard4255d502002-04-16 15:50:10 +000044
Daniel Veillard567a45b2005-10-18 19:11:55 +000045#define MAX_PUSH 10000000
Daniel Veillard94cc1032005-09-15 13:09:00 +000046
Daniel Veillardff46a042003-10-08 08:53:17 +000047#define ERROR(str) \
48 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
49 xmlRegexpErrCompile(ctxt, str);
Daniel Veillard4255d502002-04-16 15:50:10 +000050#define NEXT ctxt->cur++
51#define CUR (*(ctxt->cur))
52#define NXT(index) (ctxt->cur[index])
53
54#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
55#define NEXTL(l) ctxt->cur += l;
Daniel Veillardc0826a72004-08-10 14:17:33 +000056#define XML_REG_STRING_SEPARATOR '|'
Daniel Veillard4255d502002-04-16 15:50:10 +000057
Daniel Veillarde19fc232002-04-22 16:01:24 +000058/**
59 * TODO:
60 *
61 * macro to flag unimplemented blocks
62 */
63#define TODO \
64 xmlGenericError(xmlGenericErrorContext, \
65 "Unimplemented block at %s:%d\n", \
66 __FILE__, __LINE__);
67
Daniel Veillard4255d502002-04-16 15:50:10 +000068/************************************************************************
69 * *
70 * Datatypes and structures *
71 * *
72 ************************************************************************/
73
Daniel Veillardfc011b72006-02-12 19:14:15 +000074/*
75 * Note: the order of the enums below is significant, do not shuffle
76 */
Daniel Veillard4255d502002-04-16 15:50:10 +000077typedef enum {
78 XML_REGEXP_EPSILON = 1,
79 XML_REGEXP_CHARVAL,
80 XML_REGEXP_RANGES,
Daniel Veillard567a45b2005-10-18 19:11:55 +000081 XML_REGEXP_SUBREG, /* used for () sub regexps */
Daniel Veillard4255d502002-04-16 15:50:10 +000082 XML_REGEXP_STRING,
83 XML_REGEXP_ANYCHAR, /* . */
84 XML_REGEXP_ANYSPACE, /* \s */
85 XML_REGEXP_NOTSPACE, /* \S */
86 XML_REGEXP_INITNAME, /* \l */
Daniel Veillard567a45b2005-10-18 19:11:55 +000087 XML_REGEXP_NOTINITNAME, /* \L */
Daniel Veillard4255d502002-04-16 15:50:10 +000088 XML_REGEXP_NAMECHAR, /* \c */
89 XML_REGEXP_NOTNAMECHAR, /* \C */
90 XML_REGEXP_DECIMAL, /* \d */
Daniel Veillard567a45b2005-10-18 19:11:55 +000091 XML_REGEXP_NOTDECIMAL, /* \D */
Daniel Veillard4255d502002-04-16 15:50:10 +000092 XML_REGEXP_REALCHAR, /* \w */
Daniel Veillard567a45b2005-10-18 19:11:55 +000093 XML_REGEXP_NOTREALCHAR, /* \W */
94 XML_REGEXP_LETTER = 100,
Daniel Veillard4255d502002-04-16 15:50:10 +000095 XML_REGEXP_LETTER_UPPERCASE,
96 XML_REGEXP_LETTER_LOWERCASE,
97 XML_REGEXP_LETTER_TITLECASE,
98 XML_REGEXP_LETTER_MODIFIER,
99 XML_REGEXP_LETTER_OTHERS,
100 XML_REGEXP_MARK,
101 XML_REGEXP_MARK_NONSPACING,
102 XML_REGEXP_MARK_SPACECOMBINING,
103 XML_REGEXP_MARK_ENCLOSING,
104 XML_REGEXP_NUMBER,
105 XML_REGEXP_NUMBER_DECIMAL,
106 XML_REGEXP_NUMBER_LETTER,
107 XML_REGEXP_NUMBER_OTHERS,
108 XML_REGEXP_PUNCT,
109 XML_REGEXP_PUNCT_CONNECTOR,
110 XML_REGEXP_PUNCT_DASH,
111 XML_REGEXP_PUNCT_OPEN,
112 XML_REGEXP_PUNCT_CLOSE,
113 XML_REGEXP_PUNCT_INITQUOTE,
114 XML_REGEXP_PUNCT_FINQUOTE,
115 XML_REGEXP_PUNCT_OTHERS,
116 XML_REGEXP_SEPAR,
117 XML_REGEXP_SEPAR_SPACE,
118 XML_REGEXP_SEPAR_LINE,
119 XML_REGEXP_SEPAR_PARA,
120 XML_REGEXP_SYMBOL,
121 XML_REGEXP_SYMBOL_MATH,
122 XML_REGEXP_SYMBOL_CURRENCY,
123 XML_REGEXP_SYMBOL_MODIFIER,
124 XML_REGEXP_SYMBOL_OTHERS,
125 XML_REGEXP_OTHER,
126 XML_REGEXP_OTHER_CONTROL,
127 XML_REGEXP_OTHER_FORMAT,
128 XML_REGEXP_OTHER_PRIVATE,
129 XML_REGEXP_OTHER_NA,
130 XML_REGEXP_BLOCK_NAME
131} xmlRegAtomType;
132
133typedef enum {
134 XML_REGEXP_QUANT_EPSILON = 1,
135 XML_REGEXP_QUANT_ONCE,
136 XML_REGEXP_QUANT_OPT,
137 XML_REGEXP_QUANT_MULT,
138 XML_REGEXP_QUANT_PLUS,
Daniel Veillard7646b182002-04-20 06:41:40 +0000139 XML_REGEXP_QUANT_ONCEONLY,
140 XML_REGEXP_QUANT_ALL,
Daniel Veillard4255d502002-04-16 15:50:10 +0000141 XML_REGEXP_QUANT_RANGE
142} xmlRegQuantType;
143
144typedef enum {
145 XML_REGEXP_START_STATE = 1,
146 XML_REGEXP_FINAL_STATE,
Daniel Veillardcc026dc2005-01-12 13:21:17 +0000147 XML_REGEXP_TRANS_STATE,
148 XML_REGEXP_SINK_STATE
Daniel Veillard4255d502002-04-16 15:50:10 +0000149} xmlRegStateType;
150
151typedef enum {
152 XML_REGEXP_MARK_NORMAL = 0,
153 XML_REGEXP_MARK_START,
154 XML_REGEXP_MARK_VISITED
155} xmlRegMarkedType;
156
157typedef struct _xmlRegRange xmlRegRange;
158typedef xmlRegRange *xmlRegRangePtr;
159
160struct _xmlRegRange {
Daniel Veillardf8b9de32003-11-24 14:27:26 +0000161 int neg; /* 0 normal, 1 not, 2 exclude */
Daniel Veillard4255d502002-04-16 15:50:10 +0000162 xmlRegAtomType type;
163 int start;
164 int end;
165 xmlChar *blockName;
166};
167
168typedef struct _xmlRegAtom xmlRegAtom;
169typedef xmlRegAtom *xmlRegAtomPtr;
170
171typedef struct _xmlAutomataState xmlRegState;
172typedef xmlRegState *xmlRegStatePtr;
173
174struct _xmlRegAtom {
175 int no;
176 xmlRegAtomType type;
177 xmlRegQuantType quant;
178 int min;
179 int max;
180
181 void *valuep;
Daniel Veillarda646cfd2002-09-17 21:50:03 +0000182 void *valuep2;
Daniel Veillard4255d502002-04-16 15:50:10 +0000183 int neg;
184 int codepoint;
185 xmlRegStatePtr start;
186 xmlRegStatePtr stop;
187 int maxRanges;
188 int nbRanges;
189 xmlRegRangePtr *ranges;
190 void *data;
191};
192
193typedef struct _xmlRegCounter xmlRegCounter;
194typedef xmlRegCounter *xmlRegCounterPtr;
195
196struct _xmlRegCounter {
197 int min;
198 int max;
199};
200
201typedef struct _xmlRegTrans xmlRegTrans;
202typedef xmlRegTrans *xmlRegTransPtr;
203
204struct _xmlRegTrans {
205 xmlRegAtomPtr atom;
206 int to;
207 int counter;
208 int count;
Daniel Veillard567a45b2005-10-18 19:11:55 +0000209 int nd;
Daniel Veillard4255d502002-04-16 15:50:10 +0000210};
211
212struct _xmlAutomataState {
213 xmlRegStateType type;
214 xmlRegMarkedType mark;
Daniel Veillard23e73572002-09-19 19:56:43 +0000215 xmlRegMarkedType reached;
Daniel Veillard4255d502002-04-16 15:50:10 +0000216 int no;
Daniel Veillard4255d502002-04-16 15:50:10 +0000217 int maxTrans;
218 int nbTrans;
219 xmlRegTrans *trans;
Daniel Veillarddb68b742005-07-30 13:18:24 +0000220 /* knowing states ponting to us can speed things up */
221 int maxTransTo;
222 int nbTransTo;
223 int *transTo;
Daniel Veillard4255d502002-04-16 15:50:10 +0000224};
225
226typedef struct _xmlAutomata xmlRegParserCtxt;
227typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
228
229struct _xmlAutomata {
230 xmlChar *string;
231 xmlChar *cur;
232
233 int error;
234 int neg;
235
236 xmlRegStatePtr start;
237 xmlRegStatePtr end;
238 xmlRegStatePtr state;
239
240 xmlRegAtomPtr atom;
241
242 int maxAtoms;
243 int nbAtoms;
244 xmlRegAtomPtr *atoms;
245
246 int maxStates;
247 int nbStates;
248 xmlRegStatePtr *states;
249
250 int maxCounters;
251 int nbCounters;
252 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000253
254 int determinist;
Daniel Veillard6e65e152005-08-09 11:09:52 +0000255 int negs;
Daniel Veillard4255d502002-04-16 15:50:10 +0000256};
257
258struct _xmlRegexp {
259 xmlChar *string;
260 int nbStates;
261 xmlRegStatePtr *states;
262 int nbAtoms;
263 xmlRegAtomPtr *atoms;
264 int nbCounters;
265 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000266 int determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000267 /*
268 * That's the compact form for determinists automatas
269 */
270 int nbstates;
271 int *compact;
Daniel Veillard118aed72002-09-24 14:13:13 +0000272 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000273 int nbstrings;
274 xmlChar **stringMap;
Daniel Veillard4255d502002-04-16 15:50:10 +0000275};
276
277typedef struct _xmlRegExecRollback xmlRegExecRollback;
278typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
279
280struct _xmlRegExecRollback {
281 xmlRegStatePtr state;/* the current state */
282 int index; /* the index in the input stack */
283 int nextbranch; /* the next transition to explore in that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000284 int *counts; /* save the automata state if it has some */
Daniel Veillard4255d502002-04-16 15:50:10 +0000285};
286
287typedef struct _xmlRegInputToken xmlRegInputToken;
288typedef xmlRegInputToken *xmlRegInputTokenPtr;
289
290struct _xmlRegInputToken {
291 xmlChar *value;
292 void *data;
293};
294
295struct _xmlRegExecCtxt {
296 int status; /* execution status != 0 indicate an error */
William M. Brackddf71d62004-05-06 04:17:26 +0000297 int determinist; /* did we find an indeterministic behaviour */
Daniel Veillard4255d502002-04-16 15:50:10 +0000298 xmlRegexpPtr comp; /* the compiled regexp */
299 xmlRegExecCallbacks callback;
300 void *data;
301
302 xmlRegStatePtr state;/* the current state */
303 int transno; /* the current transition on that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000304 int transcount; /* the number of chars in char counted transitions */
Daniel Veillard4255d502002-04-16 15:50:10 +0000305
306 /*
307 * A stack of rollback states
308 */
309 int maxRollbacks;
310 int nbRollbacks;
311 xmlRegExecRollback *rollbacks;
312
313 /*
314 * The state of the automata if any
315 */
316 int *counts;
317
318 /*
319 * The input stack
320 */
321 int inputStackMax;
322 int inputStackNr;
323 int index;
324 int *charStack;
325 const xmlChar *inputString; /* when operating on characters */
326 xmlRegInputTokenPtr inputStack;/* when operating on strings */
327
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +0000328 /*
329 * error handling
330 */
331 int errStateNo; /* the error state number */
332 xmlRegStatePtr errState; /* the error state */
333 xmlChar *errString; /* the string raising the error */
334 int *errCounts; /* counters at the error state */
Daniel Veillard94cc1032005-09-15 13:09:00 +0000335 int nbPush;
Daniel Veillard4255d502002-04-16 15:50:10 +0000336};
337
Daniel Veillard441bc322002-04-20 17:38:48 +0000338#define REGEXP_ALL_COUNTER 0x123456
339#define REGEXP_ALL_LAX_COUNTER 0x123457
Daniel Veillard7646b182002-04-20 06:41:40 +0000340
Daniel Veillard4255d502002-04-16 15:50:10 +0000341static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
Daniel Veillard23e73572002-09-19 19:56:43 +0000342static void xmlRegFreeState(xmlRegStatePtr state);
343static void xmlRegFreeAtom(xmlRegAtomPtr atom);
Daniel Veillard9efc4762005-07-19 14:33:55 +0000344static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
Daniel Veillard567a45b2005-10-18 19:11:55 +0000345static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
346static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
347 int neg, int start, int end, const xmlChar *blockName);
Daniel Veillard4255d502002-04-16 15:50:10 +0000348
349/************************************************************************
Daniel Veillardff46a042003-10-08 08:53:17 +0000350 * *
351 * Regexp memory error handler *
352 * *
353 ************************************************************************/
354/**
355 * xmlRegexpErrMemory:
William M. Brackddf71d62004-05-06 04:17:26 +0000356 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000357 *
358 * Handle an out of memory condition
359 */
360static void
361xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
362{
363 const char *regexp = NULL;
364 if (ctxt != NULL) {
365 regexp = (const char *) ctxt->string;
366 ctxt->error = XML_ERR_NO_MEMORY;
367 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000368 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000369 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
370 regexp, NULL, 0, 0,
371 "Memory allocation failed : %s\n", extra);
372}
373
374/**
375 * xmlRegexpErrCompile:
William M. Brackddf71d62004-05-06 04:17:26 +0000376 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000377 *
William M. Brackddf71d62004-05-06 04:17:26 +0000378 * Handle a compilation failure
Daniel Veillardff46a042003-10-08 08:53:17 +0000379 */
380static void
381xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
382{
383 const char *regexp = NULL;
384 int idx = 0;
385
386 if (ctxt != NULL) {
387 regexp = (const char *) ctxt->string;
388 idx = ctxt->cur - ctxt->string;
389 ctxt->error = XML_REGEXP_COMPILE_ERROR;
390 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000391 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000392 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
393 regexp, NULL, idx, 0,
394 "failed to compile: %s\n", extra);
395}
396
397/************************************************************************
Daniel Veillard4255d502002-04-16 15:50:10 +0000398 * *
399 * Allocation/Deallocation *
400 * *
401 ************************************************************************/
402
Daniel Veillard23e73572002-09-19 19:56:43 +0000403static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
Daniel Veillard4255d502002-04-16 15:50:10 +0000404/**
405 * xmlRegEpxFromParse:
406 * @ctxt: the parser context used to build it
407 *
William M. Brackddf71d62004-05-06 04:17:26 +0000408 * Allocate a new regexp and fill it with the result from the parser
Daniel Veillard4255d502002-04-16 15:50:10 +0000409 *
410 * Returns the new regexp or NULL in case of error
411 */
412static xmlRegexpPtr
413xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
414 xmlRegexpPtr ret;
415
416 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000417 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000418 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +0000419 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000420 }
Daniel Veillard4255d502002-04-16 15:50:10 +0000421 memset(ret, 0, sizeof(xmlRegexp));
422 ret->string = ctxt->string;
Daniel Veillard4255d502002-04-16 15:50:10 +0000423 ret->nbStates = ctxt->nbStates;
Daniel Veillard4255d502002-04-16 15:50:10 +0000424 ret->states = ctxt->states;
Daniel Veillard4255d502002-04-16 15:50:10 +0000425 ret->nbAtoms = ctxt->nbAtoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000426 ret->atoms = ctxt->atoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000427 ret->nbCounters = ctxt->nbCounters;
Daniel Veillard4255d502002-04-16 15:50:10 +0000428 ret->counters = ctxt->counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000429 ret->determinist = ctxt->determinist;
Daniel Veillard567a45b2005-10-18 19:11:55 +0000430 if (ret->determinist == -1) {
431 xmlRegexpIsDeterminist(ret);
432 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000433
434 if ((ret->determinist != 0) &&
435 (ret->nbCounters == 0) &&
Daniel Veillard6e65e152005-08-09 11:09:52 +0000436 (ctxt->negs == 0) &&
Daniel Veillard118aed72002-09-24 14:13:13 +0000437 (ret->atoms != NULL) &&
Daniel Veillard23e73572002-09-19 19:56:43 +0000438 (ret->atoms[0] != NULL) &&
439 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
440 int i, j, nbstates = 0, nbatoms = 0;
441 int *stateRemap;
442 int *stringRemap;
443 int *transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000444 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000445 xmlChar **stringMap;
446 xmlChar *value;
447
448 /*
449 * Switch to a compact representation
450 * 1/ counting the effective number of states left
William M. Brackddf71d62004-05-06 04:17:26 +0000451 * 2/ counting the unique number of atoms, and check that
Daniel Veillard23e73572002-09-19 19:56:43 +0000452 * they are all of the string type
453 * 3/ build a table state x atom for the transitions
454 */
455
456 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000457 if (stateRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000458 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000459 xmlFree(ret);
460 return(NULL);
461 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000462 for (i = 0;i < ret->nbStates;i++) {
463 if (ret->states[i] != NULL) {
464 stateRemap[i] = nbstates;
465 nbstates++;
466 } else {
467 stateRemap[i] = -1;
468 }
469 }
470#ifdef DEBUG_COMPACTION
471 printf("Final: %d states\n", nbstates);
472#endif
473 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000474 if (stringMap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000475 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000476 xmlFree(stateRemap);
477 xmlFree(ret);
478 return(NULL);
479 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000480 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000481 if (stringRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000482 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000483 xmlFree(stringMap);
484 xmlFree(stateRemap);
485 xmlFree(ret);
486 return(NULL);
487 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000488 for (i = 0;i < ret->nbAtoms;i++) {
489 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
490 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
491 value = ret->atoms[i]->valuep;
492 for (j = 0;j < nbatoms;j++) {
493 if (xmlStrEqual(stringMap[j], value)) {
494 stringRemap[i] = j;
495 break;
496 }
497 }
498 if (j >= nbatoms) {
499 stringRemap[i] = nbatoms;
500 stringMap[nbatoms] = xmlStrdup(value);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000501 if (stringMap[nbatoms] == NULL) {
502 for (i = 0;i < nbatoms;i++)
503 xmlFree(stringMap[i]);
504 xmlFree(stringRemap);
505 xmlFree(stringMap);
506 xmlFree(stateRemap);
507 xmlFree(ret);
508 return(NULL);
509 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000510 nbatoms++;
511 }
512 } else {
513 xmlFree(stateRemap);
514 xmlFree(stringRemap);
515 for (i = 0;i < nbatoms;i++)
516 xmlFree(stringMap[i]);
517 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000518 xmlFree(ret);
519 return(NULL);
Daniel Veillard23e73572002-09-19 19:56:43 +0000520 }
521 }
522#ifdef DEBUG_COMPACTION
523 printf("Final: %d atoms\n", nbatoms);
524#endif
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000525 transitions = (int *) xmlMalloc((nbstates + 1) *
526 (nbatoms + 1) * sizeof(int));
527 if (transitions == NULL) {
528 xmlFree(stateRemap);
529 xmlFree(stringRemap);
530 xmlFree(stringMap);
531 xmlFree(ret);
532 return(NULL);
533 }
534 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int));
Daniel Veillard23e73572002-09-19 19:56:43 +0000535
536 /*
537 * Allocate the transition table. The first entry for each
William M. Brackddf71d62004-05-06 04:17:26 +0000538 * state corresponds to the state type.
Daniel Veillard23e73572002-09-19 19:56:43 +0000539 */
Daniel Veillard118aed72002-09-24 14:13:13 +0000540 transdata = NULL;
Daniel Veillard23e73572002-09-19 19:56:43 +0000541
542 for (i = 0;i < ret->nbStates;i++) {
543 int stateno, atomno, targetno, prev;
544 xmlRegStatePtr state;
545 xmlRegTransPtr trans;
546
547 stateno = stateRemap[i];
548 if (stateno == -1)
549 continue;
550 state = ret->states[i];
551
552 transitions[stateno * (nbatoms + 1)] = state->type;
553
554 for (j = 0;j < state->nbTrans;j++) {
555 trans = &(state->trans[j]);
556 if ((trans->to == -1) || (trans->atom == NULL))
557 continue;
558 atomno = stringRemap[trans->atom->no];
Daniel Veillard118aed72002-09-24 14:13:13 +0000559 if ((trans->atom->data != NULL) && (transdata == NULL)) {
560 transdata = (void **) xmlMalloc(nbstates * nbatoms *
561 sizeof(void *));
562 if (transdata != NULL)
563 memset(transdata, 0,
564 nbstates * nbatoms * sizeof(void *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000565 else {
Daniel Veillardff46a042003-10-08 08:53:17 +0000566 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000567 break;
568 }
Daniel Veillard118aed72002-09-24 14:13:13 +0000569 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000570 targetno = stateRemap[trans->to];
571 /*
William M. Brackddf71d62004-05-06 04:17:26 +0000572 * if the same atom can generate transitions to 2 different
Daniel Veillard23e73572002-09-19 19:56:43 +0000573 * states then it means the automata is not determinist and
574 * the compact form can't be used !
575 */
576 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
577 if (prev != 0) {
578 if (prev != targetno + 1) {
Daniel Veillard23e73572002-09-19 19:56:43 +0000579 ret->determinist = 0;
580#ifdef DEBUG_COMPACTION
581 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
582 i, j, trans->atom->no, trans->to, atomno, targetno);
583 printf(" previous to is %d\n", prev);
584#endif
Daniel Veillard118aed72002-09-24 14:13:13 +0000585 if (transdata != NULL)
586 xmlFree(transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +0000587 xmlFree(transitions);
588 xmlFree(stateRemap);
589 xmlFree(stringRemap);
590 for (i = 0;i < nbatoms;i++)
591 xmlFree(stringMap[i]);
592 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000593 goto not_determ;
Daniel Veillard23e73572002-09-19 19:56:43 +0000594 }
595 } else {
596#if 0
597 printf("State %d trans %d: atom %d to %d : %d to %d\n",
598 i, j, trans->atom->no, trans->to, atomno, targetno);
599#endif
600 transitions[stateno * (nbatoms + 1) + atomno + 1] =
Daniel Veillard118aed72002-09-24 14:13:13 +0000601 targetno + 1; /* to avoid 0 */
602 if (transdata != NULL)
603 transdata[stateno * nbatoms + atomno] =
604 trans->atom->data;
Daniel Veillard23e73572002-09-19 19:56:43 +0000605 }
606 }
607 }
608 ret->determinist = 1;
609#ifdef DEBUG_COMPACTION
610 /*
611 * Debug
612 */
613 for (i = 0;i < nbstates;i++) {
614 for (j = 0;j < nbatoms + 1;j++) {
615 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
616 }
617 printf("\n");
618 }
619 printf("\n");
620#endif
621 /*
622 * Cleanup of the old data
623 */
624 if (ret->states != NULL) {
625 for (i = 0;i < ret->nbStates;i++)
626 xmlRegFreeState(ret->states[i]);
627 xmlFree(ret->states);
628 }
629 ret->states = NULL;
630 ret->nbStates = 0;
631 if (ret->atoms != NULL) {
632 for (i = 0;i < ret->nbAtoms;i++)
633 xmlRegFreeAtom(ret->atoms[i]);
634 xmlFree(ret->atoms);
635 }
636 ret->atoms = NULL;
637 ret->nbAtoms = 0;
638
639 ret->compact = transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000640 ret->transdata = transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000641 ret->stringMap = stringMap;
642 ret->nbstrings = nbatoms;
643 ret->nbstates = nbstates;
644 xmlFree(stateRemap);
645 xmlFree(stringRemap);
646 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000647not_determ:
648 ctxt->string = NULL;
649 ctxt->nbStates = 0;
650 ctxt->states = NULL;
651 ctxt->nbAtoms = 0;
652 ctxt->atoms = NULL;
653 ctxt->nbCounters = 0;
654 ctxt->counters = NULL;
Daniel Veillard4255d502002-04-16 15:50:10 +0000655 return(ret);
656}
657
658/**
659 * xmlRegNewParserCtxt:
660 * @string: the string to parse
661 *
662 * Allocate a new regexp parser context
663 *
664 * Returns the new context or NULL in case of error
665 */
666static xmlRegParserCtxtPtr
667xmlRegNewParserCtxt(const xmlChar *string) {
668 xmlRegParserCtxtPtr ret;
669
670 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
671 if (ret == NULL)
672 return(NULL);
673 memset(ret, 0, sizeof(xmlRegParserCtxt));
674 if (string != NULL)
675 ret->string = xmlStrdup(string);
676 ret->cur = ret->string;
677 ret->neg = 0;
Daniel Veillard6e65e152005-08-09 11:09:52 +0000678 ret->negs = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +0000679 ret->error = 0;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000680 ret->determinist = -1;
Daniel Veillard4255d502002-04-16 15:50:10 +0000681 return(ret);
682}
683
684/**
685 * xmlRegNewRange:
686 * @ctxt: the regexp parser context
687 * @neg: is that negative
688 * @type: the type of range
689 * @start: the start codepoint
690 * @end: the end codepoint
691 *
692 * Allocate a new regexp range
693 *
694 * Returns the new range or NULL in case of error
695 */
696static xmlRegRangePtr
697xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
698 int neg, xmlRegAtomType type, int start, int end) {
699 xmlRegRangePtr ret;
700
701 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
702 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000703 xmlRegexpErrMemory(ctxt, "allocating range");
Daniel Veillard4255d502002-04-16 15:50:10 +0000704 return(NULL);
705 }
706 ret->neg = neg;
707 ret->type = type;
708 ret->start = start;
709 ret->end = end;
710 return(ret);
711}
712
713/**
714 * xmlRegFreeRange:
715 * @range: the regexp range
716 *
717 * Free a regexp range
718 */
719static void
720xmlRegFreeRange(xmlRegRangePtr range) {
721 if (range == NULL)
722 return;
723
724 if (range->blockName != NULL)
725 xmlFree(range->blockName);
726 xmlFree(range);
727}
728
729/**
730 * xmlRegNewAtom:
731 * @ctxt: the regexp parser context
732 * @type: the type of atom
733 *
734 * Allocate a new regexp range
735 *
736 * Returns the new atom or NULL in case of error
737 */
738static xmlRegAtomPtr
739xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
740 xmlRegAtomPtr ret;
741
742 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
743 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000744 xmlRegexpErrMemory(ctxt, "allocating atom");
Daniel Veillard4255d502002-04-16 15:50:10 +0000745 return(NULL);
746 }
747 memset(ret, 0, sizeof(xmlRegAtom));
748 ret->type = type;
749 ret->quant = XML_REGEXP_QUANT_ONCE;
750 ret->min = 0;
751 ret->max = 0;
752 return(ret);
753}
754
755/**
756 * xmlRegFreeAtom:
757 * @atom: the regexp atom
758 *
759 * Free a regexp atom
760 */
761static void
762xmlRegFreeAtom(xmlRegAtomPtr atom) {
763 int i;
764
765 if (atom == NULL)
766 return;
767
768 for (i = 0;i < atom->nbRanges;i++)
769 xmlRegFreeRange(atom->ranges[i]);
770 if (atom->ranges != NULL)
771 xmlFree(atom->ranges);
Daniel Veillardde0e4982005-07-03 14:35:44 +0000772 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
773 xmlFree(atom->valuep);
Daniel Veillard77005e62005-07-19 16:26:18 +0000774 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
775 xmlFree(atom->valuep2);
Daniel Veillardde0e4982005-07-03 14:35:44 +0000776 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +0000777 xmlFree(atom->valuep);
778 xmlFree(atom);
779}
780
781static xmlRegStatePtr
782xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
783 xmlRegStatePtr ret;
784
785 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
786 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000787 xmlRegexpErrMemory(ctxt, "allocating state");
Daniel Veillard4255d502002-04-16 15:50:10 +0000788 return(NULL);
789 }
790 memset(ret, 0, sizeof(xmlRegState));
791 ret->type = XML_REGEXP_TRANS_STATE;
792 ret->mark = XML_REGEXP_MARK_NORMAL;
793 return(ret);
794}
795
796/**
797 * xmlRegFreeState:
798 * @state: the regexp state
799 *
800 * Free a regexp state
801 */
802static void
803xmlRegFreeState(xmlRegStatePtr state) {
804 if (state == NULL)
805 return;
806
807 if (state->trans != NULL)
808 xmlFree(state->trans);
Daniel Veillarddb68b742005-07-30 13:18:24 +0000809 if (state->transTo != NULL)
810 xmlFree(state->transTo);
Daniel Veillard4255d502002-04-16 15:50:10 +0000811 xmlFree(state);
812}
813
814/**
815 * xmlRegFreeParserCtxt:
816 * @ctxt: the regexp parser context
817 *
818 * Free a regexp parser context
819 */
820static void
821xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
822 int i;
823 if (ctxt == NULL)
824 return;
825
826 if (ctxt->string != NULL)
827 xmlFree(ctxt->string);
828 if (ctxt->states != NULL) {
829 for (i = 0;i < ctxt->nbStates;i++)
830 xmlRegFreeState(ctxt->states[i]);
831 xmlFree(ctxt->states);
832 }
833 if (ctxt->atoms != NULL) {
834 for (i = 0;i < ctxt->nbAtoms;i++)
835 xmlRegFreeAtom(ctxt->atoms[i]);
836 xmlFree(ctxt->atoms);
837 }
838 if (ctxt->counters != NULL)
839 xmlFree(ctxt->counters);
840 xmlFree(ctxt);
841}
842
843/************************************************************************
844 * *
845 * Display of Data structures *
846 * *
847 ************************************************************************/
848
849static void
850xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
851 switch (type) {
852 case XML_REGEXP_EPSILON:
853 fprintf(output, "epsilon "); break;
854 case XML_REGEXP_CHARVAL:
855 fprintf(output, "charval "); break;
856 case XML_REGEXP_RANGES:
857 fprintf(output, "ranges "); break;
858 case XML_REGEXP_SUBREG:
859 fprintf(output, "subexpr "); break;
860 case XML_REGEXP_STRING:
861 fprintf(output, "string "); break;
862 case XML_REGEXP_ANYCHAR:
863 fprintf(output, "anychar "); break;
864 case XML_REGEXP_ANYSPACE:
865 fprintf(output, "anyspace "); break;
866 case XML_REGEXP_NOTSPACE:
867 fprintf(output, "notspace "); break;
868 case XML_REGEXP_INITNAME:
869 fprintf(output, "initname "); break;
870 case XML_REGEXP_NOTINITNAME:
871 fprintf(output, "notinitname "); break;
872 case XML_REGEXP_NAMECHAR:
873 fprintf(output, "namechar "); break;
874 case XML_REGEXP_NOTNAMECHAR:
875 fprintf(output, "notnamechar "); break;
876 case XML_REGEXP_DECIMAL:
877 fprintf(output, "decimal "); break;
878 case XML_REGEXP_NOTDECIMAL:
879 fprintf(output, "notdecimal "); break;
880 case XML_REGEXP_REALCHAR:
881 fprintf(output, "realchar "); break;
882 case XML_REGEXP_NOTREALCHAR:
883 fprintf(output, "notrealchar "); break;
884 case XML_REGEXP_LETTER:
885 fprintf(output, "LETTER "); break;
886 case XML_REGEXP_LETTER_UPPERCASE:
887 fprintf(output, "LETTER_UPPERCASE "); break;
888 case XML_REGEXP_LETTER_LOWERCASE:
889 fprintf(output, "LETTER_LOWERCASE "); break;
890 case XML_REGEXP_LETTER_TITLECASE:
891 fprintf(output, "LETTER_TITLECASE "); break;
892 case XML_REGEXP_LETTER_MODIFIER:
893 fprintf(output, "LETTER_MODIFIER "); break;
894 case XML_REGEXP_LETTER_OTHERS:
895 fprintf(output, "LETTER_OTHERS "); break;
896 case XML_REGEXP_MARK:
897 fprintf(output, "MARK "); break;
898 case XML_REGEXP_MARK_NONSPACING:
899 fprintf(output, "MARK_NONSPACING "); break;
900 case XML_REGEXP_MARK_SPACECOMBINING:
901 fprintf(output, "MARK_SPACECOMBINING "); break;
902 case XML_REGEXP_MARK_ENCLOSING:
903 fprintf(output, "MARK_ENCLOSING "); break;
904 case XML_REGEXP_NUMBER:
905 fprintf(output, "NUMBER "); break;
906 case XML_REGEXP_NUMBER_DECIMAL:
907 fprintf(output, "NUMBER_DECIMAL "); break;
908 case XML_REGEXP_NUMBER_LETTER:
909 fprintf(output, "NUMBER_LETTER "); break;
910 case XML_REGEXP_NUMBER_OTHERS:
911 fprintf(output, "NUMBER_OTHERS "); break;
912 case XML_REGEXP_PUNCT:
913 fprintf(output, "PUNCT "); break;
914 case XML_REGEXP_PUNCT_CONNECTOR:
915 fprintf(output, "PUNCT_CONNECTOR "); break;
916 case XML_REGEXP_PUNCT_DASH:
917 fprintf(output, "PUNCT_DASH "); break;
918 case XML_REGEXP_PUNCT_OPEN:
919 fprintf(output, "PUNCT_OPEN "); break;
920 case XML_REGEXP_PUNCT_CLOSE:
921 fprintf(output, "PUNCT_CLOSE "); break;
922 case XML_REGEXP_PUNCT_INITQUOTE:
923 fprintf(output, "PUNCT_INITQUOTE "); break;
924 case XML_REGEXP_PUNCT_FINQUOTE:
925 fprintf(output, "PUNCT_FINQUOTE "); break;
926 case XML_REGEXP_PUNCT_OTHERS:
927 fprintf(output, "PUNCT_OTHERS "); break;
928 case XML_REGEXP_SEPAR:
929 fprintf(output, "SEPAR "); break;
930 case XML_REGEXP_SEPAR_SPACE:
931 fprintf(output, "SEPAR_SPACE "); break;
932 case XML_REGEXP_SEPAR_LINE:
933 fprintf(output, "SEPAR_LINE "); break;
934 case XML_REGEXP_SEPAR_PARA:
935 fprintf(output, "SEPAR_PARA "); break;
936 case XML_REGEXP_SYMBOL:
937 fprintf(output, "SYMBOL "); break;
938 case XML_REGEXP_SYMBOL_MATH:
939 fprintf(output, "SYMBOL_MATH "); break;
940 case XML_REGEXP_SYMBOL_CURRENCY:
941 fprintf(output, "SYMBOL_CURRENCY "); break;
942 case XML_REGEXP_SYMBOL_MODIFIER:
943 fprintf(output, "SYMBOL_MODIFIER "); break;
944 case XML_REGEXP_SYMBOL_OTHERS:
945 fprintf(output, "SYMBOL_OTHERS "); break;
946 case XML_REGEXP_OTHER:
947 fprintf(output, "OTHER "); break;
948 case XML_REGEXP_OTHER_CONTROL:
949 fprintf(output, "OTHER_CONTROL "); break;
950 case XML_REGEXP_OTHER_FORMAT:
951 fprintf(output, "OTHER_FORMAT "); break;
952 case XML_REGEXP_OTHER_PRIVATE:
953 fprintf(output, "OTHER_PRIVATE "); break;
954 case XML_REGEXP_OTHER_NA:
955 fprintf(output, "OTHER_NA "); break;
956 case XML_REGEXP_BLOCK_NAME:
957 fprintf(output, "BLOCK "); break;
958 }
959}
960
961static void
962xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
963 switch (type) {
964 case XML_REGEXP_QUANT_EPSILON:
965 fprintf(output, "epsilon "); break;
966 case XML_REGEXP_QUANT_ONCE:
967 fprintf(output, "once "); break;
968 case XML_REGEXP_QUANT_OPT:
969 fprintf(output, "? "); break;
970 case XML_REGEXP_QUANT_MULT:
971 fprintf(output, "* "); break;
972 case XML_REGEXP_QUANT_PLUS:
973 fprintf(output, "+ "); break;
974 case XML_REGEXP_QUANT_RANGE:
975 fprintf(output, "range "); break;
Daniel Veillard7646b182002-04-20 06:41:40 +0000976 case XML_REGEXP_QUANT_ONCEONLY:
977 fprintf(output, "onceonly "); break;
978 case XML_REGEXP_QUANT_ALL:
979 fprintf(output, "all "); break;
Daniel Veillard4255d502002-04-16 15:50:10 +0000980 }
981}
982static void
983xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
984 fprintf(output, " range: ");
985 if (range->neg)
986 fprintf(output, "negative ");
987 xmlRegPrintAtomType(output, range->type);
988 fprintf(output, "%c - %c\n", range->start, range->end);
989}
990
991static void
992xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
993 fprintf(output, " atom: ");
994 if (atom == NULL) {
995 fprintf(output, "NULL\n");
996 return;
997 }
Daniel Veillard9efc4762005-07-19 14:33:55 +0000998 if (atom->neg)
999 fprintf(output, "not ");
Daniel Veillard4255d502002-04-16 15:50:10 +00001000 xmlRegPrintAtomType(output, atom->type);
1001 xmlRegPrintQuantType(output, atom->quant);
1002 if (atom->quant == XML_REGEXP_QUANT_RANGE)
1003 fprintf(output, "%d-%d ", atom->min, atom->max);
1004 if (atom->type == XML_REGEXP_STRING)
1005 fprintf(output, "'%s' ", (char *) atom->valuep);
1006 if (atom->type == XML_REGEXP_CHARVAL)
1007 fprintf(output, "char %c\n", atom->codepoint);
1008 else if (atom->type == XML_REGEXP_RANGES) {
1009 int i;
1010 fprintf(output, "%d entries\n", atom->nbRanges);
1011 for (i = 0; i < atom->nbRanges;i++)
1012 xmlRegPrintRange(output, atom->ranges[i]);
1013 } else if (atom->type == XML_REGEXP_SUBREG) {
1014 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1015 } else {
1016 fprintf(output, "\n");
1017 }
1018}
1019
1020static void
1021xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1022 fprintf(output, " trans: ");
1023 if (trans == NULL) {
1024 fprintf(output, "NULL\n");
1025 return;
1026 }
1027 if (trans->to < 0) {
1028 fprintf(output, "removed\n");
1029 return;
1030 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001031 if (trans->nd != 0) {
1032 if (trans->nd == 2)
1033 fprintf(output, "last not determinist, ");
1034 else
1035 fprintf(output, "not determinist, ");
1036 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001037 if (trans->counter >= 0) {
1038 fprintf(output, "counted %d, ", trans->counter);
1039 }
Daniel Veillard8a001f62002-04-20 07:24:11 +00001040 if (trans->count == REGEXP_ALL_COUNTER) {
1041 fprintf(output, "all transition, ");
1042 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00001043 fprintf(output, "count based %d, ", trans->count);
1044 }
1045 if (trans->atom == NULL) {
1046 fprintf(output, "epsilon to %d\n", trans->to);
1047 return;
1048 }
1049 if (trans->atom->type == XML_REGEXP_CHARVAL)
1050 fprintf(output, "char %c ", trans->atom->codepoint);
1051 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1052}
1053
1054static void
1055xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1056 int i;
1057
1058 fprintf(output, " state: ");
1059 if (state == NULL) {
1060 fprintf(output, "NULL\n");
1061 return;
1062 }
1063 if (state->type == XML_REGEXP_START_STATE)
1064 fprintf(output, "START ");
1065 if (state->type == XML_REGEXP_FINAL_STATE)
1066 fprintf(output, "FINAL ");
1067
1068 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1069 for (i = 0;i < state->nbTrans; i++) {
1070 xmlRegPrintTrans(output, &(state->trans[i]));
1071 }
1072}
1073
Daniel Veillard23e73572002-09-19 19:56:43 +00001074#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard4255d502002-04-16 15:50:10 +00001075static void
1076xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1077 int i;
1078
1079 fprintf(output, " ctxt: ");
1080 if (ctxt == NULL) {
1081 fprintf(output, "NULL\n");
1082 return;
1083 }
1084 fprintf(output, "'%s' ", ctxt->string);
1085 if (ctxt->error)
1086 fprintf(output, "error ");
1087 if (ctxt->neg)
1088 fprintf(output, "neg ");
1089 fprintf(output, "\n");
1090 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1091 for (i = 0;i < ctxt->nbAtoms; i++) {
1092 fprintf(output, " %02d ", i);
1093 xmlRegPrintAtom(output, ctxt->atoms[i]);
1094 }
1095 if (ctxt->atom != NULL) {
1096 fprintf(output, "current atom:\n");
1097 xmlRegPrintAtom(output, ctxt->atom);
1098 }
1099 fprintf(output, "%d states:", ctxt->nbStates);
1100 if (ctxt->start != NULL)
1101 fprintf(output, " start: %d", ctxt->start->no);
1102 if (ctxt->end != NULL)
1103 fprintf(output, " end: %d", ctxt->end->no);
1104 fprintf(output, "\n");
1105 for (i = 0;i < ctxt->nbStates; i++) {
1106 xmlRegPrintState(output, ctxt->states[i]);
1107 }
1108 fprintf(output, "%d counters:\n", ctxt->nbCounters);
1109 for (i = 0;i < ctxt->nbCounters; i++) {
1110 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1111 ctxt->counters[i].max);
1112 }
1113}
Daniel Veillard23e73572002-09-19 19:56:43 +00001114#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00001115
1116/************************************************************************
1117 * *
1118 * Finite Automata structures manipulations *
1119 * *
1120 ************************************************************************/
1121
1122static void
1123xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1124 int neg, xmlRegAtomType type, int start, int end,
1125 xmlChar *blockName) {
1126 xmlRegRangePtr range;
1127
1128 if (atom == NULL) {
1129 ERROR("add range: atom is NULL");
1130 return;
1131 }
1132 if (atom->type != XML_REGEXP_RANGES) {
1133 ERROR("add range: atom is not ranges");
1134 return;
1135 }
1136 if (atom->maxRanges == 0) {
1137 atom->maxRanges = 4;
1138 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1139 sizeof(xmlRegRangePtr));
1140 if (atom->ranges == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001141 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001142 atom->maxRanges = 0;
1143 return;
1144 }
1145 } else if (atom->nbRanges >= atom->maxRanges) {
1146 xmlRegRangePtr *tmp;
1147 atom->maxRanges *= 2;
1148 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1149 sizeof(xmlRegRangePtr));
1150 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001151 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001152 atom->maxRanges /= 2;
1153 return;
1154 }
1155 atom->ranges = tmp;
1156 }
1157 range = xmlRegNewRange(ctxt, neg, type, start, end);
1158 if (range == NULL)
1159 return;
1160 range->blockName = blockName;
1161 atom->ranges[atom->nbRanges++] = range;
1162
1163}
1164
1165static int
1166xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1167 if (ctxt->maxCounters == 0) {
1168 ctxt->maxCounters = 4;
1169 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1170 sizeof(xmlRegCounter));
1171 if (ctxt->counters == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001172 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001173 ctxt->maxCounters = 0;
1174 return(-1);
1175 }
1176 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1177 xmlRegCounter *tmp;
1178 ctxt->maxCounters *= 2;
1179 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1180 sizeof(xmlRegCounter));
1181 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001182 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001183 ctxt->maxCounters /= 2;
1184 return(-1);
1185 }
1186 ctxt->counters = tmp;
1187 }
1188 ctxt->counters[ctxt->nbCounters].min = -1;
1189 ctxt->counters[ctxt->nbCounters].max = -1;
1190 return(ctxt->nbCounters++);
1191}
1192
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001193static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001194xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1195 if (atom == NULL) {
1196 ERROR("atom push: atom is NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001197 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001198 }
1199 if (ctxt->maxAtoms == 0) {
1200 ctxt->maxAtoms = 4;
1201 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1202 sizeof(xmlRegAtomPtr));
1203 if (ctxt->atoms == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001204 xmlRegexpErrMemory(ctxt, "pushing atom");
Daniel Veillard4255d502002-04-16 15:50:10 +00001205 ctxt->maxAtoms = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001206 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001207 }
1208 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1209 xmlRegAtomPtr *tmp;
1210 ctxt->maxAtoms *= 2;
1211 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1212 sizeof(xmlRegAtomPtr));
1213 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001214 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001215 ctxt->maxAtoms /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001216 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001217 }
1218 ctxt->atoms = tmp;
1219 }
1220 atom->no = ctxt->nbAtoms;
1221 ctxt->atoms[ctxt->nbAtoms++] = atom;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001222 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001223}
1224
1225static void
Daniel Veillarddb68b742005-07-30 13:18:24 +00001226xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1227 int from) {
1228 if (target->maxTransTo == 0) {
1229 target->maxTransTo = 8;
1230 target->transTo = (int *) xmlMalloc(target->maxTransTo *
1231 sizeof(int));
1232 if (target->transTo == NULL) {
1233 xmlRegexpErrMemory(ctxt, "adding transition");
1234 target->maxTransTo = 0;
1235 return;
1236 }
1237 } else if (target->nbTransTo >= target->maxTransTo) {
1238 int *tmp;
1239 target->maxTransTo *= 2;
1240 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1241 sizeof(int));
1242 if (tmp == NULL) {
1243 xmlRegexpErrMemory(ctxt, "adding transition");
1244 target->maxTransTo /= 2;
1245 return;
1246 }
1247 target->transTo = tmp;
1248 }
1249 target->transTo[target->nbTransTo] = from;
1250 target->nbTransTo++;
1251}
1252
1253static void
Daniel Veillard4255d502002-04-16 15:50:10 +00001254xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1255 xmlRegAtomPtr atom, xmlRegStatePtr target,
Daniel Veillard5de09382005-09-26 17:18:17 +00001256 int counter, int count) {
William M. Brackf9b5fa22004-05-10 07:52:15 +00001257
1258 int nrtrans;
1259
Daniel Veillard4255d502002-04-16 15:50:10 +00001260 if (state == NULL) {
1261 ERROR("add state: state is NULL");
1262 return;
1263 }
1264 if (target == NULL) {
1265 ERROR("add state: target is NULL");
1266 return;
1267 }
William M. Brackf9b5fa22004-05-10 07:52:15 +00001268 /*
1269 * Other routines follow the philosophy 'When in doubt, add a transition'
1270 * so we check here whether such a transition is already present and, if
1271 * so, silently ignore this request.
1272 */
1273
Daniel Veillard5de09382005-09-26 17:18:17 +00001274 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1275 xmlRegTransPtr trans = &(state->trans[nrtrans]);
1276 if ((trans->atom == atom) &&
1277 (trans->to == target->no) &&
1278 (trans->counter == counter) &&
1279 (trans->count == count)) {
William M. Brackf9b5fa22004-05-10 07:52:15 +00001280#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard5de09382005-09-26 17:18:17 +00001281 printf("Ignoring duplicate transition from %d to %d\n",
1282 state->no, target->no);
William M. Brackf9b5fa22004-05-10 07:52:15 +00001283#endif
Daniel Veillard5de09382005-09-26 17:18:17 +00001284 return;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001285 }
William M. Brackf9b5fa22004-05-10 07:52:15 +00001286 }
1287
Daniel Veillard4255d502002-04-16 15:50:10 +00001288 if (state->maxTrans == 0) {
Daniel Veillarddb68b742005-07-30 13:18:24 +00001289 state->maxTrans = 8;
Daniel Veillard4255d502002-04-16 15:50:10 +00001290 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1291 sizeof(xmlRegTrans));
1292 if (state->trans == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001293 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001294 state->maxTrans = 0;
1295 return;
1296 }
1297 } else if (state->nbTrans >= state->maxTrans) {
1298 xmlRegTrans *tmp;
1299 state->maxTrans *= 2;
1300 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1301 sizeof(xmlRegTrans));
1302 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001303 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001304 state->maxTrans /= 2;
1305 return;
1306 }
1307 state->trans = tmp;
1308 }
1309#ifdef DEBUG_REGEXP_GRAPH
1310 printf("Add trans from %d to %d ", state->no, target->no);
Daniel Veillard8a001f62002-04-20 07:24:11 +00001311 if (count == REGEXP_ALL_COUNTER)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001312 printf("all transition\n");
Daniel Veillard4402ab42002-09-12 16:02:56 +00001313 else if (count >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001314 printf("count based %d\n", count);
Daniel Veillard4255d502002-04-16 15:50:10 +00001315 else if (counter >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001316 printf("counted %d\n", counter);
Daniel Veillard4255d502002-04-16 15:50:10 +00001317 else if (atom == NULL)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001318 printf("epsilon transition\n");
1319 else if (atom != NULL)
1320 xmlRegPrintAtom(stdout, atom);
Daniel Veillard4255d502002-04-16 15:50:10 +00001321#endif
1322
1323 state->trans[state->nbTrans].atom = atom;
1324 state->trans[state->nbTrans].to = target->no;
1325 state->trans[state->nbTrans].counter = counter;
1326 state->trans[state->nbTrans].count = count;
Daniel Veillard567a45b2005-10-18 19:11:55 +00001327 state->trans[state->nbTrans].nd = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00001328 state->nbTrans++;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001329 xmlRegStateAddTransTo(ctxt, target, state->no);
Daniel Veillard4255d502002-04-16 15:50:10 +00001330}
1331
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001332static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001333xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001334 if (state == NULL) return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001335 if (ctxt->maxStates == 0) {
1336 ctxt->maxStates = 4;
1337 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1338 sizeof(xmlRegStatePtr));
1339 if (ctxt->states == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001340 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001341 ctxt->maxStates = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001342 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001343 }
1344 } else if (ctxt->nbStates >= ctxt->maxStates) {
1345 xmlRegStatePtr *tmp;
1346 ctxt->maxStates *= 2;
1347 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1348 sizeof(xmlRegStatePtr));
1349 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001350 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001351 ctxt->maxStates /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001352 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001353 }
1354 ctxt->states = tmp;
1355 }
1356 state->no = ctxt->nbStates;
1357 ctxt->states[ctxt->nbStates++] = state;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001358 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001359}
1360
1361/**
Daniel Veillard7646b182002-04-20 06:41:40 +00001362 * xmlFAGenerateAllTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001363 * @ctxt: a regexp parser context
1364 * @from: the from state
1365 * @to: the target state or NULL for building a new one
1366 * @lax:
Daniel Veillard7646b182002-04-20 06:41:40 +00001367 *
1368 */
1369static void
1370xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
Daniel Veillard441bc322002-04-20 17:38:48 +00001371 xmlRegStatePtr from, xmlRegStatePtr to,
1372 int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00001373 if (to == NULL) {
1374 to = xmlRegNewState(ctxt);
1375 xmlRegStatePush(ctxt, to);
1376 ctxt->state = to;
1377 }
Daniel Veillard441bc322002-04-20 17:38:48 +00001378 if (lax)
Daniel Veillard5de09382005-09-26 17:18:17 +00001379 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
Daniel Veillard441bc322002-04-20 17:38:48 +00001380 else
Daniel Veillard5de09382005-09-26 17:18:17 +00001381 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
Daniel Veillard7646b182002-04-20 06:41:40 +00001382}
1383
1384/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001385 * xmlFAGenerateEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001386 * @ctxt: a regexp parser context
1387 * @from: the from state
1388 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001389 *
1390 */
1391static void
1392xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1393 xmlRegStatePtr from, xmlRegStatePtr to) {
1394 if (to == NULL) {
1395 to = xmlRegNewState(ctxt);
1396 xmlRegStatePush(ctxt, to);
1397 ctxt->state = to;
1398 }
Daniel Veillard5de09382005-09-26 17:18:17 +00001399 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001400}
1401
1402/**
1403 * xmlFAGenerateCountedEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001404 * @ctxt: a regexp parser context
1405 * @from: the from state
1406 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001407 * counter: the counter for that transition
1408 *
1409 */
1410static void
1411xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1412 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1413 if (to == NULL) {
1414 to = xmlRegNewState(ctxt);
1415 xmlRegStatePush(ctxt, to);
1416 ctxt->state = to;
1417 }
Daniel Veillard5de09382005-09-26 17:18:17 +00001418 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001419}
1420
1421/**
1422 * xmlFAGenerateCountedTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001423 * @ctxt: a regexp parser context
1424 * @from: the from state
1425 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001426 * counter: the counter for that transition
1427 *
1428 */
1429static void
1430xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1431 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1432 if (to == NULL) {
1433 to = xmlRegNewState(ctxt);
1434 xmlRegStatePush(ctxt, to);
1435 ctxt->state = to;
1436 }
Daniel Veillard5de09382005-09-26 17:18:17 +00001437 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
Daniel Veillard4255d502002-04-16 15:50:10 +00001438}
1439
1440/**
1441 * xmlFAGenerateTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001442 * @ctxt: a regexp parser context
1443 * @from: the from state
1444 * @to: the target state or NULL for building a new one
1445 * @atom: the atom generating the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00001446 *
William M. Brackddf71d62004-05-06 04:17:26 +00001447 * Returns 0 if success and -1 in case of error.
Daniel Veillard4255d502002-04-16 15:50:10 +00001448 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001449static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001450xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1451 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1452 if (atom == NULL) {
1453 ERROR("genrate transition: atom == NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001454 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001455 }
1456 if (atom->type == XML_REGEXP_SUBREG) {
1457 /*
1458 * this is a subexpression handling one should not need to
William M. Brackddf71d62004-05-06 04:17:26 +00001459 * create a new node except for XML_REGEXP_QUANT_RANGE.
Daniel Veillard4255d502002-04-16 15:50:10 +00001460 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001461 if (xmlRegAtomPush(ctxt, atom) < 0) {
1462 return(-1);
1463 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001464 if ((to != NULL) && (atom->stop != to) &&
1465 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1466 /*
1467 * Generate an epsilon transition to link to the target
1468 */
1469 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
Daniel Veillardaa622012005-10-20 15:55:25 +00001470#ifdef DV
1471 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1472 (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1473 to = xmlRegNewState(ctxt);
1474 xmlRegStatePush(ctxt, to);
1475 ctxt->state = to;
1476 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1477#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00001478 }
1479 switch (atom->quant) {
1480 case XML_REGEXP_QUANT_OPT:
1481 atom->quant = XML_REGEXP_QUANT_ONCE;
1482 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1483 break;
1484 case XML_REGEXP_QUANT_MULT:
1485 atom->quant = XML_REGEXP_QUANT_ONCE;
1486 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1487 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1488 break;
1489 case XML_REGEXP_QUANT_PLUS:
1490 atom->quant = XML_REGEXP_QUANT_ONCE;
1491 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1492 break;
1493 case XML_REGEXP_QUANT_RANGE: {
1494 int counter;
1495 xmlRegStatePtr newstate;
1496
1497 /*
1498 * This one is nasty:
William M. Brackddf71d62004-05-06 04:17:26 +00001499 * 1/ if range has minOccurs == 0, create a new state
1500 * and create epsilon transitions from atom->start
1501 * to atom->stop, as well as atom->start to the new
1502 * state
1503 * 2/ register a new counter
1504 * 3/ register an epsilon transition associated to
Daniel Veillard4255d502002-04-16 15:50:10 +00001505 * this counter going from atom->stop to atom->start
William M. Brackddf71d62004-05-06 04:17:26 +00001506 * 4/ create a new state
1507 * 5/ generate a counted transition from atom->stop to
Daniel Veillard4255d502002-04-16 15:50:10 +00001508 * that state
1509 */
William M. Brackddf71d62004-05-06 04:17:26 +00001510 if (atom->min == 0) {
1511 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1512 atom->stop);
1513 newstate = xmlRegNewState(ctxt);
1514 xmlRegStatePush(ctxt, newstate);
1515 ctxt->state = newstate;
1516 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1517 newstate);
1518 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001519 counter = xmlRegGetCounter(ctxt);
1520 ctxt->counters[counter].min = atom->min - 1;
1521 ctxt->counters[counter].max = atom->max - 1;
1522 atom->min = 0;
1523 atom->max = 0;
1524 atom->quant = XML_REGEXP_QUANT_ONCE;
1525 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1526 atom->start, counter);
1527 if (to != NULL) {
1528 newstate = to;
1529 } else {
1530 newstate = xmlRegNewState(ctxt);
1531 xmlRegStatePush(ctxt, newstate);
Daniel Veillard4255d502002-04-16 15:50:10 +00001532 }
Daniel Veillard9a00fd22005-11-09 08:56:26 +00001533 ctxt->state = newstate;
Daniel Veillard4255d502002-04-16 15:50:10 +00001534 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1535 newstate, counter);
1536 }
1537 default:
1538 break;
1539 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001540 return(0);
Daniel Veillard567a45b2005-10-18 19:11:55 +00001541 }
1542 if ((atom->min == 0) && (atom->max == 0) &&
Daniel Veillard99c394d2005-07-14 12:58:49 +00001543 (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1544 /*
1545 * we can discard the atom and generate an epsilon transition instead
1546 */
1547 if (to == NULL) {
1548 to = xmlRegNewState(ctxt);
1549 if (to != NULL)
1550 xmlRegStatePush(ctxt, to);
1551 else {
1552 return(-1);
1553 }
1554 }
1555 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1556 ctxt->state = to;
1557 xmlRegFreeAtom(atom);
1558 return(0);
Daniel Veillard567a45b2005-10-18 19:11:55 +00001559 }
1560 if (to == NULL) {
1561 to = xmlRegNewState(ctxt);
1562 if (to != NULL)
1563 xmlRegStatePush(ctxt, to);
1564 else {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001565 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001566 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001567 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001568 if (xmlRegAtomPush(ctxt, atom) < 0) {
1569 return(-1);
1570 }
1571 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1572 ctxt->state = to;
Daniel Veillard4255d502002-04-16 15:50:10 +00001573 switch (atom->quant) {
1574 case XML_REGEXP_QUANT_OPT:
1575 atom->quant = XML_REGEXP_QUANT_ONCE;
1576 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1577 break;
1578 case XML_REGEXP_QUANT_MULT:
1579 atom->quant = XML_REGEXP_QUANT_ONCE;
1580 xmlFAGenerateEpsilonTransition(ctxt, from, to);
Daniel Veillard5de09382005-09-26 17:18:17 +00001581 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001582 break;
1583 case XML_REGEXP_QUANT_PLUS:
1584 atom->quant = XML_REGEXP_QUANT_ONCE;
Daniel Veillard5de09382005-09-26 17:18:17 +00001585 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001586 break;
1587 default:
1588 break;
1589 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001590 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001591}
1592
1593/**
1594 * xmlFAReduceEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001595 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001596 * @fromnr: the from state
1597 * @tonr: the to state
William M. Brackddf71d62004-05-06 04:17:26 +00001598 * @counter: should that transition be associated to a counted
Daniel Veillard4255d502002-04-16 15:50:10 +00001599 *
1600 */
1601static void
1602xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1603 int tonr, int counter) {
1604 int transnr;
1605 xmlRegStatePtr from;
1606 xmlRegStatePtr to;
1607
1608#ifdef DEBUG_REGEXP_GRAPH
1609 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1610#endif
1611 from = ctxt->states[fromnr];
1612 if (from == NULL)
1613 return;
1614 to = ctxt->states[tonr];
1615 if (to == NULL)
1616 return;
1617 if ((to->mark == XML_REGEXP_MARK_START) ||
1618 (to->mark == XML_REGEXP_MARK_VISITED))
1619 return;
1620
1621 to->mark = XML_REGEXP_MARK_VISITED;
1622 if (to->type == XML_REGEXP_FINAL_STATE) {
1623#ifdef DEBUG_REGEXP_GRAPH
1624 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1625#endif
1626 from->type = XML_REGEXP_FINAL_STATE;
1627 }
1628 for (transnr = 0;transnr < to->nbTrans;transnr++) {
Daniel Veillarddb68b742005-07-30 13:18:24 +00001629 if (to->trans[transnr].to < 0)
1630 continue;
Daniel Veillard4255d502002-04-16 15:50:10 +00001631 if (to->trans[transnr].atom == NULL) {
1632 /*
1633 * Don't remove counted transitions
1634 * Don't loop either
1635 */
Daniel Veillardb509f152002-04-17 16:28:10 +00001636 if (to->trans[transnr].to != fromnr) {
1637 if (to->trans[transnr].count >= 0) {
1638 int newto = to->trans[transnr].to;
1639
1640 xmlRegStateAddTrans(ctxt, from, NULL,
1641 ctxt->states[newto],
Daniel Veillard5de09382005-09-26 17:18:17 +00001642 -1, to->trans[transnr].count);
Daniel Veillardb509f152002-04-17 16:28:10 +00001643 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00001644#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillardb509f152002-04-17 16:28:10 +00001645 printf("Found epsilon trans %d from %d to %d\n",
1646 transnr, tonr, to->trans[transnr].to);
Daniel Veillard4255d502002-04-16 15:50:10 +00001647#endif
Daniel Veillardb509f152002-04-17 16:28:10 +00001648 if (to->trans[transnr].counter >= 0) {
1649 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1650 to->trans[transnr].to,
1651 to->trans[transnr].counter);
1652 } else {
1653 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1654 to->trans[transnr].to,
1655 counter);
1656 }
1657 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001658 }
1659 } else {
1660 int newto = to->trans[transnr].to;
1661
Daniel Veillardb509f152002-04-17 16:28:10 +00001662 if (to->trans[transnr].counter >= 0) {
1663 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1664 ctxt->states[newto],
Daniel Veillard5de09382005-09-26 17:18:17 +00001665 to->trans[transnr].counter, -1);
Daniel Veillardb509f152002-04-17 16:28:10 +00001666 } else {
1667 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
Daniel Veillard5de09382005-09-26 17:18:17 +00001668 ctxt->states[newto], counter, -1);
Daniel Veillardb509f152002-04-17 16:28:10 +00001669 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001670 }
1671 }
1672 to->mark = XML_REGEXP_MARK_NORMAL;
1673}
1674
1675/**
Daniel Veillarddb68b742005-07-30 13:18:24 +00001676 * xmlFAEliminateSimpleEpsilonTransitions:
1677 * @ctxt: a regexp parser context
1678 *
1679 * Eliminating general epsilon transitions can get costly in the general
1680 * algorithm due to the large amount of generated new transitions and
1681 * associated comparisons. However for simple epsilon transition used just
1682 * to separate building blocks when generating the automata this can be
1683 * reduced to state elimination:
1684 * - if there exists an epsilon from X to Y
1685 * - if there is no other transition from X
1686 * then X and Y are semantically equivalent and X can be eliminated
1687 * If X is the start state then make Y the start state, else replace the
1688 * target of all transitions to X by transitions to Y.
1689 */
1690static void
1691xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1692 int statenr, i, j, newto;
1693 xmlRegStatePtr state, tmp;
1694
1695 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1696 state = ctxt->states[statenr];
1697 if (state == NULL)
1698 continue;
1699 if (state->nbTrans != 1)
1700 continue;
1701 /* is the only transition out a basic transition */
1702 if ((state->trans[0].atom == NULL) &&
1703 (state->trans[0].to >= 0) &&
1704 (state->trans[0].to != statenr) &&
1705 (state->trans[0].counter < 0) &&
1706 (state->trans[0].count < 0)) {
1707 newto = state->trans[0].to;
1708
1709 if (state->type == XML_REGEXP_START_STATE) {
1710#ifdef DEBUG_REGEXP_GRAPH
1711 printf("Found simple epsilon trans from start %d to %d\n",
1712 statenr, newto);
1713#endif
1714 } else {
1715#ifdef DEBUG_REGEXP_GRAPH
1716 printf("Found simple epsilon trans from %d to %d\n",
1717 statenr, newto);
1718#endif
1719 for (i = 0;i < state->nbTransTo;i++) {
1720 tmp = ctxt->states[state->transTo[i]];
1721 for (j = 0;j < tmp->nbTrans;j++) {
1722 if (tmp->trans[j].to == statenr) {
1723 tmp->trans[j].to = newto;
1724#ifdef DEBUG_REGEXP_GRAPH
1725 printf("Changed transition %d on %d to go to %d\n",
1726 j, tmp->no, newto);
1727#endif
1728 xmlRegStateAddTransTo(ctxt, ctxt->states[newto],
1729 tmp->no);
1730 }
1731 }
1732 }
1733#if 0
1734 for (i = 0;i < ctxt->nbStates;i++) {
1735 tmp = ctxt->states[i];
1736 for (j = 0;j < tmp->nbTrans;j++) {
1737 if (tmp->trans[j].to == statenr) {
1738 tmp->trans[j].to = newto;
1739#ifdef DEBUG_REGEXP_GRAPH
1740 printf("Changed transition %d on %d to go to %d\n",
1741 j, tmp->no, newto);
1742#endif
1743 }
1744 }
1745 }
1746#endif
1747 if (state->type == XML_REGEXP_FINAL_STATE)
1748 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1749 /* eliminate the transition completely */
1750 state->nbTrans = 0;
1751
1752
1753 }
1754
1755 }
1756 }
1757}
1758/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001759 * xmlFAEliminateEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001760 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001761 *
1762 */
1763static void
1764xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1765 int statenr, transnr;
1766 xmlRegStatePtr state;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001767 int has_epsilon;
Daniel Veillard4255d502002-04-16 15:50:10 +00001768
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001769 if (ctxt->states == NULL) return;
1770
Daniel Veillarddb68b742005-07-30 13:18:24 +00001771 xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1772
1773 has_epsilon = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001774
Daniel Veillard4255d502002-04-16 15:50:10 +00001775 /*
1776 * build the completed transitions bypassing the epsilons
1777 * Use a marking algorithm to avoid loops
Daniel Veillardcc026dc2005-01-12 13:21:17 +00001778 * mark sink states too.
Daniel Veillard4255d502002-04-16 15:50:10 +00001779 */
1780 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1781 state = ctxt->states[statenr];
1782 if (state == NULL)
1783 continue;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00001784 if ((state->nbTrans == 0) &&
1785 (state->type != XML_REGEXP_FINAL_STATE)) {
1786 state->type = XML_REGEXP_SINK_STATE;
1787 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001788 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1789 if ((state->trans[transnr].atom == NULL) &&
1790 (state->trans[transnr].to >= 0)) {
1791 if (state->trans[transnr].to == statenr) {
1792 state->trans[transnr].to = -1;
1793#ifdef DEBUG_REGEXP_GRAPH
1794 printf("Removed loopback epsilon trans %d on %d\n",
1795 transnr, statenr);
1796#endif
1797 } else if (state->trans[transnr].count < 0) {
1798 int newto = state->trans[transnr].to;
1799
1800#ifdef DEBUG_REGEXP_GRAPH
1801 printf("Found epsilon trans %d from %d to %d\n",
1802 transnr, statenr, newto);
1803#endif
1804 state->mark = XML_REGEXP_MARK_START;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001805 has_epsilon = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00001806 xmlFAReduceEpsilonTransitions(ctxt, statenr,
1807 newto, state->trans[transnr].counter);
1808 state->mark = XML_REGEXP_MARK_NORMAL;
1809#ifdef DEBUG_REGEXP_GRAPH
1810 } else {
1811 printf("Found counted transition %d on %d\n",
1812 transnr, statenr);
1813#endif
1814 }
1815 }
1816 }
1817 }
1818 /*
1819 * Eliminate the epsilon transitions
1820 */
Daniel Veillarddb68b742005-07-30 13:18:24 +00001821 if (has_epsilon) {
1822 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1823 state = ctxt->states[statenr];
1824 if (state == NULL)
1825 continue;
1826 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1827 xmlRegTransPtr trans = &(state->trans[transnr]);
1828 if ((trans->atom == NULL) &&
1829 (trans->count < 0) &&
1830 (trans->to >= 0)) {
1831 trans->to = -1;
1832 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001833 }
1834 }
1835 }
Daniel Veillard23e73572002-09-19 19:56:43 +00001836
1837 /*
1838 * Use this pass to detect unreachable states too
1839 */
1840 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1841 state = ctxt->states[statenr];
1842 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00001843 state->reached = XML_REGEXP_MARK_NORMAL;
Daniel Veillard23e73572002-09-19 19:56:43 +00001844 }
1845 state = ctxt->states[0];
1846 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00001847 state->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00001848 while (state != NULL) {
1849 xmlRegStatePtr target = NULL;
William M. Brack779af002003-08-01 15:55:39 +00001850 state->reached = XML_REGEXP_MARK_VISITED;
Daniel Veillard23e73572002-09-19 19:56:43 +00001851 /*
William M. Brackddf71d62004-05-06 04:17:26 +00001852 * Mark all states reachable from the current reachable state
Daniel Veillard23e73572002-09-19 19:56:43 +00001853 */
1854 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1855 if ((state->trans[transnr].to >= 0) &&
1856 ((state->trans[transnr].atom != NULL) ||
1857 (state->trans[transnr].count >= 0))) {
1858 int newto = state->trans[transnr].to;
1859
1860 if (ctxt->states[newto] == NULL)
1861 continue;
William M. Brack779af002003-08-01 15:55:39 +00001862 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
1863 ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00001864 target = ctxt->states[newto];
1865 }
1866 }
1867 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00001868
Daniel Veillard23e73572002-09-19 19:56:43 +00001869 /*
1870 * find the next accessible state not explored
1871 */
1872 if (target == NULL) {
1873 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
1874 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00001875 if ((state != NULL) && (state->reached ==
1876 XML_REGEXP_MARK_START)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00001877 target = state;
1878 break;
1879 }
1880 }
1881 }
1882 state = target;
1883 }
1884 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1885 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00001886 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00001887#ifdef DEBUG_REGEXP_GRAPH
1888 printf("Removed unreachable state %d\n", statenr);
1889#endif
1890 xmlRegFreeState(state);
1891 ctxt->states[statenr] = NULL;
1892 }
1893 }
1894
Daniel Veillard4255d502002-04-16 15:50:10 +00001895}
1896
Daniel Veillard567a45b2005-10-18 19:11:55 +00001897static int
1898xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
1899 int ret = 0;
1900
1901 if ((range1->type == XML_REGEXP_RANGES) ||
1902 (range2->type == XML_REGEXP_RANGES) ||
1903 (range2->type == XML_REGEXP_SUBREG) ||
1904 (range1->type == XML_REGEXP_SUBREG) ||
1905 (range1->type == XML_REGEXP_STRING) ||
1906 (range2->type == XML_REGEXP_STRING))
1907 return(-1);
1908
1909 /* put them in order */
1910 if (range1->type > range2->type) {
1911 xmlRegRangePtr tmp;
1912
1913 tmp = range1;
1914 range1 = range2;
1915 range2 = tmp;
1916 }
1917 if ((range1->type == XML_REGEXP_ANYCHAR) ||
1918 (range2->type == XML_REGEXP_ANYCHAR)) {
1919 ret = 1;
1920 } else if ((range1->type == XML_REGEXP_EPSILON) ||
1921 (range2->type == XML_REGEXP_EPSILON)) {
1922 return(0);
1923 } else if (range1->type == range2->type) {
1924 if ((range1->type != XML_REGEXP_CHARVAL) ||
1925 (range1->end < range2->start) ||
1926 (range2->end < range1->start))
1927 ret = 1;
1928 else
1929 ret = 0;
1930 } else if (range1->type == XML_REGEXP_CHARVAL) {
1931 int codepoint;
1932 int neg = 0;
1933
1934 /*
1935 * just check all codepoints in the range for acceptance,
1936 * this is usually way cheaper since done only once at
1937 * compilation than testing over and over at runtime or
1938 * pushing too many states when evaluating.
1939 */
1940 if (((range1->neg == 0) && (range2->neg != 0)) ||
1941 ((range1->neg != 0) && (range2->neg == 0)))
1942 neg = 1;
1943
1944 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
1945 ret = xmlRegCheckCharacterRange(range2->type, codepoint,
1946 0, range2->start, range2->end,
1947 range2->blockName);
1948 if (ret < 0)
1949 return(-1);
1950 if (((neg == 1) && (ret == 0)) ||
1951 ((neg == 0) && (ret == 1)))
1952 return(1);
1953 }
1954 return(0);
1955 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
1956 (range2->type == XML_REGEXP_BLOCK_NAME)) {
1957 if (range1->type == range2->type) {
1958 ret = xmlStrEqual(range1->blockName, range2->blockName);
1959 } else {
1960 /*
1961 * comparing a block range with anything else is way
1962 * too costly, and maintining the table is like too much
1963 * memory too, so let's force the automata to save state
1964 * here.
1965 */
1966 return(1);
1967 }
1968 } else if ((range1->type < XML_REGEXP_LETTER) ||
1969 (range2->type < XML_REGEXP_LETTER)) {
1970 if ((range1->type == XML_REGEXP_ANYSPACE) &&
1971 (range2->type == XML_REGEXP_NOTSPACE))
1972 ret = 0;
1973 else if ((range1->type == XML_REGEXP_INITNAME) &&
1974 (range2->type == XML_REGEXP_NOTINITNAME))
1975 ret = 0;
1976 else if ((range1->type == XML_REGEXP_NAMECHAR) &&
1977 (range2->type == XML_REGEXP_NOTNAMECHAR))
1978 ret = 0;
1979 else if ((range1->type == XML_REGEXP_DECIMAL) &&
1980 (range2->type == XML_REGEXP_NOTDECIMAL))
1981 ret = 0;
1982 else if ((range1->type == XML_REGEXP_REALCHAR) &&
1983 (range2->type == XML_REGEXP_NOTREALCHAR))
1984 ret = 0;
1985 else {
1986 /* same thing to limit complexity */
1987 return(1);
1988 }
1989 } else {
1990 ret = 0;
1991 /* range1->type < range2->type here */
1992 switch (range1->type) {
1993 case XML_REGEXP_LETTER:
1994 /* all disjoint except in the subgroups */
1995 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
1996 (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
1997 (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
1998 (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
1999 (range2->type == XML_REGEXP_LETTER_OTHERS))
2000 ret = 1;
2001 break;
2002 case XML_REGEXP_MARK:
2003 if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2004 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2005 (range2->type == XML_REGEXP_MARK_ENCLOSING))
2006 ret = 1;
2007 break;
2008 case XML_REGEXP_NUMBER:
2009 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2010 (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2011 (range2->type == XML_REGEXP_NUMBER_OTHERS))
2012 ret = 1;
2013 break;
2014 case XML_REGEXP_PUNCT:
2015 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2016 (range2->type == XML_REGEXP_PUNCT_DASH) ||
2017 (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2018 (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2019 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2020 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2021 (range2->type == XML_REGEXP_PUNCT_OTHERS))
2022 ret = 1;
2023 break;
2024 case XML_REGEXP_SEPAR:
2025 if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2026 (range2->type == XML_REGEXP_SEPAR_LINE) ||
2027 (range2->type == XML_REGEXP_SEPAR_PARA))
2028 ret = 1;
2029 break;
2030 case XML_REGEXP_SYMBOL:
2031 if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2032 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2033 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2034 (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2035 ret = 1;
2036 break;
2037 case XML_REGEXP_OTHER:
2038 if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2039 (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2040 (range2->type == XML_REGEXP_OTHER_PRIVATE))
2041 ret = 1;
2042 break;
2043 default:
2044 if ((range2->type >= XML_REGEXP_LETTER) &&
2045 (range2->type < XML_REGEXP_BLOCK_NAME))
2046 ret = 0;
2047 else {
2048 /* safety net ! */
2049 return(1);
2050 }
2051 }
2052 }
2053 if (((range1->neg == 0) && (range2->neg != 0)) ||
2054 ((range1->neg != 0) && (range2->neg == 0)))
2055 ret = !ret;
2056 return(1);
2057}
2058
Daniel Veillarde19fc232002-04-22 16:01:24 +00002059/**
Daniel Veillardfc011b72006-02-12 19:14:15 +00002060 * xmlFACompareAtomTypes:
2061 * @type1: an atom type
2062 * @type2: an atom type
2063 *
2064 * Compares two atoms type to check whether they intersect in some ways,
2065 * this is used by xmlFACompareAtoms only
2066 *
2067 * Returns 1 if they may intersect and 0 otherwise
2068 */
2069static int
2070xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2071 if ((type1 == XML_REGEXP_EPSILON) ||
2072 (type1 == XML_REGEXP_CHARVAL) ||
2073 (type1 == XML_REGEXP_RANGES) ||
2074 (type1 == XML_REGEXP_SUBREG) ||
2075 (type1 == XML_REGEXP_STRING) ||
2076 (type1 == XML_REGEXP_ANYCHAR))
2077 return(1);
2078 if ((type2 == XML_REGEXP_EPSILON) ||
2079 (type2 == XML_REGEXP_CHARVAL) ||
2080 (type2 == XML_REGEXP_RANGES) ||
2081 (type2 == XML_REGEXP_SUBREG) ||
2082 (type2 == XML_REGEXP_STRING) ||
2083 (type2 == XML_REGEXP_ANYCHAR))
2084 return(1);
2085
2086 if (type1 == type2) return(1);
2087
2088 /* simplify subsequent compares by making sure type1 < type2 */
2089 if (type1 > type2) {
2090 xmlRegAtomType tmp = type1;
2091 type1 = type2;
2092 type2 = tmp;
2093 }
2094 switch (type1) {
2095 case XML_REGEXP_ANYSPACE: /* \s */
2096 /* can't be a letter, number, mark, pontuation, symbol */
2097 if ((type2 == XML_REGEXP_NOTSPACE) ||
2098 ((type2 >= XML_REGEXP_LETTER) &&
2099 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2100 ((type2 >= XML_REGEXP_NUMBER) &&
2101 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2102 ((type2 >= XML_REGEXP_MARK) &&
2103 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2104 ((type2 >= XML_REGEXP_PUNCT) &&
2105 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2106 ((type2 >= XML_REGEXP_SYMBOL) &&
2107 (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2108 ) return(0);
2109 break;
2110 case XML_REGEXP_NOTSPACE: /* \S */
2111 break;
2112 case XML_REGEXP_INITNAME: /* \l */
2113 /* can't be a number, mark, separator, pontuation, symbol or other */
2114 if ((type2 == XML_REGEXP_NOTINITNAME) ||
2115 ((type2 >= XML_REGEXP_NUMBER) &&
2116 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2117 ((type2 >= XML_REGEXP_MARK) &&
2118 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2119 ((type2 >= XML_REGEXP_SEPAR) &&
2120 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2121 ((type2 >= XML_REGEXP_PUNCT) &&
2122 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2123 ((type2 >= XML_REGEXP_SYMBOL) &&
2124 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2125 ((type2 >= XML_REGEXP_OTHER) &&
2126 (type2 <= XML_REGEXP_OTHER_NA))
2127 ) return(0);
2128 break;
2129 case XML_REGEXP_NOTINITNAME: /* \L */
2130 break;
2131 case XML_REGEXP_NAMECHAR: /* \c */
2132 /* can't be a mark, separator, pontuation, symbol or other */
2133 if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2134 ((type2 >= XML_REGEXP_MARK) &&
2135 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2136 ((type2 >= XML_REGEXP_PUNCT) &&
2137 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2138 ((type2 >= XML_REGEXP_SEPAR) &&
2139 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2140 ((type2 >= XML_REGEXP_SYMBOL) &&
2141 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2142 ((type2 >= XML_REGEXP_OTHER) &&
2143 (type2 <= XML_REGEXP_OTHER_NA))
2144 ) return(0);
2145 break;
2146 case XML_REGEXP_NOTNAMECHAR: /* \C */
2147 break;
2148 case XML_REGEXP_DECIMAL: /* \d */
2149 /* can't be a letter, mark, separator, pontuation, symbol or other */
2150 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2151 (type2 == XML_REGEXP_REALCHAR) ||
2152 ((type2 >= XML_REGEXP_LETTER) &&
2153 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2154 ((type2 >= XML_REGEXP_MARK) &&
2155 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2156 ((type2 >= XML_REGEXP_PUNCT) &&
2157 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2158 ((type2 >= XML_REGEXP_SEPAR) &&
2159 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2160 ((type2 >= XML_REGEXP_SYMBOL) &&
2161 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2162 ((type2 >= XML_REGEXP_OTHER) &&
2163 (type2 <= XML_REGEXP_OTHER_NA))
2164 )return(0);
2165 break;
2166 case XML_REGEXP_NOTDECIMAL: /* \D */
2167 break;
2168 case XML_REGEXP_REALCHAR: /* \w */
2169 /* can't be a mark, separator, pontuation, symbol or other */
2170 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2171 ((type2 >= XML_REGEXP_MARK) &&
2172 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2173 ((type2 >= XML_REGEXP_PUNCT) &&
2174 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2175 ((type2 >= XML_REGEXP_SEPAR) &&
2176 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2177 ((type2 >= XML_REGEXP_SYMBOL) &&
2178 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2179 ((type2 >= XML_REGEXP_OTHER) &&
2180 (type2 <= XML_REGEXP_OTHER_NA))
2181 )return(0);
2182 break;
2183 case XML_REGEXP_NOTREALCHAR: /* \W */
2184 break;
2185 /*
2186 * at that point we know both type 1 and type2 are from
2187 * character categories are ordered and are different,
2188 * it becomes simple because this is a partition
2189 */
2190 case XML_REGEXP_LETTER:
2191 if (type2 <= XML_REGEXP_LETTER_OTHERS)
2192 return(1);
2193 return(0);
2194 case XML_REGEXP_LETTER_UPPERCASE:
2195 case XML_REGEXP_LETTER_LOWERCASE:
2196 case XML_REGEXP_LETTER_TITLECASE:
2197 case XML_REGEXP_LETTER_MODIFIER:
2198 case XML_REGEXP_LETTER_OTHERS:
2199 return(0);
2200 case XML_REGEXP_MARK:
2201 if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2202 return(1);
2203 return(0);
2204 case XML_REGEXP_MARK_NONSPACING:
2205 case XML_REGEXP_MARK_SPACECOMBINING:
2206 case XML_REGEXP_MARK_ENCLOSING:
2207 return(0);
2208 case XML_REGEXP_NUMBER:
2209 if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2210 return(1);
2211 return(0);
2212 case XML_REGEXP_NUMBER_DECIMAL:
2213 case XML_REGEXP_NUMBER_LETTER:
2214 case XML_REGEXP_NUMBER_OTHERS:
2215 return(0);
2216 case XML_REGEXP_PUNCT:
2217 if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2218 return(1);
2219 return(0);
2220 case XML_REGEXP_PUNCT_CONNECTOR:
2221 case XML_REGEXP_PUNCT_DASH:
2222 case XML_REGEXP_PUNCT_OPEN:
2223 case XML_REGEXP_PUNCT_CLOSE:
2224 case XML_REGEXP_PUNCT_INITQUOTE:
2225 case XML_REGEXP_PUNCT_FINQUOTE:
2226 case XML_REGEXP_PUNCT_OTHERS:
2227 return(0);
2228 case XML_REGEXP_SEPAR:
2229 if (type2 <= XML_REGEXP_SEPAR_PARA)
2230 return(1);
2231 return(0);
2232 case XML_REGEXP_SEPAR_SPACE:
2233 case XML_REGEXP_SEPAR_LINE:
2234 case XML_REGEXP_SEPAR_PARA:
2235 return(0);
2236 case XML_REGEXP_SYMBOL:
2237 if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2238 return(1);
2239 return(0);
2240 case XML_REGEXP_SYMBOL_MATH:
2241 case XML_REGEXP_SYMBOL_CURRENCY:
2242 case XML_REGEXP_SYMBOL_MODIFIER:
2243 case XML_REGEXP_SYMBOL_OTHERS:
2244 return(0);
2245 case XML_REGEXP_OTHER:
2246 if (type2 <= XML_REGEXP_OTHER_NA)
2247 return(1);
2248 return(0);
2249 case XML_REGEXP_OTHER_CONTROL:
2250 case XML_REGEXP_OTHER_FORMAT:
2251 case XML_REGEXP_OTHER_PRIVATE:
2252 case XML_REGEXP_OTHER_NA:
2253 return(0);
2254 default:
2255 break;
2256 }
2257 return(1);
2258}
2259
2260/**
2261 * xmlFAEqualAtoms:
Daniel Veillarde19fc232002-04-22 16:01:24 +00002262 * @atom1: an atom
2263 * @atom2: an atom
2264 *
Daniel Veillardfc011b72006-02-12 19:14:15 +00002265 * Compares two atoms to check whether they are the same exactly
2266 * this is used to remove equivalent transitions
Daniel Veillarde19fc232002-04-22 16:01:24 +00002267 *
Daniel Veillardfc011b72006-02-12 19:14:15 +00002268 * Returns 1 if same and 0 otherwise
Daniel Veillarde19fc232002-04-22 16:01:24 +00002269 */
2270static int
Daniel Veillardfc011b72006-02-12 19:14:15 +00002271xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
2272 int ret = 0;
Daniel Veillard9efc4762005-07-19 14:33:55 +00002273
Daniel Veillarde19fc232002-04-22 16:01:24 +00002274 if (atom1 == atom2)
2275 return(1);
2276 if ((atom1 == NULL) || (atom2 == NULL))
2277 return(0);
2278
Daniel Veillardfc011b72006-02-12 19:14:15 +00002279 if (atom1->type != atom2->type)
2280 return(0);
2281 switch (atom1->type) {
2282 case XML_REGEXP_EPSILON:
2283 ret = 0;
2284 break;
2285 case XML_REGEXP_STRING:
2286 ret = xmlStrEqual((xmlChar *)atom1->valuep,
2287 (xmlChar *)atom2->valuep);
2288 break;
2289 case XML_REGEXP_CHARVAL:
2290 ret = (atom1->codepoint == atom2->codepoint);
2291 break;
2292 case XML_REGEXP_RANGES:
2293 /* too hard to do in the general case */
2294 ret = 0;
2295 default:
2296 break;
2297 }
2298 return(ret);
2299}
2300
2301/**
2302 * xmlFACompareAtoms:
2303 * @atom1: an atom
2304 * @atom2: an atom
2305 *
2306 * Compares two atoms to check whether they intersect in some ways,
2307 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2308 *
2309 * Returns 1 if yes and 0 otherwise
2310 */
2311static int
2312xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
2313 int ret = 1;
2314
2315 if (atom1 == atom2)
2316 return(1);
2317 if ((atom1 == NULL) || (atom2 == NULL))
2318 return(0);
2319
2320 if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2321 (atom2->type == XML_REGEXP_ANYCHAR))
2322 return(1);
2323
2324 if (atom1->type > atom2->type) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002325 xmlRegAtomPtr tmp;
2326 tmp = atom1;
2327 atom1 = atom2;
2328 atom2 = tmp;
Daniel Veillardfc011b72006-02-12 19:14:15 +00002329 }
2330 if (atom1->type != atom2->type) {
2331 ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2332 /* if they can't intersect at the type level break now */
2333 if (ret == 0)
2334 return(0);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002335 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002336 switch (atom1->type) {
2337 case XML_REGEXP_STRING:
Daniel Veillard9efc4762005-07-19 14:33:55 +00002338 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep,
2339 (xmlChar *)atom2->valuep);
2340 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002341 case XML_REGEXP_EPSILON:
Daniel Veillard567a45b2005-10-18 19:11:55 +00002342 goto not_determinist;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002343 case XML_REGEXP_CHARVAL:
Daniel Veillardfc011b72006-02-12 19:14:15 +00002344 if (atom2->type == XML_REGEXP_CHARVAL) {
2345 ret = (atom1->codepoint == atom2->codepoint);
2346 } else {
2347 ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2348 if (ret < 0)
2349 ret = 1;
2350 }
Daniel Veillard9efc4762005-07-19 14:33:55 +00002351 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002352 case XML_REGEXP_RANGES:
Daniel Veillardfc011b72006-02-12 19:14:15 +00002353 if (atom2->type == XML_REGEXP_RANGES) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002354 int i, j, res;
2355 xmlRegRangePtr r1, r2;
2356
2357 /*
2358 * need to check that none of the ranges eventually matches
2359 */
2360 for (i = 0;i < atom1->nbRanges;i++) {
2361 for (j = 0;j < atom2->nbRanges;j++) {
2362 r1 = atom1->ranges[i];
2363 r2 = atom2->ranges[j];
2364 res = xmlFACompareRanges(r1, r2);
2365 if (res == 1) {
2366 ret = 1;
2367 goto done;
2368 }
2369 }
2370 }
2371 ret = 0;
2372 }
2373 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002374 default:
Daniel Veillard567a45b2005-10-18 19:11:55 +00002375 goto not_determinist;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002376 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002377done:
Daniel Veillard6e65e152005-08-09 11:09:52 +00002378 if (atom1->neg != atom2->neg) {
Daniel Veillard9efc4762005-07-19 14:33:55 +00002379 ret = !ret;
Daniel Veillard6e65e152005-08-09 11:09:52 +00002380 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002381 if (ret == 0)
2382 return(0);
2383not_determinist:
2384 return(1);
Daniel Veillarde19fc232002-04-22 16:01:24 +00002385}
2386
2387/**
2388 * xmlFARecurseDeterminism:
2389 * @ctxt: a regexp parser context
2390 *
2391 * Check whether the associated regexp is determinist,
2392 * should be called after xmlFAEliminateEpsilonTransitions()
2393 *
2394 */
2395static int
2396xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2397 int to, xmlRegAtomPtr atom) {
2398 int ret = 1;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002399 int res;
Daniel Veillard5de09382005-09-26 17:18:17 +00002400 int transnr, nbTrans;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002401 xmlRegTransPtr t1;
2402
2403 if (state == NULL)
2404 return(ret);
Daniel Veillard5de09382005-09-26 17:18:17 +00002405 /*
2406 * don't recurse on transitions potentially added in the course of
2407 * the elimination.
2408 */
2409 nbTrans = state->nbTrans;
2410 for (transnr = 0;transnr < nbTrans;transnr++) {
Daniel Veillarde19fc232002-04-22 16:01:24 +00002411 t1 = &(state->trans[transnr]);
2412 /*
2413 * check transitions conflicting with the one looked at
2414 */
2415 if (t1->atom == NULL) {
2416 if (t1->to == -1)
2417 continue;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002418 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
Daniel Veillarde19fc232002-04-22 16:01:24 +00002419 to, atom);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002420 if (res == 0) {
2421 ret = 0;
Daniel Veillardaa622012005-10-20 15:55:25 +00002422 /* t1->nd = 1; */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002423 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002424 continue;
2425 }
2426 if (t1->to != to)
2427 continue;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002428 if (xmlFACompareAtoms(t1->atom, atom)) {
2429 ret = 0;
2430 /* mark the transition as non-deterministic */
2431 t1->nd = 1;
2432 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002433 }
2434 return(ret);
2435}
2436
2437/**
2438 * xmlFAComputesDeterminism:
2439 * @ctxt: a regexp parser context
2440 *
2441 * Check whether the associated regexp is determinist,
2442 * should be called after xmlFAEliminateEpsilonTransitions()
2443 *
2444 */
2445static int
2446xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2447 int statenr, transnr;
2448 xmlRegStatePtr state;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002449 xmlRegTransPtr t1, t2, last;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002450 int i;
2451 int ret = 1;
2452
Daniel Veillard4402ab42002-09-12 16:02:56 +00002453#ifdef DEBUG_REGEXP_GRAPH
2454 printf("xmlFAComputesDeterminism\n");
2455 xmlRegPrintCtxt(stdout, ctxt);
2456#endif
Daniel Veillarde19fc232002-04-22 16:01:24 +00002457 if (ctxt->determinist != -1)
2458 return(ctxt->determinist);
2459
2460 /*
Daniel Veillard567a45b2005-10-18 19:11:55 +00002461 * First cleanup the automata removing cancelled transitions
Daniel Veillarde19fc232002-04-22 16:01:24 +00002462 */
2463 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2464 state = ctxt->states[statenr];
2465 if (state == NULL)
2466 continue;
Daniel Veillard4f82c8a2005-08-09 21:40:08 +00002467 if (state->nbTrans < 2)
2468 continue;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002469 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2470 t1 = &(state->trans[transnr]);
2471 /*
2472 * Determinism checks in case of counted or all transitions
2473 * will have to be handled separately
2474 */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002475 if (t1->atom == NULL) {
Daniel Veillardaa622012005-10-20 15:55:25 +00002476 /* t1->nd = 1; */
Daniel Veillarde19fc232002-04-22 16:01:24 +00002477 continue;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002478 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002479 if (t1->to == -1) /* eliminated */
2480 continue;
2481 for (i = 0;i < transnr;i++) {
2482 t2 = &(state->trans[i]);
2483 if (t2->to == -1) /* eliminated */
2484 continue;
2485 if (t2->atom != NULL) {
2486 if (t1->to == t2->to) {
Daniel Veillardfc011b72006-02-12 19:14:15 +00002487 if (xmlFAEqualAtoms(t1->atom, t2->atom))
William M. Brackddf71d62004-05-06 04:17:26 +00002488 t2->to = -1; /* eliminated */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002489 }
2490 }
2491 }
2492 }
2493 }
2494
2495 /*
2496 * Check for all states that there aren't 2 transitions
2497 * with the same atom and a different target.
2498 */
2499 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2500 state = ctxt->states[statenr];
2501 if (state == NULL)
2502 continue;
2503 if (state->nbTrans < 2)
2504 continue;
2505 last = NULL;
2506 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2507 t1 = &(state->trans[transnr]);
2508 /*
2509 * Determinism checks in case of counted or all transitions
2510 * will have to be handled separately
2511 */
2512 if (t1->atom == NULL) {
2513 continue;
2514 }
2515 if (t1->to == -1) /* eliminated */
2516 continue;
2517 for (i = 0;i < transnr;i++) {
2518 t2 = &(state->trans[i]);
2519 if (t2->to == -1) /* eliminated */
2520 continue;
2521 if (t2->atom != NULL) {
2522 /* not determinist ! */
2523 if (xmlFACompareAtoms(t1->atom, t2->atom)) {
2524 ret = 0;
2525 /* mark the transitions as non-deterministic ones */
2526 t1->nd = 1;
2527 t2->nd = 1;
2528 last = t1;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002529 }
2530 } else if (t1->to != -1) {
2531 /*
2532 * do the closure in case of remaining specific
2533 * epsilon transitions like choices or all
2534 */
2535 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2536 t2->to, t2->atom);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002537 /* don't shortcut the computation so all non deterministic
2538 transition get marked down
Daniel Veillarde19fc232002-04-22 16:01:24 +00002539 if (ret == 0)
Daniel Veillardaa622012005-10-20 15:55:25 +00002540 return(0);
2541 */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002542 if (ret == 0) {
2543 t1->nd = 1;
Daniel Veillardaa622012005-10-20 15:55:25 +00002544 /* t2->nd = 1; */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002545 last = t1;
2546 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002547 }
2548 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002549 /* don't shortcut the computation so all non deterministic
2550 transition get marked down
Daniel Veillarde19fc232002-04-22 16:01:24 +00002551 if (ret == 0)
Daniel Veillard567a45b2005-10-18 19:11:55 +00002552 break; */
Daniel Veillarde19fc232002-04-22 16:01:24 +00002553 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002554
2555 /*
2556 * mark specifically the last non-deterministic transition
2557 * from a state since there is no need to set-up rollback
2558 * from it
2559 */
2560 if (last != NULL) {
2561 last->nd = 2;
2562 }
2563
2564 /* don't shortcut the computation so all non deterministic
2565 transition get marked down
Daniel Veillarde19fc232002-04-22 16:01:24 +00002566 if (ret == 0)
Daniel Veillard567a45b2005-10-18 19:11:55 +00002567 break; */
Daniel Veillarde19fc232002-04-22 16:01:24 +00002568 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002569
Daniel Veillarde19fc232002-04-22 16:01:24 +00002570 ctxt->determinist = ret;
2571 return(ret);
2572}
2573
Daniel Veillard4255d502002-04-16 15:50:10 +00002574/************************************************************************
2575 * *
2576 * Routines to check input against transition atoms *
2577 * *
2578 ************************************************************************/
2579
2580static int
2581xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2582 int start, int end, const xmlChar *blockName) {
2583 int ret = 0;
2584
2585 switch (type) {
2586 case XML_REGEXP_STRING:
2587 case XML_REGEXP_SUBREG:
2588 case XML_REGEXP_RANGES:
2589 case XML_REGEXP_EPSILON:
2590 return(-1);
2591 case XML_REGEXP_ANYCHAR:
2592 ret = ((codepoint != '\n') && (codepoint != '\r'));
2593 break;
2594 case XML_REGEXP_CHARVAL:
2595 ret = ((codepoint >= start) && (codepoint <= end));
2596 break;
2597 case XML_REGEXP_NOTSPACE:
2598 neg = !neg;
2599 case XML_REGEXP_ANYSPACE:
2600 ret = ((codepoint == '\n') || (codepoint == '\r') ||
2601 (codepoint == '\t') || (codepoint == ' '));
2602 break;
2603 case XML_REGEXP_NOTINITNAME:
2604 neg = !neg;
2605 case XML_REGEXP_INITNAME:
William M. Brack871611b2003-10-18 04:53:14 +00002606 ret = (IS_LETTER(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00002607 (codepoint == '_') || (codepoint == ':'));
2608 break;
2609 case XML_REGEXP_NOTNAMECHAR:
2610 neg = !neg;
2611 case XML_REGEXP_NAMECHAR:
William M. Brack871611b2003-10-18 04:53:14 +00002612 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00002613 (codepoint == '.') || (codepoint == '-') ||
2614 (codepoint == '_') || (codepoint == ':') ||
William M. Brack871611b2003-10-18 04:53:14 +00002615 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
Daniel Veillard4255d502002-04-16 15:50:10 +00002616 break;
2617 case XML_REGEXP_NOTDECIMAL:
2618 neg = !neg;
2619 case XML_REGEXP_DECIMAL:
2620 ret = xmlUCSIsCatNd(codepoint);
2621 break;
2622 case XML_REGEXP_REALCHAR:
2623 neg = !neg;
2624 case XML_REGEXP_NOTREALCHAR:
2625 ret = xmlUCSIsCatP(codepoint);
2626 if (ret == 0)
2627 ret = xmlUCSIsCatZ(codepoint);
2628 if (ret == 0)
2629 ret = xmlUCSIsCatC(codepoint);
2630 break;
2631 case XML_REGEXP_LETTER:
2632 ret = xmlUCSIsCatL(codepoint);
2633 break;
2634 case XML_REGEXP_LETTER_UPPERCASE:
2635 ret = xmlUCSIsCatLu(codepoint);
2636 break;
2637 case XML_REGEXP_LETTER_LOWERCASE:
2638 ret = xmlUCSIsCatLl(codepoint);
2639 break;
2640 case XML_REGEXP_LETTER_TITLECASE:
2641 ret = xmlUCSIsCatLt(codepoint);
2642 break;
2643 case XML_REGEXP_LETTER_MODIFIER:
2644 ret = xmlUCSIsCatLm(codepoint);
2645 break;
2646 case XML_REGEXP_LETTER_OTHERS:
2647 ret = xmlUCSIsCatLo(codepoint);
2648 break;
2649 case XML_REGEXP_MARK:
2650 ret = xmlUCSIsCatM(codepoint);
2651 break;
2652 case XML_REGEXP_MARK_NONSPACING:
2653 ret = xmlUCSIsCatMn(codepoint);
2654 break;
2655 case XML_REGEXP_MARK_SPACECOMBINING:
2656 ret = xmlUCSIsCatMc(codepoint);
2657 break;
2658 case XML_REGEXP_MARK_ENCLOSING:
2659 ret = xmlUCSIsCatMe(codepoint);
2660 break;
2661 case XML_REGEXP_NUMBER:
2662 ret = xmlUCSIsCatN(codepoint);
2663 break;
2664 case XML_REGEXP_NUMBER_DECIMAL:
2665 ret = xmlUCSIsCatNd(codepoint);
2666 break;
2667 case XML_REGEXP_NUMBER_LETTER:
2668 ret = xmlUCSIsCatNl(codepoint);
2669 break;
2670 case XML_REGEXP_NUMBER_OTHERS:
2671 ret = xmlUCSIsCatNo(codepoint);
2672 break;
2673 case XML_REGEXP_PUNCT:
2674 ret = xmlUCSIsCatP(codepoint);
2675 break;
2676 case XML_REGEXP_PUNCT_CONNECTOR:
2677 ret = xmlUCSIsCatPc(codepoint);
2678 break;
2679 case XML_REGEXP_PUNCT_DASH:
2680 ret = xmlUCSIsCatPd(codepoint);
2681 break;
2682 case XML_REGEXP_PUNCT_OPEN:
2683 ret = xmlUCSIsCatPs(codepoint);
2684 break;
2685 case XML_REGEXP_PUNCT_CLOSE:
2686 ret = xmlUCSIsCatPe(codepoint);
2687 break;
2688 case XML_REGEXP_PUNCT_INITQUOTE:
2689 ret = xmlUCSIsCatPi(codepoint);
2690 break;
2691 case XML_REGEXP_PUNCT_FINQUOTE:
2692 ret = xmlUCSIsCatPf(codepoint);
2693 break;
2694 case XML_REGEXP_PUNCT_OTHERS:
2695 ret = xmlUCSIsCatPo(codepoint);
2696 break;
2697 case XML_REGEXP_SEPAR:
2698 ret = xmlUCSIsCatZ(codepoint);
2699 break;
2700 case XML_REGEXP_SEPAR_SPACE:
2701 ret = xmlUCSIsCatZs(codepoint);
2702 break;
2703 case XML_REGEXP_SEPAR_LINE:
2704 ret = xmlUCSIsCatZl(codepoint);
2705 break;
2706 case XML_REGEXP_SEPAR_PARA:
2707 ret = xmlUCSIsCatZp(codepoint);
2708 break;
2709 case XML_REGEXP_SYMBOL:
2710 ret = xmlUCSIsCatS(codepoint);
2711 break;
2712 case XML_REGEXP_SYMBOL_MATH:
2713 ret = xmlUCSIsCatSm(codepoint);
2714 break;
2715 case XML_REGEXP_SYMBOL_CURRENCY:
2716 ret = xmlUCSIsCatSc(codepoint);
2717 break;
2718 case XML_REGEXP_SYMBOL_MODIFIER:
2719 ret = xmlUCSIsCatSk(codepoint);
2720 break;
2721 case XML_REGEXP_SYMBOL_OTHERS:
2722 ret = xmlUCSIsCatSo(codepoint);
2723 break;
2724 case XML_REGEXP_OTHER:
2725 ret = xmlUCSIsCatC(codepoint);
2726 break;
2727 case XML_REGEXP_OTHER_CONTROL:
2728 ret = xmlUCSIsCatCc(codepoint);
2729 break;
2730 case XML_REGEXP_OTHER_FORMAT:
2731 ret = xmlUCSIsCatCf(codepoint);
2732 break;
2733 case XML_REGEXP_OTHER_PRIVATE:
2734 ret = xmlUCSIsCatCo(codepoint);
2735 break;
2736 case XML_REGEXP_OTHER_NA:
2737 /* ret = xmlUCSIsCatCn(codepoint); */
2738 /* Seems it doesn't exist anymore in recent Unicode releases */
2739 ret = 0;
2740 break;
2741 case XML_REGEXP_BLOCK_NAME:
2742 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
2743 break;
2744 }
2745 if (neg)
2746 return(!ret);
2747 return(ret);
2748}
2749
2750static int
2751xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
2752 int i, ret = 0;
2753 xmlRegRangePtr range;
2754
William M. Brack871611b2003-10-18 04:53:14 +00002755 if ((atom == NULL) || (!IS_CHAR(codepoint)))
Daniel Veillard4255d502002-04-16 15:50:10 +00002756 return(-1);
2757
2758 switch (atom->type) {
2759 case XML_REGEXP_SUBREG:
2760 case XML_REGEXP_EPSILON:
2761 return(-1);
2762 case XML_REGEXP_CHARVAL:
2763 return(codepoint == atom->codepoint);
2764 case XML_REGEXP_RANGES: {
2765 int accept = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00002766
Daniel Veillard4255d502002-04-16 15:50:10 +00002767 for (i = 0;i < atom->nbRanges;i++) {
2768 range = atom->ranges[i];
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002769 if (range->neg == 2) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002770 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2771 0, range->start, range->end,
2772 range->blockName);
2773 if (ret != 0)
2774 return(0); /* excluded char */
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002775 } else if (range->neg) {
2776 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2777 0, range->start, range->end,
2778 range->blockName);
2779 if (ret == 0)
Daniel Veillardf2a12832003-11-24 13:04:35 +00002780 accept = 1;
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002781 else
2782 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00002783 } else {
2784 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2785 0, range->start, range->end,
2786 range->blockName);
2787 if (ret != 0)
2788 accept = 1; /* might still be excluded */
2789 }
2790 }
2791 return(accept);
2792 }
2793 case XML_REGEXP_STRING:
2794 printf("TODO: XML_REGEXP_STRING\n");
2795 return(-1);
2796 case XML_REGEXP_ANYCHAR:
2797 case XML_REGEXP_ANYSPACE:
2798 case XML_REGEXP_NOTSPACE:
2799 case XML_REGEXP_INITNAME:
2800 case XML_REGEXP_NOTINITNAME:
2801 case XML_REGEXP_NAMECHAR:
2802 case XML_REGEXP_NOTNAMECHAR:
2803 case XML_REGEXP_DECIMAL:
2804 case XML_REGEXP_NOTDECIMAL:
2805 case XML_REGEXP_REALCHAR:
2806 case XML_REGEXP_NOTREALCHAR:
2807 case XML_REGEXP_LETTER:
2808 case XML_REGEXP_LETTER_UPPERCASE:
2809 case XML_REGEXP_LETTER_LOWERCASE:
2810 case XML_REGEXP_LETTER_TITLECASE:
2811 case XML_REGEXP_LETTER_MODIFIER:
2812 case XML_REGEXP_LETTER_OTHERS:
2813 case XML_REGEXP_MARK:
2814 case XML_REGEXP_MARK_NONSPACING:
2815 case XML_REGEXP_MARK_SPACECOMBINING:
2816 case XML_REGEXP_MARK_ENCLOSING:
2817 case XML_REGEXP_NUMBER:
2818 case XML_REGEXP_NUMBER_DECIMAL:
2819 case XML_REGEXP_NUMBER_LETTER:
2820 case XML_REGEXP_NUMBER_OTHERS:
2821 case XML_REGEXP_PUNCT:
2822 case XML_REGEXP_PUNCT_CONNECTOR:
2823 case XML_REGEXP_PUNCT_DASH:
2824 case XML_REGEXP_PUNCT_OPEN:
2825 case XML_REGEXP_PUNCT_CLOSE:
2826 case XML_REGEXP_PUNCT_INITQUOTE:
2827 case XML_REGEXP_PUNCT_FINQUOTE:
2828 case XML_REGEXP_PUNCT_OTHERS:
2829 case XML_REGEXP_SEPAR:
2830 case XML_REGEXP_SEPAR_SPACE:
2831 case XML_REGEXP_SEPAR_LINE:
2832 case XML_REGEXP_SEPAR_PARA:
2833 case XML_REGEXP_SYMBOL:
2834 case XML_REGEXP_SYMBOL_MATH:
2835 case XML_REGEXP_SYMBOL_CURRENCY:
2836 case XML_REGEXP_SYMBOL_MODIFIER:
2837 case XML_REGEXP_SYMBOL_OTHERS:
2838 case XML_REGEXP_OTHER:
2839 case XML_REGEXP_OTHER_CONTROL:
2840 case XML_REGEXP_OTHER_FORMAT:
2841 case XML_REGEXP_OTHER_PRIVATE:
2842 case XML_REGEXP_OTHER_NA:
2843 case XML_REGEXP_BLOCK_NAME:
2844 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
2845 (const xmlChar *)atom->valuep);
2846 if (atom->neg)
2847 ret = !ret;
2848 break;
2849 }
2850 return(ret);
2851}
2852
2853/************************************************************************
2854 * *
William M. Brackddf71d62004-05-06 04:17:26 +00002855 * Saving and restoring state of an execution context *
Daniel Veillard4255d502002-04-16 15:50:10 +00002856 * *
2857 ************************************************************************/
2858
2859#ifdef DEBUG_REGEXP_EXEC
2860static void
2861xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
2862 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
2863 if (exec->inputStack != NULL) {
2864 int i;
2865 printf(": ");
2866 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
2867 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]);
2868 } else {
2869 printf(": %s", &(exec->inputString[exec->index]));
2870 }
2871 printf("\n");
2872}
2873#endif
2874
2875static void
2876xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
2877#ifdef DEBUG_REGEXP_EXEC
2878 printf("saving ");
2879 exec->transno++;
2880 xmlFARegDebugExec(exec);
2881 exec->transno--;
2882#endif
Daniel Veillard94cc1032005-09-15 13:09:00 +00002883#ifdef MAX_PUSH
2884 if (exec->nbPush > MAX_PUSH) {
2885 return;
2886 }
2887 exec->nbPush++;
2888#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00002889
2890 if (exec->maxRollbacks == 0) {
2891 exec->maxRollbacks = 4;
2892 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
2893 sizeof(xmlRegExecRollback));
2894 if (exec->rollbacks == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002895 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002896 exec->maxRollbacks = 0;
2897 return;
2898 }
2899 memset(exec->rollbacks, 0,
2900 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2901 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
2902 xmlRegExecRollback *tmp;
2903 int len = exec->maxRollbacks;
2904
2905 exec->maxRollbacks *= 2;
2906 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
2907 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2908 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002909 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002910 exec->maxRollbacks /= 2;
2911 return;
2912 }
2913 exec->rollbacks = tmp;
2914 tmp = &exec->rollbacks[len];
2915 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
2916 }
2917 exec->rollbacks[exec->nbRollbacks].state = exec->state;
2918 exec->rollbacks[exec->nbRollbacks].index = exec->index;
2919 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
2920 if (exec->comp->nbCounters > 0) {
2921 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2922 exec->rollbacks[exec->nbRollbacks].counts = (int *)
2923 xmlMalloc(exec->comp->nbCounters * sizeof(int));
2924 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002925 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002926 exec->status = -5;
2927 return;
2928 }
2929 }
2930 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
2931 exec->comp->nbCounters * sizeof(int));
2932 }
2933 exec->nbRollbacks++;
2934}
2935
2936static void
2937xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
2938 if (exec->nbRollbacks <= 0) {
2939 exec->status = -1;
2940#ifdef DEBUG_REGEXP_EXEC
2941 printf("rollback failed on empty stack\n");
2942#endif
2943 return;
2944 }
2945 exec->nbRollbacks--;
2946 exec->state = exec->rollbacks[exec->nbRollbacks].state;
2947 exec->index = exec->rollbacks[exec->nbRollbacks].index;
2948 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
2949 if (exec->comp->nbCounters > 0) {
2950 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2951 fprintf(stderr, "exec save: allocation failed");
2952 exec->status = -6;
2953 return;
2954 }
2955 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
2956 exec->comp->nbCounters * sizeof(int));
2957 }
2958
2959#ifdef DEBUG_REGEXP_EXEC
2960 printf("restored ");
2961 xmlFARegDebugExec(exec);
2962#endif
2963}
2964
2965/************************************************************************
2966 * *
William M. Brackddf71d62004-05-06 04:17:26 +00002967 * Verifier, running an input against a compiled regexp *
Daniel Veillard4255d502002-04-16 15:50:10 +00002968 * *
2969 ************************************************************************/
2970
2971static int
2972xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
2973 xmlRegExecCtxt execval;
2974 xmlRegExecCtxtPtr exec = &execval;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002975 int ret, codepoint = 0, len, deter;
Daniel Veillard4255d502002-04-16 15:50:10 +00002976
2977 exec->inputString = content;
2978 exec->index = 0;
Daniel Veillard94cc1032005-09-15 13:09:00 +00002979 exec->nbPush = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00002980 exec->determinist = 1;
2981 exec->maxRollbacks = 0;
2982 exec->nbRollbacks = 0;
2983 exec->rollbacks = NULL;
2984 exec->status = 0;
2985 exec->comp = comp;
2986 exec->state = comp->states[0];
2987 exec->transno = 0;
2988 exec->transcount = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00002989 exec->inputStack = NULL;
2990 exec->inputStackMax = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00002991 if (comp->nbCounters > 0) {
2992 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
Daniel Veillardff46a042003-10-08 08:53:17 +00002993 if (exec->counts == NULL) {
2994 xmlRegexpErrMemory(NULL, "running regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002995 return(-1);
Daniel Veillardff46a042003-10-08 08:53:17 +00002996 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002997 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
2998 } else
2999 exec->counts = NULL;
3000 while ((exec->status == 0) &&
3001 ((exec->inputString[exec->index] != 0) ||
3002 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
3003 xmlRegTransPtr trans;
3004 xmlRegAtomPtr atom;
3005
3006 /*
William M. Brack0e00b282004-04-26 15:40:47 +00003007 * If end of input on non-terminal state, rollback, however we may
Daniel Veillard4255d502002-04-16 15:50:10 +00003008 * still have epsilon like transition for counted transitions
William M. Brack0e00b282004-04-26 15:40:47 +00003009 * on counters, in that case don't break too early. Additionally,
3010 * if we are working on a range like "AB{0,2}", where B is not present,
3011 * we don't want to break.
Daniel Veillard4255d502002-04-16 15:50:10 +00003012 */
William M. Brack0e00b282004-04-26 15:40:47 +00003013 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
William M. Brackddf71d62004-05-06 04:17:26 +00003014 /*
3015 * if there is a transition, we must check if
3016 * atom allows minOccurs of 0
3017 */
3018 if (exec->transno < exec->state->nbTrans) {
William M. Brack0e00b282004-04-26 15:40:47 +00003019 trans = &exec->state->trans[exec->transno];
3020 if (trans->to >=0) {
3021 atom = trans->atom;
3022 if (!((atom->min == 0) && (atom->max > 0)))
3023 goto rollback;
3024 }
3025 } else
3026 goto rollback;
3027 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003028
3029 exec->transcount = 0;
3030 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3031 trans = &exec->state->trans[exec->transno];
3032 if (trans->to < 0)
3033 continue;
3034 atom = trans->atom;
3035 ret = 0;
Daniel Veillard567a45b2005-10-18 19:11:55 +00003036 deter = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003037 if (trans->count >= 0) {
3038 int count;
3039 xmlRegCounterPtr counter;
3040
3041 /*
3042 * A counted transition.
3043 */
3044
3045 count = exec->counts[trans->count];
3046 counter = &exec->comp->counters[trans->count];
3047#ifdef DEBUG_REGEXP_EXEC
3048 printf("testing count %d: val %d, min %d, max %d\n",
3049 trans->count, count, counter->min, counter->max);
3050#endif
3051 ret = ((count >= counter->min) && (count <= counter->max));
Daniel Veillard567a45b2005-10-18 19:11:55 +00003052 if ((ret) && (counter->min != counter->max))
3053 deter = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003054 } else if (atom == NULL) {
3055 fprintf(stderr, "epsilon transition left at runtime\n");
3056 exec->status = -2;
3057 break;
3058 } else if (exec->inputString[exec->index] != 0) {
3059 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3060 ret = xmlRegCheckCharacter(atom, codepoint);
William M. Brack0e00b282004-04-26 15:40:47 +00003061 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003062 xmlRegStatePtr to = comp->states[trans->to];
3063
3064 /*
3065 * this is a multiple input sequence
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003066 * If there is a counter associated increment it now.
3067 * before potentially saving and rollback
Daniel Veillard4255d502002-04-16 15:50:10 +00003068 */
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003069 if (trans->counter >= 0) {
3070#ifdef DEBUG_REGEXP_EXEC
3071 printf("Increasing count %d\n", trans->counter);
3072#endif
3073 exec->counts[trans->counter]++;
3074 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003075 if (exec->state->nbTrans > exec->transno + 1) {
3076 xmlFARegExecSave(exec);
3077 }
3078 exec->transcount = 1;
3079 do {
3080 /*
3081 * Try to progress as much as possible on the input
3082 */
3083 if (exec->transcount == atom->max) {
3084 break;
3085 }
3086 exec->index += len;
3087 /*
3088 * End of input: stop here
3089 */
3090 if (exec->inputString[exec->index] == 0) {
3091 exec->index -= len;
3092 break;
3093 }
3094 if (exec->transcount >= atom->min) {
3095 int transno = exec->transno;
3096 xmlRegStatePtr state = exec->state;
3097
3098 /*
3099 * The transition is acceptable save it
3100 */
3101 exec->transno = -1; /* trick */
3102 exec->state = to;
3103 xmlFARegExecSave(exec);
3104 exec->transno = transno;
3105 exec->state = state;
3106 }
3107 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3108 len);
3109 ret = xmlRegCheckCharacter(atom, codepoint);
3110 exec->transcount++;
3111 } while (ret == 1);
3112 if (exec->transcount < atom->min)
3113 ret = 0;
3114
3115 /*
3116 * If the last check failed but one transition was found
3117 * possible, rollback
3118 */
3119 if (ret < 0)
3120 ret = 0;
3121 if (ret == 0) {
3122 goto rollback;
3123 }
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003124 if (trans->counter >= 0) {
3125#ifdef DEBUG_REGEXP_EXEC
3126 printf("Decreasing count %d\n", trans->counter);
3127#endif
3128 exec->counts[trans->counter]--;
3129 }
William M. Brack0e00b282004-04-26 15:40:47 +00003130 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3131 /*
3132 * we don't match on the codepoint, but minOccurs of 0
3133 * says that's ok. Setting len to 0 inhibits stepping
3134 * over the codepoint.
3135 */
3136 exec->transcount = 1;
3137 len = 0;
3138 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003139 }
William M. Brack0e00b282004-04-26 15:40:47 +00003140 } else if ((atom->min == 0) && (atom->max > 0)) {
3141 /* another spot to match when minOccurs is 0 */
3142 exec->transcount = 1;
3143 len = 0;
3144 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003145 }
3146 if (ret == 1) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00003147 if ((trans->nd == 1) ||
3148 ((trans->count >= 0) && (deter == 0) &&
3149 (exec->state->nbTrans > exec->transno + 1))) {
Daniel Veillardaa622012005-10-20 15:55:25 +00003150#ifdef DEBUG_REGEXP_EXEC
3151 if (trans->nd == 1)
3152 printf("Saving on nd transition atom %d for %c at %d\n",
3153 trans->atom->no, codepoint, exec->index);
3154 else
3155 printf("Saving on counted transition count %d for %c at %d\n",
3156 trans->count, codepoint, exec->index);
3157#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003158 xmlFARegExecSave(exec);
3159 }
3160 if (trans->counter >= 0) {
3161#ifdef DEBUG_REGEXP_EXEC
3162 printf("Increasing count %d\n", trans->counter);
3163#endif
3164 exec->counts[trans->counter]++;
3165 }
Daniel Veillard10752282005-08-08 13:05:13 +00003166 if ((trans->count >= 0) &&
3167 (trans->count < REGEXP_ALL_COUNTER)) {
3168#ifdef DEBUG_REGEXP_EXEC
3169 printf("resetting count %d on transition\n",
3170 trans->count);
3171#endif
3172 exec->counts[trans->count] = 0;
3173 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003174#ifdef DEBUG_REGEXP_EXEC
3175 printf("entering state %d\n", trans->to);
3176#endif
3177 exec->state = comp->states[trans->to];
3178 exec->transno = 0;
3179 if (trans->atom != NULL) {
3180 exec->index += len;
3181 }
3182 goto progress;
3183 } else if (ret < 0) {
3184 exec->status = -4;
3185 break;
3186 }
3187 }
3188 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3189rollback:
3190 /*
3191 * Failed to find a way out
3192 */
3193 exec->determinist = 0;
Daniel Veillardaa622012005-10-20 15:55:25 +00003194#ifdef DEBUG_REGEXP_EXEC
3195 printf("rollback from state %d on %d:%c\n", exec->state->no,
3196 codepoint,codepoint);
3197#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003198 xmlFARegExecRollBack(exec);
3199 }
3200progress:
3201 continue;
3202 }
3203 if (exec->rollbacks != NULL) {
3204 if (exec->counts != NULL) {
3205 int i;
3206
3207 for (i = 0;i < exec->maxRollbacks;i++)
3208 if (exec->rollbacks[i].counts != NULL)
3209 xmlFree(exec->rollbacks[i].counts);
3210 }
3211 xmlFree(exec->rollbacks);
3212 }
3213 if (exec->counts != NULL)
3214 xmlFree(exec->counts);
3215 if (exec->status == 0)
3216 return(1);
Daniel Veillard94cc1032005-09-15 13:09:00 +00003217 if (exec->status == -1) {
3218 if (exec->nbPush > MAX_PUSH)
3219 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003220 return(0);
Daniel Veillard94cc1032005-09-15 13:09:00 +00003221 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003222 return(exec->status);
3223}
3224
3225/************************************************************************
3226 * *
William M. Brackddf71d62004-05-06 04:17:26 +00003227 * Progressive interface to the verifier one atom at a time *
Daniel Veillard4255d502002-04-16 15:50:10 +00003228 * *
3229 ************************************************************************/
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003230#ifdef DEBUG_ERR
3231static void testerr(xmlRegExecCtxtPtr exec);
3232#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003233
3234/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003235 * xmlRegNewExecCtxt:
Daniel Veillard4255d502002-04-16 15:50:10 +00003236 * @comp: a precompiled regular expression
3237 * @callback: a callback function used for handling progresses in the
3238 * automata matching phase
3239 * @data: the context data associated to the callback in this context
3240 *
3241 * Build a context used for progressive evaluation of a regexp.
Daniel Veillard01c13b52002-12-10 15:19:08 +00003242 *
3243 * Returns the new context
Daniel Veillard4255d502002-04-16 15:50:10 +00003244 */
3245xmlRegExecCtxtPtr
3246xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3247 xmlRegExecCtxtPtr exec;
3248
3249 if (comp == NULL)
3250 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00003251 if ((comp->compact == NULL) && (comp->states == NULL))
3252 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00003253 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3254 if (exec == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003255 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00003256 return(NULL);
3257 }
3258 memset(exec, 0, sizeof(xmlRegExecCtxt));
3259 exec->inputString = NULL;
3260 exec->index = 0;
3261 exec->determinist = 1;
3262 exec->maxRollbacks = 0;
3263 exec->nbRollbacks = 0;
3264 exec->rollbacks = NULL;
3265 exec->status = 0;
3266 exec->comp = comp;
Daniel Veillard23e73572002-09-19 19:56:43 +00003267 if (comp->compact == NULL)
3268 exec->state = comp->states[0];
Daniel Veillard4255d502002-04-16 15:50:10 +00003269 exec->transno = 0;
3270 exec->transcount = 0;
3271 exec->callback = callback;
3272 exec->data = data;
3273 if (comp->nbCounters > 0) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003274 /*
3275 * For error handling, exec->counts is allocated twice the size
3276 * the second half is used to store the data in case of rollback
3277 */
3278 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3279 * 2);
Daniel Veillard4255d502002-04-16 15:50:10 +00003280 if (exec->counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003281 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00003282 xmlFree(exec);
3283 return(NULL);
3284 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003285 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3286 exec->errCounts = &exec->counts[comp->nbCounters];
3287 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00003288 exec->counts = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003289 exec->errCounts = NULL;
3290 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003291 exec->inputStackMax = 0;
3292 exec->inputStackNr = 0;
3293 exec->inputStack = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003294 exec->errStateNo = -1;
3295 exec->errString = NULL;
Daniel Veillard94cc1032005-09-15 13:09:00 +00003296 exec->nbPush = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003297 return(exec);
3298}
3299
3300/**
3301 * xmlRegFreeExecCtxt:
3302 * @exec: a regular expression evaulation context
3303 *
3304 * Free the structures associated to a regular expression evaulation context.
3305 */
3306void
3307xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3308 if (exec == NULL)
3309 return;
3310
3311 if (exec->rollbacks != NULL) {
3312 if (exec->counts != NULL) {
3313 int i;
3314
3315 for (i = 0;i < exec->maxRollbacks;i++)
3316 if (exec->rollbacks[i].counts != NULL)
3317 xmlFree(exec->rollbacks[i].counts);
3318 }
3319 xmlFree(exec->rollbacks);
3320 }
3321 if (exec->counts != NULL)
3322 xmlFree(exec->counts);
3323 if (exec->inputStack != NULL) {
3324 int i;
3325
Daniel Veillard32370232002-10-16 14:08:14 +00003326 for (i = 0;i < exec->inputStackNr;i++) {
3327 if (exec->inputStack[i].value != NULL)
3328 xmlFree(exec->inputStack[i].value);
3329 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003330 xmlFree(exec->inputStack);
3331 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003332 if (exec->errString != NULL)
3333 xmlFree(exec->errString);
Daniel Veillard4255d502002-04-16 15:50:10 +00003334 xmlFree(exec);
3335}
3336
3337static void
3338xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3339 void *data) {
3340#ifdef DEBUG_PUSH
3341 printf("saving value: %d:%s\n", exec->inputStackNr, value);
3342#endif
3343 if (exec->inputStackMax == 0) {
3344 exec->inputStackMax = 4;
3345 exec->inputStack = (xmlRegInputTokenPtr)
3346 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3347 if (exec->inputStack == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003348 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00003349 exec->inputStackMax = 0;
3350 return;
3351 }
3352 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3353 xmlRegInputTokenPtr tmp;
3354
3355 exec->inputStackMax *= 2;
3356 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3357 exec->inputStackMax * sizeof(xmlRegInputToken));
3358 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003359 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00003360 exec->inputStackMax /= 2;
3361 return;
3362 }
3363 exec->inputStack = tmp;
3364 }
3365 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3366 exec->inputStack[exec->inputStackNr].data = data;
3367 exec->inputStackNr++;
3368 exec->inputStack[exec->inputStackNr].value = NULL;
3369 exec->inputStack[exec->inputStackNr].data = NULL;
3370}
3371
Daniel Veillardc0826a72004-08-10 14:17:33 +00003372/**
3373 * xmlRegStrEqualWildcard:
3374 * @expStr: the string to be evaluated
3375 * @valStr: the validation string
3376 *
3377 * Checks if both strings are equal or have the same content. "*"
3378 * can be used as a wildcard in @valStr; "|" is used as a seperator of
3379 * substrings in both @expStr and @valStr.
3380 *
3381 * Returns 1 if the comparison is satisfied and the number of substrings
3382 * is equal, 0 otherwise.
3383 */
3384
3385static int
3386xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3387 if (expStr == valStr) return(1);
3388 if (expStr == NULL) return(0);
3389 if (valStr == NULL) return(0);
3390 do {
3391 /*
3392 * Eval if we have a wildcard for the current item.
3393 */
3394 if (*expStr != *valStr) {
Daniel Veillard4f82c8a2005-08-09 21:40:08 +00003395 /* if one of them starts with a wildcard make valStr be it */
3396 if (*valStr == '*') {
3397 const xmlChar *tmp;
3398
3399 tmp = valStr;
3400 valStr = expStr;
3401 expStr = tmp;
3402 }
Daniel Veillardc0826a72004-08-10 14:17:33 +00003403 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3404 do {
3405 if (*valStr == XML_REG_STRING_SEPARATOR)
3406 break;
Kasimier T. Buchcikc0e833f2005-04-19 15:02:20 +00003407 valStr++;
Daniel Veillardc0826a72004-08-10 14:17:33 +00003408 } while (*valStr != 0);
3409 continue;
3410 } else
3411 return(0);
3412 }
Kasimier T. Buchcikc0e833f2005-04-19 15:02:20 +00003413 expStr++;
3414 valStr++;
Daniel Veillardc0826a72004-08-10 14:17:33 +00003415 } while (*valStr != 0);
3416 if (*expStr != 0)
3417 return (0);
3418 else
3419 return (1);
3420}
Daniel Veillard4255d502002-04-16 15:50:10 +00003421
3422/**
Daniel Veillard23e73572002-09-19 19:56:43 +00003423 * xmlRegCompactPushString:
3424 * @exec: a regexp execution context
3425 * @comp: the precompiled exec with a compact table
3426 * @value: a string token input
3427 * @data: data associated to the token to reuse in callbacks
3428 *
3429 * Push one input token in the execution context
3430 *
3431 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3432 * a negative value in case of error.
3433 */
3434static int
3435xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3436 xmlRegexpPtr comp,
3437 const xmlChar *value,
3438 void *data) {
3439 int state = exec->index;
3440 int i, target;
3441
3442 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3443 return(-1);
3444
3445 if (value == NULL) {
3446 /*
3447 * are we at a final state ?
3448 */
3449 if (comp->compact[state * (comp->nbstrings + 1)] ==
3450 XML_REGEXP_FINAL_STATE)
3451 return(1);
3452 return(0);
3453 }
3454
3455#ifdef DEBUG_PUSH
3456 printf("value pushed: %s\n", value);
3457#endif
3458
3459 /*
William M. Brackddf71d62004-05-06 04:17:26 +00003460 * Examine all outside transitions from current state
Daniel Veillard23e73572002-09-19 19:56:43 +00003461 */
3462 for (i = 0;i < comp->nbstrings;i++) {
3463 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3464 if ((target > 0) && (target <= comp->nbstates)) {
Daniel Veillardc0826a72004-08-10 14:17:33 +00003465 target--; /* to avoid 0 */
3466 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3467 exec->index = target;
Daniel Veillard118aed72002-09-24 14:13:13 +00003468 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3469 exec->callback(exec->data, value,
3470 comp->transdata[state * comp->nbstrings + i], data);
3471 }
Daniel Veillard23e73572002-09-19 19:56:43 +00003472#ifdef DEBUG_PUSH
3473 printf("entering state %d\n", target);
3474#endif
3475 if (comp->compact[target * (comp->nbstrings + 1)] ==
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003476 XML_REGEXP_SINK_STATE)
3477 goto error;
3478
3479 if (comp->compact[target * (comp->nbstrings + 1)] ==
Daniel Veillard23e73572002-09-19 19:56:43 +00003480 XML_REGEXP_FINAL_STATE)
3481 return(1);
3482 return(0);
3483 }
3484 }
3485 }
3486 /*
3487 * Failed to find an exit transition out from current state for the
3488 * current token
3489 */
3490#ifdef DEBUG_PUSH
3491 printf("failed to find a transition for %s on state %d\n", value, state);
3492#endif
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003493error:
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003494 if (exec->errString != NULL)
3495 xmlFree(exec->errString);
3496 exec->errString = xmlStrdup(value);
3497 exec->errStateNo = state;
Daniel Veillard23e73572002-09-19 19:56:43 +00003498 exec->status = -1;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003499#ifdef DEBUG_ERR
3500 testerr(exec);
3501#endif
Daniel Veillard23e73572002-09-19 19:56:43 +00003502 return(-1);
3503}
3504
3505/**
Daniel Veillard6e65e152005-08-09 11:09:52 +00003506 * xmlRegExecPushStringInternal:
Daniel Veillardea7751d2002-12-20 00:16:24 +00003507 * @exec: a regexp execution context or NULL to indicate the end
Daniel Veillard4255d502002-04-16 15:50:10 +00003508 * @value: a string token input
3509 * @data: data associated to the token to reuse in callbacks
Daniel Veillard6e65e152005-08-09 11:09:52 +00003510 * @compound: value was assembled from 2 strings
Daniel Veillard4255d502002-04-16 15:50:10 +00003511 *
3512 * Push one input token in the execution context
3513 *
3514 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3515 * a negative value in case of error.
3516 */
Daniel Veillard6e65e152005-08-09 11:09:52 +00003517static int
3518xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3519 void *data, int compound) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003520 xmlRegTransPtr trans;
3521 xmlRegAtomPtr atom;
3522 int ret;
3523 int final = 0;
Daniel Veillard90700152005-01-08 22:05:09 +00003524 int progress = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003525
3526 if (exec == NULL)
3527 return(-1);
Daniel Veillard23e73572002-09-19 19:56:43 +00003528 if (exec->comp == NULL)
3529 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003530 if (exec->status != 0)
3531 return(exec->status);
3532
Daniel Veillard23e73572002-09-19 19:56:43 +00003533 if (exec->comp->compact != NULL)
3534 return(xmlRegCompactPushString(exec, exec->comp, value, data));
3535
Daniel Veillard4255d502002-04-16 15:50:10 +00003536 if (value == NULL) {
3537 if (exec->state->type == XML_REGEXP_FINAL_STATE)
3538 return(1);
3539 final = 1;
3540 }
3541
3542#ifdef DEBUG_PUSH
3543 printf("value pushed: %s\n", value);
3544#endif
3545 /*
3546 * If we have an active rollback stack push the new value there
3547 * and get back to where we were left
3548 */
3549 if ((value != NULL) && (exec->inputStackNr > 0)) {
3550 xmlFARegExecSaveInputString(exec, value, data);
3551 value = exec->inputStack[exec->index].value;
3552 data = exec->inputStack[exec->index].data;
3553#ifdef DEBUG_PUSH
3554 printf("value loaded: %s\n", value);
3555#endif
3556 }
3557
3558 while ((exec->status == 0) &&
3559 ((value != NULL) ||
3560 ((final == 1) &&
3561 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3562
3563 /*
3564 * End of input on non-terminal state, rollback, however we may
3565 * still have epsilon like transition for counted transitions
3566 * on counters, in that case don't break too early.
3567 */
Daniel Veillardb509f152002-04-17 16:28:10 +00003568 if ((value == NULL) && (exec->counts == NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +00003569 goto rollback;
3570
3571 exec->transcount = 0;
3572 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3573 trans = &exec->state->trans[exec->transno];
3574 if (trans->to < 0)
3575 continue;
3576 atom = trans->atom;
3577 ret = 0;
Daniel Veillard441bc322002-04-20 17:38:48 +00003578 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3579 int i;
3580 int count;
3581 xmlRegTransPtr t;
3582 xmlRegCounterPtr counter;
3583
3584 ret = 0;
3585
3586#ifdef DEBUG_PUSH
3587 printf("testing all lax %d\n", trans->count);
3588#endif
3589 /*
3590 * Check all counted transitions from the current state
3591 */
3592 if ((value == NULL) && (final)) {
3593 ret = 1;
3594 } else if (value != NULL) {
3595 for (i = 0;i < exec->state->nbTrans;i++) {
3596 t = &exec->state->trans[i];
3597 if ((t->counter < 0) || (t == trans))
3598 continue;
3599 counter = &exec->comp->counters[t->counter];
3600 count = exec->counts[t->counter];
3601 if ((count < counter->max) &&
3602 (t->atom != NULL) &&
3603 (xmlStrEqual(value, t->atom->valuep))) {
3604 ret = 0;
3605 break;
3606 }
3607 if ((count >= counter->min) &&
3608 (count < counter->max) &&
3609 (xmlStrEqual(value, t->atom->valuep))) {
3610 ret = 1;
3611 break;
3612 }
3613 }
3614 }
3615 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillard8a001f62002-04-20 07:24:11 +00003616 int i;
3617 int count;
3618 xmlRegTransPtr t;
3619 xmlRegCounterPtr counter;
3620
3621 ret = 1;
3622
3623#ifdef DEBUG_PUSH
3624 printf("testing all %d\n", trans->count);
3625#endif
3626 /*
3627 * Check all counted transitions from the current state
3628 */
3629 for (i = 0;i < exec->state->nbTrans;i++) {
3630 t = &exec->state->trans[i];
3631 if ((t->counter < 0) || (t == trans))
3632 continue;
3633 counter = &exec->comp->counters[t->counter];
3634 count = exec->counts[t->counter];
3635 if ((count < counter->min) || (count > counter->max)) {
3636 ret = 0;
3637 break;
3638 }
3639 }
3640 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003641 int count;
3642 xmlRegCounterPtr counter;
3643
3644 /*
3645 * A counted transition.
3646 */
3647
3648 count = exec->counts[trans->count];
3649 counter = &exec->comp->counters[trans->count];
3650#ifdef DEBUG_PUSH
3651 printf("testing count %d: val %d, min %d, max %d\n",
3652 trans->count, count, counter->min, counter->max);
3653#endif
3654 ret = ((count >= counter->min) && (count <= counter->max));
3655 } else if (atom == NULL) {
3656 fprintf(stderr, "epsilon transition left at runtime\n");
3657 exec->status = -2;
3658 break;
3659 } else if (value != NULL) {
Daniel Veillardc0826a72004-08-10 14:17:33 +00003660 ret = xmlRegStrEqualWildcard(atom->valuep, value);
Daniel Veillard6e65e152005-08-09 11:09:52 +00003661 if (atom->neg) {
Daniel Veillard9efc4762005-07-19 14:33:55 +00003662 ret = !ret;
Daniel Veillard6e65e152005-08-09 11:09:52 +00003663 if (!compound)
3664 ret = 0;
3665 }
Daniel Veillard441bc322002-04-20 17:38:48 +00003666 if ((ret == 1) && (trans->counter >= 0)) {
3667 xmlRegCounterPtr counter;
3668 int count;
3669
3670 count = exec->counts[trans->counter];
3671 counter = &exec->comp->counters[trans->counter];
3672 if (count >= counter->max)
3673 ret = 0;
3674 }
3675
Daniel Veillard4255d502002-04-16 15:50:10 +00003676 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3677 xmlRegStatePtr to = exec->comp->states[trans->to];
3678
3679 /*
3680 * this is a multiple input sequence
3681 */
3682 if (exec->state->nbTrans > exec->transno + 1) {
3683 if (exec->inputStackNr <= 0) {
3684 xmlFARegExecSaveInputString(exec, value, data);
3685 }
3686 xmlFARegExecSave(exec);
3687 }
3688 exec->transcount = 1;
3689 do {
3690 /*
3691 * Try to progress as much as possible on the input
3692 */
3693 if (exec->transcount == atom->max) {
3694 break;
3695 }
3696 exec->index++;
3697 value = exec->inputStack[exec->index].value;
3698 data = exec->inputStack[exec->index].data;
3699#ifdef DEBUG_PUSH
3700 printf("value loaded: %s\n", value);
3701#endif
3702
3703 /*
3704 * End of input: stop here
3705 */
3706 if (value == NULL) {
3707 exec->index --;
3708 break;
3709 }
3710 if (exec->transcount >= atom->min) {
3711 int transno = exec->transno;
3712 xmlRegStatePtr state = exec->state;
3713
3714 /*
3715 * The transition is acceptable save it
3716 */
3717 exec->transno = -1; /* trick */
3718 exec->state = to;
3719 if (exec->inputStackNr <= 0) {
3720 xmlFARegExecSaveInputString(exec, value, data);
3721 }
3722 xmlFARegExecSave(exec);
3723 exec->transno = transno;
3724 exec->state = state;
3725 }
3726 ret = xmlStrEqual(value, atom->valuep);
3727 exec->transcount++;
3728 } while (ret == 1);
3729 if (exec->transcount < atom->min)
3730 ret = 0;
3731
3732 /*
3733 * If the last check failed but one transition was found
3734 * possible, rollback
3735 */
3736 if (ret < 0)
3737 ret = 0;
3738 if (ret == 0) {
3739 goto rollback;
3740 }
3741 }
3742 }
3743 if (ret == 1) {
William M. Brack98873952003-12-26 06:03:14 +00003744 if ((exec->callback != NULL) && (atom != NULL) &&
3745 (data != NULL)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003746 exec->callback(exec->data, atom->valuep,
3747 atom->data, data);
3748 }
3749 if (exec->state->nbTrans > exec->transno + 1) {
3750 if (exec->inputStackNr <= 0) {
3751 xmlFARegExecSaveInputString(exec, value, data);
3752 }
3753 xmlFARegExecSave(exec);
3754 }
3755 if (trans->counter >= 0) {
3756#ifdef DEBUG_PUSH
3757 printf("Increasing count %d\n", trans->counter);
3758#endif
3759 exec->counts[trans->counter]++;
3760 }
Daniel Veillard10752282005-08-08 13:05:13 +00003761 if ((trans->count >= 0) &&
3762 (trans->count < REGEXP_ALL_COUNTER)) {
3763#ifdef DEBUG_REGEXP_EXEC
3764 printf("resetting count %d on transition\n",
3765 trans->count);
3766#endif
3767 exec->counts[trans->count] = 0;
3768 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003769#ifdef DEBUG_PUSH
3770 printf("entering state %d\n", trans->to);
3771#endif
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003772 if ((exec->comp->states[trans->to] != NULL) &&
3773 (exec->comp->states[trans->to]->type ==
3774 XML_REGEXP_SINK_STATE)) {
3775 /*
3776 * entering a sink state, save the current state as error
3777 * state.
3778 */
3779 if (exec->errString != NULL)
3780 xmlFree(exec->errString);
3781 exec->errString = xmlStrdup(value);
3782 exec->errState = exec->state;
3783 memcpy(exec->errCounts, exec->counts,
3784 exec->comp->nbCounters * sizeof(int));
3785 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003786 exec->state = exec->comp->states[trans->to];
3787 exec->transno = 0;
3788 if (trans->atom != NULL) {
3789 if (exec->inputStack != NULL) {
3790 exec->index++;
3791 if (exec->index < exec->inputStackNr) {
3792 value = exec->inputStack[exec->index].value;
3793 data = exec->inputStack[exec->index].data;
3794#ifdef DEBUG_PUSH
3795 printf("value loaded: %s\n", value);
3796#endif
3797 } else {
3798 value = NULL;
3799 data = NULL;
3800#ifdef DEBUG_PUSH
3801 printf("end of input\n");
3802#endif
3803 }
3804 } else {
3805 value = NULL;
3806 data = NULL;
3807#ifdef DEBUG_PUSH
3808 printf("end of input\n");
3809#endif
3810 }
3811 }
3812 goto progress;
3813 } else if (ret < 0) {
3814 exec->status = -4;
3815 break;
3816 }
3817 }
3818 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3819rollback:
Daniel Veillard90700152005-01-08 22:05:09 +00003820 /*
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003821 * if we didn't yet rollback on the current input
3822 * store the current state as the error state.
Daniel Veillard90700152005-01-08 22:05:09 +00003823 */
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003824 if ((progress) && (exec->state != NULL) &&
3825 (exec->state->type != XML_REGEXP_SINK_STATE)) {
Daniel Veillard90700152005-01-08 22:05:09 +00003826 progress = 0;
3827 if (exec->errString != NULL)
3828 xmlFree(exec->errString);
3829 exec->errString = xmlStrdup(value);
3830 exec->errState = exec->state;
3831 memcpy(exec->errCounts, exec->counts,
3832 exec->comp->nbCounters * sizeof(int));
3833 }
3834
Daniel Veillard4255d502002-04-16 15:50:10 +00003835 /*
3836 * Failed to find a way out
3837 */
3838 exec->determinist = 0;
3839 xmlFARegExecRollBack(exec);
3840 if (exec->status == 0) {
3841 value = exec->inputStack[exec->index].value;
3842 data = exec->inputStack[exec->index].data;
3843#ifdef DEBUG_PUSH
3844 printf("value loaded: %s\n", value);
3845#endif
3846 }
3847 }
Daniel Veillard90700152005-01-08 22:05:09 +00003848 continue;
Daniel Veillard4255d502002-04-16 15:50:10 +00003849progress:
Daniel Veillard90700152005-01-08 22:05:09 +00003850 progress = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003851 continue;
3852 }
3853 if (exec->status == 0) {
3854 return(exec->state->type == XML_REGEXP_FINAL_STATE);
3855 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003856#ifdef DEBUG_ERR
Daniel Veillard90700152005-01-08 22:05:09 +00003857 if (exec->status < 0) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003858 testerr(exec);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003859 }
Daniel Veillard90700152005-01-08 22:05:09 +00003860#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003861 return(exec->status);
3862}
3863
Daniel Veillard52b48c72003-04-13 19:53:42 +00003864/**
Daniel Veillard6e65e152005-08-09 11:09:52 +00003865 * xmlRegExecPushString:
3866 * @exec: a regexp execution context or NULL to indicate the end
3867 * @value: a string token input
3868 * @data: data associated to the token to reuse in callbacks
3869 *
3870 * Push one input token in the execution context
3871 *
3872 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3873 * a negative value in case of error.
3874 */
3875int
3876xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3877 void *data) {
3878 return(xmlRegExecPushStringInternal(exec, value, data, 0));
3879}
3880
3881/**
Daniel Veillard52b48c72003-04-13 19:53:42 +00003882 * xmlRegExecPushString2:
3883 * @exec: a regexp execution context or NULL to indicate the end
3884 * @value: the first string token input
3885 * @value2: the second string token input
3886 * @data: data associated to the token to reuse in callbacks
3887 *
3888 * Push one input token in the execution context
3889 *
3890 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3891 * a negative value in case of error.
3892 */
3893int
3894xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
3895 const xmlChar *value2, void *data) {
3896 xmlChar buf[150];
3897 int lenn, lenp, ret;
3898 xmlChar *str;
3899
3900 if (exec == NULL)
3901 return(-1);
3902 if (exec->comp == NULL)
3903 return(-1);
3904 if (exec->status != 0)
3905 return(exec->status);
3906
3907 if (value2 == NULL)
3908 return(xmlRegExecPushString(exec, value, data));
3909
3910 lenn = strlen((char *) value2);
3911 lenp = strlen((char *) value);
3912
3913 if (150 < lenn + lenp + 2) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003914 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00003915 if (str == NULL) {
3916 exec->status = -1;
3917 return(-1);
3918 }
3919 } else {
3920 str = buf;
3921 }
3922 memcpy(&str[0], value, lenp);
Daniel Veillardc0826a72004-08-10 14:17:33 +00003923 str[lenp] = XML_REG_STRING_SEPARATOR;
Daniel Veillard52b48c72003-04-13 19:53:42 +00003924 memcpy(&str[lenp + 1], value2, lenn);
3925 str[lenn + lenp + 1] = 0;
3926
3927 if (exec->comp->compact != NULL)
3928 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
3929 else
Daniel Veillard6e65e152005-08-09 11:09:52 +00003930 ret = xmlRegExecPushStringInternal(exec, str, data, 1);
Daniel Veillard52b48c72003-04-13 19:53:42 +00003931
3932 if (str != buf)
Daniel Veillard0b1ff142005-12-28 21:13:33 +00003933 xmlFree(str);
Daniel Veillard52b48c72003-04-13 19:53:42 +00003934 return(ret);
3935}
3936
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003937/**
Daniel Veillard77005e62005-07-19 16:26:18 +00003938 * xmlRegExecGetValues:
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003939 * @exec: a regexp execution context
3940 * @err: error extraction or normal one
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003941 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003942 * @nbneg: return number of negative transitions
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003943 * @values: pointer to the array of acceptable values
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003944 * @terminal: return value if this was a terminal state
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003945 *
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003946 * Extract informations from the regexp execution, internal routine to
3947 * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003948 *
3949 * Returns: 0 in case of success or -1 in case of error.
3950 */
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003951static int
3952xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003953 int *nbval, int *nbneg,
3954 xmlChar **values, int *terminal) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003955 int maxval;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003956 int nb = 0;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003957
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003958 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
3959 (values == NULL) || (*nbval <= 0))
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003960 return(-1);
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003961
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003962 maxval = *nbval;
3963 *nbval = 0;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003964 *nbneg = 0;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003965 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
3966 xmlRegexpPtr comp;
3967 int target, i, state;
3968
3969 comp = exec->comp;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003970
3971 if (err) {
3972 if (exec->errStateNo == -1) return(-1);
3973 state = exec->errStateNo;
3974 } else {
3975 state = exec->index;
3976 }
3977 if (terminal != NULL) {
3978 if (comp->compact[state * (comp->nbstrings + 1)] ==
3979 XML_REGEXP_FINAL_STATE)
3980 *terminal = 1;
3981 else
3982 *terminal = 0;
3983 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003984 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003985 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003986 if ((target > 0) && (target <= comp->nbstates) &&
3987 (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
3988 XML_REGEXP_SINK_STATE)) {
3989 values[nb++] = comp->stringMap[i];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003990 (*nbval)++;
3991 }
3992 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003993 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
3994 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3995 if ((target > 0) && (target <= comp->nbstates) &&
3996 (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
3997 XML_REGEXP_SINK_STATE)) {
3998 values[nb++] = comp->stringMap[i];
3999 (*nbneg)++;
4000 }
4001 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004002 } else {
4003 int transno;
4004 xmlRegTransPtr trans;
4005 xmlRegAtomPtr atom;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004006 xmlRegStatePtr state;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004007
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004008 if (terminal != NULL) {
4009 if (exec->state->type == XML_REGEXP_FINAL_STATE)
4010 *terminal = 1;
4011 else
4012 *terminal = 0;
4013 }
4014
4015 if (err) {
4016 if (exec->errState == NULL) return(-1);
4017 state = exec->errState;
4018 } else {
4019 if (exec->state == NULL) return(-1);
4020 state = exec->state;
4021 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004022 for (transno = 0;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004023 (transno < state->nbTrans) && (nb < maxval);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004024 transno++) {
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004025 trans = &state->trans[transno];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004026 if (trans->to < 0)
4027 continue;
4028 atom = trans->atom;
4029 if ((atom == NULL) || (atom->valuep == NULL))
4030 continue;
4031 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004032 /* this should not be reached but ... */
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004033 TODO;
4034 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004035 /* this should not be reached but ... */
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004036 TODO;
4037 } else if (trans->counter >= 0) {
4038 xmlRegCounterPtr counter;
4039 int count;
4040
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004041 if (err)
4042 count = exec->errCounts[trans->counter];
4043 else
4044 count = exec->counts[trans->counter];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004045 counter = &exec->comp->counters[trans->counter];
4046 if (count < counter->max) {
Daniel Veillard77005e62005-07-19 16:26:18 +00004047 if (atom->neg)
4048 values[nb++] = (xmlChar *) atom->valuep2;
4049 else
4050 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004051 (*nbval)++;
4052 }
4053 } else {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004054 if ((exec->comp->states[trans->to] != NULL) &&
4055 (exec->comp->states[trans->to]->type !=
4056 XML_REGEXP_SINK_STATE)) {
Daniel Veillard77005e62005-07-19 16:26:18 +00004057 if (atom->neg)
4058 values[nb++] = (xmlChar *) atom->valuep2;
4059 else
4060 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004061 (*nbval)++;
4062 }
4063 }
4064 }
4065 for (transno = 0;
4066 (transno < state->nbTrans) && (nb < maxval);
4067 transno++) {
4068 trans = &state->trans[transno];
4069 if (trans->to < 0)
4070 continue;
4071 atom = trans->atom;
4072 if ((atom == NULL) || (atom->valuep == NULL))
4073 continue;
4074 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4075 continue;
4076 } else if (trans->count == REGEXP_ALL_COUNTER) {
4077 continue;
4078 } else if (trans->counter >= 0) {
4079 continue;
4080 } else {
4081 if ((exec->comp->states[trans->to] != NULL) &&
4082 (exec->comp->states[trans->to]->type ==
4083 XML_REGEXP_SINK_STATE)) {
Daniel Veillard77005e62005-07-19 16:26:18 +00004084 if (atom->neg)
4085 values[nb++] = (xmlChar *) atom->valuep2;
4086 else
4087 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004088 (*nbneg)++;
4089 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004090 }
4091 }
4092 }
4093 return(0);
4094}
4095
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004096/**
4097 * xmlRegExecNextValues:
4098 * @exec: a regexp execution context
4099 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004100 * @nbneg: return number of negative transitions
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004101 * @values: pointer to the array of acceptable values
4102 * @terminal: return value if this was a terminal state
4103 *
4104 * Extract informations from the regexp execution,
4105 * the parameter @values must point to an array of @nbval string pointers
4106 * on return nbval will contain the number of possible strings in that
4107 * state and the @values array will be updated with them. The string values
4108 * returned will be freed with the @exec context and don't need to be
4109 * deallocated.
4110 *
4111 * Returns: 0 in case of success or -1 in case of error.
4112 */
4113int
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004114xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4115 xmlChar **values, int *terminal) {
4116 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004117}
4118
4119/**
4120 * xmlRegExecErrInfo:
4121 * @exec: a regexp execution context generating an error
4122 * @string: return value for the error string
4123 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004124 * @nbneg: return number of negative transitions
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004125 * @values: pointer to the array of acceptable values
4126 * @terminal: return value if this was a terminal state
4127 *
4128 * Extract error informations from the regexp execution, the parameter
4129 * @string will be updated with the value pushed and not accepted,
4130 * the parameter @values must point to an array of @nbval string pointers
4131 * on return nbval will contain the number of possible strings in that
4132 * state and the @values array will be updated with them. The string values
4133 * returned will be freed with the @exec context and don't need to be
4134 * deallocated.
4135 *
4136 * Returns: 0 in case of success or -1 in case of error.
4137 */
4138int
4139xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004140 int *nbval, int *nbneg, xmlChar **values, int *terminal) {
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004141 if (exec == NULL)
4142 return(-1);
4143 if (string != NULL) {
4144 if (exec->status != 0)
4145 *string = exec->errString;
4146 else
4147 *string = NULL;
4148 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004149 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004150}
4151
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004152#ifdef DEBUG_ERR
4153static void testerr(xmlRegExecCtxtPtr exec) {
4154 const xmlChar *string;
Daniel Veillardcee2b3a2005-01-25 00:22:52 +00004155 xmlChar *values[5];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004156 int nb = 5;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004157 int nbneg;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004158 int terminal;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004159 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004160}
4161#endif
4162
Daniel Veillard4255d502002-04-16 15:50:10 +00004163#if 0
4164static int
4165xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4166 xmlRegTransPtr trans;
4167 xmlRegAtomPtr atom;
4168 int ret;
4169 int codepoint, len;
4170
4171 if (exec == NULL)
4172 return(-1);
4173 if (exec->status != 0)
4174 return(exec->status);
4175
4176 while ((exec->status == 0) &&
4177 ((exec->inputString[exec->index] != 0) ||
4178 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4179
4180 /*
4181 * End of input on non-terminal state, rollback, however we may
4182 * still have epsilon like transition for counted transitions
4183 * on counters, in that case don't break too early.
4184 */
4185 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4186 goto rollback;
4187
4188 exec->transcount = 0;
4189 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4190 trans = &exec->state->trans[exec->transno];
4191 if (trans->to < 0)
4192 continue;
4193 atom = trans->atom;
4194 ret = 0;
4195 if (trans->count >= 0) {
4196 int count;
4197 xmlRegCounterPtr counter;
4198
4199 /*
4200 * A counted transition.
4201 */
4202
4203 count = exec->counts[trans->count];
4204 counter = &exec->comp->counters[trans->count];
4205#ifdef DEBUG_REGEXP_EXEC
4206 printf("testing count %d: val %d, min %d, max %d\n",
4207 trans->count, count, counter->min, counter->max);
4208#endif
4209 ret = ((count >= counter->min) && (count <= counter->max));
4210 } else if (atom == NULL) {
4211 fprintf(stderr, "epsilon transition left at runtime\n");
4212 exec->status = -2;
4213 break;
4214 } else if (exec->inputString[exec->index] != 0) {
4215 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4216 ret = xmlRegCheckCharacter(atom, codepoint);
4217 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4218 xmlRegStatePtr to = exec->comp->states[trans->to];
4219
4220 /*
4221 * this is a multiple input sequence
4222 */
4223 if (exec->state->nbTrans > exec->transno + 1) {
4224 xmlFARegExecSave(exec);
4225 }
4226 exec->transcount = 1;
4227 do {
4228 /*
4229 * Try to progress as much as possible on the input
4230 */
4231 if (exec->transcount == atom->max) {
4232 break;
4233 }
4234 exec->index += len;
4235 /*
4236 * End of input: stop here
4237 */
4238 if (exec->inputString[exec->index] == 0) {
4239 exec->index -= len;
4240 break;
4241 }
4242 if (exec->transcount >= atom->min) {
4243 int transno = exec->transno;
4244 xmlRegStatePtr state = exec->state;
4245
4246 /*
4247 * The transition is acceptable save it
4248 */
4249 exec->transno = -1; /* trick */
4250 exec->state = to;
4251 xmlFARegExecSave(exec);
4252 exec->transno = transno;
4253 exec->state = state;
4254 }
4255 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4256 len);
4257 ret = xmlRegCheckCharacter(atom, codepoint);
4258 exec->transcount++;
4259 } while (ret == 1);
4260 if (exec->transcount < atom->min)
4261 ret = 0;
4262
4263 /*
4264 * If the last check failed but one transition was found
4265 * possible, rollback
4266 */
4267 if (ret < 0)
4268 ret = 0;
4269 if (ret == 0) {
4270 goto rollback;
4271 }
4272 }
4273 }
4274 if (ret == 1) {
4275 if (exec->state->nbTrans > exec->transno + 1) {
4276 xmlFARegExecSave(exec);
4277 }
4278 if (trans->counter >= 0) {
4279#ifdef DEBUG_REGEXP_EXEC
4280 printf("Increasing count %d\n", trans->counter);
4281#endif
4282 exec->counts[trans->counter]++;
4283 }
4284#ifdef DEBUG_REGEXP_EXEC
4285 printf("entering state %d\n", trans->to);
4286#endif
4287 exec->state = exec->comp->states[trans->to];
4288 exec->transno = 0;
4289 if (trans->atom != NULL) {
4290 exec->index += len;
4291 }
4292 goto progress;
4293 } else if (ret < 0) {
4294 exec->status = -4;
4295 break;
4296 }
4297 }
4298 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4299rollback:
4300 /*
4301 * Failed to find a way out
4302 */
4303 exec->determinist = 0;
4304 xmlFARegExecRollBack(exec);
4305 }
4306progress:
4307 continue;
4308 }
4309}
4310#endif
4311/************************************************************************
4312 * *
William M. Brackddf71d62004-05-06 04:17:26 +00004313 * Parser for the Schemas Datatype Regular Expressions *
Daniel Veillard4255d502002-04-16 15:50:10 +00004314 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
4315 * *
4316 ************************************************************************/
4317
4318/**
4319 * xmlFAIsChar:
Daniel Veillard441bc322002-04-20 17:38:48 +00004320 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004321 *
4322 * [10] Char ::= [^.\?*+()|#x5B#x5D]
4323 */
4324static int
4325xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4326 int cur;
4327 int len;
4328
4329 cur = CUR_SCHAR(ctxt->cur, len);
4330 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4331 (cur == '*') || (cur == '+') || (cur == '(') ||
4332 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
4333 (cur == 0x5D) || (cur == 0))
4334 return(-1);
4335 return(cur);
4336}
4337
4338/**
4339 * xmlFAParseCharProp:
Daniel Veillard441bc322002-04-20 17:38:48 +00004340 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004341 *
4342 * [27] charProp ::= IsCategory | IsBlock
4343 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
4344 * Separators | Symbols | Others
4345 * [29] Letters ::= 'L' [ultmo]?
4346 * [30] Marks ::= 'M' [nce]?
4347 * [31] Numbers ::= 'N' [dlo]?
4348 * [32] Punctuation ::= 'P' [cdseifo]?
4349 * [33] Separators ::= 'Z' [slp]?
4350 * [34] Symbols ::= 'S' [mcko]?
4351 * [35] Others ::= 'C' [cfon]?
4352 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
4353 */
4354static void
4355xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4356 int cur;
William M. Brack779af002003-08-01 15:55:39 +00004357 xmlRegAtomType type = (xmlRegAtomType) 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00004358 xmlChar *blockName = NULL;
4359
4360 cur = CUR;
4361 if (cur == 'L') {
4362 NEXT;
4363 cur = CUR;
4364 if (cur == 'u') {
4365 NEXT;
4366 type = XML_REGEXP_LETTER_UPPERCASE;
4367 } else if (cur == 'l') {
4368 NEXT;
4369 type = XML_REGEXP_LETTER_LOWERCASE;
4370 } else if (cur == 't') {
4371 NEXT;
4372 type = XML_REGEXP_LETTER_TITLECASE;
4373 } else if (cur == 'm') {
4374 NEXT;
4375 type = XML_REGEXP_LETTER_MODIFIER;
4376 } else if (cur == 'o') {
4377 NEXT;
4378 type = XML_REGEXP_LETTER_OTHERS;
4379 } else {
4380 type = XML_REGEXP_LETTER;
4381 }
4382 } else if (cur == 'M') {
4383 NEXT;
4384 cur = CUR;
4385 if (cur == 'n') {
4386 NEXT;
4387 /* nonspacing */
4388 type = XML_REGEXP_MARK_NONSPACING;
4389 } else if (cur == 'c') {
4390 NEXT;
4391 /* spacing combining */
4392 type = XML_REGEXP_MARK_SPACECOMBINING;
4393 } else if (cur == 'e') {
4394 NEXT;
4395 /* enclosing */
4396 type = XML_REGEXP_MARK_ENCLOSING;
4397 } else {
4398 /* all marks */
4399 type = XML_REGEXP_MARK;
4400 }
4401 } else if (cur == 'N') {
4402 NEXT;
4403 cur = CUR;
4404 if (cur == 'd') {
4405 NEXT;
4406 /* digital */
4407 type = XML_REGEXP_NUMBER_DECIMAL;
4408 } else if (cur == 'l') {
4409 NEXT;
4410 /* letter */
4411 type = XML_REGEXP_NUMBER_LETTER;
4412 } else if (cur == 'o') {
4413 NEXT;
4414 /* other */
4415 type = XML_REGEXP_NUMBER_OTHERS;
4416 } else {
4417 /* all numbers */
4418 type = XML_REGEXP_NUMBER;
4419 }
4420 } else if (cur == 'P') {
4421 NEXT;
4422 cur = CUR;
4423 if (cur == 'c') {
4424 NEXT;
4425 /* connector */
4426 type = XML_REGEXP_PUNCT_CONNECTOR;
4427 } else if (cur == 'd') {
4428 NEXT;
4429 /* dash */
4430 type = XML_REGEXP_PUNCT_DASH;
4431 } else if (cur == 's') {
4432 NEXT;
4433 /* open */
4434 type = XML_REGEXP_PUNCT_OPEN;
4435 } else if (cur == 'e') {
4436 NEXT;
4437 /* close */
4438 type = XML_REGEXP_PUNCT_CLOSE;
4439 } else if (cur == 'i') {
4440 NEXT;
4441 /* initial quote */
4442 type = XML_REGEXP_PUNCT_INITQUOTE;
4443 } else if (cur == 'f') {
4444 NEXT;
4445 /* final quote */
4446 type = XML_REGEXP_PUNCT_FINQUOTE;
4447 } else if (cur == 'o') {
4448 NEXT;
4449 /* other */
4450 type = XML_REGEXP_PUNCT_OTHERS;
4451 } else {
4452 /* all punctuation */
4453 type = XML_REGEXP_PUNCT;
4454 }
4455 } else if (cur == 'Z') {
4456 NEXT;
4457 cur = CUR;
4458 if (cur == 's') {
4459 NEXT;
4460 /* space */
4461 type = XML_REGEXP_SEPAR_SPACE;
4462 } else if (cur == 'l') {
4463 NEXT;
4464 /* line */
4465 type = XML_REGEXP_SEPAR_LINE;
4466 } else if (cur == 'p') {
4467 NEXT;
4468 /* paragraph */
4469 type = XML_REGEXP_SEPAR_PARA;
4470 } else {
4471 /* all separators */
4472 type = XML_REGEXP_SEPAR;
4473 }
4474 } else if (cur == 'S') {
4475 NEXT;
4476 cur = CUR;
4477 if (cur == 'm') {
4478 NEXT;
4479 type = XML_REGEXP_SYMBOL_MATH;
4480 /* math */
4481 } else if (cur == 'c') {
4482 NEXT;
4483 type = XML_REGEXP_SYMBOL_CURRENCY;
4484 /* currency */
4485 } else if (cur == 'k') {
4486 NEXT;
4487 type = XML_REGEXP_SYMBOL_MODIFIER;
4488 /* modifiers */
4489 } else if (cur == 'o') {
4490 NEXT;
4491 type = XML_REGEXP_SYMBOL_OTHERS;
4492 /* other */
4493 } else {
4494 /* all symbols */
4495 type = XML_REGEXP_SYMBOL;
4496 }
4497 } else if (cur == 'C') {
4498 NEXT;
4499 cur = CUR;
4500 if (cur == 'c') {
4501 NEXT;
4502 /* control */
4503 type = XML_REGEXP_OTHER_CONTROL;
4504 } else if (cur == 'f') {
4505 NEXT;
4506 /* format */
4507 type = XML_REGEXP_OTHER_FORMAT;
4508 } else if (cur == 'o') {
4509 NEXT;
4510 /* private use */
4511 type = XML_REGEXP_OTHER_PRIVATE;
4512 } else if (cur == 'n') {
4513 NEXT;
4514 /* not assigned */
4515 type = XML_REGEXP_OTHER_NA;
4516 } else {
4517 /* all others */
4518 type = XML_REGEXP_OTHER;
4519 }
4520 } else if (cur == 'I') {
4521 const xmlChar *start;
4522 NEXT;
4523 cur = CUR;
4524 if (cur != 's') {
4525 ERROR("IsXXXX expected");
4526 return;
4527 }
4528 NEXT;
4529 start = ctxt->cur;
4530 cur = CUR;
4531 if (((cur >= 'a') && (cur <= 'z')) ||
4532 ((cur >= 'A') && (cur <= 'Z')) ||
4533 ((cur >= '0') && (cur <= '9')) ||
4534 (cur == 0x2D)) {
4535 NEXT;
4536 cur = CUR;
4537 while (((cur >= 'a') && (cur <= 'z')) ||
4538 ((cur >= 'A') && (cur <= 'Z')) ||
4539 ((cur >= '0') && (cur <= '9')) ||
4540 (cur == 0x2D)) {
4541 NEXT;
4542 cur = CUR;
4543 }
4544 }
4545 type = XML_REGEXP_BLOCK_NAME;
4546 blockName = xmlStrndup(start, ctxt->cur - start);
4547 } else {
4548 ERROR("Unknown char property");
4549 return;
4550 }
4551 if (ctxt->atom == NULL) {
4552 ctxt->atom = xmlRegNewAtom(ctxt, type);
4553 if (ctxt->atom != NULL)
4554 ctxt->atom->valuep = blockName;
4555 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4556 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4557 type, 0, 0, blockName);
4558 }
4559}
4560
4561/**
4562 * xmlFAParseCharClassEsc:
Daniel Veillard441bc322002-04-20 17:38:48 +00004563 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004564 *
4565 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4566 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4567 * [25] catEsc ::= '\p{' charProp '}'
4568 * [26] complEsc ::= '\P{' charProp '}'
4569 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4570 */
4571static void
4572xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4573 int cur;
4574
4575 if (CUR == '.') {
4576 if (ctxt->atom == NULL) {
4577 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4578 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4579 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4580 XML_REGEXP_ANYCHAR, 0, 0, NULL);
4581 }
4582 NEXT;
4583 return;
4584 }
4585 if (CUR != '\\') {
4586 ERROR("Escaped sequence: expecting \\");
4587 return;
4588 }
4589 NEXT;
4590 cur = CUR;
4591 if (cur == 'p') {
4592 NEXT;
4593 if (CUR != '{') {
4594 ERROR("Expecting '{'");
4595 return;
4596 }
4597 NEXT;
4598 xmlFAParseCharProp(ctxt);
4599 if (CUR != '}') {
4600 ERROR("Expecting '}'");
4601 return;
4602 }
4603 NEXT;
4604 } else if (cur == 'P') {
4605 NEXT;
4606 if (CUR != '{') {
4607 ERROR("Expecting '{'");
4608 return;
4609 }
4610 NEXT;
4611 xmlFAParseCharProp(ctxt);
4612 ctxt->atom->neg = 1;
4613 if (CUR != '}') {
4614 ERROR("Expecting '}'");
4615 return;
4616 }
4617 NEXT;
4618 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4619 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4620 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4621 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4622 (cur == 0x5E)) {
4623 if (ctxt->atom == NULL) {
4624 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
Daniel Veillard99c394d2005-07-14 12:58:49 +00004625 if (ctxt->atom != NULL) {
4626 switch (cur) {
4627 case 'n':
4628 ctxt->atom->codepoint = '\n';
4629 break;
4630 case 'r':
4631 ctxt->atom->codepoint = '\r';
4632 break;
4633 case 't':
4634 ctxt->atom->codepoint = '\t';
4635 break;
4636 default:
4637 ctxt->atom->codepoint = cur;
4638 }
4639 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004640 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4641 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4642 XML_REGEXP_CHARVAL, cur, cur, NULL);
4643 }
4644 NEXT;
4645 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
4646 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
4647 (cur == 'w') || (cur == 'W')) {
Daniel Veillardb509f152002-04-17 16:28:10 +00004648 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
Daniel Veillard4255d502002-04-16 15:50:10 +00004649
4650 switch (cur) {
4651 case 's':
4652 type = XML_REGEXP_ANYSPACE;
4653 break;
4654 case 'S':
4655 type = XML_REGEXP_NOTSPACE;
4656 break;
4657 case 'i':
4658 type = XML_REGEXP_INITNAME;
4659 break;
4660 case 'I':
4661 type = XML_REGEXP_NOTINITNAME;
4662 break;
4663 case 'c':
4664 type = XML_REGEXP_NAMECHAR;
4665 break;
4666 case 'C':
4667 type = XML_REGEXP_NOTNAMECHAR;
4668 break;
4669 case 'd':
4670 type = XML_REGEXP_DECIMAL;
4671 break;
4672 case 'D':
4673 type = XML_REGEXP_NOTDECIMAL;
4674 break;
4675 case 'w':
4676 type = XML_REGEXP_REALCHAR;
4677 break;
4678 case 'W':
4679 type = XML_REGEXP_NOTREALCHAR;
4680 break;
4681 }
4682 NEXT;
4683 if (ctxt->atom == NULL) {
4684 ctxt->atom = xmlRegNewAtom(ctxt, type);
4685 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4686 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4687 type, 0, 0, NULL);
4688 }
4689 }
4690}
4691
4692/**
4693 * xmlFAParseCharRef:
Daniel Veillard441bc322002-04-20 17:38:48 +00004694 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004695 *
4696 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' )
4697 */
4698static int
4699xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) {
4700 int ret = 0, cur;
4701
4702 if ((CUR != '&') || (NXT(1) != '#'))
4703 return(-1);
4704 NEXT;
4705 NEXT;
4706 cur = CUR;
4707 if (cur == 'x') {
4708 NEXT;
4709 cur = CUR;
4710 if (((cur >= '0') && (cur <= '9')) ||
4711 ((cur >= 'a') && (cur <= 'f')) ||
4712 ((cur >= 'A') && (cur <= 'F'))) {
4713 while (((cur >= '0') && (cur <= '9')) ||
4714 ((cur >= 'A') && (cur <= 'F'))) {
4715 if ((cur >= '0') && (cur <= '9'))
4716 ret = ret * 16 + cur - '0';
4717 else if ((cur >= 'a') && (cur <= 'f'))
4718 ret = ret * 16 + 10 + (cur - 'a');
4719 else
4720 ret = ret * 16 + 10 + (cur - 'A');
4721 NEXT;
4722 cur = CUR;
4723 }
4724 } else {
4725 ERROR("Char ref: expecting [0-9A-F]");
4726 return(-1);
4727 }
4728 } else {
4729 if ((cur >= '0') && (cur <= '9')) {
4730 while ((cur >= '0') && (cur <= '9')) {
4731 ret = ret * 10 + cur - '0';
4732 NEXT;
4733 cur = CUR;
4734 }
4735 } else {
4736 ERROR("Char ref: expecting [0-9]");
4737 return(-1);
4738 }
4739 }
4740 if (cur != ';') {
4741 ERROR("Char ref: expecting ';'");
4742 return(-1);
4743 } else {
4744 NEXT;
4745 }
4746 return(ret);
4747}
4748
4749/**
4750 * xmlFAParseCharRange:
Daniel Veillard441bc322002-04-20 17:38:48 +00004751 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004752 *
4753 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
4754 * [18] seRange ::= charOrEsc '-' charOrEsc
4755 * [20] charOrEsc ::= XmlChar | SingleCharEsc
4756 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
4757 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
4758 */
4759static void
4760xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
William M. Brackdc99df92003-12-27 01:54:25 +00004761 int cur, len;
Daniel Veillard4255d502002-04-16 15:50:10 +00004762 int start = -1;
4763 int end = -1;
4764
4765 if ((CUR == '&') && (NXT(1) == '#')) {
4766 end = start = xmlFAParseCharRef(ctxt);
4767 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4768 XML_REGEXP_CHARVAL, start, end, NULL);
4769 return;
4770 }
4771 cur = CUR;
4772 if (cur == '\\') {
4773 NEXT;
4774 cur = CUR;
4775 switch (cur) {
4776 case 'n': start = 0xA; break;
4777 case 'r': start = 0xD; break;
4778 case 't': start = 0x9; break;
4779 case '\\': case '|': case '.': case '-': case '^': case '?':
4780 case '*': case '+': case '{': case '}': case '(': case ')':
4781 case '[': case ']':
4782 start = cur; break;
4783 default:
4784 ERROR("Invalid escape value");
4785 return;
4786 }
4787 end = start;
William M. Brackdc99df92003-12-27 01:54:25 +00004788 len = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00004789 } else if ((cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00004790 end = start = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00004791 } else {
4792 ERROR("Expecting a char range");
4793 return;
4794 }
William M. Brackdc99df92003-12-27 01:54:25 +00004795 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00004796 if (start == '-') {
4797 return;
4798 }
4799 cur = CUR;
William M. Brack10f1ef42004-03-20 14:51:25 +00004800 if ((cur != '-') || (NXT(1) == ']')) {
Daniel Veillard4255d502002-04-16 15:50:10 +00004801 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4802 XML_REGEXP_CHARVAL, start, end, NULL);
4803 return;
4804 }
4805 NEXT;
4806 cur = CUR;
4807 if (cur == '\\') {
4808 NEXT;
4809 cur = CUR;
4810 switch (cur) {
4811 case 'n': end = 0xA; break;
4812 case 'r': end = 0xD; break;
4813 case 't': end = 0x9; break;
4814 case '\\': case '|': case '.': case '-': case '^': case '?':
4815 case '*': case '+': case '{': case '}': case '(': case ')':
4816 case '[': case ']':
4817 end = cur; break;
4818 default:
4819 ERROR("Invalid escape value");
4820 return;
4821 }
William M. Brackdc99df92003-12-27 01:54:25 +00004822 len = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00004823 } else if ((cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00004824 end = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00004825 } else {
4826 ERROR("Expecting the end of a char range");
4827 return;
4828 }
William M. Brackdc99df92003-12-27 01:54:25 +00004829 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00004830 /* TODO check that the values are acceptable character ranges for XML */
4831 if (end < start) {
4832 ERROR("End of range is before start of range");
4833 } else {
4834 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4835 XML_REGEXP_CHARVAL, start, end, NULL);
4836 }
4837 return;
4838}
4839
4840/**
4841 * xmlFAParsePosCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00004842 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004843 *
4844 * [14] posCharGroup ::= ( charRange | charClassEsc )+
4845 */
4846static void
4847xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
4848 do {
4849 if ((CUR == '\\') || (CUR == '.')) {
4850 xmlFAParseCharClassEsc(ctxt);
4851 } else {
4852 xmlFAParseCharRange(ctxt);
4853 }
4854 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
4855 (ctxt->error == 0));
4856}
4857
4858/**
4859 * xmlFAParseCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00004860 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004861 *
4862 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
4863 * [15] negCharGroup ::= '^' posCharGroup
4864 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
4865 * [12] charClassExpr ::= '[' charGroup ']'
4866 */
4867static void
4868xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
4869 int n = ctxt->neg;
4870 while ((CUR != ']') && (ctxt->error == 0)) {
4871 if (CUR == '^') {
4872 int neg = ctxt->neg;
4873
4874 NEXT;
4875 ctxt->neg = !ctxt->neg;
4876 xmlFAParsePosCharGroup(ctxt);
4877 ctxt->neg = neg;
William M. Brack10f1ef42004-03-20 14:51:25 +00004878 } else if ((CUR == '-') && (NXT(1) == '[')) {
Daniel Veillardf8b9de32003-11-24 14:27:26 +00004879 int neg = ctxt->neg;
Daniel Veillardf8b9de32003-11-24 14:27:26 +00004880 ctxt->neg = 2;
William M. Brack10f1ef42004-03-20 14:51:25 +00004881 NEXT; /* eat the '-' */
4882 NEXT; /* eat the '[' */
Daniel Veillard4255d502002-04-16 15:50:10 +00004883 xmlFAParseCharGroup(ctxt);
4884 if (CUR == ']') {
4885 NEXT;
4886 } else {
4887 ERROR("charClassExpr: ']' expected");
4888 break;
4889 }
Daniel Veillardf8b9de32003-11-24 14:27:26 +00004890 ctxt->neg = neg;
Daniel Veillard4255d502002-04-16 15:50:10 +00004891 break;
4892 } else if (CUR != ']') {
4893 xmlFAParsePosCharGroup(ctxt);
4894 }
4895 }
4896 ctxt->neg = n;
4897}
4898
4899/**
4900 * xmlFAParseCharClass:
Daniel Veillard441bc322002-04-20 17:38:48 +00004901 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004902 *
4903 * [11] charClass ::= charClassEsc | charClassExpr
4904 * [12] charClassExpr ::= '[' charGroup ']'
4905 */
4906static void
4907xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
4908 if (CUR == '[') {
4909 NEXT;
4910 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
4911 if (ctxt->atom == NULL)
4912 return;
4913 xmlFAParseCharGroup(ctxt);
4914 if (CUR == ']') {
4915 NEXT;
4916 } else {
4917 ERROR("xmlFAParseCharClass: ']' expected");
4918 }
4919 } else {
4920 xmlFAParseCharClassEsc(ctxt);
4921 }
4922}
4923
4924/**
4925 * xmlFAParseQuantExact:
Daniel Veillard441bc322002-04-20 17:38:48 +00004926 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004927 *
4928 * [8] QuantExact ::= [0-9]+
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004929 *
4930 * Returns 0 if success or -1 in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00004931 */
4932static int
4933xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
4934 int ret = 0;
4935 int ok = 0;
4936
4937 while ((CUR >= '0') && (CUR <= '9')) {
4938 ret = ret * 10 + (CUR - '0');
4939 ok = 1;
4940 NEXT;
4941 }
4942 if (ok != 1) {
4943 return(-1);
4944 }
4945 return(ret);
4946}
4947
4948/**
4949 * xmlFAParseQuantifier:
Daniel Veillard441bc322002-04-20 17:38:48 +00004950 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004951 *
4952 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
4953 * [5] quantity ::= quantRange | quantMin | QuantExact
4954 * [6] quantRange ::= QuantExact ',' QuantExact
4955 * [7] quantMin ::= QuantExact ','
4956 * [8] QuantExact ::= [0-9]+
4957 */
4958static int
4959xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
4960 int cur;
4961
4962 cur = CUR;
4963 if ((cur == '?') || (cur == '*') || (cur == '+')) {
4964 if (ctxt->atom != NULL) {
4965 if (cur == '?')
4966 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
4967 else if (cur == '*')
4968 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
4969 else if (cur == '+')
4970 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
4971 }
4972 NEXT;
4973 return(1);
4974 }
4975 if (cur == '{') {
4976 int min = 0, max = 0;
4977
4978 NEXT;
4979 cur = xmlFAParseQuantExact(ctxt);
4980 if (cur >= 0)
4981 min = cur;
4982 if (CUR == ',') {
4983 NEXT;
Daniel Veillardebe48c62003-12-03 12:12:27 +00004984 if (CUR == '}')
4985 max = INT_MAX;
4986 else {
4987 cur = xmlFAParseQuantExact(ctxt);
4988 if (cur >= 0)
4989 max = cur;
4990 else {
4991 ERROR("Improper quantifier");
4992 }
4993 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004994 }
4995 if (CUR == '}') {
4996 NEXT;
4997 } else {
4998 ERROR("Unterminated quantifier");
4999 }
5000 if (max == 0)
5001 max = min;
5002 if (ctxt->atom != NULL) {
5003 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5004 ctxt->atom->min = min;
5005 ctxt->atom->max = max;
5006 }
5007 return(1);
5008 }
5009 return(0);
5010}
5011
5012/**
5013 * xmlFAParseAtom:
Daniel Veillard441bc322002-04-20 17:38:48 +00005014 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005015 *
5016 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
5017 */
5018static int
5019xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5020 int codepoint, len;
5021
5022 codepoint = xmlFAIsChar(ctxt);
5023 if (codepoint > 0) {
5024 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5025 if (ctxt->atom == NULL)
5026 return(-1);
5027 codepoint = CUR_SCHAR(ctxt->cur, len);
5028 ctxt->atom->codepoint = codepoint;
5029 NEXTL(len);
5030 return(1);
5031 } else if (CUR == '|') {
5032 return(0);
5033 } else if (CUR == 0) {
5034 return(0);
5035 } else if (CUR == ')') {
5036 return(0);
5037 } else if (CUR == '(') {
5038 xmlRegStatePtr start, oldend;
5039
5040 NEXT;
5041 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5042 start = ctxt->state;
5043 oldend = ctxt->end;
5044 ctxt->end = NULL;
5045 ctxt->atom = NULL;
5046 xmlFAParseRegExp(ctxt, 0);
5047 if (CUR == ')') {
5048 NEXT;
5049 } else {
5050 ERROR("xmlFAParseAtom: expecting ')'");
5051 }
5052 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5053 if (ctxt->atom == NULL)
5054 return(-1);
5055 ctxt->atom->start = start;
5056 ctxt->atom->stop = ctxt->state;
5057 ctxt->end = oldend;
5058 return(1);
5059 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5060 xmlFAParseCharClass(ctxt);
5061 return(1);
5062 }
5063 return(0);
5064}
5065
5066/**
5067 * xmlFAParsePiece:
Daniel Veillard441bc322002-04-20 17:38:48 +00005068 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005069 *
5070 * [3] piece ::= atom quantifier?
5071 */
5072static int
5073xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5074 int ret;
5075
5076 ctxt->atom = NULL;
5077 ret = xmlFAParseAtom(ctxt);
5078 if (ret == 0)
5079 return(0);
5080 if (ctxt->atom == NULL) {
5081 ERROR("internal: no atom generated");
5082 }
5083 xmlFAParseQuantifier(ctxt);
5084 return(1);
5085}
5086
5087/**
5088 * xmlFAParseBranch:
Daniel Veillard441bc322002-04-20 17:38:48 +00005089 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005090 *
5091 * [2] branch ::= piece*
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005092 8
Daniel Veillard4255d502002-04-16 15:50:10 +00005093 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005094static int
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005095xmlFAParseBranch(xmlRegParserCtxtPtr ctxt) {
Daniel Veillard4255d502002-04-16 15:50:10 +00005096 xmlRegStatePtr previous;
Daniel Veillard4255d502002-04-16 15:50:10 +00005097 int ret;
5098
5099 previous = ctxt->state;
5100 ret = xmlFAParsePiece(ctxt);
5101 if (ret != 0) {
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005102 if (xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom) < 0)
5103 return(-1);
5104 previous = ctxt->state;
Daniel Veillard4255d502002-04-16 15:50:10 +00005105 ctxt->atom = NULL;
5106 }
5107 while ((ret != 0) && (ctxt->error == 0)) {
5108 ret = xmlFAParsePiece(ctxt);
5109 if (ret != 0) {
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005110 if (xmlFAGenerateTransitions(ctxt, previous, NULL,
5111 ctxt->atom) < 0)
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005112 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00005113 previous = ctxt->state;
5114 ctxt->atom = NULL;
5115 }
5116 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005117 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00005118}
5119
5120/**
5121 * xmlFAParseRegExp:
Daniel Veillard441bc322002-04-20 17:38:48 +00005122 * @ctxt: a regexp parser context
William M. Brackddf71d62004-05-06 04:17:26 +00005123 * @top: is this the top-level expression ?
Daniel Veillard4255d502002-04-16 15:50:10 +00005124 *
5125 * [1] regExp ::= branch ( '|' branch )*
5126 */
5127static void
5128xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
Daniel Veillardc7e3cc42004-09-28 12:33:52 +00005129 xmlRegStatePtr start, end;
Daniel Veillard4255d502002-04-16 15:50:10 +00005130
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005131 /* if not top start should have been generated by an epsilon trans */
Daniel Veillard4255d502002-04-16 15:50:10 +00005132 start = ctxt->state;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005133 ctxt->end = NULL;
5134 xmlFAParseBranch(ctxt);
5135 if (top) {
5136#ifdef DEBUG_REGEXP_GRAPH
5137 printf("State %d is final\n", ctxt->state->no);
5138#endif
5139 ctxt->state->type = XML_REGEXP_FINAL_STATE;
5140 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005141 if (CUR != '|') {
5142 ctxt->end = ctxt->state;
5143 return;
5144 }
5145 end = ctxt->state;
5146 while ((CUR == '|') && (ctxt->error == 0)) {
5147 NEXT;
5148 ctxt->state = start;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005149 ctxt->end = NULL;
5150 xmlFAParseBranch(ctxt);
5151 if (top) {
5152 ctxt->state->type = XML_REGEXP_FINAL_STATE;
5153#ifdef DEBUG_REGEXP_GRAPH
5154 printf("State %d is final\n", ctxt->state->no);
5155#endif
5156 } else {
5157 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, end);
5158 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005159 }
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005160 if (!top) {
5161 ctxt->state = end;
5162 ctxt->end = end;
5163 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005164}
5165
5166/************************************************************************
5167 * *
5168 * The basic API *
5169 * *
5170 ************************************************************************/
5171
5172/**
5173 * xmlRegexpPrint:
5174 * @output: the file for the output debug
5175 * @regexp: the compiled regexp
5176 *
5177 * Print the content of the compiled regular expression
5178 */
5179void
5180xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5181 int i;
5182
Daniel Veillarda82b1822004-11-08 16:24:57 +00005183 if (output == NULL)
5184 return;
Daniel Veillard4255d502002-04-16 15:50:10 +00005185 fprintf(output, " regexp: ");
5186 if (regexp == NULL) {
5187 fprintf(output, "NULL\n");
5188 return;
5189 }
5190 fprintf(output, "'%s' ", regexp->string);
5191 fprintf(output, "\n");
5192 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5193 for (i = 0;i < regexp->nbAtoms; i++) {
5194 fprintf(output, " %02d ", i);
5195 xmlRegPrintAtom(output, regexp->atoms[i]);
5196 }
5197 fprintf(output, "%d states:", regexp->nbStates);
5198 fprintf(output, "\n");
5199 for (i = 0;i < regexp->nbStates; i++) {
5200 xmlRegPrintState(output, regexp->states[i]);
5201 }
5202 fprintf(output, "%d counters:\n", regexp->nbCounters);
5203 for (i = 0;i < regexp->nbCounters; i++) {
5204 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5205 regexp->counters[i].max);
5206 }
5207}
5208
5209/**
5210 * xmlRegexpCompile:
5211 * @regexp: a regular expression string
5212 *
5213 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
William M. Brackddf71d62004-05-06 04:17:26 +00005214 * Appendix F and builds an automata suitable for testing strings against
Daniel Veillard4255d502002-04-16 15:50:10 +00005215 * that regular expression
5216 *
5217 * Returns the compiled expression or NULL in case of error
5218 */
5219xmlRegexpPtr
5220xmlRegexpCompile(const xmlChar *regexp) {
5221 xmlRegexpPtr ret;
5222 xmlRegParserCtxtPtr ctxt;
5223
5224 ctxt = xmlRegNewParserCtxt(regexp);
5225 if (ctxt == NULL)
5226 return(NULL);
5227
5228 /* initialize the parser */
5229 ctxt->end = NULL;
5230 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5231 xmlRegStatePush(ctxt, ctxt->start);
5232
5233 /* parse the expression building an automata */
5234 xmlFAParseRegExp(ctxt, 1);
5235 if (CUR != 0) {
5236 ERROR("xmlFAParseRegExp: extra characters");
5237 }
5238 ctxt->end = ctxt->state;
5239 ctxt->start->type = XML_REGEXP_START_STATE;
5240 ctxt->end->type = XML_REGEXP_FINAL_STATE;
5241
5242 /* remove the Epsilon except for counted transitions */
5243 xmlFAEliminateEpsilonTransitions(ctxt);
5244
5245
5246 if (ctxt->error != 0) {
5247 xmlRegFreeParserCtxt(ctxt);
5248 return(NULL);
5249 }
5250 ret = xmlRegEpxFromParse(ctxt);
5251 xmlRegFreeParserCtxt(ctxt);
5252 return(ret);
5253}
5254
5255/**
5256 * xmlRegexpExec:
5257 * @comp: the compiled regular expression
5258 * @content: the value to check against the regular expression
5259 *
William M. Brackddf71d62004-05-06 04:17:26 +00005260 * Check if the regular expression generates the value
Daniel Veillard4255d502002-04-16 15:50:10 +00005261 *
William M. Brackddf71d62004-05-06 04:17:26 +00005262 * Returns 1 if it matches, 0 if not and a negative value in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00005263 */
5264int
5265xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5266 if ((comp == NULL) || (content == NULL))
5267 return(-1);
5268 return(xmlFARegExec(comp, content));
5269}
5270
5271/**
Daniel Veillard23e73572002-09-19 19:56:43 +00005272 * xmlRegexpIsDeterminist:
5273 * @comp: the compiled regular expression
5274 *
5275 * Check if the regular expression is determinist
5276 *
William M. Brackddf71d62004-05-06 04:17:26 +00005277 * Returns 1 if it yes, 0 if not and a negative value in case of error
Daniel Veillard23e73572002-09-19 19:56:43 +00005278 */
5279int
5280xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5281 xmlAutomataPtr am;
5282 int ret;
5283
5284 if (comp == NULL)
5285 return(-1);
5286 if (comp->determinist != -1)
5287 return(comp->determinist);
5288
5289 am = xmlNewAutomata();
Daniel Veillardbd9afb52002-09-25 22:25:35 +00005290 if (am->states != NULL) {
5291 int i;
5292
5293 for (i = 0;i < am->nbStates;i++)
5294 xmlRegFreeState(am->states[i]);
5295 xmlFree(am->states);
5296 }
Daniel Veillard23e73572002-09-19 19:56:43 +00005297 am->nbAtoms = comp->nbAtoms;
5298 am->atoms = comp->atoms;
5299 am->nbStates = comp->nbStates;
5300 am->states = comp->states;
5301 am->determinist = -1;
5302 ret = xmlFAComputesDeterminism(am);
5303 am->atoms = NULL;
5304 am->states = NULL;
5305 xmlFreeAutomata(am);
5306 return(ret);
5307}
5308
5309/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005310 * xmlRegFreeRegexp:
5311 * @regexp: the regexp
5312 *
5313 * Free a regexp
5314 */
5315void
5316xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5317 int i;
5318 if (regexp == NULL)
5319 return;
5320
5321 if (regexp->string != NULL)
5322 xmlFree(regexp->string);
5323 if (regexp->states != NULL) {
5324 for (i = 0;i < regexp->nbStates;i++)
5325 xmlRegFreeState(regexp->states[i]);
5326 xmlFree(regexp->states);
5327 }
5328 if (regexp->atoms != NULL) {
5329 for (i = 0;i < regexp->nbAtoms;i++)
5330 xmlRegFreeAtom(regexp->atoms[i]);
5331 xmlFree(regexp->atoms);
5332 }
5333 if (regexp->counters != NULL)
5334 xmlFree(regexp->counters);
Daniel Veillard23e73572002-09-19 19:56:43 +00005335 if (regexp->compact != NULL)
5336 xmlFree(regexp->compact);
Daniel Veillard118aed72002-09-24 14:13:13 +00005337 if (regexp->transdata != NULL)
5338 xmlFree(regexp->transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +00005339 if (regexp->stringMap != NULL) {
5340 for (i = 0; i < regexp->nbstrings;i++)
5341 xmlFree(regexp->stringMap[i]);
5342 xmlFree(regexp->stringMap);
5343 }
5344
Daniel Veillard4255d502002-04-16 15:50:10 +00005345 xmlFree(regexp);
5346}
5347
5348#ifdef LIBXML_AUTOMATA_ENABLED
5349/************************************************************************
5350 * *
5351 * The Automata interface *
5352 * *
5353 ************************************************************************/
5354
5355/**
5356 * xmlNewAutomata:
5357 *
5358 * Create a new automata
5359 *
5360 * Returns the new object or NULL in case of failure
5361 */
5362xmlAutomataPtr
5363xmlNewAutomata(void) {
5364 xmlAutomataPtr ctxt;
5365
5366 ctxt = xmlRegNewParserCtxt(NULL);
5367 if (ctxt == NULL)
5368 return(NULL);
5369
5370 /* initialize the parser */
5371 ctxt->end = NULL;
5372 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005373 if (ctxt->start == NULL) {
5374 xmlFreeAutomata(ctxt);
5375 return(NULL);
5376 }
Daniel Veillardd0271472006-01-02 10:22:02 +00005377 ctxt->start->type = XML_REGEXP_START_STATE;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005378 if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5379 xmlRegFreeState(ctxt->start);
5380 xmlFreeAutomata(ctxt);
5381 return(NULL);
5382 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005383
5384 return(ctxt);
5385}
5386
5387/**
5388 * xmlFreeAutomata:
5389 * @am: an automata
5390 *
5391 * Free an automata
5392 */
5393void
5394xmlFreeAutomata(xmlAutomataPtr am) {
5395 if (am == NULL)
5396 return;
5397 xmlRegFreeParserCtxt(am);
5398}
5399
5400/**
5401 * xmlAutomataGetInitState:
5402 * @am: an automata
5403 *
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005404 * Initial state lookup
5405 *
Daniel Veillard4255d502002-04-16 15:50:10 +00005406 * Returns the initial state of the automata
5407 */
5408xmlAutomataStatePtr
5409xmlAutomataGetInitState(xmlAutomataPtr am) {
5410 if (am == NULL)
5411 return(NULL);
5412 return(am->start);
5413}
5414
5415/**
5416 * xmlAutomataSetFinalState:
5417 * @am: an automata
5418 * @state: a state in this automata
5419 *
5420 * Makes that state a final state
5421 *
5422 * Returns 0 or -1 in case of error
5423 */
5424int
5425xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5426 if ((am == NULL) || (state == NULL))
5427 return(-1);
5428 state->type = XML_REGEXP_FINAL_STATE;
5429 return(0);
5430}
5431
5432/**
5433 * xmlAutomataNewTransition:
5434 * @am: an automata
5435 * @from: the starting point of the transition
5436 * @to: the target point of the transition or NULL
5437 * @token: the input string associated to that transition
5438 * @data: data passed to the callback function if the transition is activated
5439 *
William M. Brackddf71d62004-05-06 04:17:26 +00005440 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00005441 * and then adds a transition from the @from state to the target state
5442 * activated by the value of @token
5443 *
5444 * Returns the target state or NULL in case of error
5445 */
5446xmlAutomataStatePtr
5447xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5448 xmlAutomataStatePtr to, const xmlChar *token,
5449 void *data) {
5450 xmlRegAtomPtr atom;
5451
5452 if ((am == NULL) || (from == NULL) || (token == NULL))
5453 return(NULL);
5454 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005455 if (atom == NULL)
5456 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00005457 atom->data = data;
5458 if (atom == NULL)
5459 return(NULL);
5460 atom->valuep = xmlStrdup(token);
5461
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005462 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5463 xmlRegFreeAtom(atom);
5464 return(NULL);
5465 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005466 if (to == NULL)
5467 return(am->state);
5468 return(to);
5469}
5470
5471/**
Daniel Veillard52b48c72003-04-13 19:53:42 +00005472 * xmlAutomataNewTransition2:
5473 * @am: an automata
5474 * @from: the starting point of the transition
5475 * @to: the target point of the transition or NULL
5476 * @token: the first input string associated to that transition
5477 * @token2: the second input string associated to that transition
5478 * @data: data passed to the callback function if the transition is activated
5479 *
William M. Brackddf71d62004-05-06 04:17:26 +00005480 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard52b48c72003-04-13 19:53:42 +00005481 * and then adds a transition from the @from state to the target state
5482 * activated by the value of @token
5483 *
5484 * Returns the target state or NULL in case of error
5485 */
5486xmlAutomataStatePtr
5487xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5488 xmlAutomataStatePtr to, const xmlChar *token,
5489 const xmlChar *token2, void *data) {
5490 xmlRegAtomPtr atom;
5491
5492 if ((am == NULL) || (from == NULL) || (token == NULL))
5493 return(NULL);
5494 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5495 atom->data = data;
5496 if (atom == NULL)
5497 return(NULL);
5498 if ((token2 == NULL) || (*token2 == 0)) {
5499 atom->valuep = xmlStrdup(token);
5500 } else {
5501 int lenn, lenp;
5502 xmlChar *str;
5503
5504 lenn = strlen((char *) token2);
5505 lenp = strlen((char *) token);
5506
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005507 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00005508 if (str == NULL) {
5509 xmlRegFreeAtom(atom);
5510 return(NULL);
5511 }
5512 memcpy(&str[0], token, lenp);
5513 str[lenp] = '|';
5514 memcpy(&str[lenp + 1], token2, lenn);
5515 str[lenn + lenp + 1] = 0;
5516
5517 atom->valuep = str;
5518 }
5519
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005520 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5521 xmlRegFreeAtom(atom);
5522 return(NULL);
5523 }
Daniel Veillard52b48c72003-04-13 19:53:42 +00005524 if (to == NULL)
5525 return(am->state);
5526 return(to);
5527}
5528
5529/**
Daniel Veillard9efc4762005-07-19 14:33:55 +00005530 * xmlAutomataNewNegTrans:
5531 * @am: an automata
5532 * @from: the starting point of the transition
5533 * @to: the target point of the transition or NULL
5534 * @token: the first input string associated to that transition
5535 * @token2: the second input string associated to that transition
5536 * @data: data passed to the callback function if the transition is activated
5537 *
5538 * If @to is NULL, this creates first a new target state in the automata
5539 * and then adds a transition from the @from state to the target state
5540 * activated by any value except (@token,@token2)
Daniel Veillard6e65e152005-08-09 11:09:52 +00005541 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5542 # the semantic of XSD ##other
Daniel Veillard9efc4762005-07-19 14:33:55 +00005543 *
5544 * Returns the target state or NULL in case of error
5545 */
5546xmlAutomataStatePtr
5547xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5548 xmlAutomataStatePtr to, const xmlChar *token,
5549 const xmlChar *token2, void *data) {
5550 xmlRegAtomPtr atom;
Daniel Veillard77005e62005-07-19 16:26:18 +00005551 xmlChar err_msg[200];
Daniel Veillard9efc4762005-07-19 14:33:55 +00005552
5553 if ((am == NULL) || (from == NULL) || (token == NULL))
5554 return(NULL);
5555 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5556 if (atom == NULL)
5557 return(NULL);
5558 atom->data = data;
5559 atom->neg = 1;
5560 if ((token2 == NULL) || (*token2 == 0)) {
5561 atom->valuep = xmlStrdup(token);
5562 } else {
5563 int lenn, lenp;
5564 xmlChar *str;
5565
5566 lenn = strlen((char *) token2);
5567 lenp = strlen((char *) token);
5568
5569 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5570 if (str == NULL) {
5571 xmlRegFreeAtom(atom);
5572 return(NULL);
5573 }
5574 memcpy(&str[0], token, lenp);
5575 str[lenp] = '|';
5576 memcpy(&str[lenp + 1], token2, lenn);
5577 str[lenn + lenp + 1] = 0;
5578
5579 atom->valuep = str;
5580 }
Daniel Veillarddb68b742005-07-30 13:18:24 +00005581 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
Daniel Veillard77005e62005-07-19 16:26:18 +00005582 err_msg[199] = 0;
5583 atom->valuep2 = xmlStrdup(err_msg);
Daniel Veillard9efc4762005-07-19 14:33:55 +00005584
5585 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5586 xmlRegFreeAtom(atom);
5587 return(NULL);
5588 }
Daniel Veillard6e65e152005-08-09 11:09:52 +00005589 am->negs++;
Daniel Veillard9efc4762005-07-19 14:33:55 +00005590 if (to == NULL)
5591 return(am->state);
5592 return(to);
5593}
5594
5595/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005596 * xmlAutomataNewCountTrans2:
5597 * @am: an automata
5598 * @from: the starting point of the transition
5599 * @to: the target point of the transition or NULL
5600 * @token: the input string associated to that transition
5601 * @token2: the second input string associated to that transition
5602 * @min: the minimum successive occurences of token
5603 * @max: the maximum successive occurences of token
5604 * @data: data associated to the transition
5605 *
5606 * If @to is NULL, this creates first a new target state in the automata
5607 * and then adds a transition from the @from state to the target state
5608 * activated by a succession of input of value @token and @token2 and
5609 * whose number is between @min and @max
5610 *
5611 * Returns the target state or NULL in case of error
5612 */
5613xmlAutomataStatePtr
5614xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5615 xmlAutomataStatePtr to, const xmlChar *token,
5616 const xmlChar *token2,
5617 int min, int max, void *data) {
5618 xmlRegAtomPtr atom;
5619 int counter;
5620
5621 if ((am == NULL) || (from == NULL) || (token == NULL))
5622 return(NULL);
5623 if (min < 0)
5624 return(NULL);
5625 if ((max < min) || (max < 1))
5626 return(NULL);
5627 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5628 if (atom == NULL)
5629 return(NULL);
5630 if ((token2 == NULL) || (*token2 == 0)) {
5631 atom->valuep = xmlStrdup(token);
5632 } else {
5633 int lenn, lenp;
5634 xmlChar *str;
5635
5636 lenn = strlen((char *) token2);
5637 lenp = strlen((char *) token);
5638
5639 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5640 if (str == NULL) {
5641 xmlRegFreeAtom(atom);
5642 return(NULL);
5643 }
5644 memcpy(&str[0], token, lenp);
5645 str[lenp] = '|';
5646 memcpy(&str[lenp + 1], token2, lenn);
5647 str[lenn + lenp + 1] = 0;
5648
5649 atom->valuep = str;
5650 }
5651 atom->data = data;
5652 if (min == 0)
5653 atom->min = 1;
5654 else
5655 atom->min = min;
5656 atom->max = max;
5657
5658 /*
5659 * associate a counter to the transition.
5660 */
5661 counter = xmlRegGetCounter(am);
5662 am->counters[counter].min = min;
5663 am->counters[counter].max = max;
5664
5665 /* xmlFAGenerateTransitions(am, from, to, atom); */
5666 if (to == NULL) {
5667 to = xmlRegNewState(am);
5668 xmlRegStatePush(am, to);
5669 }
Daniel Veillard5de09382005-09-26 17:18:17 +00005670 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005671 xmlRegAtomPush(am, atom);
5672 am->state = to;
5673
5674 if (to == NULL)
5675 to = am->state;
5676 if (to == NULL)
5677 return(NULL);
5678 if (min == 0)
5679 xmlFAGenerateEpsilonTransition(am, from, to);
5680 return(to);
5681}
5682
5683/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005684 * xmlAutomataNewCountTrans:
5685 * @am: an automata
5686 * @from: the starting point of the transition
5687 * @to: the target point of the transition or NULL
5688 * @token: the input string associated to that transition
5689 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005690 * @max: the maximum successive occurences of token
5691 * @data: data associated to the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00005692 *
William M. Brackddf71d62004-05-06 04:17:26 +00005693 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00005694 * and then adds a transition from the @from state to the target state
5695 * activated by a succession of input of value @token and whose number
5696 * is between @min and @max
5697 *
5698 * Returns the target state or NULL in case of error
5699 */
5700xmlAutomataStatePtr
5701xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5702 xmlAutomataStatePtr to, const xmlChar *token,
5703 int min, int max, void *data) {
5704 xmlRegAtomPtr atom;
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00005705 int counter;
Daniel Veillard4255d502002-04-16 15:50:10 +00005706
5707 if ((am == NULL) || (from == NULL) || (token == NULL))
5708 return(NULL);
5709 if (min < 0)
5710 return(NULL);
5711 if ((max < min) || (max < 1))
5712 return(NULL);
5713 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5714 if (atom == NULL)
5715 return(NULL);
5716 atom->valuep = xmlStrdup(token);
5717 atom->data = data;
5718 if (min == 0)
5719 atom->min = 1;
5720 else
5721 atom->min = min;
5722 atom->max = max;
5723
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00005724 /*
5725 * associate a counter to the transition.
5726 */
5727 counter = xmlRegGetCounter(am);
5728 am->counters[counter].min = min;
5729 am->counters[counter].max = max;
5730
5731 /* xmlFAGenerateTransitions(am, from, to, atom); */
5732 if (to == NULL) {
5733 to = xmlRegNewState(am);
5734 xmlRegStatePush(am, to);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005735 }
Daniel Veillard5de09382005-09-26 17:18:17 +00005736 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00005737 xmlRegAtomPush(am, atom);
5738 am->state = to;
5739
Daniel Veillard4255d502002-04-16 15:50:10 +00005740 if (to == NULL)
5741 to = am->state;
5742 if (to == NULL)
5743 return(NULL);
5744 if (min == 0)
5745 xmlFAGenerateEpsilonTransition(am, from, to);
5746 return(to);
5747}
5748
5749/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005750 * xmlAutomataNewOnceTrans2:
5751 * @am: an automata
5752 * @from: the starting point of the transition
5753 * @to: the target point of the transition or NULL
5754 * @token: the input string associated to that transition
5755 * @token2: the second input string associated to that transition
5756 * @min: the minimum successive occurences of token
5757 * @max: the maximum successive occurences of token
5758 * @data: data associated to the transition
5759 *
5760 * If @to is NULL, this creates first a new target state in the automata
5761 * and then adds a transition from the @from state to the target state
5762 * activated by a succession of input of value @token and @token2 and whose
5763 * number is between @min and @max, moreover that transition can only be
5764 * crossed once.
5765 *
5766 * Returns the target state or NULL in case of error
5767 */
5768xmlAutomataStatePtr
5769xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5770 xmlAutomataStatePtr to, const xmlChar *token,
5771 const xmlChar *token2,
5772 int min, int max, void *data) {
5773 xmlRegAtomPtr atom;
5774 int counter;
5775
5776 if ((am == NULL) || (from == NULL) || (token == NULL))
5777 return(NULL);
5778 if (min < 1)
5779 return(NULL);
5780 if ((max < min) || (max < 1))
5781 return(NULL);
5782 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5783 if (atom == NULL)
5784 return(NULL);
5785 if ((token2 == NULL) || (*token2 == 0)) {
5786 atom->valuep = xmlStrdup(token);
5787 } else {
5788 int lenn, lenp;
5789 xmlChar *str;
5790
5791 lenn = strlen((char *) token2);
5792 lenp = strlen((char *) token);
5793
5794 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5795 if (str == NULL) {
5796 xmlRegFreeAtom(atom);
5797 return(NULL);
5798 }
5799 memcpy(&str[0], token, lenp);
5800 str[lenp] = '|';
5801 memcpy(&str[lenp + 1], token2, lenn);
5802 str[lenn + lenp + 1] = 0;
5803
5804 atom->valuep = str;
5805 }
5806 atom->data = data;
5807 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
5808 if (min == 0)
5809 atom->min = 1;
5810 else
5811 atom->min = min;
5812 atom->max = max;
5813 /*
5814 * associate a counter to the transition.
5815 */
5816 counter = xmlRegGetCounter(am);
5817 am->counters[counter].min = 1;
5818 am->counters[counter].max = 1;
5819
5820 /* xmlFAGenerateTransitions(am, from, to, atom); */
5821 if (to == NULL) {
5822 to = xmlRegNewState(am);
5823 xmlRegStatePush(am, to);
5824 }
Daniel Veillard5de09382005-09-26 17:18:17 +00005825 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005826 xmlRegAtomPush(am, atom);
5827 am->state = to;
5828 return(to);
5829}
5830
5831
5832
5833/**
Daniel Veillard7646b182002-04-20 06:41:40 +00005834 * xmlAutomataNewOnceTrans:
5835 * @am: an automata
5836 * @from: the starting point of the transition
5837 * @to: the target point of the transition or NULL
5838 * @token: the input string associated to that transition
5839 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005840 * @max: the maximum successive occurences of token
5841 * @data: data associated to the transition
Daniel Veillard7646b182002-04-20 06:41:40 +00005842 *
William M. Brackddf71d62004-05-06 04:17:26 +00005843 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00005844 * and then adds a transition from the @from state to the target state
5845 * activated by a succession of input of value @token and whose number
William M. Brackddf71d62004-05-06 04:17:26 +00005846 * is between @min and @max, moreover that transition can only be crossed
Daniel Veillard7646b182002-04-20 06:41:40 +00005847 * once.
5848 *
5849 * Returns the target state or NULL in case of error
5850 */
5851xmlAutomataStatePtr
5852xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5853 xmlAutomataStatePtr to, const xmlChar *token,
5854 int min, int max, void *data) {
5855 xmlRegAtomPtr atom;
5856 int counter;
5857
5858 if ((am == NULL) || (from == NULL) || (token == NULL))
5859 return(NULL);
5860 if (min < 1)
5861 return(NULL);
5862 if ((max < min) || (max < 1))
5863 return(NULL);
5864 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5865 if (atom == NULL)
5866 return(NULL);
5867 atom->valuep = xmlStrdup(token);
5868 atom->data = data;
5869 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
5870 if (min == 0)
5871 atom->min = 1;
5872 else
5873 atom->min = min;
5874 atom->max = max;
5875 /*
5876 * associate a counter to the transition.
5877 */
5878 counter = xmlRegGetCounter(am);
5879 am->counters[counter].min = 1;
5880 am->counters[counter].max = 1;
5881
5882 /* xmlFAGenerateTransitions(am, from, to, atom); */
5883 if (to == NULL) {
5884 to = xmlRegNewState(am);
5885 xmlRegStatePush(am, to);
5886 }
Daniel Veillard5de09382005-09-26 17:18:17 +00005887 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Daniel Veillard7646b182002-04-20 06:41:40 +00005888 xmlRegAtomPush(am, atom);
5889 am->state = to;
Daniel Veillard7646b182002-04-20 06:41:40 +00005890 return(to);
5891}
5892
5893/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005894 * xmlAutomataNewState:
5895 * @am: an automata
5896 *
5897 * Create a new disconnected state in the automata
5898 *
5899 * Returns the new state or NULL in case of error
5900 */
5901xmlAutomataStatePtr
5902xmlAutomataNewState(xmlAutomataPtr am) {
5903 xmlAutomataStatePtr to;
5904
5905 if (am == NULL)
5906 return(NULL);
5907 to = xmlRegNewState(am);
5908 xmlRegStatePush(am, to);
5909 return(to);
5910}
5911
5912/**
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005913 * xmlAutomataNewEpsilon:
Daniel Veillard4255d502002-04-16 15:50:10 +00005914 * @am: an automata
5915 * @from: the starting point of the transition
5916 * @to: the target point of the transition or NULL
5917 *
William M. Brackddf71d62004-05-06 04:17:26 +00005918 * If @to is NULL, this creates first a new target state in the automata
5919 * and then adds an epsilon transition from the @from state to the
Daniel Veillard4255d502002-04-16 15:50:10 +00005920 * target state
5921 *
5922 * Returns the target state or NULL in case of error
5923 */
5924xmlAutomataStatePtr
5925xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
5926 xmlAutomataStatePtr to) {
5927 if ((am == NULL) || (from == NULL))
5928 return(NULL);
5929 xmlFAGenerateEpsilonTransition(am, from, to);
5930 if (to == NULL)
5931 return(am->state);
5932 return(to);
5933}
5934
Daniel Veillardb509f152002-04-17 16:28:10 +00005935/**
Daniel Veillard7646b182002-04-20 06:41:40 +00005936 * xmlAutomataNewAllTrans:
5937 * @am: an automata
5938 * @from: the starting point of the transition
5939 * @to: the target point of the transition or NULL
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005940 * @lax: allow to transition if not all all transitions have been activated
Daniel Veillard7646b182002-04-20 06:41:40 +00005941 *
William M. Brackddf71d62004-05-06 04:17:26 +00005942 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00005943 * and then adds a an ALL transition from the @from state to the
5944 * target state. That transition is an epsilon transition allowed only when
5945 * all transitions from the @from node have been activated.
5946 *
5947 * Returns the target state or NULL in case of error
5948 */
5949xmlAutomataStatePtr
5950xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
Daniel Veillard441bc322002-04-20 17:38:48 +00005951 xmlAutomataStatePtr to, int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00005952 if ((am == NULL) || (from == NULL))
5953 return(NULL);
Daniel Veillard441bc322002-04-20 17:38:48 +00005954 xmlFAGenerateAllTransition(am, from, to, lax);
Daniel Veillard7646b182002-04-20 06:41:40 +00005955 if (to == NULL)
5956 return(am->state);
5957 return(to);
5958}
5959
5960/**
Daniel Veillardb509f152002-04-17 16:28:10 +00005961 * xmlAutomataNewCounter:
5962 * @am: an automata
5963 * @min: the minimal value on the counter
5964 * @max: the maximal value on the counter
5965 *
5966 * Create a new counter
5967 *
5968 * Returns the counter number or -1 in case of error
5969 */
5970int
5971xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
5972 int ret;
5973
5974 if (am == NULL)
5975 return(-1);
5976
5977 ret = xmlRegGetCounter(am);
5978 if (ret < 0)
5979 return(-1);
5980 am->counters[ret].min = min;
5981 am->counters[ret].max = max;
5982 return(ret);
5983}
5984
5985/**
5986 * xmlAutomataNewCountedTrans:
5987 * @am: an automata
5988 * @from: the starting point of the transition
5989 * @to: the target point of the transition or NULL
5990 * @counter: the counter associated to that transition
5991 *
William M. Brackddf71d62004-05-06 04:17:26 +00005992 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00005993 * and then adds an epsilon transition from the @from state to the target state
5994 * which will increment the counter provided
5995 *
5996 * Returns the target state or NULL in case of error
5997 */
5998xmlAutomataStatePtr
5999xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6000 xmlAutomataStatePtr to, int counter) {
6001 if ((am == NULL) || (from == NULL) || (counter < 0))
6002 return(NULL);
6003 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6004 if (to == NULL)
6005 return(am->state);
6006 return(to);
6007}
6008
6009/**
6010 * xmlAutomataNewCounterTrans:
6011 * @am: an automata
6012 * @from: the starting point of the transition
6013 * @to: the target point of the transition or NULL
6014 * @counter: the counter associated to that transition
6015 *
William M. Brackddf71d62004-05-06 04:17:26 +00006016 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00006017 * and then adds an epsilon transition from the @from state to the target state
6018 * which will be allowed only if the counter is within the right range.
6019 *
6020 * Returns the target state or NULL in case of error
6021 */
6022xmlAutomataStatePtr
6023xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6024 xmlAutomataStatePtr to, int counter) {
6025 if ((am == NULL) || (from == NULL) || (counter < 0))
6026 return(NULL);
6027 xmlFAGenerateCountedTransition(am, from, to, counter);
6028 if (to == NULL)
6029 return(am->state);
6030 return(to);
6031}
Daniel Veillard4255d502002-04-16 15:50:10 +00006032
6033/**
6034 * xmlAutomataCompile:
6035 * @am: an automata
6036 *
6037 * Compile the automata into a Reg Exp ready for being executed.
6038 * The automata should be free after this point.
6039 *
6040 * Returns the compiled regexp or NULL in case of error
6041 */
6042xmlRegexpPtr
6043xmlAutomataCompile(xmlAutomataPtr am) {
6044 xmlRegexpPtr ret;
6045
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00006046 if ((am == NULL) || (am->error != 0)) return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00006047 xmlFAEliminateEpsilonTransitions(am);
Daniel Veillard23e73572002-09-19 19:56:43 +00006048 /* xmlFAComputesDeterminism(am); */
Daniel Veillard4255d502002-04-16 15:50:10 +00006049 ret = xmlRegEpxFromParse(am);
6050
6051 return(ret);
6052}
Daniel Veillarde19fc232002-04-22 16:01:24 +00006053
6054/**
6055 * xmlAutomataIsDeterminist:
6056 * @am: an automata
6057 *
6058 * Checks if an automata is determinist.
6059 *
6060 * Returns 1 if true, 0 if not, and -1 in case of error
6061 */
6062int
6063xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6064 int ret;
6065
6066 if (am == NULL)
6067 return(-1);
6068
6069 ret = xmlFAComputesDeterminism(am);
6070 return(ret);
6071}
Daniel Veillard4255d502002-04-16 15:50:10 +00006072#endif /* LIBXML_AUTOMATA_ENABLED */
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006073
6074#ifdef LIBXML_EXPR_ENABLED
6075/************************************************************************
6076 * *
6077 * Formal Expression handling code *
6078 * *
6079 ************************************************************************/
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006080/************************************************************************
6081 * *
6082 * Expression handling context *
6083 * *
6084 ************************************************************************/
6085
6086struct _xmlExpCtxt {
6087 xmlDictPtr dict;
6088 xmlExpNodePtr *table;
6089 int size;
6090 int nbElems;
6091 int nb_nodes;
6092 const char *expr;
6093 const char *cur;
6094 int nb_cons;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006095 int tabSize;
6096};
6097
6098/**
6099 * xmlExpNewCtxt:
6100 * @maxNodes: the maximum number of nodes
6101 * @dict: optional dictionnary to use internally
6102 *
6103 * Creates a new context for manipulating expressions
6104 *
6105 * Returns the context or NULL in case of error
6106 */
6107xmlExpCtxtPtr
6108xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6109 xmlExpCtxtPtr ret;
6110 int size = 256;
6111
6112 if (maxNodes <= 4096)
6113 maxNodes = 4096;
6114
6115 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6116 if (ret == NULL)
6117 return(NULL);
6118 memset(ret, 0, sizeof(xmlExpCtxt));
6119 ret->size = size;
6120 ret->nbElems = 0;
6121 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6122 if (ret->table == NULL) {
6123 xmlFree(ret);
6124 return(NULL);
6125 }
6126 memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6127 if (dict == NULL) {
6128 ret->dict = xmlDictCreate();
6129 if (ret->dict == NULL) {
6130 xmlFree(ret->table);
6131 xmlFree(ret);
6132 return(NULL);
6133 }
6134 } else {
6135 ret->dict = dict;
6136 xmlDictReference(ret->dict);
6137 }
6138 return(ret);
6139}
6140
6141/**
6142 * xmlExpFreeCtxt:
6143 * @ctxt: an expression context
6144 *
6145 * Free an expression context
6146 */
6147void
6148xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6149 if (ctxt == NULL)
6150 return;
6151 xmlDictFree(ctxt->dict);
6152 if (ctxt->table != NULL)
6153 xmlFree(ctxt->table);
6154 xmlFree(ctxt);
6155}
6156
6157/************************************************************************
6158 * *
6159 * Structure associated to an expression node *
6160 * *
6161 ************************************************************************/
Daniel Veillard465a0002005-08-22 12:07:04 +00006162#define MAX_NODES 10000
6163
6164/* #define DEBUG_DERIV */
6165
6166/*
6167 * TODO:
6168 * - Wildcards
6169 * - public API for creation
6170 *
6171 * Started
6172 * - regression testing
6173 *
6174 * Done
6175 * - split into module and test tool
6176 * - memleaks
6177 */
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006178
6179typedef enum {
6180 XML_EXP_NILABLE = (1 << 0)
6181} xmlExpNodeInfo;
6182
6183#define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6184
6185struct _xmlExpNode {
6186 unsigned char type;/* xmlExpNodeType */
6187 unsigned char info;/* OR of xmlExpNodeInfo */
6188 unsigned short key; /* the hash key */
6189 unsigned int ref; /* The number of references */
6190 int c_max; /* the maximum length it can consume */
6191 xmlExpNodePtr exp_left;
6192 xmlExpNodePtr next;/* the next node in the hash table or free list */
6193 union {
6194 struct {
6195 int f_min;
6196 int f_max;
6197 } count;
6198 struct {
6199 xmlExpNodePtr f_right;
6200 } children;
6201 const xmlChar *f_str;
6202 } field;
6203};
6204
6205#define exp_min field.count.f_min
6206#define exp_max field.count.f_max
6207/* #define exp_left field.children.f_left */
6208#define exp_right field.children.f_right
6209#define exp_str field.f_str
6210
6211static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6212static xmlExpNode forbiddenExpNode = {
6213 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6214};
6215xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6216static xmlExpNode emptyExpNode = {
6217 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6218};
6219xmlExpNodePtr emptyExp = &emptyExpNode;
6220
6221/************************************************************************
6222 * *
6223 * The custom hash table for unicity and canonicalization *
6224 * of sub-expressions pointers *
6225 * *
6226 ************************************************************************/
6227/*
6228 * xmlExpHashNameComputeKey:
6229 * Calculate the hash key for a token
6230 */
6231static unsigned short
6232xmlExpHashNameComputeKey(const xmlChar *name) {
6233 unsigned short value = 0L;
6234 char ch;
6235
6236 if (name != NULL) {
6237 value += 30 * (*name);
6238 while ((ch = *name++) != 0) {
6239 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6240 }
6241 }
6242 return (value);
6243}
6244
6245/*
6246 * xmlExpHashComputeKey:
6247 * Calculate the hash key for a compound expression
6248 */
6249static unsigned short
6250xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6251 xmlExpNodePtr right) {
6252 unsigned long value;
6253 unsigned short ret;
6254
6255 switch (type) {
6256 case XML_EXP_SEQ:
6257 value = left->key;
6258 value += right->key;
6259 value *= 3;
6260 ret = (unsigned short) value;
6261 break;
6262 case XML_EXP_OR:
6263 value = left->key;
6264 value += right->key;
6265 value *= 7;
6266 ret = (unsigned short) value;
6267 break;
6268 case XML_EXP_COUNT:
6269 value = left->key;
6270 value += right->key;
6271 ret = (unsigned short) value;
6272 break;
6273 default:
6274 ret = 0;
6275 }
6276 return(ret);
6277}
6278
6279
6280static xmlExpNodePtr
6281xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6282 xmlExpNodePtr ret;
6283
6284 if (ctxt->nb_nodes >= MAX_NODES)
6285 return(NULL);
6286 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6287 if (ret == NULL)
6288 return(NULL);
6289 memset(ret, 0, sizeof(xmlExpNode));
6290 ret->type = type;
6291 ret->next = NULL;
6292 ctxt->nb_nodes++;
6293 ctxt->nb_cons++;
6294 return(ret);
6295}
6296
6297/**
6298 * xmlExpHashGetEntry:
6299 * @table: the hash table
6300 *
6301 * Get the unique entry from the hash table. The entry is created if
6302 * needed. @left and @right are consumed, i.e. their ref count will
6303 * be decremented by the operation.
6304 *
6305 * Returns the pointer or NULL in case of error
6306 */
6307static xmlExpNodePtr
6308xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6309 xmlExpNodePtr left, xmlExpNodePtr right,
6310 const xmlChar *name, int min, int max) {
6311 unsigned short kbase, key;
6312 xmlExpNodePtr entry;
6313 xmlExpNodePtr insert;
6314
6315 if (ctxt == NULL)
6316 return(NULL);
6317
6318 /*
6319 * Check for duplicate and insertion location.
6320 */
6321 if (type == XML_EXP_ATOM) {
6322 kbase = xmlExpHashNameComputeKey(name);
6323 } else if (type == XML_EXP_COUNT) {
6324 /* COUNT reduction rule 1 */
6325 /* a{1} -> a */
6326 if (min == max) {
6327 if (min == 1) {
6328 return(left);
6329 }
6330 if (min == 0) {
6331 xmlExpFree(ctxt, left);
6332 return(emptyExp);
6333 }
6334 }
6335 if (min < 0) {
6336 xmlExpFree(ctxt, left);
6337 return(forbiddenExp);
6338 }
6339 if (max == -1)
6340 kbase = min + 79;
6341 else
6342 kbase = max - min;
6343 kbase += left->key;
6344 } else if (type == XML_EXP_OR) {
6345 /* Forbid reduction rules */
6346 if (left->type == XML_EXP_FORBID) {
6347 xmlExpFree(ctxt, left);
6348 return(right);
6349 }
6350 if (right->type == XML_EXP_FORBID) {
6351 xmlExpFree(ctxt, right);
6352 return(left);
6353 }
6354
6355 /* OR reduction rule 1 */
6356 /* a | a reduced to a */
6357 if (left == right) {
6358 left->ref--;
6359 return(left);
6360 }
6361 /* OR canonicalization rule 1 */
6362 /* linearize (a | b) | c into a | (b | c) */
6363 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6364 xmlExpNodePtr tmp = left;
6365 left = right;
6366 right = tmp;
6367 }
6368 /* OR reduction rule 2 */
6369 /* a | (a | b) and b | (a | b) are reduced to a | b */
6370 if (right->type == XML_EXP_OR) {
6371 if ((left == right->exp_left) ||
6372 (left == right->exp_right)) {
6373 xmlExpFree(ctxt, left);
6374 return(right);
6375 }
6376 }
6377 /* OR canonicalization rule 2 */
6378 /* linearize (a | b) | c into a | (b | c) */
6379 if (left->type == XML_EXP_OR) {
6380 xmlExpNodePtr tmp;
6381
6382 /* OR canonicalization rule 2 */
6383 if ((left->exp_right->type != XML_EXP_OR) &&
6384 (left->exp_right->key < left->exp_left->key)) {
6385 tmp = left->exp_right;
6386 left->exp_right = left->exp_left;
6387 left->exp_left = tmp;
6388 }
6389 left->exp_right->ref++;
6390 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6391 NULL, 0, 0);
6392 left->exp_left->ref++;
6393 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6394 NULL, 0, 0);
6395
6396 xmlExpFree(ctxt, left);
6397 return(tmp);
6398 }
6399 if (right->type == XML_EXP_OR) {
6400 /* Ordering in the tree */
6401 /* C | (A | B) -> A | (B | C) */
6402 if (left->key > right->exp_right->key) {
6403 xmlExpNodePtr tmp;
6404 right->exp_right->ref++;
6405 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6406 left, NULL, 0, 0);
6407 right->exp_left->ref++;
6408 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6409 tmp, NULL, 0, 0);
6410 xmlExpFree(ctxt, right);
6411 return(tmp);
6412 }
6413 /* Ordering in the tree */
6414 /* B | (A | C) -> A | (B | C) */
6415 if (left->key > right->exp_left->key) {
6416 xmlExpNodePtr tmp;
6417 right->exp_right->ref++;
6418 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6419 right->exp_right, NULL, 0, 0);
6420 right->exp_left->ref++;
6421 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6422 tmp, NULL, 0, 0);
6423 xmlExpFree(ctxt, right);
6424 return(tmp);
6425 }
6426 }
6427 /* we know both types are != XML_EXP_OR here */
6428 else if (left->key > right->key) {
6429 xmlExpNodePtr tmp = left;
6430 left = right;
6431 right = tmp;
6432 }
6433 kbase = xmlExpHashComputeKey(type, left, right);
6434 } else if (type == XML_EXP_SEQ) {
6435 /* Forbid reduction rules */
6436 if (left->type == XML_EXP_FORBID) {
6437 xmlExpFree(ctxt, right);
6438 return(left);
6439 }
6440 if (right->type == XML_EXP_FORBID) {
6441 xmlExpFree(ctxt, left);
6442 return(right);
6443 }
6444 /* Empty reduction rules */
6445 if (right->type == XML_EXP_EMPTY) {
6446 return(left);
6447 }
6448 if (left->type == XML_EXP_EMPTY) {
6449 return(right);
6450 }
6451 kbase = xmlExpHashComputeKey(type, left, right);
6452 } else
6453 return(NULL);
6454
6455 key = kbase % ctxt->size;
6456 if (ctxt->table[key] != NULL) {
6457 for (insert = ctxt->table[key]; insert != NULL;
6458 insert = insert->next) {
6459 if ((insert->key == kbase) &&
6460 (insert->type == type)) {
6461 if (type == XML_EXP_ATOM) {
6462 if (name == insert->exp_str) {
6463 insert->ref++;
6464 return(insert);
6465 }
6466 } else if (type == XML_EXP_COUNT) {
6467 if ((insert->exp_min == min) && (insert->exp_max == max) &&
6468 (insert->exp_left == left)) {
6469 insert->ref++;
6470 left->ref--;
6471 return(insert);
6472 }
6473 } else if ((insert->exp_left == left) &&
6474 (insert->exp_right == right)) {
6475 insert->ref++;
6476 left->ref--;
6477 right->ref--;
6478 return(insert);
6479 }
6480 }
6481 }
6482 }
6483
6484 entry = xmlExpNewNode(ctxt, type);
6485 if (entry == NULL)
6486 return(NULL);
6487 entry->key = kbase;
6488 if (type == XML_EXP_ATOM) {
6489 entry->exp_str = name;
6490 entry->c_max = 1;
6491 } else if (type == XML_EXP_COUNT) {
6492 entry->exp_min = min;
6493 entry->exp_max = max;
6494 entry->exp_left = left;
6495 if ((min == 0) || (IS_NILLABLE(left)))
6496 entry->info |= XML_EXP_NILABLE;
6497 if (max < 0)
6498 entry->c_max = -1;
6499 else
6500 entry->c_max = max * entry->exp_left->c_max;
6501 } else {
6502 entry->exp_left = left;
6503 entry->exp_right = right;
6504 if (type == XML_EXP_OR) {
6505 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6506 entry->info |= XML_EXP_NILABLE;
6507 if ((entry->exp_left->c_max == -1) ||
6508 (entry->exp_right->c_max == -1))
6509 entry->c_max = -1;
6510 else if (entry->exp_left->c_max > entry->exp_right->c_max)
6511 entry->c_max = entry->exp_left->c_max;
6512 else
6513 entry->c_max = entry->exp_right->c_max;
6514 } else {
6515 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6516 entry->info |= XML_EXP_NILABLE;
6517 if ((entry->exp_left->c_max == -1) ||
6518 (entry->exp_right->c_max == -1))
6519 entry->c_max = -1;
6520 else
6521 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6522 }
6523 }
6524 entry->ref = 1;
6525 if (ctxt->table[key] != NULL)
6526 entry->next = ctxt->table[key];
6527
6528 ctxt->table[key] = entry;
6529 ctxt->nbElems++;
6530
6531 return(entry);
6532}
6533
6534/**
6535 * xmlExpFree:
6536 * @ctxt: the expression context
6537 * @exp: the expression
6538 *
6539 * Dereference the expression
6540 */
6541void
6542xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6543 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6544 return;
6545 exp->ref--;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006546 if (exp->ref == 0) {
6547 unsigned short key;
6548
6549 /* Unlink it first from the hash table */
6550 key = exp->key % ctxt->size;
6551 if (ctxt->table[key] == exp) {
6552 ctxt->table[key] = exp->next;
6553 } else {
6554 xmlExpNodePtr tmp;
6555
6556 tmp = ctxt->table[key];
6557 while (tmp != NULL) {
6558 if (tmp->next == exp) {
6559 tmp->next = exp->next;
6560 break;
6561 }
6562 tmp = tmp->next;
6563 }
6564 }
6565
6566 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6567 xmlExpFree(ctxt, exp->exp_left);
6568 xmlExpFree(ctxt, exp->exp_right);
6569 } else if (exp->type == XML_EXP_COUNT) {
6570 xmlExpFree(ctxt, exp->exp_left);
6571 }
6572 xmlFree(exp);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006573 ctxt->nb_nodes--;
6574 }
6575}
6576
6577/**
6578 * xmlExpRef:
6579 * @exp: the expression
6580 *
6581 * Increase the reference count of the expression
6582 */
6583void
6584xmlExpRef(xmlExpNodePtr exp) {
6585 if (exp != NULL)
6586 exp->ref++;
6587}
6588
Daniel Veillardccb4d412005-08-23 13:41:17 +00006589/**
6590 * xmlExpNewAtom:
6591 * @ctxt: the expression context
6592 * @name: the atom name
6593 * @len: the atom name lenght in byte (or -1);
6594 *
6595 * Get the atom associated to this name from that context
6596 *
6597 * Returns the node or NULL in case of error
6598 */
6599xmlExpNodePtr
6600xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
6601 if ((ctxt == NULL) || (name == NULL))
6602 return(NULL);
6603 name = xmlDictLookup(ctxt->dict, name, len);
6604 if (name == NULL)
6605 return(NULL);
6606 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
6607}
6608
6609/**
6610 * xmlExpNewOr:
6611 * @ctxt: the expression context
6612 * @left: left expression
6613 * @right: right expression
6614 *
6615 * Get the atom associated to the choice @left | @right
6616 * Note that @left and @right are consumed in the operation, to keep
6617 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6618 * this is true even in case of failure (unless ctxt == NULL).
6619 *
6620 * Returns the node or NULL in case of error
6621 */
6622xmlExpNodePtr
6623xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6624 if ((ctxt == NULL) || (left == NULL) || (right == NULL)) {
6625 xmlExpFree(ctxt, left);
6626 xmlExpFree(ctxt, right);
6627 return(NULL);
6628 }
6629 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
6630}
6631
6632/**
6633 * xmlExpNewSeq:
6634 * @ctxt: the expression context
6635 * @left: left expression
6636 * @right: right expression
6637 *
6638 * Get the atom associated to the sequence @left , @right
6639 * Note that @left and @right are consumed in the operation, to keep
6640 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6641 * this is true even in case of failure (unless ctxt == NULL).
6642 *
6643 * Returns the node or NULL in case of error
6644 */
6645xmlExpNodePtr
6646xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6647 if ((ctxt == NULL) || (left == NULL) || (right == NULL)) {
6648 xmlExpFree(ctxt, left);
6649 xmlExpFree(ctxt, right);
6650 return(NULL);
6651 }
6652 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
6653}
6654
6655/**
6656 * xmlExpNewRange:
6657 * @ctxt: the expression context
6658 * @subset: the expression to be repeated
6659 * @min: the lower bound for the repetition
6660 * @max: the upper bound for the repetition, -1 means infinite
6661 *
6662 * Get the atom associated to the range (@subset){@min, @max}
6663 * Note that @subset is consumed in the operation, to keep
6664 * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
6665 * this is true even in case of failure (unless ctxt == NULL).
6666 *
6667 * Returns the node or NULL in case of error
6668 */
6669xmlExpNodePtr
6670xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
6671 if ((ctxt == NULL) || (subset == NULL) || (min < 0) || (max < -1) ||
6672 ((max >= 0) && (min > max))) {
6673 xmlExpFree(ctxt, subset);
6674 return(NULL);
6675 }
6676 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
6677 NULL, NULL, min, max));
6678}
6679
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006680/************************************************************************
6681 * *
6682 * Public API for operations on expressions *
6683 * *
6684 ************************************************************************/
6685
6686static int
6687xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6688 const xmlChar**list, int len, int nb) {
6689 int tmp, tmp2;
6690tail:
6691 switch (exp->type) {
6692 case XML_EXP_EMPTY:
6693 return(0);
6694 case XML_EXP_ATOM:
6695 for (tmp = 0;tmp < nb;tmp++)
6696 if (list[tmp] == exp->exp_str)
6697 return(0);
6698 if (nb >= len)
6699 return(-2);
6700 list[nb++] = exp->exp_str;
6701 return(1);
6702 case XML_EXP_COUNT:
6703 exp = exp->exp_left;
6704 goto tail;
6705 case XML_EXP_SEQ:
6706 case XML_EXP_OR:
6707 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
6708 if (tmp < 0)
6709 return(tmp);
6710 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
6711 nb + tmp);
6712 if (tmp2 < 0)
6713 return(tmp2);
6714 return(tmp + tmp2);
6715 }
6716 return(-1);
6717}
6718
6719/**
6720 * xmlExpGetLanguage:
6721 * @ctxt: the expression context
6722 * @exp: the expression
Daniel Veillard7802ba52005-10-27 11:56:20 +00006723 * @langList: where to store the tokens
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006724 * @len: the allocated lenght of @list
6725 *
6726 * Find all the strings used in @exp and store them in @list
6727 *
6728 * Returns the number of unique strings found, -1 in case of errors and
6729 * -2 if there is more than @len strings
6730 */
6731int
6732xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard7802ba52005-10-27 11:56:20 +00006733 const xmlChar**langList, int len) {
6734 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006735 return(-1);
Daniel Veillard7802ba52005-10-27 11:56:20 +00006736 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006737}
6738
6739static int
6740xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6741 const xmlChar**list, int len, int nb) {
6742 int tmp, tmp2;
6743tail:
6744 switch (exp->type) {
6745 case XML_EXP_FORBID:
6746 return(0);
6747 case XML_EXP_EMPTY:
6748 return(0);
6749 case XML_EXP_ATOM:
6750 for (tmp = 0;tmp < nb;tmp++)
6751 if (list[tmp] == exp->exp_str)
6752 return(0);
6753 if (nb >= len)
6754 return(-2);
6755 list[nb++] = exp->exp_str;
6756 return(1);
6757 case XML_EXP_COUNT:
6758 exp = exp->exp_left;
6759 goto tail;
6760 case XML_EXP_SEQ:
6761 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
6762 if (tmp < 0)
6763 return(tmp);
6764 if (IS_NILLABLE(exp->exp_left)) {
6765 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
6766 nb + tmp);
6767 if (tmp2 < 0)
6768 return(tmp2);
6769 tmp += tmp2;
6770 }
6771 return(tmp);
6772 case XML_EXP_OR:
6773 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
6774 if (tmp < 0)
6775 return(tmp);
6776 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
6777 nb + tmp);
6778 if (tmp2 < 0)
6779 return(tmp2);
6780 return(tmp + tmp2);
6781 }
6782 return(-1);
6783}
6784
6785/**
6786 * xmlExpGetStart:
6787 * @ctxt: the expression context
6788 * @exp: the expression
Daniel Veillard7802ba52005-10-27 11:56:20 +00006789 * @tokList: where to store the tokens
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006790 * @len: the allocated lenght of @list
6791 *
6792 * Find all the strings that appears at the start of the languages
6793 * accepted by @exp and store them in @list. E.g. for (a, b) | c
6794 * it will return the list [a, c]
6795 *
6796 * Returns the number of unique strings found, -1 in case of errors and
6797 * -2 if there is more than @len strings
6798 */
6799int
6800xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard7802ba52005-10-27 11:56:20 +00006801 const xmlChar**tokList, int len) {
6802 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006803 return(-1);
Daniel Veillard7802ba52005-10-27 11:56:20 +00006804 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006805}
6806
6807/**
6808 * xmlExpIsNillable:
6809 * @exp: the expression
6810 *
6811 * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce
6812 *
6813 * Returns 1 if nillable, 0 if not and -1 in case of error
6814 */
6815int
6816xmlExpIsNillable(xmlExpNodePtr exp) {
6817 if (exp == NULL)
6818 return(-1);
6819 return(IS_NILLABLE(exp) != 0);
6820}
6821
6822static xmlExpNodePtr
6823xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
6824{
6825 xmlExpNodePtr ret;
6826
6827 switch (exp->type) {
6828 case XML_EXP_EMPTY:
6829 return(forbiddenExp);
6830 case XML_EXP_FORBID:
6831 return(forbiddenExp);
6832 case XML_EXP_ATOM:
6833 if (exp->exp_str == str) {
6834#ifdef DEBUG_DERIV
6835 printf("deriv atom: equal => Empty\n");
6836#endif
6837 ret = emptyExp;
6838 } else {
6839#ifdef DEBUG_DERIV
6840 printf("deriv atom: mismatch => forbid\n");
6841#endif
6842 /* TODO wildcards here */
6843 ret = forbiddenExp;
6844 }
6845 return(ret);
6846 case XML_EXP_OR: {
6847 xmlExpNodePtr tmp;
6848
6849#ifdef DEBUG_DERIV
6850 printf("deriv or: => or(derivs)\n");
6851#endif
6852 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
6853 if (tmp == NULL) {
6854 return(NULL);
6855 }
6856 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
6857 if (ret == NULL) {
6858 xmlExpFree(ctxt, tmp);
6859 return(NULL);
6860 }
6861 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
6862 NULL, 0, 0);
6863 return(ret);
6864 }
6865 case XML_EXP_SEQ:
6866#ifdef DEBUG_DERIV
6867 printf("deriv seq: starting with left\n");
6868#endif
6869 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
6870 if (ret == NULL) {
6871 return(NULL);
6872 } else if (ret == forbiddenExp) {
6873 if (IS_NILLABLE(exp->exp_left)) {
6874#ifdef DEBUG_DERIV
6875 printf("deriv seq: left failed but nillable\n");
6876#endif
6877 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
6878 }
6879 } else {
6880#ifdef DEBUG_DERIV
6881 printf("deriv seq: left match => sequence\n");
6882#endif
6883 exp->exp_right->ref++;
6884 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
6885 NULL, 0, 0);
6886 }
6887 return(ret);
6888 case XML_EXP_COUNT: {
6889 int min, max;
6890 xmlExpNodePtr tmp;
6891
6892 if (exp->exp_max == 0)
6893 return(forbiddenExp);
6894 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
6895 if (ret == NULL)
6896 return(NULL);
6897 if (ret == forbiddenExp) {
6898#ifdef DEBUG_DERIV
6899 printf("deriv count: pattern mismatch => forbid\n");
6900#endif
6901 return(ret);
6902 }
6903 if (exp->exp_max == 1)
6904 return(ret);
6905 if (exp->exp_max < 0) /* unbounded */
6906 max = -1;
6907 else
6908 max = exp->exp_max - 1;
6909 if (exp->exp_min > 0)
6910 min = exp->exp_min - 1;
6911 else
6912 min = 0;
6913 exp->exp_left->ref++;
6914 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
6915 NULL, min, max);
6916 if (ret == emptyExp) {
6917#ifdef DEBUG_DERIV
6918 printf("deriv count: match to empty => new count\n");
6919#endif
6920 return(tmp);
6921 }
6922#ifdef DEBUG_DERIV
6923 printf("deriv count: match => sequence with new count\n");
6924#endif
6925 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
6926 NULL, 0, 0));
6927 }
6928 }
6929 return(NULL);
6930}
6931
6932/**
6933 * xmlExpStringDerive:
6934 * @ctxt: the expression context
6935 * @exp: the expression
6936 * @str: the string
6937 * @len: the string len in bytes if available
6938 *
6939 * Do one step of Brzozowski derivation of the expression @exp with
6940 * respect to the input string
6941 *
6942 * Returns the resulting expression or NULL in case of internal error
6943 */
6944xmlExpNodePtr
6945xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6946 const xmlChar *str, int len) {
6947 const xmlChar *input;
6948
6949 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
6950 return(NULL);
6951 }
6952 /*
6953 * check the string is in the dictionnary, if yes use an interned
6954 * copy, otherwise we know it's not an acceptable input
6955 */
6956 input = xmlDictExists(ctxt->dict, str, len);
6957 if (input == NULL) {
6958 return(forbiddenExp);
6959 }
6960 return(xmlExpStringDeriveInt(ctxt, exp, input));
6961}
6962
6963static int
6964xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
6965 int ret = 1;
6966
6967 if (sub->c_max == -1) {
6968 if (exp->c_max != -1)
6969 ret = 0;
6970 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
6971 ret = 0;
6972 }
6973#if 0
6974 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
6975 ret = 0;
6976#endif
6977 return(ret);
6978}
6979
6980static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6981 xmlExpNodePtr sub);
6982/**
6983 * xmlExpDivide:
6984 * @ctxt: the expressions context
6985 * @exp: the englobing expression
6986 * @sub: the subexpression
6987 * @mult: the multiple expression
6988 * @remain: the remain from the derivation of the multiple
6989 *
6990 * Check if exp is a multiple of sub, i.e. if there is a finite number n
6991 * so that sub{n} subsume exp
6992 *
6993 * Returns the multiple value if successful, 0 if it is not a multiple
6994 * and -1 in case of internel error.
6995 */
6996
6997static int
6998xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
6999 xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7000 int i;
7001 xmlExpNodePtr tmp, tmp2;
7002
7003 if (mult != NULL) *mult = NULL;
7004 if (remain != NULL) *remain = NULL;
7005 if (exp->c_max == -1) return(0);
7006 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7007
7008 for (i = 1;i <= exp->c_max;i++) {
7009 sub->ref++;
7010 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7011 sub, NULL, NULL, i, i);
7012 if (tmp == NULL) {
7013 return(-1);
7014 }
7015 if (!xmlExpCheckCard(tmp, exp)) {
7016 xmlExpFree(ctxt, tmp);
7017 continue;
7018 }
7019 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7020 if (tmp2 == NULL) {
7021 xmlExpFree(ctxt, tmp);
7022 return(-1);
7023 }
7024 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7025 if (remain != NULL)
7026 *remain = tmp2;
7027 else
7028 xmlExpFree(ctxt, tmp2);
7029 if (mult != NULL)
7030 *mult = tmp;
7031 else
7032 xmlExpFree(ctxt, tmp);
7033#ifdef DEBUG_DERIV
7034 printf("Divide succeeded %d\n", i);
7035#endif
7036 return(i);
7037 }
7038 xmlExpFree(ctxt, tmp);
7039 xmlExpFree(ctxt, tmp2);
7040 }
7041#ifdef DEBUG_DERIV
7042 printf("Divide failed\n");
7043#endif
7044 return(0);
7045}
7046
7047/**
7048 * xmlExpExpDeriveInt:
7049 * @ctxt: the expressions context
7050 * @exp: the englobing expression
7051 * @sub: the subexpression
7052 *
7053 * Try to do a step of Brzozowski derivation but at a higher level
7054 * the input being a subexpression.
7055 *
7056 * Returns the resulting expression or NULL in case of internal error
7057 */
7058static xmlExpNodePtr
7059xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7060 xmlExpNodePtr ret, tmp, tmp2, tmp3;
7061 const xmlChar **tab;
7062 int len, i;
7063
7064 /*
7065 * In case of equality and if the expression can only consume a finite
7066 * amount, then the derivation is empty
7067 */
7068 if ((exp == sub) && (exp->c_max >= 0)) {
7069#ifdef DEBUG_DERIV
7070 printf("Equal(exp, sub) and finite -> Empty\n");
7071#endif
7072 return(emptyExp);
7073 }
7074 /*
7075 * decompose sub sequence first
7076 */
7077 if (sub->type == XML_EXP_EMPTY) {
7078#ifdef DEBUG_DERIV
7079 printf("Empty(sub) -> Empty\n");
7080#endif
7081 exp->ref++;
7082 return(exp);
7083 }
7084 if (sub->type == XML_EXP_SEQ) {
7085#ifdef DEBUG_DERIV
7086 printf("Seq(sub) -> decompose\n");
7087#endif
7088 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7089 if (tmp == NULL)
7090 return(NULL);
7091 if (tmp == forbiddenExp)
7092 return(tmp);
7093 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7094 xmlExpFree(ctxt, tmp);
7095 return(ret);
7096 }
7097 if (sub->type == XML_EXP_OR) {
7098#ifdef DEBUG_DERIV
7099 printf("Or(sub) -> decompose\n");
7100#endif
7101 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7102 if (tmp == forbiddenExp)
7103 return(tmp);
7104 if (tmp == NULL)
7105 return(NULL);
7106 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7107 if ((ret == NULL) || (ret == forbiddenExp)) {
7108 xmlExpFree(ctxt, tmp);
7109 return(ret);
7110 }
7111 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7112 }
7113 if (!xmlExpCheckCard(exp, sub)) {
7114#ifdef DEBUG_DERIV
7115 printf("CheckCard(exp, sub) failed -> Forbid\n");
7116#endif
7117 return(forbiddenExp);
7118 }
7119 switch (exp->type) {
7120 case XML_EXP_EMPTY:
7121 if (sub == emptyExp)
7122 return(emptyExp);
7123#ifdef DEBUG_DERIV
7124 printf("Empty(exp) -> Forbid\n");
7125#endif
7126 return(forbiddenExp);
7127 case XML_EXP_FORBID:
7128#ifdef DEBUG_DERIV
7129 printf("Forbid(exp) -> Forbid\n");
7130#endif
7131 return(forbiddenExp);
7132 case XML_EXP_ATOM:
7133 if (sub->type == XML_EXP_ATOM) {
7134 /* TODO: handle wildcards */
7135 if (exp->exp_str == sub->exp_str) {
7136#ifdef DEBUG_DERIV
7137 printf("Atom match -> Empty\n");
7138#endif
7139 return(emptyExp);
7140 }
7141#ifdef DEBUG_DERIV
7142 printf("Atom mismatch -> Forbid\n");
7143#endif
7144 return(forbiddenExp);
7145 }
7146 if ((sub->type == XML_EXP_COUNT) &&
7147 (sub->exp_max == 1) &&
7148 (sub->exp_left->type == XML_EXP_ATOM)) {
7149 /* TODO: handle wildcards */
7150 if (exp->exp_str == sub->exp_left->exp_str) {
7151#ifdef DEBUG_DERIV
7152 printf("Atom match -> Empty\n");
7153#endif
7154 return(emptyExp);
7155 }
7156#ifdef DEBUG_DERIV
7157 printf("Atom mismatch -> Forbid\n");
7158#endif
7159 return(forbiddenExp);
7160 }
7161#ifdef DEBUG_DERIV
7162 printf("Compex exp vs Atom -> Forbid\n");
7163#endif
7164 return(forbiddenExp);
7165 case XML_EXP_SEQ:
7166 /* try to get the sequence consumed only if possible */
7167 if (xmlExpCheckCard(exp->exp_left, sub)) {
7168 /* See if the sequence can be consumed directly */
7169#ifdef DEBUG_DERIV
7170 printf("Seq trying left only\n");
7171#endif
7172 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7173 if ((ret != forbiddenExp) && (ret != NULL)) {
7174#ifdef DEBUG_DERIV
7175 printf("Seq trying left only worked\n");
7176#endif
7177 /*
7178 * TODO: assumption here that we are determinist
7179 * i.e. we won't get to a nillable exp left
7180 * subset which could be matched by the right
7181 * part too.
7182 * e.g.: (a | b)+,(a | c) and 'a+,a'
7183 */
7184 exp->exp_right->ref++;
7185 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7186 exp->exp_right, NULL, 0, 0));
7187 }
7188#ifdef DEBUG_DERIV
7189 } else {
7190 printf("Seq: left too short\n");
7191#endif
7192 }
7193 /* Try instead to decompose */
7194 if (sub->type == XML_EXP_COUNT) {
7195 int min, max;
7196
7197#ifdef DEBUG_DERIV
7198 printf("Seq: sub is a count\n");
7199#endif
7200 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7201 if (ret == NULL)
7202 return(NULL);
7203 if (ret != forbiddenExp) {
7204#ifdef DEBUG_DERIV
7205 printf("Seq , Count match on left\n");
7206#endif
7207 if (sub->exp_max < 0)
7208 max = -1;
7209 else
7210 max = sub->exp_max -1;
7211 if (sub->exp_min > 0)
7212 min = sub->exp_min -1;
7213 else
7214 min = 0;
7215 exp->exp_right->ref++;
7216 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7217 exp->exp_right, NULL, 0, 0);
7218 if (tmp == NULL)
7219 return(NULL);
7220
7221 sub->exp_left->ref++;
7222 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7223 sub->exp_left, NULL, NULL, min, max);
7224 if (tmp2 == NULL) {
7225 xmlExpFree(ctxt, tmp);
7226 return(NULL);
7227 }
7228 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7229 xmlExpFree(ctxt, tmp);
7230 xmlExpFree(ctxt, tmp2);
7231 return(ret);
7232 }
7233 }
7234 /* we made no progress on structured operations */
7235 break;
7236 case XML_EXP_OR:
7237#ifdef DEBUG_DERIV
7238 printf("Or , trying both side\n");
7239#endif
7240 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7241 if (ret == NULL)
7242 return(NULL);
7243 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7244 if (tmp == NULL) {
7245 xmlExpFree(ctxt, ret);
7246 return(NULL);
7247 }
7248 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7249 case XML_EXP_COUNT: {
7250 int min, max;
7251
7252 if (sub->type == XML_EXP_COUNT) {
7253 /*
7254 * Try to see if the loop is completely subsumed
7255 */
7256 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7257 if (tmp == NULL)
7258 return(NULL);
7259 if (tmp == forbiddenExp) {
7260 int mult;
7261
7262#ifdef DEBUG_DERIV
7263 printf("Count, Count inner don't subsume\n");
7264#endif
7265 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7266 NULL, &tmp);
7267 if (mult <= 0) {
7268#ifdef DEBUG_DERIV
7269 printf("Count, Count not multiple => forbidden\n");
7270#endif
7271 return(forbiddenExp);
7272 }
7273 if (sub->exp_max == -1) {
7274 max = -1;
7275 if (exp->exp_max == -1) {
7276 if (exp->exp_min <= sub->exp_min * mult)
7277 min = 0;
7278 else
7279 min = exp->exp_min - sub->exp_min * mult;
7280 } else {
7281#ifdef DEBUG_DERIV
7282 printf("Count, Count finite can't subsume infinite\n");
7283#endif
7284 xmlExpFree(ctxt, tmp);
7285 return(forbiddenExp);
7286 }
7287 } else {
7288 if (exp->exp_max == -1) {
7289#ifdef DEBUG_DERIV
7290 printf("Infinite loop consume mult finite loop\n");
7291#endif
7292 if (exp->exp_min > sub->exp_min * mult) {
7293 max = -1;
7294 min = exp->exp_min - sub->exp_min * mult;
7295 } else {
7296 max = -1;
7297 min = 0;
7298 }
7299 } else {
7300 if (exp->exp_max < sub->exp_max * mult) {
7301#ifdef DEBUG_DERIV
7302 printf("loops max mult mismatch => forbidden\n");
7303#endif
7304 xmlExpFree(ctxt, tmp);
7305 return(forbiddenExp);
7306 }
7307 if (sub->exp_max * mult > exp->exp_min)
7308 min = 0;
7309 else
7310 min = exp->exp_min - sub->exp_max * mult;
7311 max = exp->exp_max - sub->exp_max * mult;
7312 }
7313 }
7314 } else if (!IS_NILLABLE(tmp)) {
7315 /*
7316 * TODO: loop here to try to grow if working on finite
7317 * blocks.
7318 */
7319#ifdef DEBUG_DERIV
7320 printf("Count, Count remain not nillable => forbidden\n");
7321#endif
7322 xmlExpFree(ctxt, tmp);
7323 return(forbiddenExp);
7324 } else if (sub->exp_max == -1) {
7325 if (exp->exp_max == -1) {
7326 if (exp->exp_min <= sub->exp_min) {
7327#ifdef DEBUG_DERIV
7328 printf("Infinite loops Okay => COUNT(0,Inf)\n");
7329#endif
7330 max = -1;
7331 min = 0;
7332 } else {
7333#ifdef DEBUG_DERIV
7334 printf("Infinite loops min => Count(X,Inf)\n");
7335#endif
7336 max = -1;
7337 min = exp->exp_min - sub->exp_min;
7338 }
7339 } else if (exp->exp_min > sub->exp_min) {
7340#ifdef DEBUG_DERIV
7341 printf("loops min mismatch 1 => forbidden ???\n");
7342#endif
7343 xmlExpFree(ctxt, tmp);
7344 return(forbiddenExp);
7345 } else {
7346 max = -1;
7347 min = 0;
7348 }
7349 } else {
7350 if (exp->exp_max == -1) {
7351#ifdef DEBUG_DERIV
7352 printf("Infinite loop consume finite loop\n");
7353#endif
7354 if (exp->exp_min > sub->exp_min) {
7355 max = -1;
7356 min = exp->exp_min - sub->exp_min;
7357 } else {
7358 max = -1;
7359 min = 0;
7360 }
7361 } else {
7362 if (exp->exp_max < sub->exp_max) {
7363#ifdef DEBUG_DERIV
7364 printf("loops max mismatch => forbidden\n");
7365#endif
7366 xmlExpFree(ctxt, tmp);
7367 return(forbiddenExp);
7368 }
7369 if (sub->exp_max > exp->exp_min)
7370 min = 0;
7371 else
7372 min = exp->exp_min - sub->exp_max;
7373 max = exp->exp_max - sub->exp_max;
7374 }
7375 }
7376#ifdef DEBUG_DERIV
7377 printf("loops match => SEQ(COUNT())\n");
7378#endif
7379 exp->exp_left->ref++;
7380 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7381 NULL, NULL, min, max);
7382 if (tmp2 == NULL) {
7383 return(NULL);
7384 }
7385 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7386 NULL, 0, 0);
7387 return(ret);
7388 }
7389 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7390 if (tmp == NULL)
7391 return(NULL);
7392 if (tmp == forbiddenExp) {
7393#ifdef DEBUG_DERIV
7394 printf("loop mismatch => forbidden\n");
7395#endif
7396 return(forbiddenExp);
7397 }
7398 if (exp->exp_min > 0)
7399 min = exp->exp_min - 1;
7400 else
7401 min = 0;
7402 if (exp->exp_max < 0)
7403 max = -1;
7404 else
7405 max = exp->exp_max - 1;
7406
7407#ifdef DEBUG_DERIV
7408 printf("loop match => SEQ(COUNT())\n");
7409#endif
7410 exp->exp_left->ref++;
7411 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7412 NULL, NULL, min, max);
7413 if (tmp2 == NULL)
7414 return(NULL);
7415 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7416 NULL, 0, 0);
7417 return(ret);
7418 }
7419 }
7420
Daniel Veillardccb4d412005-08-23 13:41:17 +00007421#ifdef DEBUG_DERIV
7422 printf("Fallback to derivative\n");
7423#endif
7424 if (IS_NILLABLE(sub)) {
7425 if (!(IS_NILLABLE(exp)))
7426 return(forbiddenExp);
7427 else
7428 ret = emptyExp;
7429 } else
7430 ret = NULL;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007431 /*
7432 * here the structured derivation made no progress so
7433 * we use the default token based derivation to force one more step
7434 */
7435 if (ctxt->tabSize == 0)
7436 ctxt->tabSize = 40;
7437
7438 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7439 sizeof(const xmlChar *));
7440 if (tab == NULL) {
7441 return(NULL);
7442 }
7443
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007444 /*
7445 * collect all the strings accepted by the subexpression on input
7446 */
7447 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7448 while (len < 0) {
7449 const xmlChar **temp;
Rob Richards54a8f672005-10-07 02:33:00 +00007450 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007451 sizeof(const xmlChar *));
7452 if (temp == NULL) {
Rob Richards54a8f672005-10-07 02:33:00 +00007453 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007454 return(NULL);
7455 }
7456 tab = temp;
7457 ctxt->tabSize *= 2;
7458 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7459 }
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007460 for (i = 0;i < len;i++) {
7461 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7462 if ((tmp == NULL) || (tmp == forbiddenExp)) {
7463 xmlExpFree(ctxt, ret);
Rob Richards54a8f672005-10-07 02:33:00 +00007464 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007465 return(tmp);
7466 }
7467 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7468 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7469 xmlExpFree(ctxt, tmp);
7470 xmlExpFree(ctxt, ret);
Rob Richards54a8f672005-10-07 02:33:00 +00007471 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007472 return(tmp);
7473 }
7474 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7475 xmlExpFree(ctxt, tmp);
7476 xmlExpFree(ctxt, tmp2);
7477
7478 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7479 xmlExpFree(ctxt, ret);
Rob Richards54a8f672005-10-07 02:33:00 +00007480 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007481 return(tmp3);
7482 }
7483
7484 if (ret == NULL)
7485 ret = tmp3;
7486 else {
7487 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7488 if (ret == NULL) {
Rob Richards54a8f672005-10-07 02:33:00 +00007489 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007490 return(NULL);
7491 }
7492 }
7493 }
Rob Richards54a8f672005-10-07 02:33:00 +00007494 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007495 return(ret);
7496}
7497
7498/**
Daniel Veillard0090bd52005-08-22 14:43:43 +00007499 * xmlExpExpDerive:
7500 * @ctxt: the expressions context
7501 * @exp: the englobing expression
7502 * @sub: the subexpression
7503 *
7504 * Evaluates the expression resulting from @exp consuming a sub expression @sub
7505 * Based on algebraic derivation and sometimes direct Brzozowski derivation
7506 * it usually tatkes less than linear time and can handle expressions generating
7507 * infinite languages.
7508 *
7509 * Returns the resulting expression or NULL in case of internal error, the
7510 * result must be freed
7511 */
7512xmlExpNodePtr
7513xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7514 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7515 return(NULL);
7516
7517 /*
7518 * O(1) speedups
7519 */
7520 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7521#ifdef DEBUG_DERIV
7522 printf("Sub nillable and not exp : can't subsume\n");
7523#endif
7524 return(forbiddenExp);
7525 }
7526 if (xmlExpCheckCard(exp, sub) == 0) {
7527#ifdef DEBUG_DERIV
7528 printf("sub generate longuer sequances than exp : can't subsume\n");
7529#endif
7530 return(forbiddenExp);
7531 }
7532 return(xmlExpExpDeriveInt(ctxt, exp, sub));
7533}
7534
7535/**
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007536 * xmlExpSubsume:
7537 * @ctxt: the expressions context
7538 * @exp: the englobing expression
7539 * @sub: the subexpression
7540 *
7541 * Check whether @exp accepts all the languages accexpted by @sub
7542 * the input being a subexpression.
7543 *
7544 * Returns 1 if true 0 if false and -1 in case of failure.
7545 */
7546int
7547xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7548 xmlExpNodePtr tmp;
7549
7550 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7551 return(-1);
7552
7553 /*
7554 * TODO: speedup by checking the language of sub is a subset of the
7555 * language of exp
7556 */
7557 /*
7558 * O(1) speedups
7559 */
7560 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7561#ifdef DEBUG_DERIV
7562 printf("Sub nillable and not exp : can't subsume\n");
7563#endif
7564 return(0);
7565 }
7566 if (xmlExpCheckCard(exp, sub) == 0) {
7567#ifdef DEBUG_DERIV
7568 printf("sub generate longuer sequances than exp : can't subsume\n");
7569#endif
7570 return(0);
7571 }
7572 tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7573#ifdef DEBUG_DERIV
7574 printf("Result derivation :\n");
7575 PRINT_EXP(tmp);
7576#endif
7577 if (tmp == NULL)
7578 return(-1);
7579 if (tmp == forbiddenExp)
7580 return(0);
7581 if (tmp == emptyExp)
7582 return(1);
7583 if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
7584 xmlExpFree(ctxt, tmp);
7585 return(1);
7586 }
7587 xmlExpFree(ctxt, tmp);
7588 return(0);
7589}
Daniel Veillard465a0002005-08-22 12:07:04 +00007590
7591/************************************************************************
7592 * *
7593 * Parsing expression *
7594 * *
7595 ************************************************************************/
7596
7597static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
7598
7599#undef CUR
7600#define CUR (*ctxt->cur)
7601#undef NEXT
7602#define NEXT ctxt->cur++;
7603#undef IS_BLANK
7604#define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
7605#define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
7606
7607static int
7608xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
7609 int ret = 0;
7610
7611 SKIP_BLANKS
7612 if (CUR == '*') {
7613 NEXT
7614 return(-1);
7615 }
7616 if ((CUR < '0') || (CUR > '9'))
7617 return(-1);
7618 while ((CUR >= '0') && (CUR <= '9')) {
7619 ret = ret * 10 + (CUR - '0');
7620 NEXT
7621 }
7622 return(ret);
7623}
7624
7625static xmlExpNodePtr
7626xmlExpParseOr(xmlExpCtxtPtr ctxt) {
7627 const char *base;
7628 xmlExpNodePtr ret;
7629 const xmlChar *val;
7630
7631 SKIP_BLANKS
7632 base = ctxt->cur;
7633 if (*ctxt->cur == '(') {
7634 NEXT
7635 ret = xmlExpParseExpr(ctxt);
7636 SKIP_BLANKS
7637 if (*ctxt->cur != ')') {
7638 fprintf(stderr, "unbalanced '(' : %s\n", base);
7639 xmlExpFree(ctxt, ret);
7640 return(NULL);
7641 }
7642 NEXT;
7643 SKIP_BLANKS
7644 goto parse_quantifier;
7645 }
7646 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
7647 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
7648 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
7649 NEXT;
7650 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
7651 if (val == NULL)
7652 return(NULL);
7653 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
7654 if (ret == NULL)
7655 return(NULL);
7656 SKIP_BLANKS
7657parse_quantifier:
7658 if (CUR == '{') {
7659 int min, max;
7660
7661 NEXT
7662 min = xmlExpParseNumber(ctxt);
7663 if (min < 0) {
7664 xmlExpFree(ctxt, ret);
7665 return(NULL);
7666 }
7667 SKIP_BLANKS
7668 if (CUR == ',') {
7669 NEXT
7670 max = xmlExpParseNumber(ctxt);
7671 SKIP_BLANKS
7672 } else
7673 max = min;
7674 if (CUR != '}') {
7675 xmlExpFree(ctxt, ret);
7676 return(NULL);
7677 }
7678 NEXT
7679 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7680 min, max);
7681 SKIP_BLANKS
7682 } else if (CUR == '?') {
7683 NEXT
7684 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7685 0, 1);
7686 SKIP_BLANKS
7687 } else if (CUR == '+') {
7688 NEXT
7689 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7690 1, -1);
7691 SKIP_BLANKS
7692 } else if (CUR == '*') {
7693 NEXT
7694 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7695 0, -1);
7696 SKIP_BLANKS
7697 }
7698 return(ret);
7699}
7700
7701
7702static xmlExpNodePtr
7703xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
7704 xmlExpNodePtr ret, right;
7705
7706 ret = xmlExpParseOr(ctxt);
7707 SKIP_BLANKS
7708 while (CUR == '|') {
7709 NEXT
7710 right = xmlExpParseOr(ctxt);
7711 if (right == NULL) {
7712 xmlExpFree(ctxt, ret);
7713 return(NULL);
7714 }
7715 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
7716 if (ret == NULL)
7717 return(NULL);
7718 }
7719 return(ret);
7720}
7721
7722static xmlExpNodePtr
7723xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
7724 xmlExpNodePtr ret, right;
7725
7726 ret = xmlExpParseSeq(ctxt);
7727 SKIP_BLANKS
7728 while (CUR == ',') {
7729 NEXT
7730 right = xmlExpParseSeq(ctxt);
7731 if (right == NULL) {
7732 xmlExpFree(ctxt, ret);
7733 return(NULL);
7734 }
7735 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
7736 if (ret == NULL)
7737 return(NULL);
7738 }
7739 return(ret);
7740}
7741
7742/**
7743 * xmlExpParse:
7744 * @ctxt: the expressions context
7745 * @expr: the 0 terminated string
7746 *
7747 * Minimal parser for regexps, it understand the following constructs
7748 * - string terminals
7749 * - choice operator |
7750 * - sequence operator ,
7751 * - subexpressions (...)
7752 * - usual cardinality operators + * and ?
7753 * - finite sequences { min, max }
7754 * - infinite sequences { min, * }
7755 * There is minimal checkings made especially no checking on strings values
7756 *
7757 * Returns a new expression or NULL in case of failure
7758 */
7759xmlExpNodePtr
7760xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
7761 xmlExpNodePtr ret;
7762
7763 ctxt->expr = expr;
7764 ctxt->cur = expr;
7765
7766 ret = xmlExpParseExpr(ctxt);
7767 SKIP_BLANKS
7768 if (*ctxt->cur != 0) {
7769 xmlExpFree(ctxt, ret);
7770 return(NULL);
7771 }
7772 return(ret);
7773}
7774
7775static void
7776xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
7777 xmlExpNodePtr c;
7778
7779 if (expr == NULL) return;
7780 if (glob) xmlBufferWriteChar(buf, "(");
7781 switch (expr->type) {
7782 case XML_EXP_EMPTY:
7783 xmlBufferWriteChar(buf, "empty");
7784 break;
7785 case XML_EXP_FORBID:
7786 xmlBufferWriteChar(buf, "forbidden");
7787 break;
7788 case XML_EXP_ATOM:
7789 xmlBufferWriteCHAR(buf, expr->exp_str);
7790 break;
7791 case XML_EXP_SEQ:
7792 c = expr->exp_left;
7793 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7794 xmlExpDumpInt(buf, c, 1);
7795 else
7796 xmlExpDumpInt(buf, c, 0);
7797 xmlBufferWriteChar(buf, " , ");
7798 c = expr->exp_right;
7799 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7800 xmlExpDumpInt(buf, c, 1);
7801 else
7802 xmlExpDumpInt(buf, c, 0);
7803 break;
7804 case XML_EXP_OR:
7805 c = expr->exp_left;
7806 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7807 xmlExpDumpInt(buf, c, 1);
7808 else
7809 xmlExpDumpInt(buf, c, 0);
7810 xmlBufferWriteChar(buf, " | ");
7811 c = expr->exp_right;
7812 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7813 xmlExpDumpInt(buf, c, 1);
7814 else
7815 xmlExpDumpInt(buf, c, 0);
7816 break;
7817 case XML_EXP_COUNT: {
7818 char rep[40];
7819
7820 c = expr->exp_left;
7821 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7822 xmlExpDumpInt(buf, c, 1);
7823 else
7824 xmlExpDumpInt(buf, c, 0);
7825 if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
7826 rep[0] = '?';
7827 rep[1] = 0;
7828 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
7829 rep[0] = '*';
7830 rep[1] = 0;
7831 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
7832 rep[0] = '+';
7833 rep[1] = 0;
7834 } else if (expr->exp_max == expr->exp_min) {
7835 snprintf(rep, 39, "{%d}", expr->exp_min);
7836 } else if (expr->exp_max < 0) {
7837 snprintf(rep, 39, "{%d,inf}", expr->exp_min);
7838 } else {
7839 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
7840 }
7841 rep[39] = 0;
7842 xmlBufferWriteChar(buf, rep);
7843 break;
7844 }
7845 default:
7846 fprintf(stderr, "Error in tree\n");
7847 }
7848 if (glob)
7849 xmlBufferWriteChar(buf, ")");
7850}
7851/**
7852 * xmlExpDump:
7853 * @buf: a buffer to receive the output
7854 * @expr: the compiled expression
7855 *
7856 * Serialize the expression as compiled to the buffer
7857 */
7858void
Daniel Veillard5eee7672005-08-22 21:22:27 +00007859xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
7860 if ((buf == NULL) || (expr == NULL))
Daniel Veillard465a0002005-08-22 12:07:04 +00007861 return;
Daniel Veillard5eee7672005-08-22 21:22:27 +00007862 xmlExpDumpInt(buf, expr, 0);
Daniel Veillard465a0002005-08-22 12:07:04 +00007863}
7864
7865/**
7866 * xmlExpMaxToken:
7867 * @expr: a compiled expression
7868 *
7869 * Indicate the maximum number of input a expression can accept
7870 *
7871 * Returns the maximum length or -1 in case of error
7872 */
7873int
7874xmlExpMaxToken(xmlExpNodePtr expr) {
7875 if (expr == NULL)
7876 return(-1);
7877 return(expr->c_max);
7878}
7879
7880/**
7881 * xmlExpCtxtNbNodes:
7882 * @ctxt: an expression context
7883 *
7884 * Debugging facility provides the number of allocated nodes at a that point
7885 *
7886 * Returns the number of nodes in use or -1 in case of error
7887 */
7888int
7889xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
7890 if (ctxt == NULL)
7891 return(-1);
7892 return(ctxt->nb_nodes);
7893}
7894
7895/**
7896 * xmlExpCtxtNbCons:
7897 * @ctxt: an expression context
7898 *
7899 * Debugging facility provides the number of allocated nodes over lifetime
7900 *
7901 * Returns the number of nodes ever allocated or -1 in case of error
7902 */
7903int
7904xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
7905 if (ctxt == NULL)
7906 return(-1);
7907 return(ctxt->nb_cons);
7908}
7909
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007910#endif /* LIBXML_EXPR_ENABLED */
Daniel Veillard5d4644e2005-04-01 13:11:58 +00007911#define bottom_xmlregexp
7912#include "elfgcchack.h"
Daniel Veillard4255d502002-04-16 15:50:10 +00007913#endif /* LIBXML_REGEXP_ENABLED */