blob: 8d7ee97fd066724be50ebe708ac3cf372dc93830 [file] [log] [blame]
Daniel Veillard4255d502002-04-16 15:50:10 +00001/*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/shemas mechanisms now available in
William M. Brackddf71d62004-05-06 04:17:26 +00006 * XML related specifications these include:
Daniel Veillard4255d502002-04-16 15:50:10 +00007 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17#define IN_LIBXML
18#include "libxml.h"
19
20#ifdef LIBXML_REGEXP_ENABLED
21
Daniel Veillardcee2b3a2005-01-25 00:22:52 +000022/* #define DEBUG_ERR */
Daniel Veillardfc0b6f62005-01-09 17:48:02 +000023
Daniel Veillard4255d502002-04-16 15:50:10 +000024#include <stdio.h>
25#include <string.h>
Daniel Veillardebe48c62003-12-03 12:12:27 +000026#ifdef HAVE_LIMITS_H
27#include <limits.h>
28#endif
29
Daniel Veillard4255d502002-04-16 15:50:10 +000030#include <libxml/tree.h>
31#include <libxml/parserInternals.h>
32#include <libxml/xmlregexp.h>
33#include <libxml/xmlautomata.h>
34#include <libxml/xmlunicode.h>
35
Daniel Veillardebe48c62003-12-03 12:12:27 +000036#ifndef INT_MAX
37#define INT_MAX 123456789 /* easy to flag and big enough for our needs */
38#endif
39
Daniel Veillardc0826a72004-08-10 14:17:33 +000040/* #define DEBUG_REGEXP_GRAPH */
Daniel Veillard10752282005-08-08 13:05:13 +000041/* #define DEBUG_REGEXP_EXEC */
Daniel Veillard4255d502002-04-16 15:50:10 +000042/* #define DEBUG_PUSH */
Daniel Veillard23e73572002-09-19 19:56:43 +000043/* #define DEBUG_COMPACTION */
Daniel Veillard4255d502002-04-16 15:50:10 +000044
Daniel Veillard567a45b2005-10-18 19:11:55 +000045#define MAX_PUSH 10000000
Daniel Veillard94cc1032005-09-15 13:09:00 +000046
Daniel Veillardff46a042003-10-08 08:53:17 +000047#define ERROR(str) \
48 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
49 xmlRegexpErrCompile(ctxt, str);
Daniel Veillard4255d502002-04-16 15:50:10 +000050#define NEXT ctxt->cur++
51#define CUR (*(ctxt->cur))
52#define NXT(index) (ctxt->cur[index])
53
54#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
55#define NEXTL(l) ctxt->cur += l;
Daniel Veillardc0826a72004-08-10 14:17:33 +000056#define XML_REG_STRING_SEPARATOR '|'
Daniel Veillard4255d502002-04-16 15:50:10 +000057
Daniel Veillarde19fc232002-04-22 16:01:24 +000058/**
59 * TODO:
60 *
61 * macro to flag unimplemented blocks
62 */
63#define TODO \
64 xmlGenericError(xmlGenericErrorContext, \
65 "Unimplemented block at %s:%d\n", \
66 __FILE__, __LINE__);
67
Daniel Veillard4255d502002-04-16 15:50:10 +000068/************************************************************************
69 * *
70 * Datatypes and structures *
71 * *
72 ************************************************************************/
73
Daniel Veillardfc011b72006-02-12 19:14:15 +000074/*
75 * Note: the order of the enums below is significant, do not shuffle
76 */
Daniel Veillard4255d502002-04-16 15:50:10 +000077typedef enum {
78 XML_REGEXP_EPSILON = 1,
79 XML_REGEXP_CHARVAL,
80 XML_REGEXP_RANGES,
Daniel Veillard567a45b2005-10-18 19:11:55 +000081 XML_REGEXP_SUBREG, /* used for () sub regexps */
Daniel Veillard4255d502002-04-16 15:50:10 +000082 XML_REGEXP_STRING,
83 XML_REGEXP_ANYCHAR, /* . */
84 XML_REGEXP_ANYSPACE, /* \s */
85 XML_REGEXP_NOTSPACE, /* \S */
86 XML_REGEXP_INITNAME, /* \l */
Daniel Veillard567a45b2005-10-18 19:11:55 +000087 XML_REGEXP_NOTINITNAME, /* \L */
Daniel Veillard4255d502002-04-16 15:50:10 +000088 XML_REGEXP_NAMECHAR, /* \c */
89 XML_REGEXP_NOTNAMECHAR, /* \C */
90 XML_REGEXP_DECIMAL, /* \d */
Daniel Veillard567a45b2005-10-18 19:11:55 +000091 XML_REGEXP_NOTDECIMAL, /* \D */
Daniel Veillard4255d502002-04-16 15:50:10 +000092 XML_REGEXP_REALCHAR, /* \w */
Daniel Veillard567a45b2005-10-18 19:11:55 +000093 XML_REGEXP_NOTREALCHAR, /* \W */
94 XML_REGEXP_LETTER = 100,
Daniel Veillard4255d502002-04-16 15:50:10 +000095 XML_REGEXP_LETTER_UPPERCASE,
96 XML_REGEXP_LETTER_LOWERCASE,
97 XML_REGEXP_LETTER_TITLECASE,
98 XML_REGEXP_LETTER_MODIFIER,
99 XML_REGEXP_LETTER_OTHERS,
100 XML_REGEXP_MARK,
101 XML_REGEXP_MARK_NONSPACING,
102 XML_REGEXP_MARK_SPACECOMBINING,
103 XML_REGEXP_MARK_ENCLOSING,
104 XML_REGEXP_NUMBER,
105 XML_REGEXP_NUMBER_DECIMAL,
106 XML_REGEXP_NUMBER_LETTER,
107 XML_REGEXP_NUMBER_OTHERS,
108 XML_REGEXP_PUNCT,
109 XML_REGEXP_PUNCT_CONNECTOR,
110 XML_REGEXP_PUNCT_DASH,
111 XML_REGEXP_PUNCT_OPEN,
112 XML_REGEXP_PUNCT_CLOSE,
113 XML_REGEXP_PUNCT_INITQUOTE,
114 XML_REGEXP_PUNCT_FINQUOTE,
115 XML_REGEXP_PUNCT_OTHERS,
116 XML_REGEXP_SEPAR,
117 XML_REGEXP_SEPAR_SPACE,
118 XML_REGEXP_SEPAR_LINE,
119 XML_REGEXP_SEPAR_PARA,
120 XML_REGEXP_SYMBOL,
121 XML_REGEXP_SYMBOL_MATH,
122 XML_REGEXP_SYMBOL_CURRENCY,
123 XML_REGEXP_SYMBOL_MODIFIER,
124 XML_REGEXP_SYMBOL_OTHERS,
125 XML_REGEXP_OTHER,
126 XML_REGEXP_OTHER_CONTROL,
127 XML_REGEXP_OTHER_FORMAT,
128 XML_REGEXP_OTHER_PRIVATE,
129 XML_REGEXP_OTHER_NA,
130 XML_REGEXP_BLOCK_NAME
131} xmlRegAtomType;
132
133typedef enum {
134 XML_REGEXP_QUANT_EPSILON = 1,
135 XML_REGEXP_QUANT_ONCE,
136 XML_REGEXP_QUANT_OPT,
137 XML_REGEXP_QUANT_MULT,
138 XML_REGEXP_QUANT_PLUS,
Daniel Veillard7646b182002-04-20 06:41:40 +0000139 XML_REGEXP_QUANT_ONCEONLY,
140 XML_REGEXP_QUANT_ALL,
Daniel Veillard4255d502002-04-16 15:50:10 +0000141 XML_REGEXP_QUANT_RANGE
142} xmlRegQuantType;
143
144typedef enum {
145 XML_REGEXP_START_STATE = 1,
146 XML_REGEXP_FINAL_STATE,
Daniel Veillardcc026dc2005-01-12 13:21:17 +0000147 XML_REGEXP_TRANS_STATE,
148 XML_REGEXP_SINK_STATE
Daniel Veillard4255d502002-04-16 15:50:10 +0000149} xmlRegStateType;
150
151typedef enum {
152 XML_REGEXP_MARK_NORMAL = 0,
153 XML_REGEXP_MARK_START,
154 XML_REGEXP_MARK_VISITED
155} xmlRegMarkedType;
156
157typedef struct _xmlRegRange xmlRegRange;
158typedef xmlRegRange *xmlRegRangePtr;
159
160struct _xmlRegRange {
Daniel Veillardf8b9de32003-11-24 14:27:26 +0000161 int neg; /* 0 normal, 1 not, 2 exclude */
Daniel Veillard4255d502002-04-16 15:50:10 +0000162 xmlRegAtomType type;
163 int start;
164 int end;
165 xmlChar *blockName;
166};
167
168typedef struct _xmlRegAtom xmlRegAtom;
169typedef xmlRegAtom *xmlRegAtomPtr;
170
171typedef struct _xmlAutomataState xmlRegState;
172typedef xmlRegState *xmlRegStatePtr;
173
174struct _xmlRegAtom {
175 int no;
176 xmlRegAtomType type;
177 xmlRegQuantType quant;
178 int min;
179 int max;
180
181 void *valuep;
Daniel Veillarda646cfd2002-09-17 21:50:03 +0000182 void *valuep2;
Daniel Veillard4255d502002-04-16 15:50:10 +0000183 int neg;
184 int codepoint;
185 xmlRegStatePtr start;
186 xmlRegStatePtr stop;
187 int maxRanges;
188 int nbRanges;
189 xmlRegRangePtr *ranges;
190 void *data;
191};
192
193typedef struct _xmlRegCounter xmlRegCounter;
194typedef xmlRegCounter *xmlRegCounterPtr;
195
196struct _xmlRegCounter {
197 int min;
198 int max;
199};
200
201typedef struct _xmlRegTrans xmlRegTrans;
202typedef xmlRegTrans *xmlRegTransPtr;
203
204struct _xmlRegTrans {
205 xmlRegAtomPtr atom;
206 int to;
207 int counter;
208 int count;
Daniel Veillard567a45b2005-10-18 19:11:55 +0000209 int nd;
Daniel Veillard4255d502002-04-16 15:50:10 +0000210};
211
212struct _xmlAutomataState {
213 xmlRegStateType type;
214 xmlRegMarkedType mark;
Daniel Veillard23e73572002-09-19 19:56:43 +0000215 xmlRegMarkedType reached;
Daniel Veillard4255d502002-04-16 15:50:10 +0000216 int no;
Daniel Veillard4255d502002-04-16 15:50:10 +0000217 int maxTrans;
218 int nbTrans;
219 xmlRegTrans *trans;
Daniel Veillarddb68b742005-07-30 13:18:24 +0000220 /* knowing states ponting to us can speed things up */
221 int maxTransTo;
222 int nbTransTo;
223 int *transTo;
Daniel Veillard4255d502002-04-16 15:50:10 +0000224};
225
226typedef struct _xmlAutomata xmlRegParserCtxt;
227typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
228
229struct _xmlAutomata {
230 xmlChar *string;
231 xmlChar *cur;
232
233 int error;
234 int neg;
235
236 xmlRegStatePtr start;
237 xmlRegStatePtr end;
238 xmlRegStatePtr state;
239
240 xmlRegAtomPtr atom;
241
242 int maxAtoms;
243 int nbAtoms;
244 xmlRegAtomPtr *atoms;
245
246 int maxStates;
247 int nbStates;
248 xmlRegStatePtr *states;
249
250 int maxCounters;
251 int nbCounters;
252 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000253
254 int determinist;
Daniel Veillard6e65e152005-08-09 11:09:52 +0000255 int negs;
Daniel Veillard4255d502002-04-16 15:50:10 +0000256};
257
258struct _xmlRegexp {
259 xmlChar *string;
260 int nbStates;
261 xmlRegStatePtr *states;
262 int nbAtoms;
263 xmlRegAtomPtr *atoms;
264 int nbCounters;
265 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000266 int determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000267 /*
268 * That's the compact form for determinists automatas
269 */
270 int nbstates;
271 int *compact;
Daniel Veillard118aed72002-09-24 14:13:13 +0000272 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000273 int nbstrings;
274 xmlChar **stringMap;
Daniel Veillard4255d502002-04-16 15:50:10 +0000275};
276
277typedef struct _xmlRegExecRollback xmlRegExecRollback;
278typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
279
280struct _xmlRegExecRollback {
281 xmlRegStatePtr state;/* the current state */
282 int index; /* the index in the input stack */
283 int nextbranch; /* the next transition to explore in that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000284 int *counts; /* save the automata state if it has some */
Daniel Veillard4255d502002-04-16 15:50:10 +0000285};
286
287typedef struct _xmlRegInputToken xmlRegInputToken;
288typedef xmlRegInputToken *xmlRegInputTokenPtr;
289
290struct _xmlRegInputToken {
291 xmlChar *value;
292 void *data;
293};
294
295struct _xmlRegExecCtxt {
296 int status; /* execution status != 0 indicate an error */
William M. Brackddf71d62004-05-06 04:17:26 +0000297 int determinist; /* did we find an indeterministic behaviour */
Daniel Veillard4255d502002-04-16 15:50:10 +0000298 xmlRegexpPtr comp; /* the compiled regexp */
299 xmlRegExecCallbacks callback;
300 void *data;
301
302 xmlRegStatePtr state;/* the current state */
303 int transno; /* the current transition on that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000304 int transcount; /* the number of chars in char counted transitions */
Daniel Veillard4255d502002-04-16 15:50:10 +0000305
306 /*
307 * A stack of rollback states
308 */
309 int maxRollbacks;
310 int nbRollbacks;
311 xmlRegExecRollback *rollbacks;
312
313 /*
314 * The state of the automata if any
315 */
316 int *counts;
317
318 /*
319 * The input stack
320 */
321 int inputStackMax;
322 int inputStackNr;
323 int index;
324 int *charStack;
325 const xmlChar *inputString; /* when operating on characters */
326 xmlRegInputTokenPtr inputStack;/* when operating on strings */
327
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +0000328 /*
329 * error handling
330 */
331 int errStateNo; /* the error state number */
332 xmlRegStatePtr errState; /* the error state */
333 xmlChar *errString; /* the string raising the error */
334 int *errCounts; /* counters at the error state */
Daniel Veillard94cc1032005-09-15 13:09:00 +0000335 int nbPush;
Daniel Veillard4255d502002-04-16 15:50:10 +0000336};
337
Daniel Veillard441bc322002-04-20 17:38:48 +0000338#define REGEXP_ALL_COUNTER 0x123456
339#define REGEXP_ALL_LAX_COUNTER 0x123457
Daniel Veillard7646b182002-04-20 06:41:40 +0000340
Daniel Veillard4255d502002-04-16 15:50:10 +0000341static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
Daniel Veillard23e73572002-09-19 19:56:43 +0000342static void xmlRegFreeState(xmlRegStatePtr state);
343static void xmlRegFreeAtom(xmlRegAtomPtr atom);
Daniel Veillard9efc4762005-07-19 14:33:55 +0000344static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
Daniel Veillard567a45b2005-10-18 19:11:55 +0000345static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
346static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
347 int neg, int start, int end, const xmlChar *blockName);
Daniel Veillard4255d502002-04-16 15:50:10 +0000348
349/************************************************************************
Daniel Veillardff46a042003-10-08 08:53:17 +0000350 * *
351 * Regexp memory error handler *
352 * *
353 ************************************************************************/
354/**
355 * xmlRegexpErrMemory:
William M. Brackddf71d62004-05-06 04:17:26 +0000356 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000357 *
358 * Handle an out of memory condition
359 */
360static void
361xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
362{
363 const char *regexp = NULL;
364 if (ctxt != NULL) {
365 regexp = (const char *) ctxt->string;
366 ctxt->error = XML_ERR_NO_MEMORY;
367 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000368 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000369 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
370 regexp, NULL, 0, 0,
371 "Memory allocation failed : %s\n", extra);
372}
373
374/**
375 * xmlRegexpErrCompile:
William M. Brackddf71d62004-05-06 04:17:26 +0000376 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000377 *
William M. Brackddf71d62004-05-06 04:17:26 +0000378 * Handle a compilation failure
Daniel Veillardff46a042003-10-08 08:53:17 +0000379 */
380static void
381xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
382{
383 const char *regexp = NULL;
384 int idx = 0;
385
386 if (ctxt != NULL) {
387 regexp = (const char *) ctxt->string;
388 idx = ctxt->cur - ctxt->string;
389 ctxt->error = XML_REGEXP_COMPILE_ERROR;
390 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000391 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000392 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
393 regexp, NULL, idx, 0,
394 "failed to compile: %s\n", extra);
395}
396
397/************************************************************************
Daniel Veillard4255d502002-04-16 15:50:10 +0000398 * *
399 * Allocation/Deallocation *
400 * *
401 ************************************************************************/
402
Daniel Veillard23e73572002-09-19 19:56:43 +0000403static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
Daniel Veillard4255d502002-04-16 15:50:10 +0000404/**
405 * xmlRegEpxFromParse:
406 * @ctxt: the parser context used to build it
407 *
William M. Brackddf71d62004-05-06 04:17:26 +0000408 * Allocate a new regexp and fill it with the result from the parser
Daniel Veillard4255d502002-04-16 15:50:10 +0000409 *
410 * Returns the new regexp or NULL in case of error
411 */
412static xmlRegexpPtr
413xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
414 xmlRegexpPtr ret;
415
416 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000417 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000418 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +0000419 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000420 }
Daniel Veillard4255d502002-04-16 15:50:10 +0000421 memset(ret, 0, sizeof(xmlRegexp));
422 ret->string = ctxt->string;
Daniel Veillard4255d502002-04-16 15:50:10 +0000423 ret->nbStates = ctxt->nbStates;
Daniel Veillard4255d502002-04-16 15:50:10 +0000424 ret->states = ctxt->states;
Daniel Veillard4255d502002-04-16 15:50:10 +0000425 ret->nbAtoms = ctxt->nbAtoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000426 ret->atoms = ctxt->atoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000427 ret->nbCounters = ctxt->nbCounters;
Daniel Veillard4255d502002-04-16 15:50:10 +0000428 ret->counters = ctxt->counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000429 ret->determinist = ctxt->determinist;
Daniel Veillard567a45b2005-10-18 19:11:55 +0000430 if (ret->determinist == -1) {
431 xmlRegexpIsDeterminist(ret);
432 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000433
434 if ((ret->determinist != 0) &&
435 (ret->nbCounters == 0) &&
Daniel Veillard6e65e152005-08-09 11:09:52 +0000436 (ctxt->negs == 0) &&
Daniel Veillard118aed72002-09-24 14:13:13 +0000437 (ret->atoms != NULL) &&
Daniel Veillard23e73572002-09-19 19:56:43 +0000438 (ret->atoms[0] != NULL) &&
439 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
440 int i, j, nbstates = 0, nbatoms = 0;
441 int *stateRemap;
442 int *stringRemap;
443 int *transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000444 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000445 xmlChar **stringMap;
446 xmlChar *value;
447
448 /*
449 * Switch to a compact representation
450 * 1/ counting the effective number of states left
William M. Brackddf71d62004-05-06 04:17:26 +0000451 * 2/ counting the unique number of atoms, and check that
Daniel Veillard23e73572002-09-19 19:56:43 +0000452 * they are all of the string type
453 * 3/ build a table state x atom for the transitions
454 */
455
456 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000457 if (stateRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000458 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000459 xmlFree(ret);
460 return(NULL);
461 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000462 for (i = 0;i < ret->nbStates;i++) {
463 if (ret->states[i] != NULL) {
464 stateRemap[i] = nbstates;
465 nbstates++;
466 } else {
467 stateRemap[i] = -1;
468 }
469 }
470#ifdef DEBUG_COMPACTION
471 printf("Final: %d states\n", nbstates);
472#endif
473 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000474 if (stringMap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000475 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000476 xmlFree(stateRemap);
477 xmlFree(ret);
478 return(NULL);
479 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000480 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000481 if (stringRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000482 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000483 xmlFree(stringMap);
484 xmlFree(stateRemap);
485 xmlFree(ret);
486 return(NULL);
487 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000488 for (i = 0;i < ret->nbAtoms;i++) {
489 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
490 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
491 value = ret->atoms[i]->valuep;
492 for (j = 0;j < nbatoms;j++) {
493 if (xmlStrEqual(stringMap[j], value)) {
494 stringRemap[i] = j;
495 break;
496 }
497 }
498 if (j >= nbatoms) {
499 stringRemap[i] = nbatoms;
500 stringMap[nbatoms] = xmlStrdup(value);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000501 if (stringMap[nbatoms] == NULL) {
502 for (i = 0;i < nbatoms;i++)
503 xmlFree(stringMap[i]);
504 xmlFree(stringRemap);
505 xmlFree(stringMap);
506 xmlFree(stateRemap);
507 xmlFree(ret);
508 return(NULL);
509 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000510 nbatoms++;
511 }
512 } else {
513 xmlFree(stateRemap);
514 xmlFree(stringRemap);
515 for (i = 0;i < nbatoms;i++)
516 xmlFree(stringMap[i]);
517 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000518 xmlFree(ret);
519 return(NULL);
Daniel Veillard23e73572002-09-19 19:56:43 +0000520 }
521 }
522#ifdef DEBUG_COMPACTION
523 printf("Final: %d atoms\n", nbatoms);
524#endif
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000525 transitions = (int *) xmlMalloc((nbstates + 1) *
526 (nbatoms + 1) * sizeof(int));
527 if (transitions == NULL) {
528 xmlFree(stateRemap);
529 xmlFree(stringRemap);
530 xmlFree(stringMap);
531 xmlFree(ret);
532 return(NULL);
533 }
534 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int));
Daniel Veillard23e73572002-09-19 19:56:43 +0000535
536 /*
537 * Allocate the transition table. The first entry for each
William M. Brackddf71d62004-05-06 04:17:26 +0000538 * state corresponds to the state type.
Daniel Veillard23e73572002-09-19 19:56:43 +0000539 */
Daniel Veillard118aed72002-09-24 14:13:13 +0000540 transdata = NULL;
Daniel Veillard23e73572002-09-19 19:56:43 +0000541
542 for (i = 0;i < ret->nbStates;i++) {
543 int stateno, atomno, targetno, prev;
544 xmlRegStatePtr state;
545 xmlRegTransPtr trans;
546
547 stateno = stateRemap[i];
548 if (stateno == -1)
549 continue;
550 state = ret->states[i];
551
552 transitions[stateno * (nbatoms + 1)] = state->type;
553
554 for (j = 0;j < state->nbTrans;j++) {
555 trans = &(state->trans[j]);
556 if ((trans->to == -1) || (trans->atom == NULL))
557 continue;
558 atomno = stringRemap[trans->atom->no];
Daniel Veillard118aed72002-09-24 14:13:13 +0000559 if ((trans->atom->data != NULL) && (transdata == NULL)) {
560 transdata = (void **) xmlMalloc(nbstates * nbatoms *
561 sizeof(void *));
562 if (transdata != NULL)
563 memset(transdata, 0,
564 nbstates * nbatoms * sizeof(void *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000565 else {
Daniel Veillardff46a042003-10-08 08:53:17 +0000566 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000567 break;
568 }
Daniel Veillard118aed72002-09-24 14:13:13 +0000569 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000570 targetno = stateRemap[trans->to];
571 /*
William M. Brackddf71d62004-05-06 04:17:26 +0000572 * if the same atom can generate transitions to 2 different
Daniel Veillard23e73572002-09-19 19:56:43 +0000573 * states then it means the automata is not determinist and
574 * the compact form can't be used !
575 */
576 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
577 if (prev != 0) {
578 if (prev != targetno + 1) {
Daniel Veillard23e73572002-09-19 19:56:43 +0000579 ret->determinist = 0;
580#ifdef DEBUG_COMPACTION
581 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
582 i, j, trans->atom->no, trans->to, atomno, targetno);
583 printf(" previous to is %d\n", prev);
584#endif
Daniel Veillard118aed72002-09-24 14:13:13 +0000585 if (transdata != NULL)
586 xmlFree(transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +0000587 xmlFree(transitions);
588 xmlFree(stateRemap);
589 xmlFree(stringRemap);
590 for (i = 0;i < nbatoms;i++)
591 xmlFree(stringMap[i]);
592 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000593 goto not_determ;
Daniel Veillard23e73572002-09-19 19:56:43 +0000594 }
595 } else {
596#if 0
597 printf("State %d trans %d: atom %d to %d : %d to %d\n",
598 i, j, trans->atom->no, trans->to, atomno, targetno);
599#endif
600 transitions[stateno * (nbatoms + 1) + atomno + 1] =
Daniel Veillard118aed72002-09-24 14:13:13 +0000601 targetno + 1; /* to avoid 0 */
602 if (transdata != NULL)
603 transdata[stateno * nbatoms + atomno] =
604 trans->atom->data;
Daniel Veillard23e73572002-09-19 19:56:43 +0000605 }
606 }
607 }
608 ret->determinist = 1;
609#ifdef DEBUG_COMPACTION
610 /*
611 * Debug
612 */
613 for (i = 0;i < nbstates;i++) {
614 for (j = 0;j < nbatoms + 1;j++) {
615 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
616 }
617 printf("\n");
618 }
619 printf("\n");
620#endif
621 /*
622 * Cleanup of the old data
623 */
624 if (ret->states != NULL) {
625 for (i = 0;i < ret->nbStates;i++)
626 xmlRegFreeState(ret->states[i]);
627 xmlFree(ret->states);
628 }
629 ret->states = NULL;
630 ret->nbStates = 0;
631 if (ret->atoms != NULL) {
632 for (i = 0;i < ret->nbAtoms;i++)
633 xmlRegFreeAtom(ret->atoms[i]);
634 xmlFree(ret->atoms);
635 }
636 ret->atoms = NULL;
637 ret->nbAtoms = 0;
638
639 ret->compact = transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000640 ret->transdata = transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000641 ret->stringMap = stringMap;
642 ret->nbstrings = nbatoms;
643 ret->nbstates = nbstates;
644 xmlFree(stateRemap);
645 xmlFree(stringRemap);
646 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000647not_determ:
648 ctxt->string = NULL;
649 ctxt->nbStates = 0;
650 ctxt->states = NULL;
651 ctxt->nbAtoms = 0;
652 ctxt->atoms = NULL;
653 ctxt->nbCounters = 0;
654 ctxt->counters = NULL;
Daniel Veillard4255d502002-04-16 15:50:10 +0000655 return(ret);
656}
657
658/**
659 * xmlRegNewParserCtxt:
660 * @string: the string to parse
661 *
662 * Allocate a new regexp parser context
663 *
664 * Returns the new context or NULL in case of error
665 */
666static xmlRegParserCtxtPtr
667xmlRegNewParserCtxt(const xmlChar *string) {
668 xmlRegParserCtxtPtr ret;
669
670 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
671 if (ret == NULL)
672 return(NULL);
673 memset(ret, 0, sizeof(xmlRegParserCtxt));
674 if (string != NULL)
675 ret->string = xmlStrdup(string);
676 ret->cur = ret->string;
677 ret->neg = 0;
Daniel Veillard6e65e152005-08-09 11:09:52 +0000678 ret->negs = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +0000679 ret->error = 0;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000680 ret->determinist = -1;
Daniel Veillard4255d502002-04-16 15:50:10 +0000681 return(ret);
682}
683
684/**
685 * xmlRegNewRange:
686 * @ctxt: the regexp parser context
687 * @neg: is that negative
688 * @type: the type of range
689 * @start: the start codepoint
690 * @end: the end codepoint
691 *
692 * Allocate a new regexp range
693 *
694 * Returns the new range or NULL in case of error
695 */
696static xmlRegRangePtr
697xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
698 int neg, xmlRegAtomType type, int start, int end) {
699 xmlRegRangePtr ret;
700
701 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
702 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000703 xmlRegexpErrMemory(ctxt, "allocating range");
Daniel Veillard4255d502002-04-16 15:50:10 +0000704 return(NULL);
705 }
706 ret->neg = neg;
707 ret->type = type;
708 ret->start = start;
709 ret->end = end;
710 return(ret);
711}
712
713/**
714 * xmlRegFreeRange:
715 * @range: the regexp range
716 *
717 * Free a regexp range
718 */
719static void
720xmlRegFreeRange(xmlRegRangePtr range) {
721 if (range == NULL)
722 return;
723
724 if (range->blockName != NULL)
725 xmlFree(range->blockName);
726 xmlFree(range);
727}
728
729/**
730 * xmlRegNewAtom:
731 * @ctxt: the regexp parser context
732 * @type: the type of atom
733 *
734 * Allocate a new regexp range
735 *
736 * Returns the new atom or NULL in case of error
737 */
738static xmlRegAtomPtr
739xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
740 xmlRegAtomPtr ret;
741
742 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
743 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000744 xmlRegexpErrMemory(ctxt, "allocating atom");
Daniel Veillard4255d502002-04-16 15:50:10 +0000745 return(NULL);
746 }
747 memset(ret, 0, sizeof(xmlRegAtom));
748 ret->type = type;
749 ret->quant = XML_REGEXP_QUANT_ONCE;
750 ret->min = 0;
751 ret->max = 0;
752 return(ret);
753}
754
755/**
756 * xmlRegFreeAtom:
757 * @atom: the regexp atom
758 *
759 * Free a regexp atom
760 */
761static void
762xmlRegFreeAtom(xmlRegAtomPtr atom) {
763 int i;
764
765 if (atom == NULL)
766 return;
767
768 for (i = 0;i < atom->nbRanges;i++)
769 xmlRegFreeRange(atom->ranges[i]);
770 if (atom->ranges != NULL)
771 xmlFree(atom->ranges);
Daniel Veillardde0e4982005-07-03 14:35:44 +0000772 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
773 xmlFree(atom->valuep);
Daniel Veillard77005e62005-07-19 16:26:18 +0000774 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
775 xmlFree(atom->valuep2);
Daniel Veillardde0e4982005-07-03 14:35:44 +0000776 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +0000777 xmlFree(atom->valuep);
778 xmlFree(atom);
779}
780
781static xmlRegStatePtr
782xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
783 xmlRegStatePtr ret;
784
785 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
786 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000787 xmlRegexpErrMemory(ctxt, "allocating state");
Daniel Veillard4255d502002-04-16 15:50:10 +0000788 return(NULL);
789 }
790 memset(ret, 0, sizeof(xmlRegState));
791 ret->type = XML_REGEXP_TRANS_STATE;
792 ret->mark = XML_REGEXP_MARK_NORMAL;
793 return(ret);
794}
795
796/**
797 * xmlRegFreeState:
798 * @state: the regexp state
799 *
800 * Free a regexp state
801 */
802static void
803xmlRegFreeState(xmlRegStatePtr state) {
804 if (state == NULL)
805 return;
806
807 if (state->trans != NULL)
808 xmlFree(state->trans);
Daniel Veillarddb68b742005-07-30 13:18:24 +0000809 if (state->transTo != NULL)
810 xmlFree(state->transTo);
Daniel Veillard4255d502002-04-16 15:50:10 +0000811 xmlFree(state);
812}
813
814/**
815 * xmlRegFreeParserCtxt:
816 * @ctxt: the regexp parser context
817 *
818 * Free a regexp parser context
819 */
820static void
821xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
822 int i;
823 if (ctxt == NULL)
824 return;
825
826 if (ctxt->string != NULL)
827 xmlFree(ctxt->string);
828 if (ctxt->states != NULL) {
829 for (i = 0;i < ctxt->nbStates;i++)
830 xmlRegFreeState(ctxt->states[i]);
831 xmlFree(ctxt->states);
832 }
833 if (ctxt->atoms != NULL) {
834 for (i = 0;i < ctxt->nbAtoms;i++)
835 xmlRegFreeAtom(ctxt->atoms[i]);
836 xmlFree(ctxt->atoms);
837 }
838 if (ctxt->counters != NULL)
839 xmlFree(ctxt->counters);
840 xmlFree(ctxt);
841}
842
843/************************************************************************
844 * *
845 * Display of Data structures *
846 * *
847 ************************************************************************/
848
849static void
850xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
851 switch (type) {
852 case XML_REGEXP_EPSILON:
853 fprintf(output, "epsilon "); break;
854 case XML_REGEXP_CHARVAL:
855 fprintf(output, "charval "); break;
856 case XML_REGEXP_RANGES:
857 fprintf(output, "ranges "); break;
858 case XML_REGEXP_SUBREG:
859 fprintf(output, "subexpr "); break;
860 case XML_REGEXP_STRING:
861 fprintf(output, "string "); break;
862 case XML_REGEXP_ANYCHAR:
863 fprintf(output, "anychar "); break;
864 case XML_REGEXP_ANYSPACE:
865 fprintf(output, "anyspace "); break;
866 case XML_REGEXP_NOTSPACE:
867 fprintf(output, "notspace "); break;
868 case XML_REGEXP_INITNAME:
869 fprintf(output, "initname "); break;
870 case XML_REGEXP_NOTINITNAME:
871 fprintf(output, "notinitname "); break;
872 case XML_REGEXP_NAMECHAR:
873 fprintf(output, "namechar "); break;
874 case XML_REGEXP_NOTNAMECHAR:
875 fprintf(output, "notnamechar "); break;
876 case XML_REGEXP_DECIMAL:
877 fprintf(output, "decimal "); break;
878 case XML_REGEXP_NOTDECIMAL:
879 fprintf(output, "notdecimal "); break;
880 case XML_REGEXP_REALCHAR:
881 fprintf(output, "realchar "); break;
882 case XML_REGEXP_NOTREALCHAR:
883 fprintf(output, "notrealchar "); break;
884 case XML_REGEXP_LETTER:
885 fprintf(output, "LETTER "); break;
886 case XML_REGEXP_LETTER_UPPERCASE:
887 fprintf(output, "LETTER_UPPERCASE "); break;
888 case XML_REGEXP_LETTER_LOWERCASE:
889 fprintf(output, "LETTER_LOWERCASE "); break;
890 case XML_REGEXP_LETTER_TITLECASE:
891 fprintf(output, "LETTER_TITLECASE "); break;
892 case XML_REGEXP_LETTER_MODIFIER:
893 fprintf(output, "LETTER_MODIFIER "); break;
894 case XML_REGEXP_LETTER_OTHERS:
895 fprintf(output, "LETTER_OTHERS "); break;
896 case XML_REGEXP_MARK:
897 fprintf(output, "MARK "); break;
898 case XML_REGEXP_MARK_NONSPACING:
899 fprintf(output, "MARK_NONSPACING "); break;
900 case XML_REGEXP_MARK_SPACECOMBINING:
901 fprintf(output, "MARK_SPACECOMBINING "); break;
902 case XML_REGEXP_MARK_ENCLOSING:
903 fprintf(output, "MARK_ENCLOSING "); break;
904 case XML_REGEXP_NUMBER:
905 fprintf(output, "NUMBER "); break;
906 case XML_REGEXP_NUMBER_DECIMAL:
907 fprintf(output, "NUMBER_DECIMAL "); break;
908 case XML_REGEXP_NUMBER_LETTER:
909 fprintf(output, "NUMBER_LETTER "); break;
910 case XML_REGEXP_NUMBER_OTHERS:
911 fprintf(output, "NUMBER_OTHERS "); break;
912 case XML_REGEXP_PUNCT:
913 fprintf(output, "PUNCT "); break;
914 case XML_REGEXP_PUNCT_CONNECTOR:
915 fprintf(output, "PUNCT_CONNECTOR "); break;
916 case XML_REGEXP_PUNCT_DASH:
917 fprintf(output, "PUNCT_DASH "); break;
918 case XML_REGEXP_PUNCT_OPEN:
919 fprintf(output, "PUNCT_OPEN "); break;
920 case XML_REGEXP_PUNCT_CLOSE:
921 fprintf(output, "PUNCT_CLOSE "); break;
922 case XML_REGEXP_PUNCT_INITQUOTE:
923 fprintf(output, "PUNCT_INITQUOTE "); break;
924 case XML_REGEXP_PUNCT_FINQUOTE:
925 fprintf(output, "PUNCT_FINQUOTE "); break;
926 case XML_REGEXP_PUNCT_OTHERS:
927 fprintf(output, "PUNCT_OTHERS "); break;
928 case XML_REGEXP_SEPAR:
929 fprintf(output, "SEPAR "); break;
930 case XML_REGEXP_SEPAR_SPACE:
931 fprintf(output, "SEPAR_SPACE "); break;
932 case XML_REGEXP_SEPAR_LINE:
933 fprintf(output, "SEPAR_LINE "); break;
934 case XML_REGEXP_SEPAR_PARA:
935 fprintf(output, "SEPAR_PARA "); break;
936 case XML_REGEXP_SYMBOL:
937 fprintf(output, "SYMBOL "); break;
938 case XML_REGEXP_SYMBOL_MATH:
939 fprintf(output, "SYMBOL_MATH "); break;
940 case XML_REGEXP_SYMBOL_CURRENCY:
941 fprintf(output, "SYMBOL_CURRENCY "); break;
942 case XML_REGEXP_SYMBOL_MODIFIER:
943 fprintf(output, "SYMBOL_MODIFIER "); break;
944 case XML_REGEXP_SYMBOL_OTHERS:
945 fprintf(output, "SYMBOL_OTHERS "); break;
946 case XML_REGEXP_OTHER:
947 fprintf(output, "OTHER "); break;
948 case XML_REGEXP_OTHER_CONTROL:
949 fprintf(output, "OTHER_CONTROL "); break;
950 case XML_REGEXP_OTHER_FORMAT:
951 fprintf(output, "OTHER_FORMAT "); break;
952 case XML_REGEXP_OTHER_PRIVATE:
953 fprintf(output, "OTHER_PRIVATE "); break;
954 case XML_REGEXP_OTHER_NA:
955 fprintf(output, "OTHER_NA "); break;
956 case XML_REGEXP_BLOCK_NAME:
957 fprintf(output, "BLOCK "); break;
958 }
959}
960
961static void
962xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
963 switch (type) {
964 case XML_REGEXP_QUANT_EPSILON:
965 fprintf(output, "epsilon "); break;
966 case XML_REGEXP_QUANT_ONCE:
967 fprintf(output, "once "); break;
968 case XML_REGEXP_QUANT_OPT:
969 fprintf(output, "? "); break;
970 case XML_REGEXP_QUANT_MULT:
971 fprintf(output, "* "); break;
972 case XML_REGEXP_QUANT_PLUS:
973 fprintf(output, "+ "); break;
974 case XML_REGEXP_QUANT_RANGE:
975 fprintf(output, "range "); break;
Daniel Veillard7646b182002-04-20 06:41:40 +0000976 case XML_REGEXP_QUANT_ONCEONLY:
977 fprintf(output, "onceonly "); break;
978 case XML_REGEXP_QUANT_ALL:
979 fprintf(output, "all "); break;
Daniel Veillard4255d502002-04-16 15:50:10 +0000980 }
981}
982static void
983xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
984 fprintf(output, " range: ");
985 if (range->neg)
986 fprintf(output, "negative ");
987 xmlRegPrintAtomType(output, range->type);
988 fprintf(output, "%c - %c\n", range->start, range->end);
989}
990
991static void
992xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
993 fprintf(output, " atom: ");
994 if (atom == NULL) {
995 fprintf(output, "NULL\n");
996 return;
997 }
Daniel Veillard9efc4762005-07-19 14:33:55 +0000998 if (atom->neg)
999 fprintf(output, "not ");
Daniel Veillard4255d502002-04-16 15:50:10 +00001000 xmlRegPrintAtomType(output, atom->type);
1001 xmlRegPrintQuantType(output, atom->quant);
1002 if (atom->quant == XML_REGEXP_QUANT_RANGE)
1003 fprintf(output, "%d-%d ", atom->min, atom->max);
1004 if (atom->type == XML_REGEXP_STRING)
1005 fprintf(output, "'%s' ", (char *) atom->valuep);
1006 if (atom->type == XML_REGEXP_CHARVAL)
1007 fprintf(output, "char %c\n", atom->codepoint);
1008 else if (atom->type == XML_REGEXP_RANGES) {
1009 int i;
1010 fprintf(output, "%d entries\n", atom->nbRanges);
1011 for (i = 0; i < atom->nbRanges;i++)
1012 xmlRegPrintRange(output, atom->ranges[i]);
1013 } else if (atom->type == XML_REGEXP_SUBREG) {
1014 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1015 } else {
1016 fprintf(output, "\n");
1017 }
1018}
1019
1020static void
1021xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1022 fprintf(output, " trans: ");
1023 if (trans == NULL) {
1024 fprintf(output, "NULL\n");
1025 return;
1026 }
1027 if (trans->to < 0) {
1028 fprintf(output, "removed\n");
1029 return;
1030 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001031 if (trans->nd != 0) {
1032 if (trans->nd == 2)
1033 fprintf(output, "last not determinist, ");
1034 else
1035 fprintf(output, "not determinist, ");
1036 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001037 if (trans->counter >= 0) {
1038 fprintf(output, "counted %d, ", trans->counter);
1039 }
Daniel Veillard8a001f62002-04-20 07:24:11 +00001040 if (trans->count == REGEXP_ALL_COUNTER) {
1041 fprintf(output, "all transition, ");
1042 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00001043 fprintf(output, "count based %d, ", trans->count);
1044 }
1045 if (trans->atom == NULL) {
1046 fprintf(output, "epsilon to %d\n", trans->to);
1047 return;
1048 }
1049 if (trans->atom->type == XML_REGEXP_CHARVAL)
1050 fprintf(output, "char %c ", trans->atom->codepoint);
1051 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1052}
1053
1054static void
1055xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1056 int i;
1057
1058 fprintf(output, " state: ");
1059 if (state == NULL) {
1060 fprintf(output, "NULL\n");
1061 return;
1062 }
1063 if (state->type == XML_REGEXP_START_STATE)
1064 fprintf(output, "START ");
1065 if (state->type == XML_REGEXP_FINAL_STATE)
1066 fprintf(output, "FINAL ");
1067
1068 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1069 for (i = 0;i < state->nbTrans; i++) {
1070 xmlRegPrintTrans(output, &(state->trans[i]));
1071 }
1072}
1073
Daniel Veillard23e73572002-09-19 19:56:43 +00001074#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard4255d502002-04-16 15:50:10 +00001075static void
1076xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1077 int i;
1078
1079 fprintf(output, " ctxt: ");
1080 if (ctxt == NULL) {
1081 fprintf(output, "NULL\n");
1082 return;
1083 }
1084 fprintf(output, "'%s' ", ctxt->string);
1085 if (ctxt->error)
1086 fprintf(output, "error ");
1087 if (ctxt->neg)
1088 fprintf(output, "neg ");
1089 fprintf(output, "\n");
1090 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1091 for (i = 0;i < ctxt->nbAtoms; i++) {
1092 fprintf(output, " %02d ", i);
1093 xmlRegPrintAtom(output, ctxt->atoms[i]);
1094 }
1095 if (ctxt->atom != NULL) {
1096 fprintf(output, "current atom:\n");
1097 xmlRegPrintAtom(output, ctxt->atom);
1098 }
1099 fprintf(output, "%d states:", ctxt->nbStates);
1100 if (ctxt->start != NULL)
1101 fprintf(output, " start: %d", ctxt->start->no);
1102 if (ctxt->end != NULL)
1103 fprintf(output, " end: %d", ctxt->end->no);
1104 fprintf(output, "\n");
1105 for (i = 0;i < ctxt->nbStates; i++) {
1106 xmlRegPrintState(output, ctxt->states[i]);
1107 }
1108 fprintf(output, "%d counters:\n", ctxt->nbCounters);
1109 for (i = 0;i < ctxt->nbCounters; i++) {
1110 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1111 ctxt->counters[i].max);
1112 }
1113}
Daniel Veillard23e73572002-09-19 19:56:43 +00001114#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00001115
1116/************************************************************************
1117 * *
1118 * Finite Automata structures manipulations *
1119 * *
1120 ************************************************************************/
1121
1122static void
1123xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1124 int neg, xmlRegAtomType type, int start, int end,
1125 xmlChar *blockName) {
1126 xmlRegRangePtr range;
1127
1128 if (atom == NULL) {
1129 ERROR("add range: atom is NULL");
1130 return;
1131 }
1132 if (atom->type != XML_REGEXP_RANGES) {
1133 ERROR("add range: atom is not ranges");
1134 return;
1135 }
1136 if (atom->maxRanges == 0) {
1137 atom->maxRanges = 4;
1138 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1139 sizeof(xmlRegRangePtr));
1140 if (atom->ranges == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001141 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001142 atom->maxRanges = 0;
1143 return;
1144 }
1145 } else if (atom->nbRanges >= atom->maxRanges) {
1146 xmlRegRangePtr *tmp;
1147 atom->maxRanges *= 2;
1148 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1149 sizeof(xmlRegRangePtr));
1150 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001151 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001152 atom->maxRanges /= 2;
1153 return;
1154 }
1155 atom->ranges = tmp;
1156 }
1157 range = xmlRegNewRange(ctxt, neg, type, start, end);
1158 if (range == NULL)
1159 return;
1160 range->blockName = blockName;
1161 atom->ranges[atom->nbRanges++] = range;
1162
1163}
1164
1165static int
1166xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1167 if (ctxt->maxCounters == 0) {
1168 ctxt->maxCounters = 4;
1169 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1170 sizeof(xmlRegCounter));
1171 if (ctxt->counters == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001172 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001173 ctxt->maxCounters = 0;
1174 return(-1);
1175 }
1176 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1177 xmlRegCounter *tmp;
1178 ctxt->maxCounters *= 2;
1179 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1180 sizeof(xmlRegCounter));
1181 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001182 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001183 ctxt->maxCounters /= 2;
1184 return(-1);
1185 }
1186 ctxt->counters = tmp;
1187 }
1188 ctxt->counters[ctxt->nbCounters].min = -1;
1189 ctxt->counters[ctxt->nbCounters].max = -1;
1190 return(ctxt->nbCounters++);
1191}
1192
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001193static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001194xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1195 if (atom == NULL) {
1196 ERROR("atom push: atom is NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001197 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001198 }
1199 if (ctxt->maxAtoms == 0) {
1200 ctxt->maxAtoms = 4;
1201 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1202 sizeof(xmlRegAtomPtr));
1203 if (ctxt->atoms == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001204 xmlRegexpErrMemory(ctxt, "pushing atom");
Daniel Veillard4255d502002-04-16 15:50:10 +00001205 ctxt->maxAtoms = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001206 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001207 }
1208 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1209 xmlRegAtomPtr *tmp;
1210 ctxt->maxAtoms *= 2;
1211 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1212 sizeof(xmlRegAtomPtr));
1213 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001214 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001215 ctxt->maxAtoms /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001216 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001217 }
1218 ctxt->atoms = tmp;
1219 }
1220 atom->no = ctxt->nbAtoms;
1221 ctxt->atoms[ctxt->nbAtoms++] = atom;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001222 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001223}
1224
1225static void
Daniel Veillarddb68b742005-07-30 13:18:24 +00001226xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1227 int from) {
1228 if (target->maxTransTo == 0) {
1229 target->maxTransTo = 8;
1230 target->transTo = (int *) xmlMalloc(target->maxTransTo *
1231 sizeof(int));
1232 if (target->transTo == NULL) {
1233 xmlRegexpErrMemory(ctxt, "adding transition");
1234 target->maxTransTo = 0;
1235 return;
1236 }
1237 } else if (target->nbTransTo >= target->maxTransTo) {
1238 int *tmp;
1239 target->maxTransTo *= 2;
1240 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1241 sizeof(int));
1242 if (tmp == NULL) {
1243 xmlRegexpErrMemory(ctxt, "adding transition");
1244 target->maxTransTo /= 2;
1245 return;
1246 }
1247 target->transTo = tmp;
1248 }
1249 target->transTo[target->nbTransTo] = from;
1250 target->nbTransTo++;
1251}
1252
1253static void
Daniel Veillard4255d502002-04-16 15:50:10 +00001254xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1255 xmlRegAtomPtr atom, xmlRegStatePtr target,
Daniel Veillard5de09382005-09-26 17:18:17 +00001256 int counter, int count) {
William M. Brackf9b5fa22004-05-10 07:52:15 +00001257
1258 int nrtrans;
1259
Daniel Veillard4255d502002-04-16 15:50:10 +00001260 if (state == NULL) {
1261 ERROR("add state: state is NULL");
1262 return;
1263 }
1264 if (target == NULL) {
1265 ERROR("add state: target is NULL");
1266 return;
1267 }
William M. Brackf9b5fa22004-05-10 07:52:15 +00001268 /*
1269 * Other routines follow the philosophy 'When in doubt, add a transition'
1270 * so we check here whether such a transition is already present and, if
1271 * so, silently ignore this request.
1272 */
1273
Daniel Veillard5de09382005-09-26 17:18:17 +00001274 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1275 xmlRegTransPtr trans = &(state->trans[nrtrans]);
1276 if ((trans->atom == atom) &&
1277 (trans->to == target->no) &&
1278 (trans->counter == counter) &&
1279 (trans->count == count)) {
William M. Brackf9b5fa22004-05-10 07:52:15 +00001280#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard5de09382005-09-26 17:18:17 +00001281 printf("Ignoring duplicate transition from %d to %d\n",
1282 state->no, target->no);
William M. Brackf9b5fa22004-05-10 07:52:15 +00001283#endif
Daniel Veillard5de09382005-09-26 17:18:17 +00001284 return;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001285 }
William M. Brackf9b5fa22004-05-10 07:52:15 +00001286 }
1287
Daniel Veillard4255d502002-04-16 15:50:10 +00001288 if (state->maxTrans == 0) {
Daniel Veillarddb68b742005-07-30 13:18:24 +00001289 state->maxTrans = 8;
Daniel Veillard4255d502002-04-16 15:50:10 +00001290 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1291 sizeof(xmlRegTrans));
1292 if (state->trans == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001293 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001294 state->maxTrans = 0;
1295 return;
1296 }
1297 } else if (state->nbTrans >= state->maxTrans) {
1298 xmlRegTrans *tmp;
1299 state->maxTrans *= 2;
1300 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1301 sizeof(xmlRegTrans));
1302 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001303 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001304 state->maxTrans /= 2;
1305 return;
1306 }
1307 state->trans = tmp;
1308 }
1309#ifdef DEBUG_REGEXP_GRAPH
1310 printf("Add trans from %d to %d ", state->no, target->no);
Daniel Veillard8a001f62002-04-20 07:24:11 +00001311 if (count == REGEXP_ALL_COUNTER)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001312 printf("all transition\n");
Daniel Veillard4402ab42002-09-12 16:02:56 +00001313 else if (count >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001314 printf("count based %d\n", count);
Daniel Veillard4255d502002-04-16 15:50:10 +00001315 else if (counter >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001316 printf("counted %d\n", counter);
Daniel Veillard4255d502002-04-16 15:50:10 +00001317 else if (atom == NULL)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001318 printf("epsilon transition\n");
1319 else if (atom != NULL)
1320 xmlRegPrintAtom(stdout, atom);
Daniel Veillard4255d502002-04-16 15:50:10 +00001321#endif
1322
1323 state->trans[state->nbTrans].atom = atom;
1324 state->trans[state->nbTrans].to = target->no;
1325 state->trans[state->nbTrans].counter = counter;
1326 state->trans[state->nbTrans].count = count;
Daniel Veillard567a45b2005-10-18 19:11:55 +00001327 state->trans[state->nbTrans].nd = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00001328 state->nbTrans++;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001329 xmlRegStateAddTransTo(ctxt, target, state->no);
Daniel Veillard4255d502002-04-16 15:50:10 +00001330}
1331
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001332static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001333xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001334 if (state == NULL) return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001335 if (ctxt->maxStates == 0) {
1336 ctxt->maxStates = 4;
1337 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1338 sizeof(xmlRegStatePtr));
1339 if (ctxt->states == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001340 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001341 ctxt->maxStates = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001342 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001343 }
1344 } else if (ctxt->nbStates >= ctxt->maxStates) {
1345 xmlRegStatePtr *tmp;
1346 ctxt->maxStates *= 2;
1347 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1348 sizeof(xmlRegStatePtr));
1349 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001350 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001351 ctxt->maxStates /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001352 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001353 }
1354 ctxt->states = tmp;
1355 }
1356 state->no = ctxt->nbStates;
1357 ctxt->states[ctxt->nbStates++] = state;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001358 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001359}
1360
1361/**
Daniel Veillard7646b182002-04-20 06:41:40 +00001362 * xmlFAGenerateAllTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001363 * @ctxt: a regexp parser context
1364 * @from: the from state
1365 * @to: the target state or NULL for building a new one
1366 * @lax:
Daniel Veillard7646b182002-04-20 06:41:40 +00001367 *
1368 */
1369static void
1370xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
Daniel Veillard441bc322002-04-20 17:38:48 +00001371 xmlRegStatePtr from, xmlRegStatePtr to,
1372 int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00001373 if (to == NULL) {
1374 to = xmlRegNewState(ctxt);
1375 xmlRegStatePush(ctxt, to);
1376 ctxt->state = to;
1377 }
Daniel Veillard441bc322002-04-20 17:38:48 +00001378 if (lax)
Daniel Veillard5de09382005-09-26 17:18:17 +00001379 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
Daniel Veillard441bc322002-04-20 17:38:48 +00001380 else
Daniel Veillard5de09382005-09-26 17:18:17 +00001381 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
Daniel Veillard7646b182002-04-20 06:41:40 +00001382}
1383
1384/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001385 * xmlFAGenerateEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001386 * @ctxt: a regexp parser context
1387 * @from: the from state
1388 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001389 *
1390 */
1391static void
1392xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1393 xmlRegStatePtr from, xmlRegStatePtr to) {
1394 if (to == NULL) {
1395 to = xmlRegNewState(ctxt);
1396 xmlRegStatePush(ctxt, to);
1397 ctxt->state = to;
1398 }
Daniel Veillard5de09382005-09-26 17:18:17 +00001399 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001400}
1401
1402/**
1403 * xmlFAGenerateCountedEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001404 * @ctxt: a regexp parser context
1405 * @from: the from state
1406 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001407 * counter: the counter for that transition
1408 *
1409 */
1410static void
1411xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1412 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1413 if (to == NULL) {
1414 to = xmlRegNewState(ctxt);
1415 xmlRegStatePush(ctxt, to);
1416 ctxt->state = to;
1417 }
Daniel Veillard5de09382005-09-26 17:18:17 +00001418 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001419}
1420
1421/**
1422 * xmlFAGenerateCountedTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001423 * @ctxt: a regexp parser context
1424 * @from: the from state
1425 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001426 * counter: the counter for that transition
1427 *
1428 */
1429static void
1430xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1431 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1432 if (to == NULL) {
1433 to = xmlRegNewState(ctxt);
1434 xmlRegStatePush(ctxt, to);
1435 ctxt->state = to;
1436 }
Daniel Veillard5de09382005-09-26 17:18:17 +00001437 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
Daniel Veillard4255d502002-04-16 15:50:10 +00001438}
1439
1440/**
1441 * xmlFAGenerateTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001442 * @ctxt: a regexp parser context
1443 * @from: the from state
1444 * @to: the target state or NULL for building a new one
1445 * @atom: the atom generating the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00001446 *
William M. Brackddf71d62004-05-06 04:17:26 +00001447 * Returns 0 if success and -1 in case of error.
Daniel Veillard4255d502002-04-16 15:50:10 +00001448 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001449static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001450xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1451 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1452 if (atom == NULL) {
1453 ERROR("genrate transition: atom == NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001454 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001455 }
1456 if (atom->type == XML_REGEXP_SUBREG) {
1457 /*
1458 * this is a subexpression handling one should not need to
William M. Brackddf71d62004-05-06 04:17:26 +00001459 * create a new node except for XML_REGEXP_QUANT_RANGE.
Daniel Veillard4255d502002-04-16 15:50:10 +00001460 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001461 if (xmlRegAtomPush(ctxt, atom) < 0) {
1462 return(-1);
1463 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001464 if ((to != NULL) && (atom->stop != to) &&
1465 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1466 /*
1467 * Generate an epsilon transition to link to the target
1468 */
1469 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
Daniel Veillardaa622012005-10-20 15:55:25 +00001470#ifdef DV
1471 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1472 (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1473 to = xmlRegNewState(ctxt);
1474 xmlRegStatePush(ctxt, to);
1475 ctxt->state = to;
1476 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1477#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00001478 }
1479 switch (atom->quant) {
1480 case XML_REGEXP_QUANT_OPT:
1481 atom->quant = XML_REGEXP_QUANT_ONCE;
1482 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1483 break;
1484 case XML_REGEXP_QUANT_MULT:
1485 atom->quant = XML_REGEXP_QUANT_ONCE;
1486 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1487 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1488 break;
1489 case XML_REGEXP_QUANT_PLUS:
1490 atom->quant = XML_REGEXP_QUANT_ONCE;
1491 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1492 break;
1493 case XML_REGEXP_QUANT_RANGE: {
1494 int counter;
1495 xmlRegStatePtr newstate;
1496
1497 /*
1498 * This one is nasty:
William M. Brackddf71d62004-05-06 04:17:26 +00001499 * 1/ if range has minOccurs == 0, create a new state
1500 * and create epsilon transitions from atom->start
1501 * to atom->stop, as well as atom->start to the new
1502 * state
1503 * 2/ register a new counter
1504 * 3/ register an epsilon transition associated to
Daniel Veillard4255d502002-04-16 15:50:10 +00001505 * this counter going from atom->stop to atom->start
William M. Brackddf71d62004-05-06 04:17:26 +00001506 * 4/ create a new state
1507 * 5/ generate a counted transition from atom->stop to
Daniel Veillard4255d502002-04-16 15:50:10 +00001508 * that state
1509 */
William M. Brackddf71d62004-05-06 04:17:26 +00001510 if (atom->min == 0) {
1511 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1512 atom->stop);
1513 newstate = xmlRegNewState(ctxt);
1514 xmlRegStatePush(ctxt, newstate);
1515 ctxt->state = newstate;
1516 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1517 newstate);
1518 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001519 counter = xmlRegGetCounter(ctxt);
1520 ctxt->counters[counter].min = atom->min - 1;
1521 ctxt->counters[counter].max = atom->max - 1;
1522 atom->min = 0;
1523 atom->max = 0;
1524 atom->quant = XML_REGEXP_QUANT_ONCE;
1525 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1526 atom->start, counter);
1527 if (to != NULL) {
1528 newstate = to;
1529 } else {
1530 newstate = xmlRegNewState(ctxt);
1531 xmlRegStatePush(ctxt, newstate);
Daniel Veillard4255d502002-04-16 15:50:10 +00001532 }
Daniel Veillard9a00fd22005-11-09 08:56:26 +00001533 ctxt->state = newstate;
Daniel Veillard4255d502002-04-16 15:50:10 +00001534 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1535 newstate, counter);
1536 }
1537 default:
1538 break;
1539 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001540 return(0);
Daniel Veillard567a45b2005-10-18 19:11:55 +00001541 }
1542 if ((atom->min == 0) && (atom->max == 0) &&
Daniel Veillard99c394d2005-07-14 12:58:49 +00001543 (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1544 /*
1545 * we can discard the atom and generate an epsilon transition instead
1546 */
1547 if (to == NULL) {
1548 to = xmlRegNewState(ctxt);
1549 if (to != NULL)
1550 xmlRegStatePush(ctxt, to);
1551 else {
1552 return(-1);
1553 }
1554 }
1555 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1556 ctxt->state = to;
1557 xmlRegFreeAtom(atom);
1558 return(0);
Daniel Veillard567a45b2005-10-18 19:11:55 +00001559 }
1560 if (to == NULL) {
1561 to = xmlRegNewState(ctxt);
1562 if (to != NULL)
1563 xmlRegStatePush(ctxt, to);
1564 else {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001565 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001566 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001567 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001568 if (xmlRegAtomPush(ctxt, atom) < 0) {
1569 return(-1);
1570 }
1571 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1572 ctxt->state = to;
Daniel Veillard4255d502002-04-16 15:50:10 +00001573 switch (atom->quant) {
1574 case XML_REGEXP_QUANT_OPT:
1575 atom->quant = XML_REGEXP_QUANT_ONCE;
1576 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1577 break;
1578 case XML_REGEXP_QUANT_MULT:
1579 atom->quant = XML_REGEXP_QUANT_ONCE;
1580 xmlFAGenerateEpsilonTransition(ctxt, from, to);
Daniel Veillard5de09382005-09-26 17:18:17 +00001581 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001582 break;
1583 case XML_REGEXP_QUANT_PLUS:
1584 atom->quant = XML_REGEXP_QUANT_ONCE;
Daniel Veillard5de09382005-09-26 17:18:17 +00001585 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001586 break;
1587 default:
1588 break;
1589 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001590 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001591}
1592
1593/**
1594 * xmlFAReduceEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001595 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001596 * @fromnr: the from state
1597 * @tonr: the to state
William M. Brackddf71d62004-05-06 04:17:26 +00001598 * @counter: should that transition be associated to a counted
Daniel Veillard4255d502002-04-16 15:50:10 +00001599 *
1600 */
1601static void
1602xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1603 int tonr, int counter) {
1604 int transnr;
1605 xmlRegStatePtr from;
1606 xmlRegStatePtr to;
1607
1608#ifdef DEBUG_REGEXP_GRAPH
1609 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1610#endif
1611 from = ctxt->states[fromnr];
1612 if (from == NULL)
1613 return;
1614 to = ctxt->states[tonr];
1615 if (to == NULL)
1616 return;
1617 if ((to->mark == XML_REGEXP_MARK_START) ||
1618 (to->mark == XML_REGEXP_MARK_VISITED))
1619 return;
1620
1621 to->mark = XML_REGEXP_MARK_VISITED;
1622 if (to->type == XML_REGEXP_FINAL_STATE) {
1623#ifdef DEBUG_REGEXP_GRAPH
1624 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1625#endif
1626 from->type = XML_REGEXP_FINAL_STATE;
1627 }
1628 for (transnr = 0;transnr < to->nbTrans;transnr++) {
Daniel Veillarddb68b742005-07-30 13:18:24 +00001629 if (to->trans[transnr].to < 0)
1630 continue;
Daniel Veillard4255d502002-04-16 15:50:10 +00001631 if (to->trans[transnr].atom == NULL) {
1632 /*
1633 * Don't remove counted transitions
1634 * Don't loop either
1635 */
Daniel Veillardb509f152002-04-17 16:28:10 +00001636 if (to->trans[transnr].to != fromnr) {
1637 if (to->trans[transnr].count >= 0) {
1638 int newto = to->trans[transnr].to;
1639
1640 xmlRegStateAddTrans(ctxt, from, NULL,
1641 ctxt->states[newto],
Daniel Veillard5de09382005-09-26 17:18:17 +00001642 -1, to->trans[transnr].count);
Daniel Veillardb509f152002-04-17 16:28:10 +00001643 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00001644#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillardb509f152002-04-17 16:28:10 +00001645 printf("Found epsilon trans %d from %d to %d\n",
1646 transnr, tonr, to->trans[transnr].to);
Daniel Veillard4255d502002-04-16 15:50:10 +00001647#endif
Daniel Veillardb509f152002-04-17 16:28:10 +00001648 if (to->trans[transnr].counter >= 0) {
1649 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1650 to->trans[transnr].to,
1651 to->trans[transnr].counter);
1652 } else {
1653 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1654 to->trans[transnr].to,
1655 counter);
1656 }
1657 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001658 }
1659 } else {
1660 int newto = to->trans[transnr].to;
1661
Daniel Veillardb509f152002-04-17 16:28:10 +00001662 if (to->trans[transnr].counter >= 0) {
1663 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1664 ctxt->states[newto],
Daniel Veillard5de09382005-09-26 17:18:17 +00001665 to->trans[transnr].counter, -1);
Daniel Veillardb509f152002-04-17 16:28:10 +00001666 } else {
1667 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
Daniel Veillard5de09382005-09-26 17:18:17 +00001668 ctxt->states[newto], counter, -1);
Daniel Veillardb509f152002-04-17 16:28:10 +00001669 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001670 }
1671 }
1672 to->mark = XML_REGEXP_MARK_NORMAL;
1673}
1674
1675/**
Daniel Veillarddb68b742005-07-30 13:18:24 +00001676 * xmlFAEliminateSimpleEpsilonTransitions:
1677 * @ctxt: a regexp parser context
1678 *
1679 * Eliminating general epsilon transitions can get costly in the general
1680 * algorithm due to the large amount of generated new transitions and
1681 * associated comparisons. However for simple epsilon transition used just
1682 * to separate building blocks when generating the automata this can be
1683 * reduced to state elimination:
1684 * - if there exists an epsilon from X to Y
1685 * - if there is no other transition from X
1686 * then X and Y are semantically equivalent and X can be eliminated
1687 * If X is the start state then make Y the start state, else replace the
1688 * target of all transitions to X by transitions to Y.
1689 */
1690static void
1691xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1692 int statenr, i, j, newto;
1693 xmlRegStatePtr state, tmp;
1694
1695 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1696 state = ctxt->states[statenr];
1697 if (state == NULL)
1698 continue;
1699 if (state->nbTrans != 1)
1700 continue;
1701 /* is the only transition out a basic transition */
1702 if ((state->trans[0].atom == NULL) &&
1703 (state->trans[0].to >= 0) &&
1704 (state->trans[0].to != statenr) &&
1705 (state->trans[0].counter < 0) &&
1706 (state->trans[0].count < 0)) {
1707 newto = state->trans[0].to;
1708
1709 if (state->type == XML_REGEXP_START_STATE) {
1710#ifdef DEBUG_REGEXP_GRAPH
1711 printf("Found simple epsilon trans from start %d to %d\n",
1712 statenr, newto);
1713#endif
1714 } else {
1715#ifdef DEBUG_REGEXP_GRAPH
1716 printf("Found simple epsilon trans from %d to %d\n",
1717 statenr, newto);
1718#endif
1719 for (i = 0;i < state->nbTransTo;i++) {
1720 tmp = ctxt->states[state->transTo[i]];
1721 for (j = 0;j < tmp->nbTrans;j++) {
1722 if (tmp->trans[j].to == statenr) {
1723 tmp->trans[j].to = newto;
1724#ifdef DEBUG_REGEXP_GRAPH
1725 printf("Changed transition %d on %d to go to %d\n",
1726 j, tmp->no, newto);
1727#endif
1728 xmlRegStateAddTransTo(ctxt, ctxt->states[newto],
1729 tmp->no);
1730 }
1731 }
1732 }
1733#if 0
1734 for (i = 0;i < ctxt->nbStates;i++) {
1735 tmp = ctxt->states[i];
1736 for (j = 0;j < tmp->nbTrans;j++) {
1737 if (tmp->trans[j].to == statenr) {
1738 tmp->trans[j].to = newto;
1739#ifdef DEBUG_REGEXP_GRAPH
1740 printf("Changed transition %d on %d to go to %d\n",
1741 j, tmp->no, newto);
1742#endif
1743 }
1744 }
1745 }
1746#endif
1747 if (state->type == XML_REGEXP_FINAL_STATE)
1748 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1749 /* eliminate the transition completely */
1750 state->nbTrans = 0;
1751
1752
1753 }
1754
1755 }
1756 }
1757}
1758/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001759 * xmlFAEliminateEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001760 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001761 *
1762 */
1763static void
1764xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1765 int statenr, transnr;
1766 xmlRegStatePtr state;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001767 int has_epsilon;
Daniel Veillard4255d502002-04-16 15:50:10 +00001768
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001769 if (ctxt->states == NULL) return;
1770
Daniel Veillarddb68b742005-07-30 13:18:24 +00001771 xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1772
1773 has_epsilon = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001774
Daniel Veillard4255d502002-04-16 15:50:10 +00001775 /*
1776 * build the completed transitions bypassing the epsilons
1777 * Use a marking algorithm to avoid loops
Daniel Veillardcc026dc2005-01-12 13:21:17 +00001778 * mark sink states too.
Daniel Veillard4255d502002-04-16 15:50:10 +00001779 */
1780 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1781 state = ctxt->states[statenr];
1782 if (state == NULL)
1783 continue;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00001784 if ((state->nbTrans == 0) &&
1785 (state->type != XML_REGEXP_FINAL_STATE)) {
1786 state->type = XML_REGEXP_SINK_STATE;
1787 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001788 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1789 if ((state->trans[transnr].atom == NULL) &&
1790 (state->trans[transnr].to >= 0)) {
1791 if (state->trans[transnr].to == statenr) {
1792 state->trans[transnr].to = -1;
1793#ifdef DEBUG_REGEXP_GRAPH
1794 printf("Removed loopback epsilon trans %d on %d\n",
1795 transnr, statenr);
1796#endif
1797 } else if (state->trans[transnr].count < 0) {
1798 int newto = state->trans[transnr].to;
1799
1800#ifdef DEBUG_REGEXP_GRAPH
1801 printf("Found epsilon trans %d from %d to %d\n",
1802 transnr, statenr, newto);
1803#endif
1804 state->mark = XML_REGEXP_MARK_START;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001805 has_epsilon = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00001806 xmlFAReduceEpsilonTransitions(ctxt, statenr,
1807 newto, state->trans[transnr].counter);
1808 state->mark = XML_REGEXP_MARK_NORMAL;
1809#ifdef DEBUG_REGEXP_GRAPH
1810 } else {
1811 printf("Found counted transition %d on %d\n",
1812 transnr, statenr);
1813#endif
1814 }
1815 }
1816 }
1817 }
1818 /*
1819 * Eliminate the epsilon transitions
1820 */
Daniel Veillarddb68b742005-07-30 13:18:24 +00001821 if (has_epsilon) {
1822 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1823 state = ctxt->states[statenr];
1824 if (state == NULL)
1825 continue;
1826 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1827 xmlRegTransPtr trans = &(state->trans[transnr]);
1828 if ((trans->atom == NULL) &&
1829 (trans->count < 0) &&
1830 (trans->to >= 0)) {
1831 trans->to = -1;
1832 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001833 }
1834 }
1835 }
Daniel Veillard23e73572002-09-19 19:56:43 +00001836
1837 /*
1838 * Use this pass to detect unreachable states too
1839 */
1840 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1841 state = ctxt->states[statenr];
1842 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00001843 state->reached = XML_REGEXP_MARK_NORMAL;
Daniel Veillard23e73572002-09-19 19:56:43 +00001844 }
1845 state = ctxt->states[0];
1846 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00001847 state->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00001848 while (state != NULL) {
1849 xmlRegStatePtr target = NULL;
William M. Brack779af002003-08-01 15:55:39 +00001850 state->reached = XML_REGEXP_MARK_VISITED;
Daniel Veillard23e73572002-09-19 19:56:43 +00001851 /*
William M. Brackddf71d62004-05-06 04:17:26 +00001852 * Mark all states reachable from the current reachable state
Daniel Veillard23e73572002-09-19 19:56:43 +00001853 */
1854 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1855 if ((state->trans[transnr].to >= 0) &&
1856 ((state->trans[transnr].atom != NULL) ||
1857 (state->trans[transnr].count >= 0))) {
1858 int newto = state->trans[transnr].to;
1859
1860 if (ctxt->states[newto] == NULL)
1861 continue;
William M. Brack779af002003-08-01 15:55:39 +00001862 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
1863 ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00001864 target = ctxt->states[newto];
1865 }
1866 }
1867 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00001868
Daniel Veillard23e73572002-09-19 19:56:43 +00001869 /*
1870 * find the next accessible state not explored
1871 */
1872 if (target == NULL) {
1873 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
1874 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00001875 if ((state != NULL) && (state->reached ==
1876 XML_REGEXP_MARK_START)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00001877 target = state;
1878 break;
1879 }
1880 }
1881 }
1882 state = target;
1883 }
1884 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1885 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00001886 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00001887#ifdef DEBUG_REGEXP_GRAPH
1888 printf("Removed unreachable state %d\n", statenr);
1889#endif
1890 xmlRegFreeState(state);
1891 ctxt->states[statenr] = NULL;
1892 }
1893 }
1894
Daniel Veillard4255d502002-04-16 15:50:10 +00001895}
1896
Daniel Veillard567a45b2005-10-18 19:11:55 +00001897static int
1898xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
1899 int ret = 0;
1900
1901 if ((range1->type == XML_REGEXP_RANGES) ||
1902 (range2->type == XML_REGEXP_RANGES) ||
1903 (range2->type == XML_REGEXP_SUBREG) ||
1904 (range1->type == XML_REGEXP_SUBREG) ||
1905 (range1->type == XML_REGEXP_STRING) ||
1906 (range2->type == XML_REGEXP_STRING))
1907 return(-1);
1908
1909 /* put them in order */
1910 if (range1->type > range2->type) {
1911 xmlRegRangePtr tmp;
1912
1913 tmp = range1;
1914 range1 = range2;
1915 range2 = tmp;
1916 }
1917 if ((range1->type == XML_REGEXP_ANYCHAR) ||
1918 (range2->type == XML_REGEXP_ANYCHAR)) {
1919 ret = 1;
1920 } else if ((range1->type == XML_REGEXP_EPSILON) ||
1921 (range2->type == XML_REGEXP_EPSILON)) {
1922 return(0);
1923 } else if (range1->type == range2->type) {
1924 if ((range1->type != XML_REGEXP_CHARVAL) ||
1925 (range1->end < range2->start) ||
1926 (range2->end < range1->start))
1927 ret = 1;
1928 else
1929 ret = 0;
1930 } else if (range1->type == XML_REGEXP_CHARVAL) {
1931 int codepoint;
1932 int neg = 0;
1933
1934 /*
1935 * just check all codepoints in the range for acceptance,
1936 * this is usually way cheaper since done only once at
1937 * compilation than testing over and over at runtime or
1938 * pushing too many states when evaluating.
1939 */
1940 if (((range1->neg == 0) && (range2->neg != 0)) ||
1941 ((range1->neg != 0) && (range2->neg == 0)))
1942 neg = 1;
1943
1944 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
1945 ret = xmlRegCheckCharacterRange(range2->type, codepoint,
1946 0, range2->start, range2->end,
1947 range2->blockName);
1948 if (ret < 0)
1949 return(-1);
1950 if (((neg == 1) && (ret == 0)) ||
1951 ((neg == 0) && (ret == 1)))
1952 return(1);
1953 }
1954 return(0);
1955 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
1956 (range2->type == XML_REGEXP_BLOCK_NAME)) {
1957 if (range1->type == range2->type) {
1958 ret = xmlStrEqual(range1->blockName, range2->blockName);
1959 } else {
1960 /*
1961 * comparing a block range with anything else is way
1962 * too costly, and maintining the table is like too much
1963 * memory too, so let's force the automata to save state
1964 * here.
1965 */
1966 return(1);
1967 }
1968 } else if ((range1->type < XML_REGEXP_LETTER) ||
1969 (range2->type < XML_REGEXP_LETTER)) {
1970 if ((range1->type == XML_REGEXP_ANYSPACE) &&
1971 (range2->type == XML_REGEXP_NOTSPACE))
1972 ret = 0;
1973 else if ((range1->type == XML_REGEXP_INITNAME) &&
1974 (range2->type == XML_REGEXP_NOTINITNAME))
1975 ret = 0;
1976 else if ((range1->type == XML_REGEXP_NAMECHAR) &&
1977 (range2->type == XML_REGEXP_NOTNAMECHAR))
1978 ret = 0;
1979 else if ((range1->type == XML_REGEXP_DECIMAL) &&
1980 (range2->type == XML_REGEXP_NOTDECIMAL))
1981 ret = 0;
1982 else if ((range1->type == XML_REGEXP_REALCHAR) &&
1983 (range2->type == XML_REGEXP_NOTREALCHAR))
1984 ret = 0;
1985 else {
1986 /* same thing to limit complexity */
1987 return(1);
1988 }
1989 } else {
1990 ret = 0;
1991 /* range1->type < range2->type here */
1992 switch (range1->type) {
1993 case XML_REGEXP_LETTER:
1994 /* all disjoint except in the subgroups */
1995 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
1996 (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
1997 (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
1998 (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
1999 (range2->type == XML_REGEXP_LETTER_OTHERS))
2000 ret = 1;
2001 break;
2002 case XML_REGEXP_MARK:
2003 if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2004 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2005 (range2->type == XML_REGEXP_MARK_ENCLOSING))
2006 ret = 1;
2007 break;
2008 case XML_REGEXP_NUMBER:
2009 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2010 (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2011 (range2->type == XML_REGEXP_NUMBER_OTHERS))
2012 ret = 1;
2013 break;
2014 case XML_REGEXP_PUNCT:
2015 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2016 (range2->type == XML_REGEXP_PUNCT_DASH) ||
2017 (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2018 (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2019 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2020 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2021 (range2->type == XML_REGEXP_PUNCT_OTHERS))
2022 ret = 1;
2023 break;
2024 case XML_REGEXP_SEPAR:
2025 if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2026 (range2->type == XML_REGEXP_SEPAR_LINE) ||
2027 (range2->type == XML_REGEXP_SEPAR_PARA))
2028 ret = 1;
2029 break;
2030 case XML_REGEXP_SYMBOL:
2031 if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2032 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2033 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2034 (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2035 ret = 1;
2036 break;
2037 case XML_REGEXP_OTHER:
2038 if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2039 (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2040 (range2->type == XML_REGEXP_OTHER_PRIVATE))
2041 ret = 1;
2042 break;
2043 default:
2044 if ((range2->type >= XML_REGEXP_LETTER) &&
2045 (range2->type < XML_REGEXP_BLOCK_NAME))
2046 ret = 0;
2047 else {
2048 /* safety net ! */
2049 return(1);
2050 }
2051 }
2052 }
2053 if (((range1->neg == 0) && (range2->neg != 0)) ||
2054 ((range1->neg != 0) && (range2->neg == 0)))
2055 ret = !ret;
2056 return(1);
2057}
2058
Daniel Veillarde19fc232002-04-22 16:01:24 +00002059/**
Daniel Veillardfc011b72006-02-12 19:14:15 +00002060 * xmlFACompareAtomTypes:
2061 * @type1: an atom type
2062 * @type2: an atom type
2063 *
2064 * Compares two atoms type to check whether they intersect in some ways,
2065 * this is used by xmlFACompareAtoms only
2066 *
2067 * Returns 1 if they may intersect and 0 otherwise
2068 */
2069static int
2070xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2071 if ((type1 == XML_REGEXP_EPSILON) ||
2072 (type1 == XML_REGEXP_CHARVAL) ||
2073 (type1 == XML_REGEXP_RANGES) ||
2074 (type1 == XML_REGEXP_SUBREG) ||
2075 (type1 == XML_REGEXP_STRING) ||
2076 (type1 == XML_REGEXP_ANYCHAR))
2077 return(1);
2078 if ((type2 == XML_REGEXP_EPSILON) ||
2079 (type2 == XML_REGEXP_CHARVAL) ||
2080 (type2 == XML_REGEXP_RANGES) ||
2081 (type2 == XML_REGEXP_SUBREG) ||
2082 (type2 == XML_REGEXP_STRING) ||
2083 (type2 == XML_REGEXP_ANYCHAR))
2084 return(1);
2085
2086 if (type1 == type2) return(1);
2087
2088 /* simplify subsequent compares by making sure type1 < type2 */
2089 if (type1 > type2) {
2090 xmlRegAtomType tmp = type1;
2091 type1 = type2;
2092 type2 = tmp;
2093 }
2094 switch (type1) {
2095 case XML_REGEXP_ANYSPACE: /* \s */
2096 /* can't be a letter, number, mark, pontuation, symbol */
2097 if ((type2 == XML_REGEXP_NOTSPACE) ||
2098 ((type2 >= XML_REGEXP_LETTER) &&
2099 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2100 ((type2 >= XML_REGEXP_NUMBER) &&
2101 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2102 ((type2 >= XML_REGEXP_MARK) &&
2103 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2104 ((type2 >= XML_REGEXP_PUNCT) &&
2105 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2106 ((type2 >= XML_REGEXP_SYMBOL) &&
2107 (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2108 ) return(0);
2109 break;
2110 case XML_REGEXP_NOTSPACE: /* \S */
2111 break;
2112 case XML_REGEXP_INITNAME: /* \l */
2113 /* can't be a number, mark, separator, pontuation, symbol or other */
2114 if ((type2 == XML_REGEXP_NOTINITNAME) ||
2115 ((type2 >= XML_REGEXP_NUMBER) &&
2116 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2117 ((type2 >= XML_REGEXP_MARK) &&
2118 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2119 ((type2 >= XML_REGEXP_SEPAR) &&
2120 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2121 ((type2 >= XML_REGEXP_PUNCT) &&
2122 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2123 ((type2 >= XML_REGEXP_SYMBOL) &&
2124 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2125 ((type2 >= XML_REGEXP_OTHER) &&
2126 (type2 <= XML_REGEXP_OTHER_NA))
2127 ) return(0);
2128 break;
2129 case XML_REGEXP_NOTINITNAME: /* \L */
2130 break;
2131 case XML_REGEXP_NAMECHAR: /* \c */
2132 /* can't be a mark, separator, pontuation, symbol or other */
2133 if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2134 ((type2 >= XML_REGEXP_MARK) &&
2135 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2136 ((type2 >= XML_REGEXP_PUNCT) &&
2137 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2138 ((type2 >= XML_REGEXP_SEPAR) &&
2139 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2140 ((type2 >= XML_REGEXP_SYMBOL) &&
2141 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2142 ((type2 >= XML_REGEXP_OTHER) &&
2143 (type2 <= XML_REGEXP_OTHER_NA))
2144 ) return(0);
2145 break;
2146 case XML_REGEXP_NOTNAMECHAR: /* \C */
2147 break;
2148 case XML_REGEXP_DECIMAL: /* \d */
2149 /* can't be a letter, mark, separator, pontuation, symbol or other */
2150 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2151 (type2 == XML_REGEXP_REALCHAR) ||
2152 ((type2 >= XML_REGEXP_LETTER) &&
2153 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2154 ((type2 >= XML_REGEXP_MARK) &&
2155 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2156 ((type2 >= XML_REGEXP_PUNCT) &&
2157 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2158 ((type2 >= XML_REGEXP_SEPAR) &&
2159 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2160 ((type2 >= XML_REGEXP_SYMBOL) &&
2161 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2162 ((type2 >= XML_REGEXP_OTHER) &&
2163 (type2 <= XML_REGEXP_OTHER_NA))
2164 )return(0);
2165 break;
2166 case XML_REGEXP_NOTDECIMAL: /* \D */
2167 break;
2168 case XML_REGEXP_REALCHAR: /* \w */
2169 /* can't be a mark, separator, pontuation, symbol or other */
2170 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2171 ((type2 >= XML_REGEXP_MARK) &&
2172 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2173 ((type2 >= XML_REGEXP_PUNCT) &&
2174 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2175 ((type2 >= XML_REGEXP_SEPAR) &&
2176 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2177 ((type2 >= XML_REGEXP_SYMBOL) &&
2178 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2179 ((type2 >= XML_REGEXP_OTHER) &&
2180 (type2 <= XML_REGEXP_OTHER_NA))
2181 )return(0);
2182 break;
2183 case XML_REGEXP_NOTREALCHAR: /* \W */
2184 break;
2185 /*
2186 * at that point we know both type 1 and type2 are from
2187 * character categories are ordered and are different,
2188 * it becomes simple because this is a partition
2189 */
2190 case XML_REGEXP_LETTER:
2191 if (type2 <= XML_REGEXP_LETTER_OTHERS)
2192 return(1);
2193 return(0);
2194 case XML_REGEXP_LETTER_UPPERCASE:
2195 case XML_REGEXP_LETTER_LOWERCASE:
2196 case XML_REGEXP_LETTER_TITLECASE:
2197 case XML_REGEXP_LETTER_MODIFIER:
2198 case XML_REGEXP_LETTER_OTHERS:
2199 return(0);
2200 case XML_REGEXP_MARK:
2201 if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2202 return(1);
2203 return(0);
2204 case XML_REGEXP_MARK_NONSPACING:
2205 case XML_REGEXP_MARK_SPACECOMBINING:
2206 case XML_REGEXP_MARK_ENCLOSING:
2207 return(0);
2208 case XML_REGEXP_NUMBER:
2209 if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2210 return(1);
2211 return(0);
2212 case XML_REGEXP_NUMBER_DECIMAL:
2213 case XML_REGEXP_NUMBER_LETTER:
2214 case XML_REGEXP_NUMBER_OTHERS:
2215 return(0);
2216 case XML_REGEXP_PUNCT:
2217 if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2218 return(1);
2219 return(0);
2220 case XML_REGEXP_PUNCT_CONNECTOR:
2221 case XML_REGEXP_PUNCT_DASH:
2222 case XML_REGEXP_PUNCT_OPEN:
2223 case XML_REGEXP_PUNCT_CLOSE:
2224 case XML_REGEXP_PUNCT_INITQUOTE:
2225 case XML_REGEXP_PUNCT_FINQUOTE:
2226 case XML_REGEXP_PUNCT_OTHERS:
2227 return(0);
2228 case XML_REGEXP_SEPAR:
2229 if (type2 <= XML_REGEXP_SEPAR_PARA)
2230 return(1);
2231 return(0);
2232 case XML_REGEXP_SEPAR_SPACE:
2233 case XML_REGEXP_SEPAR_LINE:
2234 case XML_REGEXP_SEPAR_PARA:
2235 return(0);
2236 case XML_REGEXP_SYMBOL:
2237 if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2238 return(1);
2239 return(0);
2240 case XML_REGEXP_SYMBOL_MATH:
2241 case XML_REGEXP_SYMBOL_CURRENCY:
2242 case XML_REGEXP_SYMBOL_MODIFIER:
2243 case XML_REGEXP_SYMBOL_OTHERS:
2244 return(0);
2245 case XML_REGEXP_OTHER:
2246 if (type2 <= XML_REGEXP_OTHER_NA)
2247 return(1);
2248 return(0);
2249 case XML_REGEXP_OTHER_CONTROL:
2250 case XML_REGEXP_OTHER_FORMAT:
2251 case XML_REGEXP_OTHER_PRIVATE:
2252 case XML_REGEXP_OTHER_NA:
2253 return(0);
2254 default:
2255 break;
2256 }
2257 return(1);
2258}
2259
2260/**
2261 * xmlFAEqualAtoms:
Daniel Veillarde19fc232002-04-22 16:01:24 +00002262 * @atom1: an atom
2263 * @atom2: an atom
2264 *
Daniel Veillardfc011b72006-02-12 19:14:15 +00002265 * Compares two atoms to check whether they are the same exactly
2266 * this is used to remove equivalent transitions
Daniel Veillarde19fc232002-04-22 16:01:24 +00002267 *
Daniel Veillardfc011b72006-02-12 19:14:15 +00002268 * Returns 1 if same and 0 otherwise
Daniel Veillarde19fc232002-04-22 16:01:24 +00002269 */
2270static int
Daniel Veillardfc011b72006-02-12 19:14:15 +00002271xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
2272 int ret = 0;
Daniel Veillard9efc4762005-07-19 14:33:55 +00002273
Daniel Veillarde19fc232002-04-22 16:01:24 +00002274 if (atom1 == atom2)
2275 return(1);
2276 if ((atom1 == NULL) || (atom2 == NULL))
2277 return(0);
2278
Daniel Veillardfc011b72006-02-12 19:14:15 +00002279 if (atom1->type != atom2->type)
2280 return(0);
2281 switch (atom1->type) {
2282 case XML_REGEXP_EPSILON:
2283 ret = 0;
2284 break;
2285 case XML_REGEXP_STRING:
2286 ret = xmlStrEqual((xmlChar *)atom1->valuep,
2287 (xmlChar *)atom2->valuep);
2288 break;
2289 case XML_REGEXP_CHARVAL:
2290 ret = (atom1->codepoint == atom2->codepoint);
2291 break;
2292 case XML_REGEXP_RANGES:
2293 /* too hard to do in the general case */
2294 ret = 0;
2295 default:
2296 break;
2297 }
2298 return(ret);
2299}
2300
2301/**
2302 * xmlFACompareAtoms:
2303 * @atom1: an atom
2304 * @atom2: an atom
2305 *
2306 * Compares two atoms to check whether they intersect in some ways,
2307 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2308 *
2309 * Returns 1 if yes and 0 otherwise
2310 */
2311static int
2312xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
2313 int ret = 1;
2314
2315 if (atom1 == atom2)
2316 return(1);
2317 if ((atom1 == NULL) || (atom2 == NULL))
2318 return(0);
2319
2320 if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2321 (atom2->type == XML_REGEXP_ANYCHAR))
2322 return(1);
2323
2324 if (atom1->type > atom2->type) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002325 xmlRegAtomPtr tmp;
2326 tmp = atom1;
2327 atom1 = atom2;
2328 atom2 = tmp;
Daniel Veillardfc011b72006-02-12 19:14:15 +00002329 }
2330 if (atom1->type != atom2->type) {
2331 ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2332 /* if they can't intersect at the type level break now */
2333 if (ret == 0)
2334 return(0);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002335 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002336 switch (atom1->type) {
2337 case XML_REGEXP_STRING:
Daniel Veillard9efc4762005-07-19 14:33:55 +00002338 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep,
2339 (xmlChar *)atom2->valuep);
2340 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002341 case XML_REGEXP_EPSILON:
Daniel Veillard567a45b2005-10-18 19:11:55 +00002342 goto not_determinist;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002343 case XML_REGEXP_CHARVAL:
Daniel Veillardfc011b72006-02-12 19:14:15 +00002344 if (atom2->type == XML_REGEXP_CHARVAL) {
2345 ret = (atom1->codepoint == atom2->codepoint);
2346 } else {
2347 ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2348 if (ret < 0)
2349 ret = 1;
2350 }
Daniel Veillard9efc4762005-07-19 14:33:55 +00002351 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002352 case XML_REGEXP_RANGES:
Daniel Veillardfc011b72006-02-12 19:14:15 +00002353 if (atom2->type == XML_REGEXP_RANGES) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002354 int i, j, res;
2355 xmlRegRangePtr r1, r2;
2356
2357 /*
2358 * need to check that none of the ranges eventually matches
2359 */
2360 for (i = 0;i < atom1->nbRanges;i++) {
2361 for (j = 0;j < atom2->nbRanges;j++) {
2362 r1 = atom1->ranges[i];
2363 r2 = atom2->ranges[j];
2364 res = xmlFACompareRanges(r1, r2);
2365 if (res == 1) {
2366 ret = 1;
2367 goto done;
2368 }
2369 }
2370 }
2371 ret = 0;
2372 }
2373 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002374 default:
Daniel Veillard567a45b2005-10-18 19:11:55 +00002375 goto not_determinist;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002376 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002377done:
Daniel Veillard6e65e152005-08-09 11:09:52 +00002378 if (atom1->neg != atom2->neg) {
Daniel Veillard9efc4762005-07-19 14:33:55 +00002379 ret = !ret;
Daniel Veillard6e65e152005-08-09 11:09:52 +00002380 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002381 if (ret == 0)
2382 return(0);
2383not_determinist:
2384 return(1);
Daniel Veillarde19fc232002-04-22 16:01:24 +00002385}
2386
2387/**
2388 * xmlFARecurseDeterminism:
2389 * @ctxt: a regexp parser context
2390 *
2391 * Check whether the associated regexp is determinist,
2392 * should be called after xmlFAEliminateEpsilonTransitions()
2393 *
2394 */
2395static int
2396xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2397 int to, xmlRegAtomPtr atom) {
2398 int ret = 1;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002399 int res;
Daniel Veillard5de09382005-09-26 17:18:17 +00002400 int transnr, nbTrans;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002401 xmlRegTransPtr t1;
2402
2403 if (state == NULL)
2404 return(ret);
Daniel Veillard5de09382005-09-26 17:18:17 +00002405 /*
2406 * don't recurse on transitions potentially added in the course of
2407 * the elimination.
2408 */
2409 nbTrans = state->nbTrans;
2410 for (transnr = 0;transnr < nbTrans;transnr++) {
Daniel Veillarde19fc232002-04-22 16:01:24 +00002411 t1 = &(state->trans[transnr]);
2412 /*
2413 * check transitions conflicting with the one looked at
2414 */
2415 if (t1->atom == NULL) {
2416 if (t1->to == -1)
2417 continue;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002418 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
Daniel Veillarde19fc232002-04-22 16:01:24 +00002419 to, atom);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002420 if (res == 0) {
2421 ret = 0;
Daniel Veillardaa622012005-10-20 15:55:25 +00002422 /* t1->nd = 1; */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002423 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002424 continue;
2425 }
2426 if (t1->to != to)
2427 continue;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002428 if (xmlFACompareAtoms(t1->atom, atom)) {
2429 ret = 0;
2430 /* mark the transition as non-deterministic */
2431 t1->nd = 1;
2432 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002433 }
2434 return(ret);
2435}
2436
2437/**
2438 * xmlFAComputesDeterminism:
2439 * @ctxt: a regexp parser context
2440 *
2441 * Check whether the associated regexp is determinist,
2442 * should be called after xmlFAEliminateEpsilonTransitions()
2443 *
2444 */
2445static int
2446xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2447 int statenr, transnr;
2448 xmlRegStatePtr state;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002449 xmlRegTransPtr t1, t2, last;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002450 int i;
2451 int ret = 1;
2452
Daniel Veillard4402ab42002-09-12 16:02:56 +00002453#ifdef DEBUG_REGEXP_GRAPH
2454 printf("xmlFAComputesDeterminism\n");
2455 xmlRegPrintCtxt(stdout, ctxt);
2456#endif
Daniel Veillarde19fc232002-04-22 16:01:24 +00002457 if (ctxt->determinist != -1)
2458 return(ctxt->determinist);
2459
2460 /*
Daniel Veillard567a45b2005-10-18 19:11:55 +00002461 * First cleanup the automata removing cancelled transitions
Daniel Veillarde19fc232002-04-22 16:01:24 +00002462 */
2463 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2464 state = ctxt->states[statenr];
2465 if (state == NULL)
2466 continue;
Daniel Veillard4f82c8a2005-08-09 21:40:08 +00002467 if (state->nbTrans < 2)
2468 continue;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002469 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2470 t1 = &(state->trans[transnr]);
2471 /*
2472 * Determinism checks in case of counted or all transitions
2473 * will have to be handled separately
2474 */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002475 if (t1->atom == NULL) {
Daniel Veillardaa622012005-10-20 15:55:25 +00002476 /* t1->nd = 1; */
Daniel Veillarde19fc232002-04-22 16:01:24 +00002477 continue;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002478 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002479 if (t1->to == -1) /* eliminated */
2480 continue;
2481 for (i = 0;i < transnr;i++) {
2482 t2 = &(state->trans[i]);
2483 if (t2->to == -1) /* eliminated */
2484 continue;
2485 if (t2->atom != NULL) {
2486 if (t1->to == t2->to) {
Daniel Veillardfc011b72006-02-12 19:14:15 +00002487 if (xmlFAEqualAtoms(t1->atom, t2->atom))
William M. Brackddf71d62004-05-06 04:17:26 +00002488 t2->to = -1; /* eliminated */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002489 }
2490 }
2491 }
2492 }
2493 }
2494
2495 /*
2496 * Check for all states that there aren't 2 transitions
2497 * with the same atom and a different target.
2498 */
2499 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2500 state = ctxt->states[statenr];
2501 if (state == NULL)
2502 continue;
2503 if (state->nbTrans < 2)
2504 continue;
2505 last = NULL;
2506 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2507 t1 = &(state->trans[transnr]);
2508 /*
2509 * Determinism checks in case of counted or all transitions
2510 * will have to be handled separately
2511 */
2512 if (t1->atom == NULL) {
2513 continue;
2514 }
2515 if (t1->to == -1) /* eliminated */
2516 continue;
2517 for (i = 0;i < transnr;i++) {
2518 t2 = &(state->trans[i]);
2519 if (t2->to == -1) /* eliminated */
2520 continue;
2521 if (t2->atom != NULL) {
2522 /* not determinist ! */
2523 if (xmlFACompareAtoms(t1->atom, t2->atom)) {
2524 ret = 0;
2525 /* mark the transitions as non-deterministic ones */
2526 t1->nd = 1;
2527 t2->nd = 1;
2528 last = t1;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002529 }
2530 } else if (t1->to != -1) {
2531 /*
2532 * do the closure in case of remaining specific
2533 * epsilon transitions like choices or all
2534 */
2535 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2536 t2->to, t2->atom);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002537 /* don't shortcut the computation so all non deterministic
2538 transition get marked down
Daniel Veillarde19fc232002-04-22 16:01:24 +00002539 if (ret == 0)
Daniel Veillardaa622012005-10-20 15:55:25 +00002540 return(0);
2541 */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002542 if (ret == 0) {
2543 t1->nd = 1;
Daniel Veillardaa622012005-10-20 15:55:25 +00002544 /* t2->nd = 1; */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002545 last = t1;
2546 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002547 }
2548 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002549 /* don't shortcut the computation so all non deterministic
2550 transition get marked down
Daniel Veillarde19fc232002-04-22 16:01:24 +00002551 if (ret == 0)
Daniel Veillard567a45b2005-10-18 19:11:55 +00002552 break; */
Daniel Veillarde19fc232002-04-22 16:01:24 +00002553 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002554
2555 /*
2556 * mark specifically the last non-deterministic transition
2557 * from a state since there is no need to set-up rollback
2558 * from it
2559 */
2560 if (last != NULL) {
2561 last->nd = 2;
2562 }
2563
2564 /* don't shortcut the computation so all non deterministic
2565 transition get marked down
Daniel Veillarde19fc232002-04-22 16:01:24 +00002566 if (ret == 0)
Daniel Veillard567a45b2005-10-18 19:11:55 +00002567 break; */
Daniel Veillarde19fc232002-04-22 16:01:24 +00002568 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002569
Daniel Veillarde19fc232002-04-22 16:01:24 +00002570 ctxt->determinist = ret;
2571 return(ret);
2572}
2573
Daniel Veillard4255d502002-04-16 15:50:10 +00002574/************************************************************************
2575 * *
2576 * Routines to check input against transition atoms *
2577 * *
2578 ************************************************************************/
2579
2580static int
2581xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2582 int start, int end, const xmlChar *blockName) {
2583 int ret = 0;
2584
2585 switch (type) {
2586 case XML_REGEXP_STRING:
2587 case XML_REGEXP_SUBREG:
2588 case XML_REGEXP_RANGES:
2589 case XML_REGEXP_EPSILON:
2590 return(-1);
2591 case XML_REGEXP_ANYCHAR:
2592 ret = ((codepoint != '\n') && (codepoint != '\r'));
2593 break;
2594 case XML_REGEXP_CHARVAL:
2595 ret = ((codepoint >= start) && (codepoint <= end));
2596 break;
2597 case XML_REGEXP_NOTSPACE:
2598 neg = !neg;
2599 case XML_REGEXP_ANYSPACE:
2600 ret = ((codepoint == '\n') || (codepoint == '\r') ||
2601 (codepoint == '\t') || (codepoint == ' '));
2602 break;
2603 case XML_REGEXP_NOTINITNAME:
2604 neg = !neg;
2605 case XML_REGEXP_INITNAME:
William M. Brack871611b2003-10-18 04:53:14 +00002606 ret = (IS_LETTER(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00002607 (codepoint == '_') || (codepoint == ':'));
2608 break;
2609 case XML_REGEXP_NOTNAMECHAR:
2610 neg = !neg;
2611 case XML_REGEXP_NAMECHAR:
William M. Brack871611b2003-10-18 04:53:14 +00002612 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00002613 (codepoint == '.') || (codepoint == '-') ||
2614 (codepoint == '_') || (codepoint == ':') ||
William M. Brack871611b2003-10-18 04:53:14 +00002615 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
Daniel Veillard4255d502002-04-16 15:50:10 +00002616 break;
2617 case XML_REGEXP_NOTDECIMAL:
2618 neg = !neg;
2619 case XML_REGEXP_DECIMAL:
2620 ret = xmlUCSIsCatNd(codepoint);
2621 break;
2622 case XML_REGEXP_REALCHAR:
2623 neg = !neg;
2624 case XML_REGEXP_NOTREALCHAR:
2625 ret = xmlUCSIsCatP(codepoint);
2626 if (ret == 0)
2627 ret = xmlUCSIsCatZ(codepoint);
2628 if (ret == 0)
2629 ret = xmlUCSIsCatC(codepoint);
2630 break;
2631 case XML_REGEXP_LETTER:
2632 ret = xmlUCSIsCatL(codepoint);
2633 break;
2634 case XML_REGEXP_LETTER_UPPERCASE:
2635 ret = xmlUCSIsCatLu(codepoint);
2636 break;
2637 case XML_REGEXP_LETTER_LOWERCASE:
2638 ret = xmlUCSIsCatLl(codepoint);
2639 break;
2640 case XML_REGEXP_LETTER_TITLECASE:
2641 ret = xmlUCSIsCatLt(codepoint);
2642 break;
2643 case XML_REGEXP_LETTER_MODIFIER:
2644 ret = xmlUCSIsCatLm(codepoint);
2645 break;
2646 case XML_REGEXP_LETTER_OTHERS:
2647 ret = xmlUCSIsCatLo(codepoint);
2648 break;
2649 case XML_REGEXP_MARK:
2650 ret = xmlUCSIsCatM(codepoint);
2651 break;
2652 case XML_REGEXP_MARK_NONSPACING:
2653 ret = xmlUCSIsCatMn(codepoint);
2654 break;
2655 case XML_REGEXP_MARK_SPACECOMBINING:
2656 ret = xmlUCSIsCatMc(codepoint);
2657 break;
2658 case XML_REGEXP_MARK_ENCLOSING:
2659 ret = xmlUCSIsCatMe(codepoint);
2660 break;
2661 case XML_REGEXP_NUMBER:
2662 ret = xmlUCSIsCatN(codepoint);
2663 break;
2664 case XML_REGEXP_NUMBER_DECIMAL:
2665 ret = xmlUCSIsCatNd(codepoint);
2666 break;
2667 case XML_REGEXP_NUMBER_LETTER:
2668 ret = xmlUCSIsCatNl(codepoint);
2669 break;
2670 case XML_REGEXP_NUMBER_OTHERS:
2671 ret = xmlUCSIsCatNo(codepoint);
2672 break;
2673 case XML_REGEXP_PUNCT:
2674 ret = xmlUCSIsCatP(codepoint);
2675 break;
2676 case XML_REGEXP_PUNCT_CONNECTOR:
2677 ret = xmlUCSIsCatPc(codepoint);
2678 break;
2679 case XML_REGEXP_PUNCT_DASH:
2680 ret = xmlUCSIsCatPd(codepoint);
2681 break;
2682 case XML_REGEXP_PUNCT_OPEN:
2683 ret = xmlUCSIsCatPs(codepoint);
2684 break;
2685 case XML_REGEXP_PUNCT_CLOSE:
2686 ret = xmlUCSIsCatPe(codepoint);
2687 break;
2688 case XML_REGEXP_PUNCT_INITQUOTE:
2689 ret = xmlUCSIsCatPi(codepoint);
2690 break;
2691 case XML_REGEXP_PUNCT_FINQUOTE:
2692 ret = xmlUCSIsCatPf(codepoint);
2693 break;
2694 case XML_REGEXP_PUNCT_OTHERS:
2695 ret = xmlUCSIsCatPo(codepoint);
2696 break;
2697 case XML_REGEXP_SEPAR:
2698 ret = xmlUCSIsCatZ(codepoint);
2699 break;
2700 case XML_REGEXP_SEPAR_SPACE:
2701 ret = xmlUCSIsCatZs(codepoint);
2702 break;
2703 case XML_REGEXP_SEPAR_LINE:
2704 ret = xmlUCSIsCatZl(codepoint);
2705 break;
2706 case XML_REGEXP_SEPAR_PARA:
2707 ret = xmlUCSIsCatZp(codepoint);
2708 break;
2709 case XML_REGEXP_SYMBOL:
2710 ret = xmlUCSIsCatS(codepoint);
2711 break;
2712 case XML_REGEXP_SYMBOL_MATH:
2713 ret = xmlUCSIsCatSm(codepoint);
2714 break;
2715 case XML_REGEXP_SYMBOL_CURRENCY:
2716 ret = xmlUCSIsCatSc(codepoint);
2717 break;
2718 case XML_REGEXP_SYMBOL_MODIFIER:
2719 ret = xmlUCSIsCatSk(codepoint);
2720 break;
2721 case XML_REGEXP_SYMBOL_OTHERS:
2722 ret = xmlUCSIsCatSo(codepoint);
2723 break;
2724 case XML_REGEXP_OTHER:
2725 ret = xmlUCSIsCatC(codepoint);
2726 break;
2727 case XML_REGEXP_OTHER_CONTROL:
2728 ret = xmlUCSIsCatCc(codepoint);
2729 break;
2730 case XML_REGEXP_OTHER_FORMAT:
2731 ret = xmlUCSIsCatCf(codepoint);
2732 break;
2733 case XML_REGEXP_OTHER_PRIVATE:
2734 ret = xmlUCSIsCatCo(codepoint);
2735 break;
2736 case XML_REGEXP_OTHER_NA:
2737 /* ret = xmlUCSIsCatCn(codepoint); */
2738 /* Seems it doesn't exist anymore in recent Unicode releases */
2739 ret = 0;
2740 break;
2741 case XML_REGEXP_BLOCK_NAME:
2742 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
2743 break;
2744 }
2745 if (neg)
2746 return(!ret);
2747 return(ret);
2748}
2749
2750static int
2751xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
2752 int i, ret = 0;
2753 xmlRegRangePtr range;
2754
William M. Brack871611b2003-10-18 04:53:14 +00002755 if ((atom == NULL) || (!IS_CHAR(codepoint)))
Daniel Veillard4255d502002-04-16 15:50:10 +00002756 return(-1);
2757
2758 switch (atom->type) {
2759 case XML_REGEXP_SUBREG:
2760 case XML_REGEXP_EPSILON:
2761 return(-1);
2762 case XML_REGEXP_CHARVAL:
2763 return(codepoint == atom->codepoint);
2764 case XML_REGEXP_RANGES: {
2765 int accept = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00002766
Daniel Veillard4255d502002-04-16 15:50:10 +00002767 for (i = 0;i < atom->nbRanges;i++) {
2768 range = atom->ranges[i];
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002769 if (range->neg == 2) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002770 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2771 0, range->start, range->end,
2772 range->blockName);
2773 if (ret != 0)
2774 return(0); /* excluded char */
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002775 } else if (range->neg) {
2776 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2777 0, range->start, range->end,
2778 range->blockName);
2779 if (ret == 0)
Daniel Veillardf2a12832003-11-24 13:04:35 +00002780 accept = 1;
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002781 else
2782 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00002783 } else {
2784 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2785 0, range->start, range->end,
2786 range->blockName);
2787 if (ret != 0)
2788 accept = 1; /* might still be excluded */
2789 }
2790 }
2791 return(accept);
2792 }
2793 case XML_REGEXP_STRING:
2794 printf("TODO: XML_REGEXP_STRING\n");
2795 return(-1);
2796 case XML_REGEXP_ANYCHAR:
2797 case XML_REGEXP_ANYSPACE:
2798 case XML_REGEXP_NOTSPACE:
2799 case XML_REGEXP_INITNAME:
2800 case XML_REGEXP_NOTINITNAME:
2801 case XML_REGEXP_NAMECHAR:
2802 case XML_REGEXP_NOTNAMECHAR:
2803 case XML_REGEXP_DECIMAL:
2804 case XML_REGEXP_NOTDECIMAL:
2805 case XML_REGEXP_REALCHAR:
2806 case XML_REGEXP_NOTREALCHAR:
2807 case XML_REGEXP_LETTER:
2808 case XML_REGEXP_LETTER_UPPERCASE:
2809 case XML_REGEXP_LETTER_LOWERCASE:
2810 case XML_REGEXP_LETTER_TITLECASE:
2811 case XML_REGEXP_LETTER_MODIFIER:
2812 case XML_REGEXP_LETTER_OTHERS:
2813 case XML_REGEXP_MARK:
2814 case XML_REGEXP_MARK_NONSPACING:
2815 case XML_REGEXP_MARK_SPACECOMBINING:
2816 case XML_REGEXP_MARK_ENCLOSING:
2817 case XML_REGEXP_NUMBER:
2818 case XML_REGEXP_NUMBER_DECIMAL:
2819 case XML_REGEXP_NUMBER_LETTER:
2820 case XML_REGEXP_NUMBER_OTHERS:
2821 case XML_REGEXP_PUNCT:
2822 case XML_REGEXP_PUNCT_CONNECTOR:
2823 case XML_REGEXP_PUNCT_DASH:
2824 case XML_REGEXP_PUNCT_OPEN:
2825 case XML_REGEXP_PUNCT_CLOSE:
2826 case XML_REGEXP_PUNCT_INITQUOTE:
2827 case XML_REGEXP_PUNCT_FINQUOTE:
2828 case XML_REGEXP_PUNCT_OTHERS:
2829 case XML_REGEXP_SEPAR:
2830 case XML_REGEXP_SEPAR_SPACE:
2831 case XML_REGEXP_SEPAR_LINE:
2832 case XML_REGEXP_SEPAR_PARA:
2833 case XML_REGEXP_SYMBOL:
2834 case XML_REGEXP_SYMBOL_MATH:
2835 case XML_REGEXP_SYMBOL_CURRENCY:
2836 case XML_REGEXP_SYMBOL_MODIFIER:
2837 case XML_REGEXP_SYMBOL_OTHERS:
2838 case XML_REGEXP_OTHER:
2839 case XML_REGEXP_OTHER_CONTROL:
2840 case XML_REGEXP_OTHER_FORMAT:
2841 case XML_REGEXP_OTHER_PRIVATE:
2842 case XML_REGEXP_OTHER_NA:
2843 case XML_REGEXP_BLOCK_NAME:
2844 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
2845 (const xmlChar *)atom->valuep);
2846 if (atom->neg)
2847 ret = !ret;
2848 break;
2849 }
2850 return(ret);
2851}
2852
2853/************************************************************************
2854 * *
William M. Brackddf71d62004-05-06 04:17:26 +00002855 * Saving and restoring state of an execution context *
Daniel Veillard4255d502002-04-16 15:50:10 +00002856 * *
2857 ************************************************************************/
2858
2859#ifdef DEBUG_REGEXP_EXEC
2860static void
2861xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
2862 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
2863 if (exec->inputStack != NULL) {
2864 int i;
2865 printf(": ");
2866 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
2867 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]);
2868 } else {
2869 printf(": %s", &(exec->inputString[exec->index]));
2870 }
2871 printf("\n");
2872}
2873#endif
2874
2875static void
2876xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
2877#ifdef DEBUG_REGEXP_EXEC
2878 printf("saving ");
2879 exec->transno++;
2880 xmlFARegDebugExec(exec);
2881 exec->transno--;
2882#endif
Daniel Veillard94cc1032005-09-15 13:09:00 +00002883#ifdef MAX_PUSH
2884 if (exec->nbPush > MAX_PUSH) {
2885 return;
2886 }
2887 exec->nbPush++;
2888#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00002889
2890 if (exec->maxRollbacks == 0) {
2891 exec->maxRollbacks = 4;
2892 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
2893 sizeof(xmlRegExecRollback));
2894 if (exec->rollbacks == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002895 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002896 exec->maxRollbacks = 0;
2897 return;
2898 }
2899 memset(exec->rollbacks, 0,
2900 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2901 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
2902 xmlRegExecRollback *tmp;
2903 int len = exec->maxRollbacks;
2904
2905 exec->maxRollbacks *= 2;
2906 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
2907 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2908 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002909 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002910 exec->maxRollbacks /= 2;
2911 return;
2912 }
2913 exec->rollbacks = tmp;
2914 tmp = &exec->rollbacks[len];
2915 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
2916 }
2917 exec->rollbacks[exec->nbRollbacks].state = exec->state;
2918 exec->rollbacks[exec->nbRollbacks].index = exec->index;
2919 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
2920 if (exec->comp->nbCounters > 0) {
2921 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2922 exec->rollbacks[exec->nbRollbacks].counts = (int *)
2923 xmlMalloc(exec->comp->nbCounters * sizeof(int));
2924 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002925 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002926 exec->status = -5;
2927 return;
2928 }
2929 }
2930 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
2931 exec->comp->nbCounters * sizeof(int));
2932 }
2933 exec->nbRollbacks++;
2934}
2935
2936static void
2937xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
2938 if (exec->nbRollbacks <= 0) {
2939 exec->status = -1;
2940#ifdef DEBUG_REGEXP_EXEC
2941 printf("rollback failed on empty stack\n");
2942#endif
2943 return;
2944 }
2945 exec->nbRollbacks--;
2946 exec->state = exec->rollbacks[exec->nbRollbacks].state;
2947 exec->index = exec->rollbacks[exec->nbRollbacks].index;
2948 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
2949 if (exec->comp->nbCounters > 0) {
2950 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2951 fprintf(stderr, "exec save: allocation failed");
2952 exec->status = -6;
2953 return;
2954 }
2955 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
2956 exec->comp->nbCounters * sizeof(int));
2957 }
2958
2959#ifdef DEBUG_REGEXP_EXEC
2960 printf("restored ");
2961 xmlFARegDebugExec(exec);
2962#endif
2963}
2964
2965/************************************************************************
2966 * *
William M. Brackddf71d62004-05-06 04:17:26 +00002967 * Verifier, running an input against a compiled regexp *
Daniel Veillard4255d502002-04-16 15:50:10 +00002968 * *
2969 ************************************************************************/
2970
2971static int
2972xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
2973 xmlRegExecCtxt execval;
2974 xmlRegExecCtxtPtr exec = &execval;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002975 int ret, codepoint = 0, len, deter;
Daniel Veillard4255d502002-04-16 15:50:10 +00002976
2977 exec->inputString = content;
2978 exec->index = 0;
Daniel Veillard94cc1032005-09-15 13:09:00 +00002979 exec->nbPush = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00002980 exec->determinist = 1;
2981 exec->maxRollbacks = 0;
2982 exec->nbRollbacks = 0;
2983 exec->rollbacks = NULL;
2984 exec->status = 0;
2985 exec->comp = comp;
2986 exec->state = comp->states[0];
2987 exec->transno = 0;
2988 exec->transcount = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00002989 exec->inputStack = NULL;
2990 exec->inputStackMax = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00002991 if (comp->nbCounters > 0) {
2992 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
Daniel Veillardff46a042003-10-08 08:53:17 +00002993 if (exec->counts == NULL) {
2994 xmlRegexpErrMemory(NULL, "running regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002995 return(-1);
Daniel Veillardff46a042003-10-08 08:53:17 +00002996 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002997 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
2998 } else
2999 exec->counts = NULL;
3000 while ((exec->status == 0) &&
3001 ((exec->inputString[exec->index] != 0) ||
3002 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
3003 xmlRegTransPtr trans;
3004 xmlRegAtomPtr atom;
3005
3006 /*
William M. Brack0e00b282004-04-26 15:40:47 +00003007 * If end of input on non-terminal state, rollback, however we may
Daniel Veillard4255d502002-04-16 15:50:10 +00003008 * still have epsilon like transition for counted transitions
William M. Brack0e00b282004-04-26 15:40:47 +00003009 * on counters, in that case don't break too early. Additionally,
3010 * if we are working on a range like "AB{0,2}", where B is not present,
3011 * we don't want to break.
Daniel Veillard4255d502002-04-16 15:50:10 +00003012 */
Daniel Veillard11ce4002006-03-10 00:36:23 +00003013 len = 1;
William M. Brack0e00b282004-04-26 15:40:47 +00003014 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
William M. Brackddf71d62004-05-06 04:17:26 +00003015 /*
3016 * if there is a transition, we must check if
3017 * atom allows minOccurs of 0
3018 */
3019 if (exec->transno < exec->state->nbTrans) {
William M. Brack0e00b282004-04-26 15:40:47 +00003020 trans = &exec->state->trans[exec->transno];
3021 if (trans->to >=0) {
3022 atom = trans->atom;
3023 if (!((atom->min == 0) && (atom->max > 0)))
3024 goto rollback;
3025 }
3026 } else
3027 goto rollback;
3028 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003029
3030 exec->transcount = 0;
3031 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3032 trans = &exec->state->trans[exec->transno];
3033 if (trans->to < 0)
3034 continue;
3035 atom = trans->atom;
3036 ret = 0;
Daniel Veillard567a45b2005-10-18 19:11:55 +00003037 deter = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003038 if (trans->count >= 0) {
3039 int count;
3040 xmlRegCounterPtr counter;
3041
Daniel Veillard11ce4002006-03-10 00:36:23 +00003042 if (exec->counts == NULL) {
3043 exec->status = -1;
3044 goto error;
3045 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003046 /*
3047 * A counted transition.
3048 */
3049
3050 count = exec->counts[trans->count];
3051 counter = &exec->comp->counters[trans->count];
3052#ifdef DEBUG_REGEXP_EXEC
3053 printf("testing count %d: val %d, min %d, max %d\n",
3054 trans->count, count, counter->min, counter->max);
3055#endif
3056 ret = ((count >= counter->min) && (count <= counter->max));
Daniel Veillard567a45b2005-10-18 19:11:55 +00003057 if ((ret) && (counter->min != counter->max))
3058 deter = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003059 } else if (atom == NULL) {
3060 fprintf(stderr, "epsilon transition left at runtime\n");
3061 exec->status = -2;
3062 break;
3063 } else if (exec->inputString[exec->index] != 0) {
3064 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3065 ret = xmlRegCheckCharacter(atom, codepoint);
William M. Brack0e00b282004-04-26 15:40:47 +00003066 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003067 xmlRegStatePtr to = comp->states[trans->to];
3068
3069 /*
3070 * this is a multiple input sequence
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003071 * If there is a counter associated increment it now.
3072 * before potentially saving and rollback
Daniel Veillard4255d502002-04-16 15:50:10 +00003073 */
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003074 if (trans->counter >= 0) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00003075 if (exec->counts == NULL) {
3076 exec->status = -1;
3077 goto error;
3078 }
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003079#ifdef DEBUG_REGEXP_EXEC
3080 printf("Increasing count %d\n", trans->counter);
3081#endif
3082 exec->counts[trans->counter]++;
3083 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003084 if (exec->state->nbTrans > exec->transno + 1) {
3085 xmlFARegExecSave(exec);
3086 }
3087 exec->transcount = 1;
3088 do {
3089 /*
3090 * Try to progress as much as possible on the input
3091 */
3092 if (exec->transcount == atom->max) {
3093 break;
3094 }
3095 exec->index += len;
3096 /*
3097 * End of input: stop here
3098 */
3099 if (exec->inputString[exec->index] == 0) {
3100 exec->index -= len;
3101 break;
3102 }
3103 if (exec->transcount >= atom->min) {
3104 int transno = exec->transno;
3105 xmlRegStatePtr state = exec->state;
3106
3107 /*
3108 * The transition is acceptable save it
3109 */
3110 exec->transno = -1; /* trick */
3111 exec->state = to;
3112 xmlFARegExecSave(exec);
3113 exec->transno = transno;
3114 exec->state = state;
3115 }
3116 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3117 len);
3118 ret = xmlRegCheckCharacter(atom, codepoint);
3119 exec->transcount++;
3120 } while (ret == 1);
3121 if (exec->transcount < atom->min)
3122 ret = 0;
3123
3124 /*
3125 * If the last check failed but one transition was found
3126 * possible, rollback
3127 */
3128 if (ret < 0)
3129 ret = 0;
3130 if (ret == 0) {
3131 goto rollback;
3132 }
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003133 if (trans->counter >= 0) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00003134 if (exec->counts == NULL) {
3135 exec->status = -1;
3136 goto error;
3137 }
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003138#ifdef DEBUG_REGEXP_EXEC
3139 printf("Decreasing count %d\n", trans->counter);
3140#endif
3141 exec->counts[trans->counter]--;
3142 }
William M. Brack0e00b282004-04-26 15:40:47 +00003143 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3144 /*
3145 * we don't match on the codepoint, but minOccurs of 0
3146 * says that's ok. Setting len to 0 inhibits stepping
3147 * over the codepoint.
3148 */
3149 exec->transcount = 1;
3150 len = 0;
3151 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003152 }
William M. Brack0e00b282004-04-26 15:40:47 +00003153 } else if ((atom->min == 0) && (atom->max > 0)) {
3154 /* another spot to match when minOccurs is 0 */
3155 exec->transcount = 1;
3156 len = 0;
3157 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003158 }
3159 if (ret == 1) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00003160 if ((trans->nd == 1) ||
3161 ((trans->count >= 0) && (deter == 0) &&
3162 (exec->state->nbTrans > exec->transno + 1))) {
Daniel Veillardaa622012005-10-20 15:55:25 +00003163#ifdef DEBUG_REGEXP_EXEC
3164 if (trans->nd == 1)
3165 printf("Saving on nd transition atom %d for %c at %d\n",
3166 trans->atom->no, codepoint, exec->index);
3167 else
3168 printf("Saving on counted transition count %d for %c at %d\n",
3169 trans->count, codepoint, exec->index);
3170#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003171 xmlFARegExecSave(exec);
3172 }
3173 if (trans->counter >= 0) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00003174 if (exec->counts == NULL) {
3175 exec->status = -1;
3176 goto error;
3177 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003178#ifdef DEBUG_REGEXP_EXEC
3179 printf("Increasing count %d\n", trans->counter);
3180#endif
3181 exec->counts[trans->counter]++;
3182 }
Daniel Veillard10752282005-08-08 13:05:13 +00003183 if ((trans->count >= 0) &&
3184 (trans->count < REGEXP_ALL_COUNTER)) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00003185 if (exec->counts == NULL) {
3186 exec->status = -1;
3187 goto error;
3188 }
Daniel Veillard10752282005-08-08 13:05:13 +00003189#ifdef DEBUG_REGEXP_EXEC
3190 printf("resetting count %d on transition\n",
3191 trans->count);
3192#endif
3193 exec->counts[trans->count] = 0;
3194 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003195#ifdef DEBUG_REGEXP_EXEC
3196 printf("entering state %d\n", trans->to);
3197#endif
3198 exec->state = comp->states[trans->to];
3199 exec->transno = 0;
3200 if (trans->atom != NULL) {
3201 exec->index += len;
3202 }
3203 goto progress;
3204 } else if (ret < 0) {
3205 exec->status = -4;
3206 break;
3207 }
3208 }
3209 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3210rollback:
3211 /*
3212 * Failed to find a way out
3213 */
3214 exec->determinist = 0;
Daniel Veillardaa622012005-10-20 15:55:25 +00003215#ifdef DEBUG_REGEXP_EXEC
3216 printf("rollback from state %d on %d:%c\n", exec->state->no,
3217 codepoint,codepoint);
3218#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003219 xmlFARegExecRollBack(exec);
3220 }
3221progress:
3222 continue;
3223 }
Daniel Veillard11ce4002006-03-10 00:36:23 +00003224error:
Daniel Veillard4255d502002-04-16 15:50:10 +00003225 if (exec->rollbacks != NULL) {
3226 if (exec->counts != NULL) {
3227 int i;
3228
3229 for (i = 0;i < exec->maxRollbacks;i++)
3230 if (exec->rollbacks[i].counts != NULL)
3231 xmlFree(exec->rollbacks[i].counts);
3232 }
3233 xmlFree(exec->rollbacks);
3234 }
3235 if (exec->counts != NULL)
3236 xmlFree(exec->counts);
3237 if (exec->status == 0)
3238 return(1);
Daniel Veillard94cc1032005-09-15 13:09:00 +00003239 if (exec->status == -1) {
3240 if (exec->nbPush > MAX_PUSH)
3241 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003242 return(0);
Daniel Veillard94cc1032005-09-15 13:09:00 +00003243 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003244 return(exec->status);
3245}
3246
3247/************************************************************************
3248 * *
William M. Brackddf71d62004-05-06 04:17:26 +00003249 * Progressive interface to the verifier one atom at a time *
Daniel Veillard4255d502002-04-16 15:50:10 +00003250 * *
3251 ************************************************************************/
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003252#ifdef DEBUG_ERR
3253static void testerr(xmlRegExecCtxtPtr exec);
3254#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003255
3256/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003257 * xmlRegNewExecCtxt:
Daniel Veillard4255d502002-04-16 15:50:10 +00003258 * @comp: a precompiled regular expression
3259 * @callback: a callback function used for handling progresses in the
3260 * automata matching phase
3261 * @data: the context data associated to the callback in this context
3262 *
3263 * Build a context used for progressive evaluation of a regexp.
Daniel Veillard01c13b52002-12-10 15:19:08 +00003264 *
3265 * Returns the new context
Daniel Veillard4255d502002-04-16 15:50:10 +00003266 */
3267xmlRegExecCtxtPtr
3268xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3269 xmlRegExecCtxtPtr exec;
3270
3271 if (comp == NULL)
3272 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00003273 if ((comp->compact == NULL) && (comp->states == NULL))
3274 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00003275 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3276 if (exec == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003277 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00003278 return(NULL);
3279 }
3280 memset(exec, 0, sizeof(xmlRegExecCtxt));
3281 exec->inputString = NULL;
3282 exec->index = 0;
3283 exec->determinist = 1;
3284 exec->maxRollbacks = 0;
3285 exec->nbRollbacks = 0;
3286 exec->rollbacks = NULL;
3287 exec->status = 0;
3288 exec->comp = comp;
Daniel Veillard23e73572002-09-19 19:56:43 +00003289 if (comp->compact == NULL)
3290 exec->state = comp->states[0];
Daniel Veillard4255d502002-04-16 15:50:10 +00003291 exec->transno = 0;
3292 exec->transcount = 0;
3293 exec->callback = callback;
3294 exec->data = data;
3295 if (comp->nbCounters > 0) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003296 /*
3297 * For error handling, exec->counts is allocated twice the size
3298 * the second half is used to store the data in case of rollback
3299 */
3300 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3301 * 2);
Daniel Veillard4255d502002-04-16 15:50:10 +00003302 if (exec->counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003303 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00003304 xmlFree(exec);
3305 return(NULL);
3306 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003307 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3308 exec->errCounts = &exec->counts[comp->nbCounters];
3309 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00003310 exec->counts = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003311 exec->errCounts = NULL;
3312 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003313 exec->inputStackMax = 0;
3314 exec->inputStackNr = 0;
3315 exec->inputStack = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003316 exec->errStateNo = -1;
3317 exec->errString = NULL;
Daniel Veillard94cc1032005-09-15 13:09:00 +00003318 exec->nbPush = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003319 return(exec);
3320}
3321
3322/**
3323 * xmlRegFreeExecCtxt:
3324 * @exec: a regular expression evaulation context
3325 *
3326 * Free the structures associated to a regular expression evaulation context.
3327 */
3328void
3329xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3330 if (exec == NULL)
3331 return;
3332
3333 if (exec->rollbacks != NULL) {
3334 if (exec->counts != NULL) {
3335 int i;
3336
3337 for (i = 0;i < exec->maxRollbacks;i++)
3338 if (exec->rollbacks[i].counts != NULL)
3339 xmlFree(exec->rollbacks[i].counts);
3340 }
3341 xmlFree(exec->rollbacks);
3342 }
3343 if (exec->counts != NULL)
3344 xmlFree(exec->counts);
3345 if (exec->inputStack != NULL) {
3346 int i;
3347
Daniel Veillard32370232002-10-16 14:08:14 +00003348 for (i = 0;i < exec->inputStackNr;i++) {
3349 if (exec->inputStack[i].value != NULL)
3350 xmlFree(exec->inputStack[i].value);
3351 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003352 xmlFree(exec->inputStack);
3353 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003354 if (exec->errString != NULL)
3355 xmlFree(exec->errString);
Daniel Veillard4255d502002-04-16 15:50:10 +00003356 xmlFree(exec);
3357}
3358
3359static void
3360xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3361 void *data) {
3362#ifdef DEBUG_PUSH
3363 printf("saving value: %d:%s\n", exec->inputStackNr, value);
3364#endif
3365 if (exec->inputStackMax == 0) {
3366 exec->inputStackMax = 4;
3367 exec->inputStack = (xmlRegInputTokenPtr)
3368 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3369 if (exec->inputStack == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003370 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00003371 exec->inputStackMax = 0;
3372 return;
3373 }
3374 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3375 xmlRegInputTokenPtr tmp;
3376
3377 exec->inputStackMax *= 2;
3378 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3379 exec->inputStackMax * sizeof(xmlRegInputToken));
3380 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003381 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00003382 exec->inputStackMax /= 2;
3383 return;
3384 }
3385 exec->inputStack = tmp;
3386 }
3387 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3388 exec->inputStack[exec->inputStackNr].data = data;
3389 exec->inputStackNr++;
3390 exec->inputStack[exec->inputStackNr].value = NULL;
3391 exec->inputStack[exec->inputStackNr].data = NULL;
3392}
3393
Daniel Veillardc0826a72004-08-10 14:17:33 +00003394/**
3395 * xmlRegStrEqualWildcard:
3396 * @expStr: the string to be evaluated
3397 * @valStr: the validation string
3398 *
3399 * Checks if both strings are equal or have the same content. "*"
3400 * can be used as a wildcard in @valStr; "|" is used as a seperator of
3401 * substrings in both @expStr and @valStr.
3402 *
3403 * Returns 1 if the comparison is satisfied and the number of substrings
3404 * is equal, 0 otherwise.
3405 */
3406
3407static int
3408xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3409 if (expStr == valStr) return(1);
3410 if (expStr == NULL) return(0);
3411 if (valStr == NULL) return(0);
3412 do {
3413 /*
3414 * Eval if we have a wildcard for the current item.
3415 */
3416 if (*expStr != *valStr) {
Daniel Veillard4f82c8a2005-08-09 21:40:08 +00003417 /* if one of them starts with a wildcard make valStr be it */
3418 if (*valStr == '*') {
3419 const xmlChar *tmp;
3420
3421 tmp = valStr;
3422 valStr = expStr;
3423 expStr = tmp;
3424 }
Daniel Veillardc0826a72004-08-10 14:17:33 +00003425 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3426 do {
3427 if (*valStr == XML_REG_STRING_SEPARATOR)
3428 break;
Kasimier T. Buchcikc0e833f2005-04-19 15:02:20 +00003429 valStr++;
Daniel Veillardc0826a72004-08-10 14:17:33 +00003430 } while (*valStr != 0);
3431 continue;
3432 } else
3433 return(0);
3434 }
Kasimier T. Buchcikc0e833f2005-04-19 15:02:20 +00003435 expStr++;
3436 valStr++;
Daniel Veillardc0826a72004-08-10 14:17:33 +00003437 } while (*valStr != 0);
3438 if (*expStr != 0)
3439 return (0);
3440 else
3441 return (1);
3442}
Daniel Veillard4255d502002-04-16 15:50:10 +00003443
3444/**
Daniel Veillard23e73572002-09-19 19:56:43 +00003445 * xmlRegCompactPushString:
3446 * @exec: a regexp execution context
3447 * @comp: the precompiled exec with a compact table
3448 * @value: a string token input
3449 * @data: data associated to the token to reuse in callbacks
3450 *
3451 * Push one input token in the execution context
3452 *
3453 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3454 * a negative value in case of error.
3455 */
3456static int
3457xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3458 xmlRegexpPtr comp,
3459 const xmlChar *value,
3460 void *data) {
3461 int state = exec->index;
3462 int i, target;
3463
3464 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3465 return(-1);
3466
3467 if (value == NULL) {
3468 /*
3469 * are we at a final state ?
3470 */
3471 if (comp->compact[state * (comp->nbstrings + 1)] ==
3472 XML_REGEXP_FINAL_STATE)
3473 return(1);
3474 return(0);
3475 }
3476
3477#ifdef DEBUG_PUSH
3478 printf("value pushed: %s\n", value);
3479#endif
3480
3481 /*
William M. Brackddf71d62004-05-06 04:17:26 +00003482 * Examine all outside transitions from current state
Daniel Veillard23e73572002-09-19 19:56:43 +00003483 */
3484 for (i = 0;i < comp->nbstrings;i++) {
3485 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3486 if ((target > 0) && (target <= comp->nbstates)) {
Daniel Veillardc0826a72004-08-10 14:17:33 +00003487 target--; /* to avoid 0 */
3488 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3489 exec->index = target;
Daniel Veillard118aed72002-09-24 14:13:13 +00003490 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3491 exec->callback(exec->data, value,
3492 comp->transdata[state * comp->nbstrings + i], data);
3493 }
Daniel Veillard23e73572002-09-19 19:56:43 +00003494#ifdef DEBUG_PUSH
3495 printf("entering state %d\n", target);
3496#endif
3497 if (comp->compact[target * (comp->nbstrings + 1)] ==
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003498 XML_REGEXP_SINK_STATE)
3499 goto error;
3500
3501 if (comp->compact[target * (comp->nbstrings + 1)] ==
Daniel Veillard23e73572002-09-19 19:56:43 +00003502 XML_REGEXP_FINAL_STATE)
3503 return(1);
3504 return(0);
3505 }
3506 }
3507 }
3508 /*
3509 * Failed to find an exit transition out from current state for the
3510 * current token
3511 */
3512#ifdef DEBUG_PUSH
3513 printf("failed to find a transition for %s on state %d\n", value, state);
3514#endif
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003515error:
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003516 if (exec->errString != NULL)
3517 xmlFree(exec->errString);
3518 exec->errString = xmlStrdup(value);
3519 exec->errStateNo = state;
Daniel Veillard23e73572002-09-19 19:56:43 +00003520 exec->status = -1;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003521#ifdef DEBUG_ERR
3522 testerr(exec);
3523#endif
Daniel Veillard23e73572002-09-19 19:56:43 +00003524 return(-1);
3525}
3526
3527/**
Daniel Veillard6e65e152005-08-09 11:09:52 +00003528 * xmlRegExecPushStringInternal:
Daniel Veillardea7751d2002-12-20 00:16:24 +00003529 * @exec: a regexp execution context or NULL to indicate the end
Daniel Veillard4255d502002-04-16 15:50:10 +00003530 * @value: a string token input
3531 * @data: data associated to the token to reuse in callbacks
Daniel Veillard6e65e152005-08-09 11:09:52 +00003532 * @compound: value was assembled from 2 strings
Daniel Veillard4255d502002-04-16 15:50:10 +00003533 *
3534 * Push one input token in the execution context
3535 *
3536 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3537 * a negative value in case of error.
3538 */
Daniel Veillard6e65e152005-08-09 11:09:52 +00003539static int
3540xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3541 void *data, int compound) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003542 xmlRegTransPtr trans;
3543 xmlRegAtomPtr atom;
3544 int ret;
3545 int final = 0;
Daniel Veillard90700152005-01-08 22:05:09 +00003546 int progress = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003547
3548 if (exec == NULL)
3549 return(-1);
Daniel Veillard23e73572002-09-19 19:56:43 +00003550 if (exec->comp == NULL)
3551 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003552 if (exec->status != 0)
3553 return(exec->status);
3554
Daniel Veillard23e73572002-09-19 19:56:43 +00003555 if (exec->comp->compact != NULL)
3556 return(xmlRegCompactPushString(exec, exec->comp, value, data));
3557
Daniel Veillard4255d502002-04-16 15:50:10 +00003558 if (value == NULL) {
3559 if (exec->state->type == XML_REGEXP_FINAL_STATE)
3560 return(1);
3561 final = 1;
3562 }
3563
3564#ifdef DEBUG_PUSH
3565 printf("value pushed: %s\n", value);
3566#endif
3567 /*
3568 * If we have an active rollback stack push the new value there
3569 * and get back to where we were left
3570 */
3571 if ((value != NULL) && (exec->inputStackNr > 0)) {
3572 xmlFARegExecSaveInputString(exec, value, data);
3573 value = exec->inputStack[exec->index].value;
3574 data = exec->inputStack[exec->index].data;
3575#ifdef DEBUG_PUSH
3576 printf("value loaded: %s\n", value);
3577#endif
3578 }
3579
3580 while ((exec->status == 0) &&
3581 ((value != NULL) ||
3582 ((final == 1) &&
3583 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3584
3585 /*
3586 * End of input on non-terminal state, rollback, however we may
3587 * still have epsilon like transition for counted transitions
3588 * on counters, in that case don't break too early.
3589 */
Daniel Veillardb509f152002-04-17 16:28:10 +00003590 if ((value == NULL) && (exec->counts == NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +00003591 goto rollback;
3592
3593 exec->transcount = 0;
3594 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3595 trans = &exec->state->trans[exec->transno];
3596 if (trans->to < 0)
3597 continue;
3598 atom = trans->atom;
3599 ret = 0;
Daniel Veillard441bc322002-04-20 17:38:48 +00003600 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3601 int i;
3602 int count;
3603 xmlRegTransPtr t;
3604 xmlRegCounterPtr counter;
3605
3606 ret = 0;
3607
3608#ifdef DEBUG_PUSH
3609 printf("testing all lax %d\n", trans->count);
3610#endif
3611 /*
3612 * Check all counted transitions from the current state
3613 */
3614 if ((value == NULL) && (final)) {
3615 ret = 1;
3616 } else if (value != NULL) {
3617 for (i = 0;i < exec->state->nbTrans;i++) {
3618 t = &exec->state->trans[i];
3619 if ((t->counter < 0) || (t == trans))
3620 continue;
3621 counter = &exec->comp->counters[t->counter];
3622 count = exec->counts[t->counter];
3623 if ((count < counter->max) &&
3624 (t->atom != NULL) &&
3625 (xmlStrEqual(value, t->atom->valuep))) {
3626 ret = 0;
3627 break;
3628 }
3629 if ((count >= counter->min) &&
3630 (count < counter->max) &&
Daniel Veillard11ce4002006-03-10 00:36:23 +00003631 (t->atom != NULL) &&
Daniel Veillard441bc322002-04-20 17:38:48 +00003632 (xmlStrEqual(value, t->atom->valuep))) {
3633 ret = 1;
3634 break;
3635 }
3636 }
3637 }
3638 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillard8a001f62002-04-20 07:24:11 +00003639 int i;
3640 int count;
3641 xmlRegTransPtr t;
3642 xmlRegCounterPtr counter;
3643
3644 ret = 1;
3645
3646#ifdef DEBUG_PUSH
3647 printf("testing all %d\n", trans->count);
3648#endif
3649 /*
3650 * Check all counted transitions from the current state
3651 */
3652 for (i = 0;i < exec->state->nbTrans;i++) {
3653 t = &exec->state->trans[i];
3654 if ((t->counter < 0) || (t == trans))
3655 continue;
3656 counter = &exec->comp->counters[t->counter];
3657 count = exec->counts[t->counter];
3658 if ((count < counter->min) || (count > counter->max)) {
3659 ret = 0;
3660 break;
3661 }
3662 }
3663 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003664 int count;
3665 xmlRegCounterPtr counter;
3666
3667 /*
3668 * A counted transition.
3669 */
3670
3671 count = exec->counts[trans->count];
3672 counter = &exec->comp->counters[trans->count];
3673#ifdef DEBUG_PUSH
3674 printf("testing count %d: val %d, min %d, max %d\n",
3675 trans->count, count, counter->min, counter->max);
3676#endif
3677 ret = ((count >= counter->min) && (count <= counter->max));
3678 } else if (atom == NULL) {
3679 fprintf(stderr, "epsilon transition left at runtime\n");
3680 exec->status = -2;
3681 break;
3682 } else if (value != NULL) {
Daniel Veillardc0826a72004-08-10 14:17:33 +00003683 ret = xmlRegStrEqualWildcard(atom->valuep, value);
Daniel Veillard6e65e152005-08-09 11:09:52 +00003684 if (atom->neg) {
Daniel Veillard9efc4762005-07-19 14:33:55 +00003685 ret = !ret;
Daniel Veillard6e65e152005-08-09 11:09:52 +00003686 if (!compound)
3687 ret = 0;
3688 }
Daniel Veillard441bc322002-04-20 17:38:48 +00003689 if ((ret == 1) && (trans->counter >= 0)) {
3690 xmlRegCounterPtr counter;
3691 int count;
3692
3693 count = exec->counts[trans->counter];
3694 counter = &exec->comp->counters[trans->counter];
3695 if (count >= counter->max)
3696 ret = 0;
3697 }
3698
Daniel Veillard4255d502002-04-16 15:50:10 +00003699 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3700 xmlRegStatePtr to = exec->comp->states[trans->to];
3701
3702 /*
3703 * this is a multiple input sequence
3704 */
3705 if (exec->state->nbTrans > exec->transno + 1) {
3706 if (exec->inputStackNr <= 0) {
3707 xmlFARegExecSaveInputString(exec, value, data);
3708 }
3709 xmlFARegExecSave(exec);
3710 }
3711 exec->transcount = 1;
3712 do {
3713 /*
3714 * Try to progress as much as possible on the input
3715 */
3716 if (exec->transcount == atom->max) {
3717 break;
3718 }
3719 exec->index++;
3720 value = exec->inputStack[exec->index].value;
3721 data = exec->inputStack[exec->index].data;
3722#ifdef DEBUG_PUSH
3723 printf("value loaded: %s\n", value);
3724#endif
3725
3726 /*
3727 * End of input: stop here
3728 */
3729 if (value == NULL) {
3730 exec->index --;
3731 break;
3732 }
3733 if (exec->transcount >= atom->min) {
3734 int transno = exec->transno;
3735 xmlRegStatePtr state = exec->state;
3736
3737 /*
3738 * The transition is acceptable save it
3739 */
3740 exec->transno = -1; /* trick */
3741 exec->state = to;
3742 if (exec->inputStackNr <= 0) {
3743 xmlFARegExecSaveInputString(exec, value, data);
3744 }
3745 xmlFARegExecSave(exec);
3746 exec->transno = transno;
3747 exec->state = state;
3748 }
3749 ret = xmlStrEqual(value, atom->valuep);
3750 exec->transcount++;
3751 } while (ret == 1);
3752 if (exec->transcount < atom->min)
3753 ret = 0;
3754
3755 /*
3756 * If the last check failed but one transition was found
3757 * possible, rollback
3758 */
3759 if (ret < 0)
3760 ret = 0;
3761 if (ret == 0) {
3762 goto rollback;
3763 }
3764 }
3765 }
3766 if (ret == 1) {
William M. Brack98873952003-12-26 06:03:14 +00003767 if ((exec->callback != NULL) && (atom != NULL) &&
3768 (data != NULL)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003769 exec->callback(exec->data, atom->valuep,
3770 atom->data, data);
3771 }
3772 if (exec->state->nbTrans > exec->transno + 1) {
3773 if (exec->inputStackNr <= 0) {
3774 xmlFARegExecSaveInputString(exec, value, data);
3775 }
3776 xmlFARegExecSave(exec);
3777 }
3778 if (trans->counter >= 0) {
3779#ifdef DEBUG_PUSH
3780 printf("Increasing count %d\n", trans->counter);
3781#endif
3782 exec->counts[trans->counter]++;
3783 }
Daniel Veillard10752282005-08-08 13:05:13 +00003784 if ((trans->count >= 0) &&
3785 (trans->count < REGEXP_ALL_COUNTER)) {
3786#ifdef DEBUG_REGEXP_EXEC
3787 printf("resetting count %d on transition\n",
3788 trans->count);
3789#endif
3790 exec->counts[trans->count] = 0;
3791 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003792#ifdef DEBUG_PUSH
3793 printf("entering state %d\n", trans->to);
3794#endif
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003795 if ((exec->comp->states[trans->to] != NULL) &&
3796 (exec->comp->states[trans->to]->type ==
3797 XML_REGEXP_SINK_STATE)) {
3798 /*
3799 * entering a sink state, save the current state as error
3800 * state.
3801 */
3802 if (exec->errString != NULL)
3803 xmlFree(exec->errString);
3804 exec->errString = xmlStrdup(value);
3805 exec->errState = exec->state;
3806 memcpy(exec->errCounts, exec->counts,
3807 exec->comp->nbCounters * sizeof(int));
3808 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003809 exec->state = exec->comp->states[trans->to];
3810 exec->transno = 0;
3811 if (trans->atom != NULL) {
3812 if (exec->inputStack != NULL) {
3813 exec->index++;
3814 if (exec->index < exec->inputStackNr) {
3815 value = exec->inputStack[exec->index].value;
3816 data = exec->inputStack[exec->index].data;
3817#ifdef DEBUG_PUSH
3818 printf("value loaded: %s\n", value);
3819#endif
3820 } else {
3821 value = NULL;
3822 data = NULL;
3823#ifdef DEBUG_PUSH
3824 printf("end of input\n");
3825#endif
3826 }
3827 } else {
3828 value = NULL;
3829 data = NULL;
3830#ifdef DEBUG_PUSH
3831 printf("end of input\n");
3832#endif
3833 }
3834 }
3835 goto progress;
3836 } else if (ret < 0) {
3837 exec->status = -4;
3838 break;
3839 }
3840 }
3841 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3842rollback:
Daniel Veillard90700152005-01-08 22:05:09 +00003843 /*
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003844 * if we didn't yet rollback on the current input
3845 * store the current state as the error state.
Daniel Veillard90700152005-01-08 22:05:09 +00003846 */
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003847 if ((progress) && (exec->state != NULL) &&
3848 (exec->state->type != XML_REGEXP_SINK_STATE)) {
Daniel Veillard90700152005-01-08 22:05:09 +00003849 progress = 0;
3850 if (exec->errString != NULL)
3851 xmlFree(exec->errString);
3852 exec->errString = xmlStrdup(value);
3853 exec->errState = exec->state;
3854 memcpy(exec->errCounts, exec->counts,
3855 exec->comp->nbCounters * sizeof(int));
3856 }
3857
Daniel Veillard4255d502002-04-16 15:50:10 +00003858 /*
3859 * Failed to find a way out
3860 */
3861 exec->determinist = 0;
3862 xmlFARegExecRollBack(exec);
3863 if (exec->status == 0) {
3864 value = exec->inputStack[exec->index].value;
3865 data = exec->inputStack[exec->index].data;
3866#ifdef DEBUG_PUSH
3867 printf("value loaded: %s\n", value);
3868#endif
3869 }
3870 }
Daniel Veillard90700152005-01-08 22:05:09 +00003871 continue;
Daniel Veillard4255d502002-04-16 15:50:10 +00003872progress:
Daniel Veillard90700152005-01-08 22:05:09 +00003873 progress = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003874 continue;
3875 }
3876 if (exec->status == 0) {
3877 return(exec->state->type == XML_REGEXP_FINAL_STATE);
3878 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003879#ifdef DEBUG_ERR
Daniel Veillard90700152005-01-08 22:05:09 +00003880 if (exec->status < 0) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003881 testerr(exec);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003882 }
Daniel Veillard90700152005-01-08 22:05:09 +00003883#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003884 return(exec->status);
3885}
3886
Daniel Veillard52b48c72003-04-13 19:53:42 +00003887/**
Daniel Veillard6e65e152005-08-09 11:09:52 +00003888 * xmlRegExecPushString:
3889 * @exec: a regexp execution context or NULL to indicate the end
3890 * @value: a string token input
3891 * @data: data associated to the token to reuse in callbacks
3892 *
3893 * Push one input token in the execution context
3894 *
3895 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3896 * a negative value in case of error.
3897 */
3898int
3899xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3900 void *data) {
3901 return(xmlRegExecPushStringInternal(exec, value, data, 0));
3902}
3903
3904/**
Daniel Veillard52b48c72003-04-13 19:53:42 +00003905 * xmlRegExecPushString2:
3906 * @exec: a regexp execution context or NULL to indicate the end
3907 * @value: the first string token input
3908 * @value2: the second string token input
3909 * @data: data associated to the token to reuse in callbacks
3910 *
3911 * Push one input token in the execution context
3912 *
3913 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3914 * a negative value in case of error.
3915 */
3916int
3917xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
3918 const xmlChar *value2, void *data) {
3919 xmlChar buf[150];
3920 int lenn, lenp, ret;
3921 xmlChar *str;
3922
3923 if (exec == NULL)
3924 return(-1);
3925 if (exec->comp == NULL)
3926 return(-1);
3927 if (exec->status != 0)
3928 return(exec->status);
3929
3930 if (value2 == NULL)
3931 return(xmlRegExecPushString(exec, value, data));
3932
3933 lenn = strlen((char *) value2);
3934 lenp = strlen((char *) value);
3935
3936 if (150 < lenn + lenp + 2) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003937 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00003938 if (str == NULL) {
3939 exec->status = -1;
3940 return(-1);
3941 }
3942 } else {
3943 str = buf;
3944 }
3945 memcpy(&str[0], value, lenp);
Daniel Veillardc0826a72004-08-10 14:17:33 +00003946 str[lenp] = XML_REG_STRING_SEPARATOR;
Daniel Veillard52b48c72003-04-13 19:53:42 +00003947 memcpy(&str[lenp + 1], value2, lenn);
3948 str[lenn + lenp + 1] = 0;
3949
3950 if (exec->comp->compact != NULL)
3951 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
3952 else
Daniel Veillard6e65e152005-08-09 11:09:52 +00003953 ret = xmlRegExecPushStringInternal(exec, str, data, 1);
Daniel Veillard52b48c72003-04-13 19:53:42 +00003954
3955 if (str != buf)
Daniel Veillard0b1ff142005-12-28 21:13:33 +00003956 xmlFree(str);
Daniel Veillard52b48c72003-04-13 19:53:42 +00003957 return(ret);
3958}
3959
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003960/**
Daniel Veillard77005e62005-07-19 16:26:18 +00003961 * xmlRegExecGetValues:
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003962 * @exec: a regexp execution context
3963 * @err: error extraction or normal one
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003964 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003965 * @nbneg: return number of negative transitions
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003966 * @values: pointer to the array of acceptable values
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003967 * @terminal: return value if this was a terminal state
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003968 *
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003969 * Extract informations from the regexp execution, internal routine to
3970 * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003971 *
3972 * Returns: 0 in case of success or -1 in case of error.
3973 */
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003974static int
3975xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003976 int *nbval, int *nbneg,
3977 xmlChar **values, int *terminal) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003978 int maxval;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003979 int nb = 0;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003980
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003981 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
3982 (values == NULL) || (*nbval <= 0))
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003983 return(-1);
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003984
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003985 maxval = *nbval;
3986 *nbval = 0;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003987 *nbneg = 0;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003988 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
3989 xmlRegexpPtr comp;
3990 int target, i, state;
3991
3992 comp = exec->comp;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003993
3994 if (err) {
3995 if (exec->errStateNo == -1) return(-1);
3996 state = exec->errStateNo;
3997 } else {
3998 state = exec->index;
3999 }
4000 if (terminal != NULL) {
4001 if (comp->compact[state * (comp->nbstrings + 1)] ==
4002 XML_REGEXP_FINAL_STATE)
4003 *terminal = 1;
4004 else
4005 *terminal = 0;
4006 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004007 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004008 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004009 if ((target > 0) && (target <= comp->nbstates) &&
4010 (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4011 XML_REGEXP_SINK_STATE)) {
4012 values[nb++] = comp->stringMap[i];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004013 (*nbval)++;
4014 }
4015 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004016 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4017 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4018 if ((target > 0) && (target <= comp->nbstates) &&
4019 (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4020 XML_REGEXP_SINK_STATE)) {
4021 values[nb++] = comp->stringMap[i];
4022 (*nbneg)++;
4023 }
4024 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004025 } else {
4026 int transno;
4027 xmlRegTransPtr trans;
4028 xmlRegAtomPtr atom;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004029 xmlRegStatePtr state;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004030
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004031 if (terminal != NULL) {
4032 if (exec->state->type == XML_REGEXP_FINAL_STATE)
4033 *terminal = 1;
4034 else
4035 *terminal = 0;
4036 }
4037
4038 if (err) {
4039 if (exec->errState == NULL) return(-1);
4040 state = exec->errState;
4041 } else {
4042 if (exec->state == NULL) return(-1);
4043 state = exec->state;
4044 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004045 for (transno = 0;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004046 (transno < state->nbTrans) && (nb < maxval);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004047 transno++) {
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004048 trans = &state->trans[transno];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004049 if (trans->to < 0)
4050 continue;
4051 atom = trans->atom;
4052 if ((atom == NULL) || (atom->valuep == NULL))
4053 continue;
4054 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004055 /* this should not be reached but ... */
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004056 TODO;
4057 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004058 /* this should not be reached but ... */
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004059 TODO;
4060 } else if (trans->counter >= 0) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00004061 xmlRegCounterPtr counter = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004062 int count;
4063
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004064 if (err)
4065 count = exec->errCounts[trans->counter];
4066 else
4067 count = exec->counts[trans->counter];
Daniel Veillard11ce4002006-03-10 00:36:23 +00004068 if (exec->comp != NULL)
4069 counter = &exec->comp->counters[trans->counter];
4070 if ((counter == NULL) || (count < counter->max)) {
Daniel Veillard77005e62005-07-19 16:26:18 +00004071 if (atom->neg)
4072 values[nb++] = (xmlChar *) atom->valuep2;
4073 else
4074 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004075 (*nbval)++;
4076 }
4077 } else {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004078 if ((exec->comp->states[trans->to] != NULL) &&
4079 (exec->comp->states[trans->to]->type !=
4080 XML_REGEXP_SINK_STATE)) {
Daniel Veillard77005e62005-07-19 16:26:18 +00004081 if (atom->neg)
4082 values[nb++] = (xmlChar *) atom->valuep2;
4083 else
4084 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004085 (*nbval)++;
4086 }
4087 }
4088 }
4089 for (transno = 0;
4090 (transno < state->nbTrans) && (nb < maxval);
4091 transno++) {
4092 trans = &state->trans[transno];
4093 if (trans->to < 0)
4094 continue;
4095 atom = trans->atom;
4096 if ((atom == NULL) || (atom->valuep == NULL))
4097 continue;
4098 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4099 continue;
4100 } else if (trans->count == REGEXP_ALL_COUNTER) {
4101 continue;
4102 } else if (trans->counter >= 0) {
4103 continue;
4104 } else {
4105 if ((exec->comp->states[trans->to] != NULL) &&
4106 (exec->comp->states[trans->to]->type ==
4107 XML_REGEXP_SINK_STATE)) {
Daniel Veillard77005e62005-07-19 16:26:18 +00004108 if (atom->neg)
4109 values[nb++] = (xmlChar *) atom->valuep2;
4110 else
4111 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004112 (*nbneg)++;
4113 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004114 }
4115 }
4116 }
4117 return(0);
4118}
4119
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004120/**
4121 * xmlRegExecNextValues:
4122 * @exec: a regexp execution context
4123 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004124 * @nbneg: return number of negative transitions
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004125 * @values: pointer to the array of acceptable values
4126 * @terminal: return value if this was a terminal state
4127 *
4128 * Extract informations from the regexp execution,
4129 * the parameter @values must point to an array of @nbval string pointers
4130 * on return nbval will contain the number of possible strings in that
4131 * state and the @values array will be updated with them. The string values
4132 * returned will be freed with the @exec context and don't need to be
4133 * deallocated.
4134 *
4135 * Returns: 0 in case of success or -1 in case of error.
4136 */
4137int
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004138xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4139 xmlChar **values, int *terminal) {
4140 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004141}
4142
4143/**
4144 * xmlRegExecErrInfo:
4145 * @exec: a regexp execution context generating an error
4146 * @string: return value for the error string
4147 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004148 * @nbneg: return number of negative transitions
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004149 * @values: pointer to the array of acceptable values
4150 * @terminal: return value if this was a terminal state
4151 *
4152 * Extract error informations from the regexp execution, the parameter
4153 * @string will be updated with the value pushed and not accepted,
4154 * the parameter @values must point to an array of @nbval string pointers
4155 * on return nbval will contain the number of possible strings in that
4156 * state and the @values array will be updated with them. The string values
4157 * returned will be freed with the @exec context and don't need to be
4158 * deallocated.
4159 *
4160 * Returns: 0 in case of success or -1 in case of error.
4161 */
4162int
4163xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004164 int *nbval, int *nbneg, xmlChar **values, int *terminal) {
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004165 if (exec == NULL)
4166 return(-1);
4167 if (string != NULL) {
4168 if (exec->status != 0)
4169 *string = exec->errString;
4170 else
4171 *string = NULL;
4172 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004173 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004174}
4175
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004176#ifdef DEBUG_ERR
4177static void testerr(xmlRegExecCtxtPtr exec) {
4178 const xmlChar *string;
Daniel Veillardcee2b3a2005-01-25 00:22:52 +00004179 xmlChar *values[5];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004180 int nb = 5;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004181 int nbneg;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004182 int terminal;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004183 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004184}
4185#endif
4186
Daniel Veillard4255d502002-04-16 15:50:10 +00004187#if 0
4188static int
4189xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4190 xmlRegTransPtr trans;
4191 xmlRegAtomPtr atom;
4192 int ret;
4193 int codepoint, len;
4194
4195 if (exec == NULL)
4196 return(-1);
4197 if (exec->status != 0)
4198 return(exec->status);
4199
4200 while ((exec->status == 0) &&
4201 ((exec->inputString[exec->index] != 0) ||
4202 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4203
4204 /*
4205 * End of input on non-terminal state, rollback, however we may
4206 * still have epsilon like transition for counted transitions
4207 * on counters, in that case don't break too early.
4208 */
4209 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4210 goto rollback;
4211
4212 exec->transcount = 0;
4213 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4214 trans = &exec->state->trans[exec->transno];
4215 if (trans->to < 0)
4216 continue;
4217 atom = trans->atom;
4218 ret = 0;
4219 if (trans->count >= 0) {
4220 int count;
4221 xmlRegCounterPtr counter;
4222
4223 /*
4224 * A counted transition.
4225 */
4226
4227 count = exec->counts[trans->count];
4228 counter = &exec->comp->counters[trans->count];
4229#ifdef DEBUG_REGEXP_EXEC
4230 printf("testing count %d: val %d, min %d, max %d\n",
4231 trans->count, count, counter->min, counter->max);
4232#endif
4233 ret = ((count >= counter->min) && (count <= counter->max));
4234 } else if (atom == NULL) {
4235 fprintf(stderr, "epsilon transition left at runtime\n");
4236 exec->status = -2;
4237 break;
4238 } else if (exec->inputString[exec->index] != 0) {
4239 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4240 ret = xmlRegCheckCharacter(atom, codepoint);
4241 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4242 xmlRegStatePtr to = exec->comp->states[trans->to];
4243
4244 /*
4245 * this is a multiple input sequence
4246 */
4247 if (exec->state->nbTrans > exec->transno + 1) {
4248 xmlFARegExecSave(exec);
4249 }
4250 exec->transcount = 1;
4251 do {
4252 /*
4253 * Try to progress as much as possible on the input
4254 */
4255 if (exec->transcount == atom->max) {
4256 break;
4257 }
4258 exec->index += len;
4259 /*
4260 * End of input: stop here
4261 */
4262 if (exec->inputString[exec->index] == 0) {
4263 exec->index -= len;
4264 break;
4265 }
4266 if (exec->transcount >= atom->min) {
4267 int transno = exec->transno;
4268 xmlRegStatePtr state = exec->state;
4269
4270 /*
4271 * The transition is acceptable save it
4272 */
4273 exec->transno = -1; /* trick */
4274 exec->state = to;
4275 xmlFARegExecSave(exec);
4276 exec->transno = transno;
4277 exec->state = state;
4278 }
4279 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4280 len);
4281 ret = xmlRegCheckCharacter(atom, codepoint);
4282 exec->transcount++;
4283 } while (ret == 1);
4284 if (exec->transcount < atom->min)
4285 ret = 0;
4286
4287 /*
4288 * If the last check failed but one transition was found
4289 * possible, rollback
4290 */
4291 if (ret < 0)
4292 ret = 0;
4293 if (ret == 0) {
4294 goto rollback;
4295 }
4296 }
4297 }
4298 if (ret == 1) {
4299 if (exec->state->nbTrans > exec->transno + 1) {
4300 xmlFARegExecSave(exec);
4301 }
4302 if (trans->counter >= 0) {
4303#ifdef DEBUG_REGEXP_EXEC
4304 printf("Increasing count %d\n", trans->counter);
4305#endif
4306 exec->counts[trans->counter]++;
4307 }
4308#ifdef DEBUG_REGEXP_EXEC
4309 printf("entering state %d\n", trans->to);
4310#endif
4311 exec->state = exec->comp->states[trans->to];
4312 exec->transno = 0;
4313 if (trans->atom != NULL) {
4314 exec->index += len;
4315 }
4316 goto progress;
4317 } else if (ret < 0) {
4318 exec->status = -4;
4319 break;
4320 }
4321 }
4322 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4323rollback:
4324 /*
4325 * Failed to find a way out
4326 */
4327 exec->determinist = 0;
4328 xmlFARegExecRollBack(exec);
4329 }
4330progress:
4331 continue;
4332 }
4333}
4334#endif
4335/************************************************************************
4336 * *
William M. Brackddf71d62004-05-06 04:17:26 +00004337 * Parser for the Schemas Datatype Regular Expressions *
Daniel Veillard4255d502002-04-16 15:50:10 +00004338 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
4339 * *
4340 ************************************************************************/
4341
4342/**
4343 * xmlFAIsChar:
Daniel Veillard441bc322002-04-20 17:38:48 +00004344 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004345 *
4346 * [10] Char ::= [^.\?*+()|#x5B#x5D]
4347 */
4348static int
4349xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4350 int cur;
4351 int len;
4352
4353 cur = CUR_SCHAR(ctxt->cur, len);
4354 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4355 (cur == '*') || (cur == '+') || (cur == '(') ||
4356 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
4357 (cur == 0x5D) || (cur == 0))
4358 return(-1);
4359 return(cur);
4360}
4361
4362/**
4363 * xmlFAParseCharProp:
Daniel Veillard441bc322002-04-20 17:38:48 +00004364 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004365 *
4366 * [27] charProp ::= IsCategory | IsBlock
4367 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
4368 * Separators | Symbols | Others
4369 * [29] Letters ::= 'L' [ultmo]?
4370 * [30] Marks ::= 'M' [nce]?
4371 * [31] Numbers ::= 'N' [dlo]?
4372 * [32] Punctuation ::= 'P' [cdseifo]?
4373 * [33] Separators ::= 'Z' [slp]?
4374 * [34] Symbols ::= 'S' [mcko]?
4375 * [35] Others ::= 'C' [cfon]?
4376 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
4377 */
4378static void
4379xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4380 int cur;
William M. Brack779af002003-08-01 15:55:39 +00004381 xmlRegAtomType type = (xmlRegAtomType) 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00004382 xmlChar *blockName = NULL;
4383
4384 cur = CUR;
4385 if (cur == 'L') {
4386 NEXT;
4387 cur = CUR;
4388 if (cur == 'u') {
4389 NEXT;
4390 type = XML_REGEXP_LETTER_UPPERCASE;
4391 } else if (cur == 'l') {
4392 NEXT;
4393 type = XML_REGEXP_LETTER_LOWERCASE;
4394 } else if (cur == 't') {
4395 NEXT;
4396 type = XML_REGEXP_LETTER_TITLECASE;
4397 } else if (cur == 'm') {
4398 NEXT;
4399 type = XML_REGEXP_LETTER_MODIFIER;
4400 } else if (cur == 'o') {
4401 NEXT;
4402 type = XML_REGEXP_LETTER_OTHERS;
4403 } else {
4404 type = XML_REGEXP_LETTER;
4405 }
4406 } else if (cur == 'M') {
4407 NEXT;
4408 cur = CUR;
4409 if (cur == 'n') {
4410 NEXT;
4411 /* nonspacing */
4412 type = XML_REGEXP_MARK_NONSPACING;
4413 } else if (cur == 'c') {
4414 NEXT;
4415 /* spacing combining */
4416 type = XML_REGEXP_MARK_SPACECOMBINING;
4417 } else if (cur == 'e') {
4418 NEXT;
4419 /* enclosing */
4420 type = XML_REGEXP_MARK_ENCLOSING;
4421 } else {
4422 /* all marks */
4423 type = XML_REGEXP_MARK;
4424 }
4425 } else if (cur == 'N') {
4426 NEXT;
4427 cur = CUR;
4428 if (cur == 'd') {
4429 NEXT;
4430 /* digital */
4431 type = XML_REGEXP_NUMBER_DECIMAL;
4432 } else if (cur == 'l') {
4433 NEXT;
4434 /* letter */
4435 type = XML_REGEXP_NUMBER_LETTER;
4436 } else if (cur == 'o') {
4437 NEXT;
4438 /* other */
4439 type = XML_REGEXP_NUMBER_OTHERS;
4440 } else {
4441 /* all numbers */
4442 type = XML_REGEXP_NUMBER;
4443 }
4444 } else if (cur == 'P') {
4445 NEXT;
4446 cur = CUR;
4447 if (cur == 'c') {
4448 NEXT;
4449 /* connector */
4450 type = XML_REGEXP_PUNCT_CONNECTOR;
4451 } else if (cur == 'd') {
4452 NEXT;
4453 /* dash */
4454 type = XML_REGEXP_PUNCT_DASH;
4455 } else if (cur == 's') {
4456 NEXT;
4457 /* open */
4458 type = XML_REGEXP_PUNCT_OPEN;
4459 } else if (cur == 'e') {
4460 NEXT;
4461 /* close */
4462 type = XML_REGEXP_PUNCT_CLOSE;
4463 } else if (cur == 'i') {
4464 NEXT;
4465 /* initial quote */
4466 type = XML_REGEXP_PUNCT_INITQUOTE;
4467 } else if (cur == 'f') {
4468 NEXT;
4469 /* final quote */
4470 type = XML_REGEXP_PUNCT_FINQUOTE;
4471 } else if (cur == 'o') {
4472 NEXT;
4473 /* other */
4474 type = XML_REGEXP_PUNCT_OTHERS;
4475 } else {
4476 /* all punctuation */
4477 type = XML_REGEXP_PUNCT;
4478 }
4479 } else if (cur == 'Z') {
4480 NEXT;
4481 cur = CUR;
4482 if (cur == 's') {
4483 NEXT;
4484 /* space */
4485 type = XML_REGEXP_SEPAR_SPACE;
4486 } else if (cur == 'l') {
4487 NEXT;
4488 /* line */
4489 type = XML_REGEXP_SEPAR_LINE;
4490 } else if (cur == 'p') {
4491 NEXT;
4492 /* paragraph */
4493 type = XML_REGEXP_SEPAR_PARA;
4494 } else {
4495 /* all separators */
4496 type = XML_REGEXP_SEPAR;
4497 }
4498 } else if (cur == 'S') {
4499 NEXT;
4500 cur = CUR;
4501 if (cur == 'm') {
4502 NEXT;
4503 type = XML_REGEXP_SYMBOL_MATH;
4504 /* math */
4505 } else if (cur == 'c') {
4506 NEXT;
4507 type = XML_REGEXP_SYMBOL_CURRENCY;
4508 /* currency */
4509 } else if (cur == 'k') {
4510 NEXT;
4511 type = XML_REGEXP_SYMBOL_MODIFIER;
4512 /* modifiers */
4513 } else if (cur == 'o') {
4514 NEXT;
4515 type = XML_REGEXP_SYMBOL_OTHERS;
4516 /* other */
4517 } else {
4518 /* all symbols */
4519 type = XML_REGEXP_SYMBOL;
4520 }
4521 } else if (cur == 'C') {
4522 NEXT;
4523 cur = CUR;
4524 if (cur == 'c') {
4525 NEXT;
4526 /* control */
4527 type = XML_REGEXP_OTHER_CONTROL;
4528 } else if (cur == 'f') {
4529 NEXT;
4530 /* format */
4531 type = XML_REGEXP_OTHER_FORMAT;
4532 } else if (cur == 'o') {
4533 NEXT;
4534 /* private use */
4535 type = XML_REGEXP_OTHER_PRIVATE;
4536 } else if (cur == 'n') {
4537 NEXT;
4538 /* not assigned */
4539 type = XML_REGEXP_OTHER_NA;
4540 } else {
4541 /* all others */
4542 type = XML_REGEXP_OTHER;
4543 }
4544 } else if (cur == 'I') {
4545 const xmlChar *start;
4546 NEXT;
4547 cur = CUR;
4548 if (cur != 's') {
4549 ERROR("IsXXXX expected");
4550 return;
4551 }
4552 NEXT;
4553 start = ctxt->cur;
4554 cur = CUR;
4555 if (((cur >= 'a') && (cur <= 'z')) ||
4556 ((cur >= 'A') && (cur <= 'Z')) ||
4557 ((cur >= '0') && (cur <= '9')) ||
4558 (cur == 0x2D)) {
4559 NEXT;
4560 cur = CUR;
4561 while (((cur >= 'a') && (cur <= 'z')) ||
4562 ((cur >= 'A') && (cur <= 'Z')) ||
4563 ((cur >= '0') && (cur <= '9')) ||
4564 (cur == 0x2D)) {
4565 NEXT;
4566 cur = CUR;
4567 }
4568 }
4569 type = XML_REGEXP_BLOCK_NAME;
4570 blockName = xmlStrndup(start, ctxt->cur - start);
4571 } else {
4572 ERROR("Unknown char property");
4573 return;
4574 }
4575 if (ctxt->atom == NULL) {
4576 ctxt->atom = xmlRegNewAtom(ctxt, type);
4577 if (ctxt->atom != NULL)
4578 ctxt->atom->valuep = blockName;
4579 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4580 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4581 type, 0, 0, blockName);
4582 }
4583}
4584
4585/**
4586 * xmlFAParseCharClassEsc:
Daniel Veillard441bc322002-04-20 17:38:48 +00004587 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004588 *
4589 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4590 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4591 * [25] catEsc ::= '\p{' charProp '}'
4592 * [26] complEsc ::= '\P{' charProp '}'
4593 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4594 */
4595static void
4596xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4597 int cur;
4598
4599 if (CUR == '.') {
4600 if (ctxt->atom == NULL) {
4601 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4602 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4603 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4604 XML_REGEXP_ANYCHAR, 0, 0, NULL);
4605 }
4606 NEXT;
4607 return;
4608 }
4609 if (CUR != '\\') {
4610 ERROR("Escaped sequence: expecting \\");
4611 return;
4612 }
4613 NEXT;
4614 cur = CUR;
4615 if (cur == 'p') {
4616 NEXT;
4617 if (CUR != '{') {
4618 ERROR("Expecting '{'");
4619 return;
4620 }
4621 NEXT;
4622 xmlFAParseCharProp(ctxt);
4623 if (CUR != '}') {
4624 ERROR("Expecting '}'");
4625 return;
4626 }
4627 NEXT;
4628 } else if (cur == 'P') {
4629 NEXT;
4630 if (CUR != '{') {
4631 ERROR("Expecting '{'");
4632 return;
4633 }
4634 NEXT;
4635 xmlFAParseCharProp(ctxt);
4636 ctxt->atom->neg = 1;
4637 if (CUR != '}') {
4638 ERROR("Expecting '}'");
4639 return;
4640 }
4641 NEXT;
4642 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4643 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4644 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4645 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4646 (cur == 0x5E)) {
4647 if (ctxt->atom == NULL) {
4648 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
Daniel Veillard99c394d2005-07-14 12:58:49 +00004649 if (ctxt->atom != NULL) {
4650 switch (cur) {
4651 case 'n':
4652 ctxt->atom->codepoint = '\n';
4653 break;
4654 case 'r':
4655 ctxt->atom->codepoint = '\r';
4656 break;
4657 case 't':
4658 ctxt->atom->codepoint = '\t';
4659 break;
4660 default:
4661 ctxt->atom->codepoint = cur;
4662 }
4663 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004664 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4665 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4666 XML_REGEXP_CHARVAL, cur, cur, NULL);
4667 }
4668 NEXT;
4669 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
4670 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
4671 (cur == 'w') || (cur == 'W')) {
Daniel Veillardb509f152002-04-17 16:28:10 +00004672 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
Daniel Veillard4255d502002-04-16 15:50:10 +00004673
4674 switch (cur) {
4675 case 's':
4676 type = XML_REGEXP_ANYSPACE;
4677 break;
4678 case 'S':
4679 type = XML_REGEXP_NOTSPACE;
4680 break;
4681 case 'i':
4682 type = XML_REGEXP_INITNAME;
4683 break;
4684 case 'I':
4685 type = XML_REGEXP_NOTINITNAME;
4686 break;
4687 case 'c':
4688 type = XML_REGEXP_NAMECHAR;
4689 break;
4690 case 'C':
4691 type = XML_REGEXP_NOTNAMECHAR;
4692 break;
4693 case 'd':
4694 type = XML_REGEXP_DECIMAL;
4695 break;
4696 case 'D':
4697 type = XML_REGEXP_NOTDECIMAL;
4698 break;
4699 case 'w':
4700 type = XML_REGEXP_REALCHAR;
4701 break;
4702 case 'W':
4703 type = XML_REGEXP_NOTREALCHAR;
4704 break;
4705 }
4706 NEXT;
4707 if (ctxt->atom == NULL) {
4708 ctxt->atom = xmlRegNewAtom(ctxt, type);
4709 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4710 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4711 type, 0, 0, NULL);
4712 }
4713 }
4714}
4715
4716/**
4717 * xmlFAParseCharRef:
Daniel Veillard441bc322002-04-20 17:38:48 +00004718 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004719 *
4720 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' )
4721 */
4722static int
4723xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) {
4724 int ret = 0, cur;
4725
4726 if ((CUR != '&') || (NXT(1) != '#'))
4727 return(-1);
4728 NEXT;
4729 NEXT;
4730 cur = CUR;
4731 if (cur == 'x') {
4732 NEXT;
4733 cur = CUR;
4734 if (((cur >= '0') && (cur <= '9')) ||
4735 ((cur >= 'a') && (cur <= 'f')) ||
4736 ((cur >= 'A') && (cur <= 'F'))) {
4737 while (((cur >= '0') && (cur <= '9')) ||
Daniel Veillard11ce4002006-03-10 00:36:23 +00004738 ((cur >= 'a') && (cur <= 'f')) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00004739 ((cur >= 'A') && (cur <= 'F'))) {
4740 if ((cur >= '0') && (cur <= '9'))
4741 ret = ret * 16 + cur - '0';
4742 else if ((cur >= 'a') && (cur <= 'f'))
4743 ret = ret * 16 + 10 + (cur - 'a');
4744 else
4745 ret = ret * 16 + 10 + (cur - 'A');
4746 NEXT;
4747 cur = CUR;
4748 }
4749 } else {
4750 ERROR("Char ref: expecting [0-9A-F]");
4751 return(-1);
4752 }
4753 } else {
4754 if ((cur >= '0') && (cur <= '9')) {
4755 while ((cur >= '0') && (cur <= '9')) {
4756 ret = ret * 10 + cur - '0';
4757 NEXT;
4758 cur = CUR;
4759 }
4760 } else {
4761 ERROR("Char ref: expecting [0-9]");
4762 return(-1);
4763 }
4764 }
4765 if (cur != ';') {
4766 ERROR("Char ref: expecting ';'");
4767 return(-1);
4768 } else {
4769 NEXT;
4770 }
4771 return(ret);
4772}
4773
4774/**
4775 * xmlFAParseCharRange:
Daniel Veillard441bc322002-04-20 17:38:48 +00004776 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004777 *
4778 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
4779 * [18] seRange ::= charOrEsc '-' charOrEsc
4780 * [20] charOrEsc ::= XmlChar | SingleCharEsc
4781 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
4782 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
4783 */
4784static void
4785xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
William M. Brackdc99df92003-12-27 01:54:25 +00004786 int cur, len;
Daniel Veillard4255d502002-04-16 15:50:10 +00004787 int start = -1;
4788 int end = -1;
4789
4790 if ((CUR == '&') && (NXT(1) == '#')) {
4791 end = start = xmlFAParseCharRef(ctxt);
4792 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4793 XML_REGEXP_CHARVAL, start, end, NULL);
4794 return;
4795 }
4796 cur = CUR;
4797 if (cur == '\\') {
4798 NEXT;
4799 cur = CUR;
4800 switch (cur) {
4801 case 'n': start = 0xA; break;
4802 case 'r': start = 0xD; break;
4803 case 't': start = 0x9; break;
4804 case '\\': case '|': case '.': case '-': case '^': case '?':
4805 case '*': case '+': case '{': case '}': case '(': case ')':
4806 case '[': case ']':
4807 start = cur; break;
4808 default:
4809 ERROR("Invalid escape value");
4810 return;
4811 }
4812 end = start;
William M. Brackdc99df92003-12-27 01:54:25 +00004813 len = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00004814 } else if ((cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00004815 end = start = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00004816 } else {
4817 ERROR("Expecting a char range");
4818 return;
4819 }
William M. Brackdc99df92003-12-27 01:54:25 +00004820 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00004821 if (start == '-') {
4822 return;
4823 }
4824 cur = CUR;
William M. Brack10f1ef42004-03-20 14:51:25 +00004825 if ((cur != '-') || (NXT(1) == ']')) {
Daniel Veillard4255d502002-04-16 15:50:10 +00004826 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4827 XML_REGEXP_CHARVAL, start, end, NULL);
4828 return;
4829 }
4830 NEXT;
4831 cur = CUR;
4832 if (cur == '\\') {
4833 NEXT;
4834 cur = CUR;
4835 switch (cur) {
4836 case 'n': end = 0xA; break;
4837 case 'r': end = 0xD; break;
4838 case 't': end = 0x9; break;
4839 case '\\': case '|': case '.': case '-': case '^': case '?':
4840 case '*': case '+': case '{': case '}': case '(': case ')':
4841 case '[': case ']':
4842 end = cur; break;
4843 default:
4844 ERROR("Invalid escape value");
4845 return;
4846 }
William M. Brackdc99df92003-12-27 01:54:25 +00004847 len = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00004848 } else if ((cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00004849 end = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00004850 } else {
4851 ERROR("Expecting the end of a char range");
4852 return;
4853 }
William M. Brackdc99df92003-12-27 01:54:25 +00004854 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00004855 /* TODO check that the values are acceptable character ranges for XML */
4856 if (end < start) {
4857 ERROR("End of range is before start of range");
4858 } else {
4859 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4860 XML_REGEXP_CHARVAL, start, end, NULL);
4861 }
4862 return;
4863}
4864
4865/**
4866 * xmlFAParsePosCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00004867 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004868 *
4869 * [14] posCharGroup ::= ( charRange | charClassEsc )+
4870 */
4871static void
4872xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
4873 do {
4874 if ((CUR == '\\') || (CUR == '.')) {
4875 xmlFAParseCharClassEsc(ctxt);
4876 } else {
4877 xmlFAParseCharRange(ctxt);
4878 }
4879 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
4880 (ctxt->error == 0));
4881}
4882
4883/**
4884 * xmlFAParseCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00004885 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004886 *
4887 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
4888 * [15] negCharGroup ::= '^' posCharGroup
4889 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
4890 * [12] charClassExpr ::= '[' charGroup ']'
4891 */
4892static void
4893xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
4894 int n = ctxt->neg;
4895 while ((CUR != ']') && (ctxt->error == 0)) {
4896 if (CUR == '^') {
4897 int neg = ctxt->neg;
4898
4899 NEXT;
4900 ctxt->neg = !ctxt->neg;
4901 xmlFAParsePosCharGroup(ctxt);
4902 ctxt->neg = neg;
William M. Brack10f1ef42004-03-20 14:51:25 +00004903 } else if ((CUR == '-') && (NXT(1) == '[')) {
Daniel Veillardf8b9de32003-11-24 14:27:26 +00004904 int neg = ctxt->neg;
Daniel Veillardf8b9de32003-11-24 14:27:26 +00004905 ctxt->neg = 2;
William M. Brack10f1ef42004-03-20 14:51:25 +00004906 NEXT; /* eat the '-' */
4907 NEXT; /* eat the '[' */
Daniel Veillard4255d502002-04-16 15:50:10 +00004908 xmlFAParseCharGroup(ctxt);
4909 if (CUR == ']') {
4910 NEXT;
4911 } else {
4912 ERROR("charClassExpr: ']' expected");
4913 break;
4914 }
Daniel Veillardf8b9de32003-11-24 14:27:26 +00004915 ctxt->neg = neg;
Daniel Veillard4255d502002-04-16 15:50:10 +00004916 break;
4917 } else if (CUR != ']') {
4918 xmlFAParsePosCharGroup(ctxt);
4919 }
4920 }
4921 ctxt->neg = n;
4922}
4923
4924/**
4925 * xmlFAParseCharClass:
Daniel Veillard441bc322002-04-20 17:38:48 +00004926 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004927 *
4928 * [11] charClass ::= charClassEsc | charClassExpr
4929 * [12] charClassExpr ::= '[' charGroup ']'
4930 */
4931static void
4932xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
4933 if (CUR == '[') {
4934 NEXT;
4935 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
4936 if (ctxt->atom == NULL)
4937 return;
4938 xmlFAParseCharGroup(ctxt);
4939 if (CUR == ']') {
4940 NEXT;
4941 } else {
4942 ERROR("xmlFAParseCharClass: ']' expected");
4943 }
4944 } else {
4945 xmlFAParseCharClassEsc(ctxt);
4946 }
4947}
4948
4949/**
4950 * xmlFAParseQuantExact:
Daniel Veillard441bc322002-04-20 17:38:48 +00004951 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004952 *
4953 * [8] QuantExact ::= [0-9]+
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004954 *
4955 * Returns 0 if success or -1 in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00004956 */
4957static int
4958xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
4959 int ret = 0;
4960 int ok = 0;
4961
4962 while ((CUR >= '0') && (CUR <= '9')) {
4963 ret = ret * 10 + (CUR - '0');
4964 ok = 1;
4965 NEXT;
4966 }
4967 if (ok != 1) {
4968 return(-1);
4969 }
4970 return(ret);
4971}
4972
4973/**
4974 * xmlFAParseQuantifier:
Daniel Veillard441bc322002-04-20 17:38:48 +00004975 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004976 *
4977 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
4978 * [5] quantity ::= quantRange | quantMin | QuantExact
4979 * [6] quantRange ::= QuantExact ',' QuantExact
4980 * [7] quantMin ::= QuantExact ','
4981 * [8] QuantExact ::= [0-9]+
4982 */
4983static int
4984xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
4985 int cur;
4986
4987 cur = CUR;
4988 if ((cur == '?') || (cur == '*') || (cur == '+')) {
4989 if (ctxt->atom != NULL) {
4990 if (cur == '?')
4991 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
4992 else if (cur == '*')
4993 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
4994 else if (cur == '+')
4995 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
4996 }
4997 NEXT;
4998 return(1);
4999 }
5000 if (cur == '{') {
5001 int min = 0, max = 0;
5002
5003 NEXT;
5004 cur = xmlFAParseQuantExact(ctxt);
5005 if (cur >= 0)
5006 min = cur;
5007 if (CUR == ',') {
5008 NEXT;
Daniel Veillardebe48c62003-12-03 12:12:27 +00005009 if (CUR == '}')
5010 max = INT_MAX;
5011 else {
5012 cur = xmlFAParseQuantExact(ctxt);
5013 if (cur >= 0)
5014 max = cur;
5015 else {
5016 ERROR("Improper quantifier");
5017 }
5018 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005019 }
5020 if (CUR == '}') {
5021 NEXT;
5022 } else {
5023 ERROR("Unterminated quantifier");
5024 }
5025 if (max == 0)
5026 max = min;
5027 if (ctxt->atom != NULL) {
5028 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5029 ctxt->atom->min = min;
5030 ctxt->atom->max = max;
5031 }
5032 return(1);
5033 }
5034 return(0);
5035}
5036
5037/**
5038 * xmlFAParseAtom:
Daniel Veillard441bc322002-04-20 17:38:48 +00005039 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005040 *
5041 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
5042 */
5043static int
5044xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5045 int codepoint, len;
5046
5047 codepoint = xmlFAIsChar(ctxt);
5048 if (codepoint > 0) {
5049 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5050 if (ctxt->atom == NULL)
5051 return(-1);
5052 codepoint = CUR_SCHAR(ctxt->cur, len);
5053 ctxt->atom->codepoint = codepoint;
5054 NEXTL(len);
5055 return(1);
5056 } else if (CUR == '|') {
5057 return(0);
5058 } else if (CUR == 0) {
5059 return(0);
5060 } else if (CUR == ')') {
5061 return(0);
5062 } else if (CUR == '(') {
5063 xmlRegStatePtr start, oldend;
5064
5065 NEXT;
5066 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5067 start = ctxt->state;
5068 oldend = ctxt->end;
5069 ctxt->end = NULL;
5070 ctxt->atom = NULL;
5071 xmlFAParseRegExp(ctxt, 0);
5072 if (CUR == ')') {
5073 NEXT;
5074 } else {
5075 ERROR("xmlFAParseAtom: expecting ')'");
5076 }
5077 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5078 if (ctxt->atom == NULL)
5079 return(-1);
5080 ctxt->atom->start = start;
5081 ctxt->atom->stop = ctxt->state;
5082 ctxt->end = oldend;
5083 return(1);
5084 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5085 xmlFAParseCharClass(ctxt);
5086 return(1);
5087 }
5088 return(0);
5089}
5090
5091/**
5092 * xmlFAParsePiece:
Daniel Veillard441bc322002-04-20 17:38:48 +00005093 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005094 *
5095 * [3] piece ::= atom quantifier?
5096 */
5097static int
5098xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5099 int ret;
5100
5101 ctxt->atom = NULL;
5102 ret = xmlFAParseAtom(ctxt);
5103 if (ret == 0)
5104 return(0);
5105 if (ctxt->atom == NULL) {
5106 ERROR("internal: no atom generated");
5107 }
5108 xmlFAParseQuantifier(ctxt);
5109 return(1);
5110}
5111
5112/**
5113 * xmlFAParseBranch:
Daniel Veillard441bc322002-04-20 17:38:48 +00005114 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005115 *
5116 * [2] branch ::= piece*
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005117 8
Daniel Veillard4255d502002-04-16 15:50:10 +00005118 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005119static int
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005120xmlFAParseBranch(xmlRegParserCtxtPtr ctxt) {
Daniel Veillard4255d502002-04-16 15:50:10 +00005121 xmlRegStatePtr previous;
Daniel Veillard4255d502002-04-16 15:50:10 +00005122 int ret;
5123
5124 previous = ctxt->state;
5125 ret = xmlFAParsePiece(ctxt);
5126 if (ret != 0) {
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005127 if (xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom) < 0)
5128 return(-1);
5129 previous = ctxt->state;
Daniel Veillard4255d502002-04-16 15:50:10 +00005130 ctxt->atom = NULL;
5131 }
5132 while ((ret != 0) && (ctxt->error == 0)) {
5133 ret = xmlFAParsePiece(ctxt);
5134 if (ret != 0) {
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005135 if (xmlFAGenerateTransitions(ctxt, previous, NULL,
5136 ctxt->atom) < 0)
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005137 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00005138 previous = ctxt->state;
5139 ctxt->atom = NULL;
5140 }
5141 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005142 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00005143}
5144
5145/**
5146 * xmlFAParseRegExp:
Daniel Veillard441bc322002-04-20 17:38:48 +00005147 * @ctxt: a regexp parser context
William M. Brackddf71d62004-05-06 04:17:26 +00005148 * @top: is this the top-level expression ?
Daniel Veillard4255d502002-04-16 15:50:10 +00005149 *
5150 * [1] regExp ::= branch ( '|' branch )*
5151 */
5152static void
5153xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
Daniel Veillardc7e3cc42004-09-28 12:33:52 +00005154 xmlRegStatePtr start, end;
Daniel Veillard4255d502002-04-16 15:50:10 +00005155
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005156 /* if not top start should have been generated by an epsilon trans */
Daniel Veillard4255d502002-04-16 15:50:10 +00005157 start = ctxt->state;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005158 ctxt->end = NULL;
5159 xmlFAParseBranch(ctxt);
5160 if (top) {
5161#ifdef DEBUG_REGEXP_GRAPH
5162 printf("State %d is final\n", ctxt->state->no);
5163#endif
5164 ctxt->state->type = XML_REGEXP_FINAL_STATE;
5165 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005166 if (CUR != '|') {
5167 ctxt->end = ctxt->state;
5168 return;
5169 }
5170 end = ctxt->state;
5171 while ((CUR == '|') && (ctxt->error == 0)) {
5172 NEXT;
5173 ctxt->state = start;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005174 ctxt->end = NULL;
5175 xmlFAParseBranch(ctxt);
5176 if (top) {
5177 ctxt->state->type = XML_REGEXP_FINAL_STATE;
5178#ifdef DEBUG_REGEXP_GRAPH
5179 printf("State %d is final\n", ctxt->state->no);
5180#endif
5181 } else {
5182 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, end);
5183 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005184 }
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005185 if (!top) {
5186 ctxt->state = end;
5187 ctxt->end = end;
5188 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005189}
5190
5191/************************************************************************
5192 * *
5193 * The basic API *
5194 * *
5195 ************************************************************************/
5196
5197/**
5198 * xmlRegexpPrint:
5199 * @output: the file for the output debug
5200 * @regexp: the compiled regexp
5201 *
5202 * Print the content of the compiled regular expression
5203 */
5204void
5205xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5206 int i;
5207
Daniel Veillarda82b1822004-11-08 16:24:57 +00005208 if (output == NULL)
5209 return;
Daniel Veillard4255d502002-04-16 15:50:10 +00005210 fprintf(output, " regexp: ");
5211 if (regexp == NULL) {
5212 fprintf(output, "NULL\n");
5213 return;
5214 }
5215 fprintf(output, "'%s' ", regexp->string);
5216 fprintf(output, "\n");
5217 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5218 for (i = 0;i < regexp->nbAtoms; i++) {
5219 fprintf(output, " %02d ", i);
5220 xmlRegPrintAtom(output, regexp->atoms[i]);
5221 }
5222 fprintf(output, "%d states:", regexp->nbStates);
5223 fprintf(output, "\n");
5224 for (i = 0;i < regexp->nbStates; i++) {
5225 xmlRegPrintState(output, regexp->states[i]);
5226 }
5227 fprintf(output, "%d counters:\n", regexp->nbCounters);
5228 for (i = 0;i < regexp->nbCounters; i++) {
5229 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5230 regexp->counters[i].max);
5231 }
5232}
5233
5234/**
5235 * xmlRegexpCompile:
5236 * @regexp: a regular expression string
5237 *
5238 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
William M. Brackddf71d62004-05-06 04:17:26 +00005239 * Appendix F and builds an automata suitable for testing strings against
Daniel Veillard4255d502002-04-16 15:50:10 +00005240 * that regular expression
5241 *
5242 * Returns the compiled expression or NULL in case of error
5243 */
5244xmlRegexpPtr
5245xmlRegexpCompile(const xmlChar *regexp) {
5246 xmlRegexpPtr ret;
5247 xmlRegParserCtxtPtr ctxt;
5248
5249 ctxt = xmlRegNewParserCtxt(regexp);
5250 if (ctxt == NULL)
5251 return(NULL);
5252
5253 /* initialize the parser */
5254 ctxt->end = NULL;
5255 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5256 xmlRegStatePush(ctxt, ctxt->start);
5257
5258 /* parse the expression building an automata */
5259 xmlFAParseRegExp(ctxt, 1);
5260 if (CUR != 0) {
5261 ERROR("xmlFAParseRegExp: extra characters");
5262 }
5263 ctxt->end = ctxt->state;
5264 ctxt->start->type = XML_REGEXP_START_STATE;
5265 ctxt->end->type = XML_REGEXP_FINAL_STATE;
5266
5267 /* remove the Epsilon except for counted transitions */
5268 xmlFAEliminateEpsilonTransitions(ctxt);
5269
5270
5271 if (ctxt->error != 0) {
5272 xmlRegFreeParserCtxt(ctxt);
5273 return(NULL);
5274 }
5275 ret = xmlRegEpxFromParse(ctxt);
5276 xmlRegFreeParserCtxt(ctxt);
5277 return(ret);
5278}
5279
5280/**
5281 * xmlRegexpExec:
5282 * @comp: the compiled regular expression
5283 * @content: the value to check against the regular expression
5284 *
William M. Brackddf71d62004-05-06 04:17:26 +00005285 * Check if the regular expression generates the value
Daniel Veillard4255d502002-04-16 15:50:10 +00005286 *
William M. Brackddf71d62004-05-06 04:17:26 +00005287 * Returns 1 if it matches, 0 if not and a negative value in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00005288 */
5289int
5290xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5291 if ((comp == NULL) || (content == NULL))
5292 return(-1);
5293 return(xmlFARegExec(comp, content));
5294}
5295
5296/**
Daniel Veillard23e73572002-09-19 19:56:43 +00005297 * xmlRegexpIsDeterminist:
5298 * @comp: the compiled regular expression
5299 *
5300 * Check if the regular expression is determinist
5301 *
William M. Brackddf71d62004-05-06 04:17:26 +00005302 * Returns 1 if it yes, 0 if not and a negative value in case of error
Daniel Veillard23e73572002-09-19 19:56:43 +00005303 */
5304int
5305xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5306 xmlAutomataPtr am;
5307 int ret;
5308
5309 if (comp == NULL)
5310 return(-1);
5311 if (comp->determinist != -1)
5312 return(comp->determinist);
5313
5314 am = xmlNewAutomata();
Daniel Veillardbd9afb52002-09-25 22:25:35 +00005315 if (am->states != NULL) {
5316 int i;
5317
5318 for (i = 0;i < am->nbStates;i++)
5319 xmlRegFreeState(am->states[i]);
5320 xmlFree(am->states);
5321 }
Daniel Veillard23e73572002-09-19 19:56:43 +00005322 am->nbAtoms = comp->nbAtoms;
5323 am->atoms = comp->atoms;
5324 am->nbStates = comp->nbStates;
5325 am->states = comp->states;
5326 am->determinist = -1;
5327 ret = xmlFAComputesDeterminism(am);
5328 am->atoms = NULL;
5329 am->states = NULL;
5330 xmlFreeAutomata(am);
5331 return(ret);
5332}
5333
5334/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005335 * xmlRegFreeRegexp:
5336 * @regexp: the regexp
5337 *
5338 * Free a regexp
5339 */
5340void
5341xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5342 int i;
5343 if (regexp == NULL)
5344 return;
5345
5346 if (regexp->string != NULL)
5347 xmlFree(regexp->string);
5348 if (regexp->states != NULL) {
5349 for (i = 0;i < regexp->nbStates;i++)
5350 xmlRegFreeState(regexp->states[i]);
5351 xmlFree(regexp->states);
5352 }
5353 if (regexp->atoms != NULL) {
5354 for (i = 0;i < regexp->nbAtoms;i++)
5355 xmlRegFreeAtom(regexp->atoms[i]);
5356 xmlFree(regexp->atoms);
5357 }
5358 if (regexp->counters != NULL)
5359 xmlFree(regexp->counters);
Daniel Veillard23e73572002-09-19 19:56:43 +00005360 if (regexp->compact != NULL)
5361 xmlFree(regexp->compact);
Daniel Veillard118aed72002-09-24 14:13:13 +00005362 if (regexp->transdata != NULL)
5363 xmlFree(regexp->transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +00005364 if (regexp->stringMap != NULL) {
5365 for (i = 0; i < regexp->nbstrings;i++)
5366 xmlFree(regexp->stringMap[i]);
5367 xmlFree(regexp->stringMap);
5368 }
5369
Daniel Veillard4255d502002-04-16 15:50:10 +00005370 xmlFree(regexp);
5371}
5372
5373#ifdef LIBXML_AUTOMATA_ENABLED
5374/************************************************************************
5375 * *
5376 * The Automata interface *
5377 * *
5378 ************************************************************************/
5379
5380/**
5381 * xmlNewAutomata:
5382 *
5383 * Create a new automata
5384 *
5385 * Returns the new object or NULL in case of failure
5386 */
5387xmlAutomataPtr
5388xmlNewAutomata(void) {
5389 xmlAutomataPtr ctxt;
5390
5391 ctxt = xmlRegNewParserCtxt(NULL);
5392 if (ctxt == NULL)
5393 return(NULL);
5394
5395 /* initialize the parser */
5396 ctxt->end = NULL;
5397 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005398 if (ctxt->start == NULL) {
5399 xmlFreeAutomata(ctxt);
5400 return(NULL);
5401 }
Daniel Veillardd0271472006-01-02 10:22:02 +00005402 ctxt->start->type = XML_REGEXP_START_STATE;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005403 if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5404 xmlRegFreeState(ctxt->start);
5405 xmlFreeAutomata(ctxt);
5406 return(NULL);
5407 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005408
5409 return(ctxt);
5410}
5411
5412/**
5413 * xmlFreeAutomata:
5414 * @am: an automata
5415 *
5416 * Free an automata
5417 */
5418void
5419xmlFreeAutomata(xmlAutomataPtr am) {
5420 if (am == NULL)
5421 return;
5422 xmlRegFreeParserCtxt(am);
5423}
5424
5425/**
5426 * xmlAutomataGetInitState:
5427 * @am: an automata
5428 *
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005429 * Initial state lookup
5430 *
Daniel Veillard4255d502002-04-16 15:50:10 +00005431 * Returns the initial state of the automata
5432 */
5433xmlAutomataStatePtr
5434xmlAutomataGetInitState(xmlAutomataPtr am) {
5435 if (am == NULL)
5436 return(NULL);
5437 return(am->start);
5438}
5439
5440/**
5441 * xmlAutomataSetFinalState:
5442 * @am: an automata
5443 * @state: a state in this automata
5444 *
5445 * Makes that state a final state
5446 *
5447 * Returns 0 or -1 in case of error
5448 */
5449int
5450xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5451 if ((am == NULL) || (state == NULL))
5452 return(-1);
5453 state->type = XML_REGEXP_FINAL_STATE;
5454 return(0);
5455}
5456
5457/**
5458 * xmlAutomataNewTransition:
5459 * @am: an automata
5460 * @from: the starting point of the transition
5461 * @to: the target point of the transition or NULL
5462 * @token: the input string associated to that transition
5463 * @data: data passed to the callback function if the transition is activated
5464 *
William M. Brackddf71d62004-05-06 04:17:26 +00005465 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00005466 * and then adds a transition from the @from state to the target state
5467 * activated by the value of @token
5468 *
5469 * Returns the target state or NULL in case of error
5470 */
5471xmlAutomataStatePtr
5472xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5473 xmlAutomataStatePtr to, const xmlChar *token,
5474 void *data) {
5475 xmlRegAtomPtr atom;
5476
5477 if ((am == NULL) || (from == NULL) || (token == NULL))
5478 return(NULL);
5479 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005480 if (atom == NULL)
5481 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00005482 atom->data = data;
5483 if (atom == NULL)
5484 return(NULL);
5485 atom->valuep = xmlStrdup(token);
5486
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005487 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5488 xmlRegFreeAtom(atom);
5489 return(NULL);
5490 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005491 if (to == NULL)
5492 return(am->state);
5493 return(to);
5494}
5495
5496/**
Daniel Veillard52b48c72003-04-13 19:53:42 +00005497 * xmlAutomataNewTransition2:
5498 * @am: an automata
5499 * @from: the starting point of the transition
5500 * @to: the target point of the transition or NULL
5501 * @token: the first input string associated to that transition
5502 * @token2: the second input string associated to that transition
5503 * @data: data passed to the callback function if the transition is activated
5504 *
William M. Brackddf71d62004-05-06 04:17:26 +00005505 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard52b48c72003-04-13 19:53:42 +00005506 * and then adds a transition from the @from state to the target state
5507 * activated by the value of @token
5508 *
5509 * Returns the target state or NULL in case of error
5510 */
5511xmlAutomataStatePtr
5512xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5513 xmlAutomataStatePtr to, const xmlChar *token,
5514 const xmlChar *token2, void *data) {
5515 xmlRegAtomPtr atom;
5516
5517 if ((am == NULL) || (from == NULL) || (token == NULL))
5518 return(NULL);
5519 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
Daniel Veillard52b48c72003-04-13 19:53:42 +00005520 if (atom == NULL)
5521 return(NULL);
Daniel Veillard11ce4002006-03-10 00:36:23 +00005522 atom->data = data;
Daniel Veillard52b48c72003-04-13 19:53:42 +00005523 if ((token2 == NULL) || (*token2 == 0)) {
5524 atom->valuep = xmlStrdup(token);
5525 } else {
5526 int lenn, lenp;
5527 xmlChar *str;
5528
5529 lenn = strlen((char *) token2);
5530 lenp = strlen((char *) token);
5531
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005532 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00005533 if (str == NULL) {
5534 xmlRegFreeAtom(atom);
5535 return(NULL);
5536 }
5537 memcpy(&str[0], token, lenp);
5538 str[lenp] = '|';
5539 memcpy(&str[lenp + 1], token2, lenn);
5540 str[lenn + lenp + 1] = 0;
5541
5542 atom->valuep = str;
5543 }
5544
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005545 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5546 xmlRegFreeAtom(atom);
5547 return(NULL);
5548 }
Daniel Veillard52b48c72003-04-13 19:53:42 +00005549 if (to == NULL)
5550 return(am->state);
5551 return(to);
5552}
5553
5554/**
Daniel Veillard9efc4762005-07-19 14:33:55 +00005555 * xmlAutomataNewNegTrans:
5556 * @am: an automata
5557 * @from: the starting point of the transition
5558 * @to: the target point of the transition or NULL
5559 * @token: the first input string associated to that transition
5560 * @token2: the second input string associated to that transition
5561 * @data: data passed to the callback function if the transition is activated
5562 *
5563 * If @to is NULL, this creates first a new target state in the automata
5564 * and then adds a transition from the @from state to the target state
5565 * activated by any value except (@token,@token2)
Daniel Veillard6e65e152005-08-09 11:09:52 +00005566 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5567 # the semantic of XSD ##other
Daniel Veillard9efc4762005-07-19 14:33:55 +00005568 *
5569 * Returns the target state or NULL in case of error
5570 */
5571xmlAutomataStatePtr
5572xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5573 xmlAutomataStatePtr to, const xmlChar *token,
5574 const xmlChar *token2, void *data) {
5575 xmlRegAtomPtr atom;
Daniel Veillard77005e62005-07-19 16:26:18 +00005576 xmlChar err_msg[200];
Daniel Veillard9efc4762005-07-19 14:33:55 +00005577
5578 if ((am == NULL) || (from == NULL) || (token == NULL))
5579 return(NULL);
5580 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5581 if (atom == NULL)
5582 return(NULL);
5583 atom->data = data;
5584 atom->neg = 1;
5585 if ((token2 == NULL) || (*token2 == 0)) {
5586 atom->valuep = xmlStrdup(token);
5587 } else {
5588 int lenn, lenp;
5589 xmlChar *str;
5590
5591 lenn = strlen((char *) token2);
5592 lenp = strlen((char *) token);
5593
5594 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5595 if (str == NULL) {
5596 xmlRegFreeAtom(atom);
5597 return(NULL);
5598 }
5599 memcpy(&str[0], token, lenp);
5600 str[lenp] = '|';
5601 memcpy(&str[lenp + 1], token2, lenn);
5602 str[lenn + lenp + 1] = 0;
5603
5604 atom->valuep = str;
5605 }
Daniel Veillarddb68b742005-07-30 13:18:24 +00005606 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
Daniel Veillard77005e62005-07-19 16:26:18 +00005607 err_msg[199] = 0;
5608 atom->valuep2 = xmlStrdup(err_msg);
Daniel Veillard9efc4762005-07-19 14:33:55 +00005609
5610 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5611 xmlRegFreeAtom(atom);
5612 return(NULL);
5613 }
Daniel Veillard6e65e152005-08-09 11:09:52 +00005614 am->negs++;
Daniel Veillard9efc4762005-07-19 14:33:55 +00005615 if (to == NULL)
5616 return(am->state);
5617 return(to);
5618}
5619
5620/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005621 * xmlAutomataNewCountTrans2:
5622 * @am: an automata
5623 * @from: the starting point of the transition
5624 * @to: the target point of the transition or NULL
5625 * @token: the input string associated to that transition
5626 * @token2: the second input string associated to that transition
5627 * @min: the minimum successive occurences of token
5628 * @max: the maximum successive occurences of token
5629 * @data: data associated to the transition
5630 *
5631 * If @to is NULL, this creates first a new target state in the automata
5632 * and then adds a transition from the @from state to the target state
5633 * activated by a succession of input of value @token and @token2 and
5634 * whose number is between @min and @max
5635 *
5636 * Returns the target state or NULL in case of error
5637 */
5638xmlAutomataStatePtr
5639xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5640 xmlAutomataStatePtr to, const xmlChar *token,
5641 const xmlChar *token2,
5642 int min, int max, void *data) {
5643 xmlRegAtomPtr atom;
5644 int counter;
5645
5646 if ((am == NULL) || (from == NULL) || (token == NULL))
5647 return(NULL);
5648 if (min < 0)
5649 return(NULL);
5650 if ((max < min) || (max < 1))
5651 return(NULL);
5652 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5653 if (atom == NULL)
5654 return(NULL);
5655 if ((token2 == NULL) || (*token2 == 0)) {
5656 atom->valuep = xmlStrdup(token);
5657 } else {
5658 int lenn, lenp;
5659 xmlChar *str;
5660
5661 lenn = strlen((char *) token2);
5662 lenp = strlen((char *) token);
5663
5664 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5665 if (str == NULL) {
5666 xmlRegFreeAtom(atom);
5667 return(NULL);
5668 }
5669 memcpy(&str[0], token, lenp);
5670 str[lenp] = '|';
5671 memcpy(&str[lenp + 1], token2, lenn);
5672 str[lenn + lenp + 1] = 0;
5673
5674 atom->valuep = str;
5675 }
5676 atom->data = data;
5677 if (min == 0)
5678 atom->min = 1;
5679 else
5680 atom->min = min;
5681 atom->max = max;
5682
5683 /*
5684 * associate a counter to the transition.
5685 */
5686 counter = xmlRegGetCounter(am);
5687 am->counters[counter].min = min;
5688 am->counters[counter].max = max;
5689
5690 /* xmlFAGenerateTransitions(am, from, to, atom); */
5691 if (to == NULL) {
5692 to = xmlRegNewState(am);
5693 xmlRegStatePush(am, to);
5694 }
Daniel Veillard5de09382005-09-26 17:18:17 +00005695 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005696 xmlRegAtomPush(am, atom);
5697 am->state = to;
5698
5699 if (to == NULL)
5700 to = am->state;
5701 if (to == NULL)
5702 return(NULL);
5703 if (min == 0)
5704 xmlFAGenerateEpsilonTransition(am, from, to);
5705 return(to);
5706}
5707
5708/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005709 * xmlAutomataNewCountTrans:
5710 * @am: an automata
5711 * @from: the starting point of the transition
5712 * @to: the target point of the transition or NULL
5713 * @token: the input string associated to that transition
5714 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005715 * @max: the maximum successive occurences of token
5716 * @data: data associated to the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00005717 *
William M. Brackddf71d62004-05-06 04:17:26 +00005718 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00005719 * and then adds a transition from the @from state to the target state
5720 * activated by a succession of input of value @token and whose number
5721 * is between @min and @max
5722 *
5723 * Returns the target state or NULL in case of error
5724 */
5725xmlAutomataStatePtr
5726xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5727 xmlAutomataStatePtr to, const xmlChar *token,
5728 int min, int max, void *data) {
5729 xmlRegAtomPtr atom;
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00005730 int counter;
Daniel Veillard4255d502002-04-16 15:50:10 +00005731
5732 if ((am == NULL) || (from == NULL) || (token == NULL))
5733 return(NULL);
5734 if (min < 0)
5735 return(NULL);
5736 if ((max < min) || (max < 1))
5737 return(NULL);
5738 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5739 if (atom == NULL)
5740 return(NULL);
5741 atom->valuep = xmlStrdup(token);
5742 atom->data = data;
5743 if (min == 0)
5744 atom->min = 1;
5745 else
5746 atom->min = min;
5747 atom->max = max;
5748
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00005749 /*
5750 * associate a counter to the transition.
5751 */
5752 counter = xmlRegGetCounter(am);
5753 am->counters[counter].min = min;
5754 am->counters[counter].max = max;
5755
5756 /* xmlFAGenerateTransitions(am, from, to, atom); */
5757 if (to == NULL) {
5758 to = xmlRegNewState(am);
5759 xmlRegStatePush(am, to);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005760 }
Daniel Veillard5de09382005-09-26 17:18:17 +00005761 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00005762 xmlRegAtomPush(am, atom);
5763 am->state = to;
5764
Daniel Veillard4255d502002-04-16 15:50:10 +00005765 if (to == NULL)
5766 to = am->state;
5767 if (to == NULL)
5768 return(NULL);
5769 if (min == 0)
5770 xmlFAGenerateEpsilonTransition(am, from, to);
5771 return(to);
5772}
5773
5774/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005775 * xmlAutomataNewOnceTrans2:
5776 * @am: an automata
5777 * @from: the starting point of the transition
5778 * @to: the target point of the transition or NULL
5779 * @token: the input string associated to that transition
5780 * @token2: the second input string associated to that transition
5781 * @min: the minimum successive occurences of token
5782 * @max: the maximum successive occurences of token
5783 * @data: data associated to the transition
5784 *
5785 * If @to is NULL, this creates first a new target state in the automata
5786 * and then adds a transition from the @from state to the target state
5787 * activated by a succession of input of value @token and @token2 and whose
5788 * number is between @min and @max, moreover that transition can only be
5789 * crossed once.
5790 *
5791 * Returns the target state or NULL in case of error
5792 */
5793xmlAutomataStatePtr
5794xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5795 xmlAutomataStatePtr to, const xmlChar *token,
5796 const xmlChar *token2,
5797 int min, int max, void *data) {
5798 xmlRegAtomPtr atom;
5799 int counter;
5800
5801 if ((am == NULL) || (from == NULL) || (token == NULL))
5802 return(NULL);
5803 if (min < 1)
5804 return(NULL);
5805 if ((max < min) || (max < 1))
5806 return(NULL);
5807 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5808 if (atom == NULL)
5809 return(NULL);
5810 if ((token2 == NULL) || (*token2 == 0)) {
5811 atom->valuep = xmlStrdup(token);
5812 } else {
5813 int lenn, lenp;
5814 xmlChar *str;
5815
5816 lenn = strlen((char *) token2);
5817 lenp = strlen((char *) token);
5818
5819 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5820 if (str == NULL) {
5821 xmlRegFreeAtom(atom);
5822 return(NULL);
5823 }
5824 memcpy(&str[0], token, lenp);
5825 str[lenp] = '|';
5826 memcpy(&str[lenp + 1], token2, lenn);
5827 str[lenn + lenp + 1] = 0;
5828
5829 atom->valuep = str;
5830 }
5831 atom->data = data;
5832 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
Daniel Veillard11ce4002006-03-10 00:36:23 +00005833 atom->min = min;
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005834 atom->max = max;
5835 /*
5836 * associate a counter to the transition.
5837 */
5838 counter = xmlRegGetCounter(am);
5839 am->counters[counter].min = 1;
5840 am->counters[counter].max = 1;
5841
5842 /* xmlFAGenerateTransitions(am, from, to, atom); */
5843 if (to == NULL) {
5844 to = xmlRegNewState(am);
5845 xmlRegStatePush(am, to);
5846 }
Daniel Veillard5de09382005-09-26 17:18:17 +00005847 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005848 xmlRegAtomPush(am, atom);
5849 am->state = to;
5850 return(to);
5851}
5852
5853
5854
5855/**
Daniel Veillard7646b182002-04-20 06:41:40 +00005856 * xmlAutomataNewOnceTrans:
5857 * @am: an automata
5858 * @from: the starting point of the transition
5859 * @to: the target point of the transition or NULL
5860 * @token: the input string associated to that transition
5861 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005862 * @max: the maximum successive occurences of token
5863 * @data: data associated to the transition
Daniel Veillard7646b182002-04-20 06:41:40 +00005864 *
William M. Brackddf71d62004-05-06 04:17:26 +00005865 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00005866 * and then adds a transition from the @from state to the target state
5867 * activated by a succession of input of value @token and whose number
William M. Brackddf71d62004-05-06 04:17:26 +00005868 * is between @min and @max, moreover that transition can only be crossed
Daniel Veillard7646b182002-04-20 06:41:40 +00005869 * once.
5870 *
5871 * Returns the target state or NULL in case of error
5872 */
5873xmlAutomataStatePtr
5874xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5875 xmlAutomataStatePtr to, const xmlChar *token,
5876 int min, int max, void *data) {
5877 xmlRegAtomPtr atom;
5878 int counter;
5879
5880 if ((am == NULL) || (from == NULL) || (token == NULL))
5881 return(NULL);
5882 if (min < 1)
5883 return(NULL);
5884 if ((max < min) || (max < 1))
5885 return(NULL);
5886 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5887 if (atom == NULL)
5888 return(NULL);
5889 atom->valuep = xmlStrdup(token);
5890 atom->data = data;
5891 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
Daniel Veillard11ce4002006-03-10 00:36:23 +00005892 atom->min = min;
Daniel Veillard7646b182002-04-20 06:41:40 +00005893 atom->max = max;
5894 /*
5895 * associate a counter to the transition.
5896 */
5897 counter = xmlRegGetCounter(am);
5898 am->counters[counter].min = 1;
5899 am->counters[counter].max = 1;
5900
5901 /* xmlFAGenerateTransitions(am, from, to, atom); */
5902 if (to == NULL) {
5903 to = xmlRegNewState(am);
5904 xmlRegStatePush(am, to);
5905 }
Daniel Veillard5de09382005-09-26 17:18:17 +00005906 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Daniel Veillard7646b182002-04-20 06:41:40 +00005907 xmlRegAtomPush(am, atom);
5908 am->state = to;
Daniel Veillard7646b182002-04-20 06:41:40 +00005909 return(to);
5910}
5911
5912/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005913 * xmlAutomataNewState:
5914 * @am: an automata
5915 *
5916 * Create a new disconnected state in the automata
5917 *
5918 * Returns the new state or NULL in case of error
5919 */
5920xmlAutomataStatePtr
5921xmlAutomataNewState(xmlAutomataPtr am) {
5922 xmlAutomataStatePtr to;
5923
5924 if (am == NULL)
5925 return(NULL);
5926 to = xmlRegNewState(am);
5927 xmlRegStatePush(am, to);
5928 return(to);
5929}
5930
5931/**
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005932 * xmlAutomataNewEpsilon:
Daniel Veillard4255d502002-04-16 15:50:10 +00005933 * @am: an automata
5934 * @from: the starting point of the transition
5935 * @to: the target point of the transition or NULL
5936 *
William M. Brackddf71d62004-05-06 04:17:26 +00005937 * If @to is NULL, this creates first a new target state in the automata
5938 * and then adds an epsilon transition from the @from state to the
Daniel Veillard4255d502002-04-16 15:50:10 +00005939 * target state
5940 *
5941 * Returns the target state or NULL in case of error
5942 */
5943xmlAutomataStatePtr
5944xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
5945 xmlAutomataStatePtr to) {
5946 if ((am == NULL) || (from == NULL))
5947 return(NULL);
5948 xmlFAGenerateEpsilonTransition(am, from, to);
5949 if (to == NULL)
5950 return(am->state);
5951 return(to);
5952}
5953
Daniel Veillardb509f152002-04-17 16:28:10 +00005954/**
Daniel Veillard7646b182002-04-20 06:41:40 +00005955 * xmlAutomataNewAllTrans:
5956 * @am: an automata
5957 * @from: the starting point of the transition
5958 * @to: the target point of the transition or NULL
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005959 * @lax: allow to transition if not all all transitions have been activated
Daniel Veillard7646b182002-04-20 06:41:40 +00005960 *
William M. Brackddf71d62004-05-06 04:17:26 +00005961 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00005962 * and then adds a an ALL transition from the @from state to the
5963 * target state. That transition is an epsilon transition allowed only when
5964 * all transitions from the @from node have been activated.
5965 *
5966 * Returns the target state or NULL in case of error
5967 */
5968xmlAutomataStatePtr
5969xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
Daniel Veillard441bc322002-04-20 17:38:48 +00005970 xmlAutomataStatePtr to, int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00005971 if ((am == NULL) || (from == NULL))
5972 return(NULL);
Daniel Veillard441bc322002-04-20 17:38:48 +00005973 xmlFAGenerateAllTransition(am, from, to, lax);
Daniel Veillard7646b182002-04-20 06:41:40 +00005974 if (to == NULL)
5975 return(am->state);
5976 return(to);
5977}
5978
5979/**
Daniel Veillardb509f152002-04-17 16:28:10 +00005980 * xmlAutomataNewCounter:
5981 * @am: an automata
5982 * @min: the minimal value on the counter
5983 * @max: the maximal value on the counter
5984 *
5985 * Create a new counter
5986 *
5987 * Returns the counter number or -1 in case of error
5988 */
5989int
5990xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
5991 int ret;
5992
5993 if (am == NULL)
5994 return(-1);
5995
5996 ret = xmlRegGetCounter(am);
5997 if (ret < 0)
5998 return(-1);
5999 am->counters[ret].min = min;
6000 am->counters[ret].max = max;
6001 return(ret);
6002}
6003
6004/**
6005 * xmlAutomataNewCountedTrans:
6006 * @am: an automata
6007 * @from: the starting point of the transition
6008 * @to: the target point of the transition or NULL
6009 * @counter: the counter associated to that transition
6010 *
William M. Brackddf71d62004-05-06 04:17:26 +00006011 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00006012 * and then adds an epsilon transition from the @from state to the target state
6013 * which will increment the counter provided
6014 *
6015 * Returns the target state or NULL in case of error
6016 */
6017xmlAutomataStatePtr
6018xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6019 xmlAutomataStatePtr to, int counter) {
6020 if ((am == NULL) || (from == NULL) || (counter < 0))
6021 return(NULL);
6022 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6023 if (to == NULL)
6024 return(am->state);
6025 return(to);
6026}
6027
6028/**
6029 * xmlAutomataNewCounterTrans:
6030 * @am: an automata
6031 * @from: the starting point of the transition
6032 * @to: the target point of the transition or NULL
6033 * @counter: the counter associated to that transition
6034 *
William M. Brackddf71d62004-05-06 04:17:26 +00006035 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00006036 * and then adds an epsilon transition from the @from state to the target state
6037 * which will be allowed only if the counter is within the right range.
6038 *
6039 * Returns the target state or NULL in case of error
6040 */
6041xmlAutomataStatePtr
6042xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6043 xmlAutomataStatePtr to, int counter) {
6044 if ((am == NULL) || (from == NULL) || (counter < 0))
6045 return(NULL);
6046 xmlFAGenerateCountedTransition(am, from, to, counter);
6047 if (to == NULL)
6048 return(am->state);
6049 return(to);
6050}
Daniel Veillard4255d502002-04-16 15:50:10 +00006051
6052/**
6053 * xmlAutomataCompile:
6054 * @am: an automata
6055 *
6056 * Compile the automata into a Reg Exp ready for being executed.
6057 * The automata should be free after this point.
6058 *
6059 * Returns the compiled regexp or NULL in case of error
6060 */
6061xmlRegexpPtr
6062xmlAutomataCompile(xmlAutomataPtr am) {
6063 xmlRegexpPtr ret;
6064
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00006065 if ((am == NULL) || (am->error != 0)) return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00006066 xmlFAEliminateEpsilonTransitions(am);
Daniel Veillard23e73572002-09-19 19:56:43 +00006067 /* xmlFAComputesDeterminism(am); */
Daniel Veillard4255d502002-04-16 15:50:10 +00006068 ret = xmlRegEpxFromParse(am);
6069
6070 return(ret);
6071}
Daniel Veillarde19fc232002-04-22 16:01:24 +00006072
6073/**
6074 * xmlAutomataIsDeterminist:
6075 * @am: an automata
6076 *
6077 * Checks if an automata is determinist.
6078 *
6079 * Returns 1 if true, 0 if not, and -1 in case of error
6080 */
6081int
6082xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6083 int ret;
6084
6085 if (am == NULL)
6086 return(-1);
6087
6088 ret = xmlFAComputesDeterminism(am);
6089 return(ret);
6090}
Daniel Veillard4255d502002-04-16 15:50:10 +00006091#endif /* LIBXML_AUTOMATA_ENABLED */
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006092
6093#ifdef LIBXML_EXPR_ENABLED
6094/************************************************************************
6095 * *
6096 * Formal Expression handling code *
6097 * *
6098 ************************************************************************/
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006099/************************************************************************
6100 * *
6101 * Expression handling context *
6102 * *
6103 ************************************************************************/
6104
6105struct _xmlExpCtxt {
6106 xmlDictPtr dict;
6107 xmlExpNodePtr *table;
6108 int size;
6109 int nbElems;
6110 int nb_nodes;
6111 const char *expr;
6112 const char *cur;
6113 int nb_cons;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006114 int tabSize;
6115};
6116
6117/**
6118 * xmlExpNewCtxt:
6119 * @maxNodes: the maximum number of nodes
6120 * @dict: optional dictionnary to use internally
6121 *
6122 * Creates a new context for manipulating expressions
6123 *
6124 * Returns the context or NULL in case of error
6125 */
6126xmlExpCtxtPtr
6127xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6128 xmlExpCtxtPtr ret;
6129 int size = 256;
6130
6131 if (maxNodes <= 4096)
6132 maxNodes = 4096;
6133
6134 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6135 if (ret == NULL)
6136 return(NULL);
6137 memset(ret, 0, sizeof(xmlExpCtxt));
6138 ret->size = size;
6139 ret->nbElems = 0;
6140 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6141 if (ret->table == NULL) {
6142 xmlFree(ret);
6143 return(NULL);
6144 }
6145 memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6146 if (dict == NULL) {
6147 ret->dict = xmlDictCreate();
6148 if (ret->dict == NULL) {
6149 xmlFree(ret->table);
6150 xmlFree(ret);
6151 return(NULL);
6152 }
6153 } else {
6154 ret->dict = dict;
6155 xmlDictReference(ret->dict);
6156 }
6157 return(ret);
6158}
6159
6160/**
6161 * xmlExpFreeCtxt:
6162 * @ctxt: an expression context
6163 *
6164 * Free an expression context
6165 */
6166void
6167xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6168 if (ctxt == NULL)
6169 return;
6170 xmlDictFree(ctxt->dict);
6171 if (ctxt->table != NULL)
6172 xmlFree(ctxt->table);
6173 xmlFree(ctxt);
6174}
6175
6176/************************************************************************
6177 * *
6178 * Structure associated to an expression node *
6179 * *
6180 ************************************************************************/
Daniel Veillard465a0002005-08-22 12:07:04 +00006181#define MAX_NODES 10000
6182
6183/* #define DEBUG_DERIV */
6184
6185/*
6186 * TODO:
6187 * - Wildcards
6188 * - public API for creation
6189 *
6190 * Started
6191 * - regression testing
6192 *
6193 * Done
6194 * - split into module and test tool
6195 * - memleaks
6196 */
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006197
6198typedef enum {
6199 XML_EXP_NILABLE = (1 << 0)
6200} xmlExpNodeInfo;
6201
6202#define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6203
6204struct _xmlExpNode {
6205 unsigned char type;/* xmlExpNodeType */
6206 unsigned char info;/* OR of xmlExpNodeInfo */
6207 unsigned short key; /* the hash key */
6208 unsigned int ref; /* The number of references */
6209 int c_max; /* the maximum length it can consume */
6210 xmlExpNodePtr exp_left;
6211 xmlExpNodePtr next;/* the next node in the hash table or free list */
6212 union {
6213 struct {
6214 int f_min;
6215 int f_max;
6216 } count;
6217 struct {
6218 xmlExpNodePtr f_right;
6219 } children;
6220 const xmlChar *f_str;
6221 } field;
6222};
6223
6224#define exp_min field.count.f_min
6225#define exp_max field.count.f_max
6226/* #define exp_left field.children.f_left */
6227#define exp_right field.children.f_right
6228#define exp_str field.f_str
6229
6230static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6231static xmlExpNode forbiddenExpNode = {
6232 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6233};
6234xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6235static xmlExpNode emptyExpNode = {
6236 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6237};
6238xmlExpNodePtr emptyExp = &emptyExpNode;
6239
6240/************************************************************************
6241 * *
6242 * The custom hash table for unicity and canonicalization *
6243 * of sub-expressions pointers *
6244 * *
6245 ************************************************************************/
6246/*
6247 * xmlExpHashNameComputeKey:
6248 * Calculate the hash key for a token
6249 */
6250static unsigned short
6251xmlExpHashNameComputeKey(const xmlChar *name) {
6252 unsigned short value = 0L;
6253 char ch;
6254
6255 if (name != NULL) {
6256 value += 30 * (*name);
6257 while ((ch = *name++) != 0) {
6258 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6259 }
6260 }
6261 return (value);
6262}
6263
6264/*
6265 * xmlExpHashComputeKey:
6266 * Calculate the hash key for a compound expression
6267 */
6268static unsigned short
6269xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6270 xmlExpNodePtr right) {
6271 unsigned long value;
6272 unsigned short ret;
6273
6274 switch (type) {
6275 case XML_EXP_SEQ:
6276 value = left->key;
6277 value += right->key;
6278 value *= 3;
6279 ret = (unsigned short) value;
6280 break;
6281 case XML_EXP_OR:
6282 value = left->key;
6283 value += right->key;
6284 value *= 7;
6285 ret = (unsigned short) value;
6286 break;
6287 case XML_EXP_COUNT:
6288 value = left->key;
6289 value += right->key;
6290 ret = (unsigned short) value;
6291 break;
6292 default:
6293 ret = 0;
6294 }
6295 return(ret);
6296}
6297
6298
6299static xmlExpNodePtr
6300xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6301 xmlExpNodePtr ret;
6302
6303 if (ctxt->nb_nodes >= MAX_NODES)
6304 return(NULL);
6305 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6306 if (ret == NULL)
6307 return(NULL);
6308 memset(ret, 0, sizeof(xmlExpNode));
6309 ret->type = type;
6310 ret->next = NULL;
6311 ctxt->nb_nodes++;
6312 ctxt->nb_cons++;
6313 return(ret);
6314}
6315
6316/**
6317 * xmlExpHashGetEntry:
6318 * @table: the hash table
6319 *
6320 * Get the unique entry from the hash table. The entry is created if
6321 * needed. @left and @right are consumed, i.e. their ref count will
6322 * be decremented by the operation.
6323 *
6324 * Returns the pointer or NULL in case of error
6325 */
6326static xmlExpNodePtr
6327xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6328 xmlExpNodePtr left, xmlExpNodePtr right,
6329 const xmlChar *name, int min, int max) {
6330 unsigned short kbase, key;
6331 xmlExpNodePtr entry;
6332 xmlExpNodePtr insert;
6333
6334 if (ctxt == NULL)
6335 return(NULL);
6336
6337 /*
6338 * Check for duplicate and insertion location.
6339 */
6340 if (type == XML_EXP_ATOM) {
6341 kbase = xmlExpHashNameComputeKey(name);
6342 } else if (type == XML_EXP_COUNT) {
6343 /* COUNT reduction rule 1 */
6344 /* a{1} -> a */
6345 if (min == max) {
6346 if (min == 1) {
6347 return(left);
6348 }
6349 if (min == 0) {
6350 xmlExpFree(ctxt, left);
6351 return(emptyExp);
6352 }
6353 }
6354 if (min < 0) {
6355 xmlExpFree(ctxt, left);
6356 return(forbiddenExp);
6357 }
6358 if (max == -1)
6359 kbase = min + 79;
6360 else
6361 kbase = max - min;
6362 kbase += left->key;
6363 } else if (type == XML_EXP_OR) {
6364 /* Forbid reduction rules */
6365 if (left->type == XML_EXP_FORBID) {
6366 xmlExpFree(ctxt, left);
6367 return(right);
6368 }
6369 if (right->type == XML_EXP_FORBID) {
6370 xmlExpFree(ctxt, right);
6371 return(left);
6372 }
6373
6374 /* OR reduction rule 1 */
6375 /* a | a reduced to a */
6376 if (left == right) {
6377 left->ref--;
6378 return(left);
6379 }
6380 /* OR canonicalization rule 1 */
6381 /* linearize (a | b) | c into a | (b | c) */
6382 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6383 xmlExpNodePtr tmp = left;
6384 left = right;
6385 right = tmp;
6386 }
6387 /* OR reduction rule 2 */
6388 /* a | (a | b) and b | (a | b) are reduced to a | b */
6389 if (right->type == XML_EXP_OR) {
6390 if ((left == right->exp_left) ||
6391 (left == right->exp_right)) {
6392 xmlExpFree(ctxt, left);
6393 return(right);
6394 }
6395 }
6396 /* OR canonicalization rule 2 */
6397 /* linearize (a | b) | c into a | (b | c) */
6398 if (left->type == XML_EXP_OR) {
6399 xmlExpNodePtr tmp;
6400
6401 /* OR canonicalization rule 2 */
6402 if ((left->exp_right->type != XML_EXP_OR) &&
6403 (left->exp_right->key < left->exp_left->key)) {
6404 tmp = left->exp_right;
6405 left->exp_right = left->exp_left;
6406 left->exp_left = tmp;
6407 }
6408 left->exp_right->ref++;
6409 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6410 NULL, 0, 0);
6411 left->exp_left->ref++;
6412 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6413 NULL, 0, 0);
6414
6415 xmlExpFree(ctxt, left);
6416 return(tmp);
6417 }
6418 if (right->type == XML_EXP_OR) {
6419 /* Ordering in the tree */
6420 /* C | (A | B) -> A | (B | C) */
6421 if (left->key > right->exp_right->key) {
6422 xmlExpNodePtr tmp;
6423 right->exp_right->ref++;
6424 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6425 left, NULL, 0, 0);
6426 right->exp_left->ref++;
6427 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6428 tmp, NULL, 0, 0);
6429 xmlExpFree(ctxt, right);
6430 return(tmp);
6431 }
6432 /* Ordering in the tree */
6433 /* B | (A | C) -> A | (B | C) */
6434 if (left->key > right->exp_left->key) {
6435 xmlExpNodePtr tmp;
6436 right->exp_right->ref++;
6437 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6438 right->exp_right, NULL, 0, 0);
6439 right->exp_left->ref++;
6440 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6441 tmp, NULL, 0, 0);
6442 xmlExpFree(ctxt, right);
6443 return(tmp);
6444 }
6445 }
6446 /* we know both types are != XML_EXP_OR here */
6447 else if (left->key > right->key) {
6448 xmlExpNodePtr tmp = left;
6449 left = right;
6450 right = tmp;
6451 }
6452 kbase = xmlExpHashComputeKey(type, left, right);
6453 } else if (type == XML_EXP_SEQ) {
6454 /* Forbid reduction rules */
6455 if (left->type == XML_EXP_FORBID) {
6456 xmlExpFree(ctxt, right);
6457 return(left);
6458 }
6459 if (right->type == XML_EXP_FORBID) {
6460 xmlExpFree(ctxt, left);
6461 return(right);
6462 }
6463 /* Empty reduction rules */
6464 if (right->type == XML_EXP_EMPTY) {
6465 return(left);
6466 }
6467 if (left->type == XML_EXP_EMPTY) {
6468 return(right);
6469 }
6470 kbase = xmlExpHashComputeKey(type, left, right);
6471 } else
6472 return(NULL);
6473
6474 key = kbase % ctxt->size;
6475 if (ctxt->table[key] != NULL) {
6476 for (insert = ctxt->table[key]; insert != NULL;
6477 insert = insert->next) {
6478 if ((insert->key == kbase) &&
6479 (insert->type == type)) {
6480 if (type == XML_EXP_ATOM) {
6481 if (name == insert->exp_str) {
6482 insert->ref++;
6483 return(insert);
6484 }
6485 } else if (type == XML_EXP_COUNT) {
6486 if ((insert->exp_min == min) && (insert->exp_max == max) &&
6487 (insert->exp_left == left)) {
6488 insert->ref++;
6489 left->ref--;
6490 return(insert);
6491 }
6492 } else if ((insert->exp_left == left) &&
6493 (insert->exp_right == right)) {
6494 insert->ref++;
6495 left->ref--;
6496 right->ref--;
6497 return(insert);
6498 }
6499 }
6500 }
6501 }
6502
6503 entry = xmlExpNewNode(ctxt, type);
6504 if (entry == NULL)
6505 return(NULL);
6506 entry->key = kbase;
6507 if (type == XML_EXP_ATOM) {
6508 entry->exp_str = name;
6509 entry->c_max = 1;
6510 } else if (type == XML_EXP_COUNT) {
6511 entry->exp_min = min;
6512 entry->exp_max = max;
6513 entry->exp_left = left;
6514 if ((min == 0) || (IS_NILLABLE(left)))
6515 entry->info |= XML_EXP_NILABLE;
6516 if (max < 0)
6517 entry->c_max = -1;
6518 else
6519 entry->c_max = max * entry->exp_left->c_max;
6520 } else {
6521 entry->exp_left = left;
6522 entry->exp_right = right;
6523 if (type == XML_EXP_OR) {
6524 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6525 entry->info |= XML_EXP_NILABLE;
6526 if ((entry->exp_left->c_max == -1) ||
6527 (entry->exp_right->c_max == -1))
6528 entry->c_max = -1;
6529 else if (entry->exp_left->c_max > entry->exp_right->c_max)
6530 entry->c_max = entry->exp_left->c_max;
6531 else
6532 entry->c_max = entry->exp_right->c_max;
6533 } else {
6534 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6535 entry->info |= XML_EXP_NILABLE;
6536 if ((entry->exp_left->c_max == -1) ||
6537 (entry->exp_right->c_max == -1))
6538 entry->c_max = -1;
6539 else
6540 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6541 }
6542 }
6543 entry->ref = 1;
6544 if (ctxt->table[key] != NULL)
6545 entry->next = ctxt->table[key];
6546
6547 ctxt->table[key] = entry;
6548 ctxt->nbElems++;
6549
6550 return(entry);
6551}
6552
6553/**
6554 * xmlExpFree:
6555 * @ctxt: the expression context
6556 * @exp: the expression
6557 *
6558 * Dereference the expression
6559 */
6560void
6561xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6562 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6563 return;
6564 exp->ref--;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006565 if (exp->ref == 0) {
6566 unsigned short key;
6567
6568 /* Unlink it first from the hash table */
6569 key = exp->key % ctxt->size;
6570 if (ctxt->table[key] == exp) {
6571 ctxt->table[key] = exp->next;
6572 } else {
6573 xmlExpNodePtr tmp;
6574
6575 tmp = ctxt->table[key];
6576 while (tmp != NULL) {
6577 if (tmp->next == exp) {
6578 tmp->next = exp->next;
6579 break;
6580 }
6581 tmp = tmp->next;
6582 }
6583 }
6584
6585 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6586 xmlExpFree(ctxt, exp->exp_left);
6587 xmlExpFree(ctxt, exp->exp_right);
6588 } else if (exp->type == XML_EXP_COUNT) {
6589 xmlExpFree(ctxt, exp->exp_left);
6590 }
6591 xmlFree(exp);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006592 ctxt->nb_nodes--;
6593 }
6594}
6595
6596/**
6597 * xmlExpRef:
6598 * @exp: the expression
6599 *
6600 * Increase the reference count of the expression
6601 */
6602void
6603xmlExpRef(xmlExpNodePtr exp) {
6604 if (exp != NULL)
6605 exp->ref++;
6606}
6607
Daniel Veillardccb4d412005-08-23 13:41:17 +00006608/**
6609 * xmlExpNewAtom:
6610 * @ctxt: the expression context
6611 * @name: the atom name
6612 * @len: the atom name lenght in byte (or -1);
6613 *
6614 * Get the atom associated to this name from that context
6615 *
6616 * Returns the node or NULL in case of error
6617 */
6618xmlExpNodePtr
6619xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
6620 if ((ctxt == NULL) || (name == NULL))
6621 return(NULL);
6622 name = xmlDictLookup(ctxt->dict, name, len);
6623 if (name == NULL)
6624 return(NULL);
6625 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
6626}
6627
6628/**
6629 * xmlExpNewOr:
6630 * @ctxt: the expression context
6631 * @left: left expression
6632 * @right: right expression
6633 *
6634 * Get the atom associated to the choice @left | @right
6635 * Note that @left and @right are consumed in the operation, to keep
6636 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6637 * this is true even in case of failure (unless ctxt == NULL).
6638 *
6639 * Returns the node or NULL in case of error
6640 */
6641xmlExpNodePtr
6642xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00006643 if (ctxt == NULL)
6644 return(NULL);
6645 if ((left == NULL) || (right == NULL)) {
Daniel Veillardccb4d412005-08-23 13:41:17 +00006646 xmlExpFree(ctxt, left);
6647 xmlExpFree(ctxt, right);
6648 return(NULL);
6649 }
6650 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
6651}
6652
6653/**
6654 * xmlExpNewSeq:
6655 * @ctxt: the expression context
6656 * @left: left expression
6657 * @right: right expression
6658 *
6659 * Get the atom associated to the sequence @left , @right
6660 * Note that @left and @right are consumed in the operation, to keep
6661 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6662 * this is true even in case of failure (unless ctxt == NULL).
6663 *
6664 * Returns the node or NULL in case of error
6665 */
6666xmlExpNodePtr
6667xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00006668 if (ctxt == NULL)
6669 return(NULL);
6670 if ((left == NULL) || (right == NULL)) {
Daniel Veillardccb4d412005-08-23 13:41:17 +00006671 xmlExpFree(ctxt, left);
6672 xmlExpFree(ctxt, right);
6673 return(NULL);
6674 }
6675 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
6676}
6677
6678/**
6679 * xmlExpNewRange:
6680 * @ctxt: the expression context
6681 * @subset: the expression to be repeated
6682 * @min: the lower bound for the repetition
6683 * @max: the upper bound for the repetition, -1 means infinite
6684 *
6685 * Get the atom associated to the range (@subset){@min, @max}
6686 * Note that @subset is consumed in the operation, to keep
6687 * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
6688 * this is true even in case of failure (unless ctxt == NULL).
6689 *
6690 * Returns the node or NULL in case of error
6691 */
6692xmlExpNodePtr
6693xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00006694 if (ctxt == NULL)
6695 return(NULL);
6696 if ((subset == NULL) || (min < 0) || (max < -1) ||
Daniel Veillardccb4d412005-08-23 13:41:17 +00006697 ((max >= 0) && (min > max))) {
6698 xmlExpFree(ctxt, subset);
6699 return(NULL);
6700 }
6701 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
6702 NULL, NULL, min, max));
6703}
6704
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006705/************************************************************************
6706 * *
6707 * Public API for operations on expressions *
6708 * *
6709 ************************************************************************/
6710
6711static int
6712xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6713 const xmlChar**list, int len, int nb) {
6714 int tmp, tmp2;
6715tail:
6716 switch (exp->type) {
6717 case XML_EXP_EMPTY:
6718 return(0);
6719 case XML_EXP_ATOM:
6720 for (tmp = 0;tmp < nb;tmp++)
6721 if (list[tmp] == exp->exp_str)
6722 return(0);
6723 if (nb >= len)
6724 return(-2);
6725 list[nb++] = exp->exp_str;
6726 return(1);
6727 case XML_EXP_COUNT:
6728 exp = exp->exp_left;
6729 goto tail;
6730 case XML_EXP_SEQ:
6731 case XML_EXP_OR:
6732 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
6733 if (tmp < 0)
6734 return(tmp);
6735 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
6736 nb + tmp);
6737 if (tmp2 < 0)
6738 return(tmp2);
6739 return(tmp + tmp2);
6740 }
6741 return(-1);
6742}
6743
6744/**
6745 * xmlExpGetLanguage:
6746 * @ctxt: the expression context
6747 * @exp: the expression
Daniel Veillard7802ba52005-10-27 11:56:20 +00006748 * @langList: where to store the tokens
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006749 * @len: the allocated lenght of @list
6750 *
6751 * Find all the strings used in @exp and store them in @list
6752 *
6753 * Returns the number of unique strings found, -1 in case of errors and
6754 * -2 if there is more than @len strings
6755 */
6756int
6757xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard7802ba52005-10-27 11:56:20 +00006758 const xmlChar**langList, int len) {
6759 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006760 return(-1);
Daniel Veillard7802ba52005-10-27 11:56:20 +00006761 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006762}
6763
6764static int
6765xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6766 const xmlChar**list, int len, int nb) {
6767 int tmp, tmp2;
6768tail:
6769 switch (exp->type) {
6770 case XML_EXP_FORBID:
6771 return(0);
6772 case XML_EXP_EMPTY:
6773 return(0);
6774 case XML_EXP_ATOM:
6775 for (tmp = 0;tmp < nb;tmp++)
6776 if (list[tmp] == exp->exp_str)
6777 return(0);
6778 if (nb >= len)
6779 return(-2);
6780 list[nb++] = exp->exp_str;
6781 return(1);
6782 case XML_EXP_COUNT:
6783 exp = exp->exp_left;
6784 goto tail;
6785 case XML_EXP_SEQ:
6786 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
6787 if (tmp < 0)
6788 return(tmp);
6789 if (IS_NILLABLE(exp->exp_left)) {
6790 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
6791 nb + tmp);
6792 if (tmp2 < 0)
6793 return(tmp2);
6794 tmp += tmp2;
6795 }
6796 return(tmp);
6797 case XML_EXP_OR:
6798 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
6799 if (tmp < 0)
6800 return(tmp);
6801 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
6802 nb + tmp);
6803 if (tmp2 < 0)
6804 return(tmp2);
6805 return(tmp + tmp2);
6806 }
6807 return(-1);
6808}
6809
6810/**
6811 * xmlExpGetStart:
6812 * @ctxt: the expression context
6813 * @exp: the expression
Daniel Veillard7802ba52005-10-27 11:56:20 +00006814 * @tokList: where to store the tokens
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006815 * @len: the allocated lenght of @list
6816 *
6817 * Find all the strings that appears at the start of the languages
6818 * accepted by @exp and store them in @list. E.g. for (a, b) | c
6819 * it will return the list [a, c]
6820 *
6821 * Returns the number of unique strings found, -1 in case of errors and
6822 * -2 if there is more than @len strings
6823 */
6824int
6825xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard7802ba52005-10-27 11:56:20 +00006826 const xmlChar**tokList, int len) {
6827 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006828 return(-1);
Daniel Veillard7802ba52005-10-27 11:56:20 +00006829 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006830}
6831
6832/**
6833 * xmlExpIsNillable:
6834 * @exp: the expression
6835 *
6836 * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce
6837 *
6838 * Returns 1 if nillable, 0 if not and -1 in case of error
6839 */
6840int
6841xmlExpIsNillable(xmlExpNodePtr exp) {
6842 if (exp == NULL)
6843 return(-1);
6844 return(IS_NILLABLE(exp) != 0);
6845}
6846
6847static xmlExpNodePtr
6848xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
6849{
6850 xmlExpNodePtr ret;
6851
6852 switch (exp->type) {
6853 case XML_EXP_EMPTY:
6854 return(forbiddenExp);
6855 case XML_EXP_FORBID:
6856 return(forbiddenExp);
6857 case XML_EXP_ATOM:
6858 if (exp->exp_str == str) {
6859#ifdef DEBUG_DERIV
6860 printf("deriv atom: equal => Empty\n");
6861#endif
6862 ret = emptyExp;
6863 } else {
6864#ifdef DEBUG_DERIV
6865 printf("deriv atom: mismatch => forbid\n");
6866#endif
6867 /* TODO wildcards here */
6868 ret = forbiddenExp;
6869 }
6870 return(ret);
6871 case XML_EXP_OR: {
6872 xmlExpNodePtr tmp;
6873
6874#ifdef DEBUG_DERIV
6875 printf("deriv or: => or(derivs)\n");
6876#endif
6877 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
6878 if (tmp == NULL) {
6879 return(NULL);
6880 }
6881 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
6882 if (ret == NULL) {
6883 xmlExpFree(ctxt, tmp);
6884 return(NULL);
6885 }
6886 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
6887 NULL, 0, 0);
6888 return(ret);
6889 }
6890 case XML_EXP_SEQ:
6891#ifdef DEBUG_DERIV
6892 printf("deriv seq: starting with left\n");
6893#endif
6894 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
6895 if (ret == NULL) {
6896 return(NULL);
6897 } else if (ret == forbiddenExp) {
6898 if (IS_NILLABLE(exp->exp_left)) {
6899#ifdef DEBUG_DERIV
6900 printf("deriv seq: left failed but nillable\n");
6901#endif
6902 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
6903 }
6904 } else {
6905#ifdef DEBUG_DERIV
6906 printf("deriv seq: left match => sequence\n");
6907#endif
6908 exp->exp_right->ref++;
6909 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
6910 NULL, 0, 0);
6911 }
6912 return(ret);
6913 case XML_EXP_COUNT: {
6914 int min, max;
6915 xmlExpNodePtr tmp;
6916
6917 if (exp->exp_max == 0)
6918 return(forbiddenExp);
6919 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
6920 if (ret == NULL)
6921 return(NULL);
6922 if (ret == forbiddenExp) {
6923#ifdef DEBUG_DERIV
6924 printf("deriv count: pattern mismatch => forbid\n");
6925#endif
6926 return(ret);
6927 }
6928 if (exp->exp_max == 1)
6929 return(ret);
6930 if (exp->exp_max < 0) /* unbounded */
6931 max = -1;
6932 else
6933 max = exp->exp_max - 1;
6934 if (exp->exp_min > 0)
6935 min = exp->exp_min - 1;
6936 else
6937 min = 0;
6938 exp->exp_left->ref++;
6939 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
6940 NULL, min, max);
6941 if (ret == emptyExp) {
6942#ifdef DEBUG_DERIV
6943 printf("deriv count: match to empty => new count\n");
6944#endif
6945 return(tmp);
6946 }
6947#ifdef DEBUG_DERIV
6948 printf("deriv count: match => sequence with new count\n");
6949#endif
6950 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
6951 NULL, 0, 0));
6952 }
6953 }
6954 return(NULL);
6955}
6956
6957/**
6958 * xmlExpStringDerive:
6959 * @ctxt: the expression context
6960 * @exp: the expression
6961 * @str: the string
6962 * @len: the string len in bytes if available
6963 *
6964 * Do one step of Brzozowski derivation of the expression @exp with
6965 * respect to the input string
6966 *
6967 * Returns the resulting expression or NULL in case of internal error
6968 */
6969xmlExpNodePtr
6970xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6971 const xmlChar *str, int len) {
6972 const xmlChar *input;
6973
6974 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
6975 return(NULL);
6976 }
6977 /*
6978 * check the string is in the dictionnary, if yes use an interned
6979 * copy, otherwise we know it's not an acceptable input
6980 */
6981 input = xmlDictExists(ctxt->dict, str, len);
6982 if (input == NULL) {
6983 return(forbiddenExp);
6984 }
6985 return(xmlExpStringDeriveInt(ctxt, exp, input));
6986}
6987
6988static int
6989xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
6990 int ret = 1;
6991
6992 if (sub->c_max == -1) {
6993 if (exp->c_max != -1)
6994 ret = 0;
6995 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
6996 ret = 0;
6997 }
6998#if 0
6999 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7000 ret = 0;
7001#endif
7002 return(ret);
7003}
7004
7005static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7006 xmlExpNodePtr sub);
7007/**
7008 * xmlExpDivide:
7009 * @ctxt: the expressions context
7010 * @exp: the englobing expression
7011 * @sub: the subexpression
7012 * @mult: the multiple expression
7013 * @remain: the remain from the derivation of the multiple
7014 *
7015 * Check if exp is a multiple of sub, i.e. if there is a finite number n
7016 * so that sub{n} subsume exp
7017 *
7018 * Returns the multiple value if successful, 0 if it is not a multiple
7019 * and -1 in case of internel error.
7020 */
7021
7022static int
7023xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7024 xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7025 int i;
7026 xmlExpNodePtr tmp, tmp2;
7027
7028 if (mult != NULL) *mult = NULL;
7029 if (remain != NULL) *remain = NULL;
7030 if (exp->c_max == -1) return(0);
7031 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7032
7033 for (i = 1;i <= exp->c_max;i++) {
7034 sub->ref++;
7035 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7036 sub, NULL, NULL, i, i);
7037 if (tmp == NULL) {
7038 return(-1);
7039 }
7040 if (!xmlExpCheckCard(tmp, exp)) {
7041 xmlExpFree(ctxt, tmp);
7042 continue;
7043 }
7044 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7045 if (tmp2 == NULL) {
7046 xmlExpFree(ctxt, tmp);
7047 return(-1);
7048 }
7049 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7050 if (remain != NULL)
7051 *remain = tmp2;
7052 else
7053 xmlExpFree(ctxt, tmp2);
7054 if (mult != NULL)
7055 *mult = tmp;
7056 else
7057 xmlExpFree(ctxt, tmp);
7058#ifdef DEBUG_DERIV
7059 printf("Divide succeeded %d\n", i);
7060#endif
7061 return(i);
7062 }
7063 xmlExpFree(ctxt, tmp);
7064 xmlExpFree(ctxt, tmp2);
7065 }
7066#ifdef DEBUG_DERIV
7067 printf("Divide failed\n");
7068#endif
7069 return(0);
7070}
7071
7072/**
7073 * xmlExpExpDeriveInt:
7074 * @ctxt: the expressions context
7075 * @exp: the englobing expression
7076 * @sub: the subexpression
7077 *
7078 * Try to do a step of Brzozowski derivation but at a higher level
7079 * the input being a subexpression.
7080 *
7081 * Returns the resulting expression or NULL in case of internal error
7082 */
7083static xmlExpNodePtr
7084xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7085 xmlExpNodePtr ret, tmp, tmp2, tmp3;
7086 const xmlChar **tab;
7087 int len, i;
7088
7089 /*
7090 * In case of equality and if the expression can only consume a finite
7091 * amount, then the derivation is empty
7092 */
7093 if ((exp == sub) && (exp->c_max >= 0)) {
7094#ifdef DEBUG_DERIV
7095 printf("Equal(exp, sub) and finite -> Empty\n");
7096#endif
7097 return(emptyExp);
7098 }
7099 /*
7100 * decompose sub sequence first
7101 */
7102 if (sub->type == XML_EXP_EMPTY) {
7103#ifdef DEBUG_DERIV
7104 printf("Empty(sub) -> Empty\n");
7105#endif
7106 exp->ref++;
7107 return(exp);
7108 }
7109 if (sub->type == XML_EXP_SEQ) {
7110#ifdef DEBUG_DERIV
7111 printf("Seq(sub) -> decompose\n");
7112#endif
7113 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7114 if (tmp == NULL)
7115 return(NULL);
7116 if (tmp == forbiddenExp)
7117 return(tmp);
7118 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7119 xmlExpFree(ctxt, tmp);
7120 return(ret);
7121 }
7122 if (sub->type == XML_EXP_OR) {
7123#ifdef DEBUG_DERIV
7124 printf("Or(sub) -> decompose\n");
7125#endif
7126 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7127 if (tmp == forbiddenExp)
7128 return(tmp);
7129 if (tmp == NULL)
7130 return(NULL);
7131 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7132 if ((ret == NULL) || (ret == forbiddenExp)) {
7133 xmlExpFree(ctxt, tmp);
7134 return(ret);
7135 }
7136 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7137 }
7138 if (!xmlExpCheckCard(exp, sub)) {
7139#ifdef DEBUG_DERIV
7140 printf("CheckCard(exp, sub) failed -> Forbid\n");
7141#endif
7142 return(forbiddenExp);
7143 }
7144 switch (exp->type) {
7145 case XML_EXP_EMPTY:
7146 if (sub == emptyExp)
7147 return(emptyExp);
7148#ifdef DEBUG_DERIV
7149 printf("Empty(exp) -> Forbid\n");
7150#endif
7151 return(forbiddenExp);
7152 case XML_EXP_FORBID:
7153#ifdef DEBUG_DERIV
7154 printf("Forbid(exp) -> Forbid\n");
7155#endif
7156 return(forbiddenExp);
7157 case XML_EXP_ATOM:
7158 if (sub->type == XML_EXP_ATOM) {
7159 /* TODO: handle wildcards */
7160 if (exp->exp_str == sub->exp_str) {
7161#ifdef DEBUG_DERIV
7162 printf("Atom match -> Empty\n");
7163#endif
7164 return(emptyExp);
7165 }
7166#ifdef DEBUG_DERIV
7167 printf("Atom mismatch -> Forbid\n");
7168#endif
7169 return(forbiddenExp);
7170 }
7171 if ((sub->type == XML_EXP_COUNT) &&
7172 (sub->exp_max == 1) &&
7173 (sub->exp_left->type == XML_EXP_ATOM)) {
7174 /* TODO: handle wildcards */
7175 if (exp->exp_str == sub->exp_left->exp_str) {
7176#ifdef DEBUG_DERIV
7177 printf("Atom match -> Empty\n");
7178#endif
7179 return(emptyExp);
7180 }
7181#ifdef DEBUG_DERIV
7182 printf("Atom mismatch -> Forbid\n");
7183#endif
7184 return(forbiddenExp);
7185 }
7186#ifdef DEBUG_DERIV
7187 printf("Compex exp vs Atom -> Forbid\n");
7188#endif
7189 return(forbiddenExp);
7190 case XML_EXP_SEQ:
7191 /* try to get the sequence consumed only if possible */
7192 if (xmlExpCheckCard(exp->exp_left, sub)) {
7193 /* See if the sequence can be consumed directly */
7194#ifdef DEBUG_DERIV
7195 printf("Seq trying left only\n");
7196#endif
7197 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7198 if ((ret != forbiddenExp) && (ret != NULL)) {
7199#ifdef DEBUG_DERIV
7200 printf("Seq trying left only worked\n");
7201#endif
7202 /*
7203 * TODO: assumption here that we are determinist
7204 * i.e. we won't get to a nillable exp left
7205 * subset which could be matched by the right
7206 * part too.
7207 * e.g.: (a | b)+,(a | c) and 'a+,a'
7208 */
7209 exp->exp_right->ref++;
7210 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7211 exp->exp_right, NULL, 0, 0));
7212 }
7213#ifdef DEBUG_DERIV
7214 } else {
7215 printf("Seq: left too short\n");
7216#endif
7217 }
7218 /* Try instead to decompose */
7219 if (sub->type == XML_EXP_COUNT) {
7220 int min, max;
7221
7222#ifdef DEBUG_DERIV
7223 printf("Seq: sub is a count\n");
7224#endif
7225 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7226 if (ret == NULL)
7227 return(NULL);
7228 if (ret != forbiddenExp) {
7229#ifdef DEBUG_DERIV
7230 printf("Seq , Count match on left\n");
7231#endif
7232 if (sub->exp_max < 0)
7233 max = -1;
7234 else
7235 max = sub->exp_max -1;
7236 if (sub->exp_min > 0)
7237 min = sub->exp_min -1;
7238 else
7239 min = 0;
7240 exp->exp_right->ref++;
7241 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7242 exp->exp_right, NULL, 0, 0);
7243 if (tmp == NULL)
7244 return(NULL);
7245
7246 sub->exp_left->ref++;
7247 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7248 sub->exp_left, NULL, NULL, min, max);
7249 if (tmp2 == NULL) {
7250 xmlExpFree(ctxt, tmp);
7251 return(NULL);
7252 }
7253 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7254 xmlExpFree(ctxt, tmp);
7255 xmlExpFree(ctxt, tmp2);
7256 return(ret);
7257 }
7258 }
7259 /* we made no progress on structured operations */
7260 break;
7261 case XML_EXP_OR:
7262#ifdef DEBUG_DERIV
7263 printf("Or , trying both side\n");
7264#endif
7265 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7266 if (ret == NULL)
7267 return(NULL);
7268 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7269 if (tmp == NULL) {
7270 xmlExpFree(ctxt, ret);
7271 return(NULL);
7272 }
7273 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7274 case XML_EXP_COUNT: {
7275 int min, max;
7276
7277 if (sub->type == XML_EXP_COUNT) {
7278 /*
7279 * Try to see if the loop is completely subsumed
7280 */
7281 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7282 if (tmp == NULL)
7283 return(NULL);
7284 if (tmp == forbiddenExp) {
7285 int mult;
7286
7287#ifdef DEBUG_DERIV
7288 printf("Count, Count inner don't subsume\n");
7289#endif
7290 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7291 NULL, &tmp);
7292 if (mult <= 0) {
7293#ifdef DEBUG_DERIV
7294 printf("Count, Count not multiple => forbidden\n");
7295#endif
7296 return(forbiddenExp);
7297 }
7298 if (sub->exp_max == -1) {
7299 max = -1;
7300 if (exp->exp_max == -1) {
7301 if (exp->exp_min <= sub->exp_min * mult)
7302 min = 0;
7303 else
7304 min = exp->exp_min - sub->exp_min * mult;
7305 } else {
7306#ifdef DEBUG_DERIV
7307 printf("Count, Count finite can't subsume infinite\n");
7308#endif
7309 xmlExpFree(ctxt, tmp);
7310 return(forbiddenExp);
7311 }
7312 } else {
7313 if (exp->exp_max == -1) {
7314#ifdef DEBUG_DERIV
7315 printf("Infinite loop consume mult finite loop\n");
7316#endif
7317 if (exp->exp_min > sub->exp_min * mult) {
7318 max = -1;
7319 min = exp->exp_min - sub->exp_min * mult;
7320 } else {
7321 max = -1;
7322 min = 0;
7323 }
7324 } else {
7325 if (exp->exp_max < sub->exp_max * mult) {
7326#ifdef DEBUG_DERIV
7327 printf("loops max mult mismatch => forbidden\n");
7328#endif
7329 xmlExpFree(ctxt, tmp);
7330 return(forbiddenExp);
7331 }
7332 if (sub->exp_max * mult > exp->exp_min)
7333 min = 0;
7334 else
7335 min = exp->exp_min - sub->exp_max * mult;
7336 max = exp->exp_max - sub->exp_max * mult;
7337 }
7338 }
7339 } else if (!IS_NILLABLE(tmp)) {
7340 /*
7341 * TODO: loop here to try to grow if working on finite
7342 * blocks.
7343 */
7344#ifdef DEBUG_DERIV
7345 printf("Count, Count remain not nillable => forbidden\n");
7346#endif
7347 xmlExpFree(ctxt, tmp);
7348 return(forbiddenExp);
7349 } else if (sub->exp_max == -1) {
7350 if (exp->exp_max == -1) {
7351 if (exp->exp_min <= sub->exp_min) {
7352#ifdef DEBUG_DERIV
7353 printf("Infinite loops Okay => COUNT(0,Inf)\n");
7354#endif
7355 max = -1;
7356 min = 0;
7357 } else {
7358#ifdef DEBUG_DERIV
7359 printf("Infinite loops min => Count(X,Inf)\n");
7360#endif
7361 max = -1;
7362 min = exp->exp_min - sub->exp_min;
7363 }
7364 } else if (exp->exp_min > sub->exp_min) {
7365#ifdef DEBUG_DERIV
7366 printf("loops min mismatch 1 => forbidden ???\n");
7367#endif
7368 xmlExpFree(ctxt, tmp);
7369 return(forbiddenExp);
7370 } else {
7371 max = -1;
7372 min = 0;
7373 }
7374 } else {
7375 if (exp->exp_max == -1) {
7376#ifdef DEBUG_DERIV
7377 printf("Infinite loop consume finite loop\n");
7378#endif
7379 if (exp->exp_min > sub->exp_min) {
7380 max = -1;
7381 min = exp->exp_min - sub->exp_min;
7382 } else {
7383 max = -1;
7384 min = 0;
7385 }
7386 } else {
7387 if (exp->exp_max < sub->exp_max) {
7388#ifdef DEBUG_DERIV
7389 printf("loops max mismatch => forbidden\n");
7390#endif
7391 xmlExpFree(ctxt, tmp);
7392 return(forbiddenExp);
7393 }
7394 if (sub->exp_max > exp->exp_min)
7395 min = 0;
7396 else
7397 min = exp->exp_min - sub->exp_max;
7398 max = exp->exp_max - sub->exp_max;
7399 }
7400 }
7401#ifdef DEBUG_DERIV
7402 printf("loops match => SEQ(COUNT())\n");
7403#endif
7404 exp->exp_left->ref++;
7405 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7406 NULL, NULL, min, max);
7407 if (tmp2 == NULL) {
7408 return(NULL);
7409 }
7410 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7411 NULL, 0, 0);
7412 return(ret);
7413 }
7414 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7415 if (tmp == NULL)
7416 return(NULL);
7417 if (tmp == forbiddenExp) {
7418#ifdef DEBUG_DERIV
7419 printf("loop mismatch => forbidden\n");
7420#endif
7421 return(forbiddenExp);
7422 }
7423 if (exp->exp_min > 0)
7424 min = exp->exp_min - 1;
7425 else
7426 min = 0;
7427 if (exp->exp_max < 0)
7428 max = -1;
7429 else
7430 max = exp->exp_max - 1;
7431
7432#ifdef DEBUG_DERIV
7433 printf("loop match => SEQ(COUNT())\n");
7434#endif
7435 exp->exp_left->ref++;
7436 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7437 NULL, NULL, min, max);
7438 if (tmp2 == NULL)
7439 return(NULL);
7440 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7441 NULL, 0, 0);
7442 return(ret);
7443 }
7444 }
7445
Daniel Veillardccb4d412005-08-23 13:41:17 +00007446#ifdef DEBUG_DERIV
7447 printf("Fallback to derivative\n");
7448#endif
7449 if (IS_NILLABLE(sub)) {
7450 if (!(IS_NILLABLE(exp)))
7451 return(forbiddenExp);
7452 else
7453 ret = emptyExp;
7454 } else
7455 ret = NULL;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007456 /*
7457 * here the structured derivation made no progress so
7458 * we use the default token based derivation to force one more step
7459 */
7460 if (ctxt->tabSize == 0)
7461 ctxt->tabSize = 40;
7462
7463 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7464 sizeof(const xmlChar *));
7465 if (tab == NULL) {
7466 return(NULL);
7467 }
7468
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007469 /*
7470 * collect all the strings accepted by the subexpression on input
7471 */
7472 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7473 while (len < 0) {
7474 const xmlChar **temp;
Rob Richards54a8f672005-10-07 02:33:00 +00007475 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007476 sizeof(const xmlChar *));
7477 if (temp == NULL) {
Rob Richards54a8f672005-10-07 02:33:00 +00007478 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007479 return(NULL);
7480 }
7481 tab = temp;
7482 ctxt->tabSize *= 2;
7483 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7484 }
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007485 for (i = 0;i < len;i++) {
7486 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7487 if ((tmp == NULL) || (tmp == forbiddenExp)) {
7488 xmlExpFree(ctxt, ret);
Rob Richards54a8f672005-10-07 02:33:00 +00007489 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007490 return(tmp);
7491 }
7492 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7493 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7494 xmlExpFree(ctxt, tmp);
7495 xmlExpFree(ctxt, ret);
Rob Richards54a8f672005-10-07 02:33:00 +00007496 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007497 return(tmp);
7498 }
7499 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7500 xmlExpFree(ctxt, tmp);
7501 xmlExpFree(ctxt, tmp2);
7502
7503 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7504 xmlExpFree(ctxt, ret);
Rob Richards54a8f672005-10-07 02:33:00 +00007505 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007506 return(tmp3);
7507 }
7508
7509 if (ret == NULL)
7510 ret = tmp3;
7511 else {
7512 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7513 if (ret == NULL) {
Rob Richards54a8f672005-10-07 02:33:00 +00007514 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007515 return(NULL);
7516 }
7517 }
7518 }
Rob Richards54a8f672005-10-07 02:33:00 +00007519 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007520 return(ret);
7521}
7522
7523/**
Daniel Veillard0090bd52005-08-22 14:43:43 +00007524 * xmlExpExpDerive:
7525 * @ctxt: the expressions context
7526 * @exp: the englobing expression
7527 * @sub: the subexpression
7528 *
7529 * Evaluates the expression resulting from @exp consuming a sub expression @sub
7530 * Based on algebraic derivation and sometimes direct Brzozowski derivation
7531 * it usually tatkes less than linear time and can handle expressions generating
7532 * infinite languages.
7533 *
7534 * Returns the resulting expression or NULL in case of internal error, the
7535 * result must be freed
7536 */
7537xmlExpNodePtr
7538xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7539 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7540 return(NULL);
7541
7542 /*
7543 * O(1) speedups
7544 */
7545 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7546#ifdef DEBUG_DERIV
7547 printf("Sub nillable and not exp : can't subsume\n");
7548#endif
7549 return(forbiddenExp);
7550 }
7551 if (xmlExpCheckCard(exp, sub) == 0) {
7552#ifdef DEBUG_DERIV
7553 printf("sub generate longuer sequances than exp : can't subsume\n");
7554#endif
7555 return(forbiddenExp);
7556 }
7557 return(xmlExpExpDeriveInt(ctxt, exp, sub));
7558}
7559
7560/**
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007561 * xmlExpSubsume:
7562 * @ctxt: the expressions context
7563 * @exp: the englobing expression
7564 * @sub: the subexpression
7565 *
7566 * Check whether @exp accepts all the languages accexpted by @sub
7567 * the input being a subexpression.
7568 *
7569 * Returns 1 if true 0 if false and -1 in case of failure.
7570 */
7571int
7572xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7573 xmlExpNodePtr tmp;
7574
7575 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7576 return(-1);
7577
7578 /*
7579 * TODO: speedup by checking the language of sub is a subset of the
7580 * language of exp
7581 */
7582 /*
7583 * O(1) speedups
7584 */
7585 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7586#ifdef DEBUG_DERIV
7587 printf("Sub nillable and not exp : can't subsume\n");
7588#endif
7589 return(0);
7590 }
7591 if (xmlExpCheckCard(exp, sub) == 0) {
7592#ifdef DEBUG_DERIV
7593 printf("sub generate longuer sequances than exp : can't subsume\n");
7594#endif
7595 return(0);
7596 }
7597 tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7598#ifdef DEBUG_DERIV
7599 printf("Result derivation :\n");
7600 PRINT_EXP(tmp);
7601#endif
7602 if (tmp == NULL)
7603 return(-1);
7604 if (tmp == forbiddenExp)
7605 return(0);
7606 if (tmp == emptyExp)
7607 return(1);
7608 if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
7609 xmlExpFree(ctxt, tmp);
7610 return(1);
7611 }
7612 xmlExpFree(ctxt, tmp);
7613 return(0);
7614}
Daniel Veillard465a0002005-08-22 12:07:04 +00007615
7616/************************************************************************
7617 * *
7618 * Parsing expression *
7619 * *
7620 ************************************************************************/
7621
7622static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
7623
7624#undef CUR
7625#define CUR (*ctxt->cur)
7626#undef NEXT
7627#define NEXT ctxt->cur++;
7628#undef IS_BLANK
7629#define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
7630#define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
7631
7632static int
7633xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
7634 int ret = 0;
7635
7636 SKIP_BLANKS
7637 if (CUR == '*') {
7638 NEXT
7639 return(-1);
7640 }
7641 if ((CUR < '0') || (CUR > '9'))
7642 return(-1);
7643 while ((CUR >= '0') && (CUR <= '9')) {
7644 ret = ret * 10 + (CUR - '0');
7645 NEXT
7646 }
7647 return(ret);
7648}
7649
7650static xmlExpNodePtr
7651xmlExpParseOr(xmlExpCtxtPtr ctxt) {
7652 const char *base;
7653 xmlExpNodePtr ret;
7654 const xmlChar *val;
7655
7656 SKIP_BLANKS
7657 base = ctxt->cur;
7658 if (*ctxt->cur == '(') {
7659 NEXT
7660 ret = xmlExpParseExpr(ctxt);
7661 SKIP_BLANKS
7662 if (*ctxt->cur != ')') {
7663 fprintf(stderr, "unbalanced '(' : %s\n", base);
7664 xmlExpFree(ctxt, ret);
7665 return(NULL);
7666 }
7667 NEXT;
7668 SKIP_BLANKS
7669 goto parse_quantifier;
7670 }
7671 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
7672 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
7673 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
7674 NEXT;
7675 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
7676 if (val == NULL)
7677 return(NULL);
7678 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
7679 if (ret == NULL)
7680 return(NULL);
7681 SKIP_BLANKS
7682parse_quantifier:
7683 if (CUR == '{') {
7684 int min, max;
7685
7686 NEXT
7687 min = xmlExpParseNumber(ctxt);
7688 if (min < 0) {
7689 xmlExpFree(ctxt, ret);
7690 return(NULL);
7691 }
7692 SKIP_BLANKS
7693 if (CUR == ',') {
7694 NEXT
7695 max = xmlExpParseNumber(ctxt);
7696 SKIP_BLANKS
7697 } else
7698 max = min;
7699 if (CUR != '}') {
7700 xmlExpFree(ctxt, ret);
7701 return(NULL);
7702 }
7703 NEXT
7704 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7705 min, max);
7706 SKIP_BLANKS
7707 } else if (CUR == '?') {
7708 NEXT
7709 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7710 0, 1);
7711 SKIP_BLANKS
7712 } else if (CUR == '+') {
7713 NEXT
7714 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7715 1, -1);
7716 SKIP_BLANKS
7717 } else if (CUR == '*') {
7718 NEXT
7719 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7720 0, -1);
7721 SKIP_BLANKS
7722 }
7723 return(ret);
7724}
7725
7726
7727static xmlExpNodePtr
7728xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
7729 xmlExpNodePtr ret, right;
7730
7731 ret = xmlExpParseOr(ctxt);
7732 SKIP_BLANKS
7733 while (CUR == '|') {
7734 NEXT
7735 right = xmlExpParseOr(ctxt);
7736 if (right == NULL) {
7737 xmlExpFree(ctxt, ret);
7738 return(NULL);
7739 }
7740 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
7741 if (ret == NULL)
7742 return(NULL);
7743 }
7744 return(ret);
7745}
7746
7747static xmlExpNodePtr
7748xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
7749 xmlExpNodePtr ret, right;
7750
7751 ret = xmlExpParseSeq(ctxt);
7752 SKIP_BLANKS
7753 while (CUR == ',') {
7754 NEXT
7755 right = xmlExpParseSeq(ctxt);
7756 if (right == NULL) {
7757 xmlExpFree(ctxt, ret);
7758 return(NULL);
7759 }
7760 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
7761 if (ret == NULL)
7762 return(NULL);
7763 }
7764 return(ret);
7765}
7766
7767/**
7768 * xmlExpParse:
7769 * @ctxt: the expressions context
7770 * @expr: the 0 terminated string
7771 *
7772 * Minimal parser for regexps, it understand the following constructs
7773 * - string terminals
7774 * - choice operator |
7775 * - sequence operator ,
7776 * - subexpressions (...)
7777 * - usual cardinality operators + * and ?
7778 * - finite sequences { min, max }
7779 * - infinite sequences { min, * }
7780 * There is minimal checkings made especially no checking on strings values
7781 *
7782 * Returns a new expression or NULL in case of failure
7783 */
7784xmlExpNodePtr
7785xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
7786 xmlExpNodePtr ret;
7787
7788 ctxt->expr = expr;
7789 ctxt->cur = expr;
7790
7791 ret = xmlExpParseExpr(ctxt);
7792 SKIP_BLANKS
7793 if (*ctxt->cur != 0) {
7794 xmlExpFree(ctxt, ret);
7795 return(NULL);
7796 }
7797 return(ret);
7798}
7799
7800static void
7801xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
7802 xmlExpNodePtr c;
7803
7804 if (expr == NULL) return;
7805 if (glob) xmlBufferWriteChar(buf, "(");
7806 switch (expr->type) {
7807 case XML_EXP_EMPTY:
7808 xmlBufferWriteChar(buf, "empty");
7809 break;
7810 case XML_EXP_FORBID:
7811 xmlBufferWriteChar(buf, "forbidden");
7812 break;
7813 case XML_EXP_ATOM:
7814 xmlBufferWriteCHAR(buf, expr->exp_str);
7815 break;
7816 case XML_EXP_SEQ:
7817 c = expr->exp_left;
7818 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7819 xmlExpDumpInt(buf, c, 1);
7820 else
7821 xmlExpDumpInt(buf, c, 0);
7822 xmlBufferWriteChar(buf, " , ");
7823 c = expr->exp_right;
7824 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7825 xmlExpDumpInt(buf, c, 1);
7826 else
7827 xmlExpDumpInt(buf, c, 0);
7828 break;
7829 case XML_EXP_OR:
7830 c = expr->exp_left;
7831 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7832 xmlExpDumpInt(buf, c, 1);
7833 else
7834 xmlExpDumpInt(buf, c, 0);
7835 xmlBufferWriteChar(buf, " | ");
7836 c = expr->exp_right;
7837 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7838 xmlExpDumpInt(buf, c, 1);
7839 else
7840 xmlExpDumpInt(buf, c, 0);
7841 break;
7842 case XML_EXP_COUNT: {
7843 char rep[40];
7844
7845 c = expr->exp_left;
7846 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7847 xmlExpDumpInt(buf, c, 1);
7848 else
7849 xmlExpDumpInt(buf, c, 0);
7850 if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
7851 rep[0] = '?';
7852 rep[1] = 0;
7853 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
7854 rep[0] = '*';
7855 rep[1] = 0;
7856 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
7857 rep[0] = '+';
7858 rep[1] = 0;
7859 } else if (expr->exp_max == expr->exp_min) {
7860 snprintf(rep, 39, "{%d}", expr->exp_min);
7861 } else if (expr->exp_max < 0) {
7862 snprintf(rep, 39, "{%d,inf}", expr->exp_min);
7863 } else {
7864 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
7865 }
7866 rep[39] = 0;
7867 xmlBufferWriteChar(buf, rep);
7868 break;
7869 }
7870 default:
7871 fprintf(stderr, "Error in tree\n");
7872 }
7873 if (glob)
7874 xmlBufferWriteChar(buf, ")");
7875}
7876/**
7877 * xmlExpDump:
7878 * @buf: a buffer to receive the output
7879 * @expr: the compiled expression
7880 *
7881 * Serialize the expression as compiled to the buffer
7882 */
7883void
Daniel Veillard5eee7672005-08-22 21:22:27 +00007884xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
7885 if ((buf == NULL) || (expr == NULL))
Daniel Veillard465a0002005-08-22 12:07:04 +00007886 return;
Daniel Veillard5eee7672005-08-22 21:22:27 +00007887 xmlExpDumpInt(buf, expr, 0);
Daniel Veillard465a0002005-08-22 12:07:04 +00007888}
7889
7890/**
7891 * xmlExpMaxToken:
7892 * @expr: a compiled expression
7893 *
7894 * Indicate the maximum number of input a expression can accept
7895 *
7896 * Returns the maximum length or -1 in case of error
7897 */
7898int
7899xmlExpMaxToken(xmlExpNodePtr expr) {
7900 if (expr == NULL)
7901 return(-1);
7902 return(expr->c_max);
7903}
7904
7905/**
7906 * xmlExpCtxtNbNodes:
7907 * @ctxt: an expression context
7908 *
7909 * Debugging facility provides the number of allocated nodes at a that point
7910 *
7911 * Returns the number of nodes in use or -1 in case of error
7912 */
7913int
7914xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
7915 if (ctxt == NULL)
7916 return(-1);
7917 return(ctxt->nb_nodes);
7918}
7919
7920/**
7921 * xmlExpCtxtNbCons:
7922 * @ctxt: an expression context
7923 *
7924 * Debugging facility provides the number of allocated nodes over lifetime
7925 *
7926 * Returns the number of nodes ever allocated or -1 in case of error
7927 */
7928int
7929xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
7930 if (ctxt == NULL)
7931 return(-1);
7932 return(ctxt->nb_cons);
7933}
7934
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007935#endif /* LIBXML_EXPR_ENABLED */
Daniel Veillard5d4644e2005-04-01 13:11:58 +00007936#define bottom_xmlregexp
7937#include "elfgcchack.h"
Daniel Veillard4255d502002-04-16 15:50:10 +00007938#endif /* LIBXML_REGEXP_ENABLED */