blob: 6234a879f8d078c2d0014e4cfba251557bf795b1 [file] [log] [blame]
Daniel Veillard4255d502002-04-16 15:50:10 +00001/*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004 * Basically designed with the purpose of compiling regexps for
Daniel Veillard4255d502002-04-16 15:50:10 +00005 * the variety of validation/shemas mechanisms now available in
William M. Brackddf71d62004-05-06 04:17:26 +00006 * XML related specifications these include:
Daniel Veillard4255d502002-04-16 15:50:10 +00007 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17#define IN_LIBXML
18#include "libxml.h"
19
20#ifdef LIBXML_REGEXP_ENABLED
21
Daniel Veillardcee2b3a2005-01-25 00:22:52 +000022/* #define DEBUG_ERR */
Daniel Veillardfc0b6f62005-01-09 17:48:02 +000023
Daniel Veillard4255d502002-04-16 15:50:10 +000024#include <stdio.h>
25#include <string.h>
Daniel Veillardebe48c62003-12-03 12:12:27 +000026#ifdef HAVE_LIMITS_H
27#include <limits.h>
28#endif
29
Daniel Veillard4255d502002-04-16 15:50:10 +000030#include <libxml/tree.h>
31#include <libxml/parserInternals.h>
32#include <libxml/xmlregexp.h>
33#include <libxml/xmlautomata.h>
34#include <libxml/xmlunicode.h>
35
Daniel Veillardebe48c62003-12-03 12:12:27 +000036#ifndef INT_MAX
37#define INT_MAX 123456789 /* easy to flag and big enough for our needs */
38#endif
39
Daniel Veillardc0826a72004-08-10 14:17:33 +000040/* #define DEBUG_REGEXP_GRAPH */
Daniel Veillard10752282005-08-08 13:05:13 +000041/* #define DEBUG_REGEXP_EXEC */
Daniel Veillard4255d502002-04-16 15:50:10 +000042/* #define DEBUG_PUSH */
Daniel Veillard23e73572002-09-19 19:56:43 +000043/* #define DEBUG_COMPACTION */
Daniel Veillard4255d502002-04-16 15:50:10 +000044
Daniel Veillard567a45b2005-10-18 19:11:55 +000045#define MAX_PUSH 10000000
Daniel Veillard94cc1032005-09-15 13:09:00 +000046
Patrick R. Gansterer204f1f12012-05-10 20:24:00 +080047#ifdef ERROR
48#undef ERROR
49#endif
Daniel Veillardff46a042003-10-08 08:53:17 +000050#define ERROR(str) \
51 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
52 xmlRegexpErrCompile(ctxt, str);
Daniel Veillard4255d502002-04-16 15:50:10 +000053#define NEXT ctxt->cur++
54#define CUR (*(ctxt->cur))
55#define NXT(index) (ctxt->cur[index])
56
57#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
58#define NEXTL(l) ctxt->cur += l;
Daniel Veillardc0826a72004-08-10 14:17:33 +000059#define XML_REG_STRING_SEPARATOR '|'
William M. Bracka9cbf282007-03-21 13:16:33 +000060/*
61 * Need PREV to check on a '-' within a Character Group. May only be used
62 * when it's guaranteed that cur is not at the beginning of ctxt->string!
63 */
64#define PREV (ctxt->cur[-1])
Daniel Veillard4255d502002-04-16 15:50:10 +000065
Daniel Veillarde19fc232002-04-22 16:01:24 +000066/**
67 * TODO:
68 *
69 * macro to flag unimplemented blocks
70 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +080071#define TODO \
Daniel Veillarde19fc232002-04-22 16:01:24 +000072 xmlGenericError(xmlGenericErrorContext, \
73 "Unimplemented block at %s:%d\n", \
74 __FILE__, __LINE__);
75
Daniel Veillard4255d502002-04-16 15:50:10 +000076/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +080077 * *
78 * Datatypes and structures *
79 * *
Daniel Veillard4255d502002-04-16 15:50:10 +000080 ************************************************************************/
81
Daniel Veillardfc011b72006-02-12 19:14:15 +000082/*
83 * Note: the order of the enums below is significant, do not shuffle
84 */
Daniel Veillard4255d502002-04-16 15:50:10 +000085typedef enum {
86 XML_REGEXP_EPSILON = 1,
87 XML_REGEXP_CHARVAL,
88 XML_REGEXP_RANGES,
Daniel Veillard567a45b2005-10-18 19:11:55 +000089 XML_REGEXP_SUBREG, /* used for () sub regexps */
Daniel Veillard4255d502002-04-16 15:50:10 +000090 XML_REGEXP_STRING,
91 XML_REGEXP_ANYCHAR, /* . */
92 XML_REGEXP_ANYSPACE, /* \s */
93 XML_REGEXP_NOTSPACE, /* \S */
94 XML_REGEXP_INITNAME, /* \l */
Daniel Veillard567a45b2005-10-18 19:11:55 +000095 XML_REGEXP_NOTINITNAME, /* \L */
Daniel Veillard4255d502002-04-16 15:50:10 +000096 XML_REGEXP_NAMECHAR, /* \c */
97 XML_REGEXP_NOTNAMECHAR, /* \C */
98 XML_REGEXP_DECIMAL, /* \d */
Daniel Veillard567a45b2005-10-18 19:11:55 +000099 XML_REGEXP_NOTDECIMAL, /* \D */
Daniel Veillard4255d502002-04-16 15:50:10 +0000100 XML_REGEXP_REALCHAR, /* \w */
Daniel Veillard567a45b2005-10-18 19:11:55 +0000101 XML_REGEXP_NOTREALCHAR, /* \W */
102 XML_REGEXP_LETTER = 100,
Daniel Veillard4255d502002-04-16 15:50:10 +0000103 XML_REGEXP_LETTER_UPPERCASE,
104 XML_REGEXP_LETTER_LOWERCASE,
105 XML_REGEXP_LETTER_TITLECASE,
106 XML_REGEXP_LETTER_MODIFIER,
107 XML_REGEXP_LETTER_OTHERS,
108 XML_REGEXP_MARK,
109 XML_REGEXP_MARK_NONSPACING,
110 XML_REGEXP_MARK_SPACECOMBINING,
111 XML_REGEXP_MARK_ENCLOSING,
112 XML_REGEXP_NUMBER,
113 XML_REGEXP_NUMBER_DECIMAL,
114 XML_REGEXP_NUMBER_LETTER,
115 XML_REGEXP_NUMBER_OTHERS,
116 XML_REGEXP_PUNCT,
117 XML_REGEXP_PUNCT_CONNECTOR,
118 XML_REGEXP_PUNCT_DASH,
119 XML_REGEXP_PUNCT_OPEN,
120 XML_REGEXP_PUNCT_CLOSE,
121 XML_REGEXP_PUNCT_INITQUOTE,
122 XML_REGEXP_PUNCT_FINQUOTE,
123 XML_REGEXP_PUNCT_OTHERS,
124 XML_REGEXP_SEPAR,
125 XML_REGEXP_SEPAR_SPACE,
126 XML_REGEXP_SEPAR_LINE,
127 XML_REGEXP_SEPAR_PARA,
128 XML_REGEXP_SYMBOL,
129 XML_REGEXP_SYMBOL_MATH,
130 XML_REGEXP_SYMBOL_CURRENCY,
131 XML_REGEXP_SYMBOL_MODIFIER,
132 XML_REGEXP_SYMBOL_OTHERS,
133 XML_REGEXP_OTHER,
134 XML_REGEXP_OTHER_CONTROL,
135 XML_REGEXP_OTHER_FORMAT,
136 XML_REGEXP_OTHER_PRIVATE,
137 XML_REGEXP_OTHER_NA,
138 XML_REGEXP_BLOCK_NAME
139} xmlRegAtomType;
140
141typedef enum {
142 XML_REGEXP_QUANT_EPSILON = 1,
143 XML_REGEXP_QUANT_ONCE,
144 XML_REGEXP_QUANT_OPT,
145 XML_REGEXP_QUANT_MULT,
146 XML_REGEXP_QUANT_PLUS,
Daniel Veillard7646b182002-04-20 06:41:40 +0000147 XML_REGEXP_QUANT_ONCEONLY,
148 XML_REGEXP_QUANT_ALL,
Daniel Veillard4255d502002-04-16 15:50:10 +0000149 XML_REGEXP_QUANT_RANGE
150} xmlRegQuantType;
151
152typedef enum {
153 XML_REGEXP_START_STATE = 1,
154 XML_REGEXP_FINAL_STATE,
Daniel Veillardcc026dc2005-01-12 13:21:17 +0000155 XML_REGEXP_TRANS_STATE,
Daniel Veillard0e05f4c2006-11-01 15:33:04 +0000156 XML_REGEXP_SINK_STATE,
157 XML_REGEXP_UNREACH_STATE
Daniel Veillard4255d502002-04-16 15:50:10 +0000158} xmlRegStateType;
159
160typedef enum {
161 XML_REGEXP_MARK_NORMAL = 0,
162 XML_REGEXP_MARK_START,
163 XML_REGEXP_MARK_VISITED
164} xmlRegMarkedType;
165
166typedef struct _xmlRegRange xmlRegRange;
167typedef xmlRegRange *xmlRegRangePtr;
168
169struct _xmlRegRange {
Daniel Veillardf8b9de32003-11-24 14:27:26 +0000170 int neg; /* 0 normal, 1 not, 2 exclude */
Daniel Veillard4255d502002-04-16 15:50:10 +0000171 xmlRegAtomType type;
172 int start;
173 int end;
174 xmlChar *blockName;
175};
176
177typedef struct _xmlRegAtom xmlRegAtom;
178typedef xmlRegAtom *xmlRegAtomPtr;
179
180typedef struct _xmlAutomataState xmlRegState;
181typedef xmlRegState *xmlRegStatePtr;
182
183struct _xmlRegAtom {
184 int no;
185 xmlRegAtomType type;
186 xmlRegQuantType quant;
187 int min;
188 int max;
189
190 void *valuep;
Daniel Veillarda646cfd2002-09-17 21:50:03 +0000191 void *valuep2;
Daniel Veillard4255d502002-04-16 15:50:10 +0000192 int neg;
193 int codepoint;
194 xmlRegStatePtr start;
Daniel Veillard76d59b62007-08-22 16:29:21 +0000195 xmlRegStatePtr start0;
Daniel Veillard4255d502002-04-16 15:50:10 +0000196 xmlRegStatePtr stop;
197 int maxRanges;
198 int nbRanges;
199 xmlRegRangePtr *ranges;
200 void *data;
201};
202
203typedef struct _xmlRegCounter xmlRegCounter;
204typedef xmlRegCounter *xmlRegCounterPtr;
205
206struct _xmlRegCounter {
207 int min;
208 int max;
209};
210
211typedef struct _xmlRegTrans xmlRegTrans;
212typedef xmlRegTrans *xmlRegTransPtr;
213
214struct _xmlRegTrans {
215 xmlRegAtomPtr atom;
216 int to;
217 int counter;
218 int count;
Daniel Veillard567a45b2005-10-18 19:11:55 +0000219 int nd;
Daniel Veillard4255d502002-04-16 15:50:10 +0000220};
221
222struct _xmlAutomataState {
223 xmlRegStateType type;
224 xmlRegMarkedType mark;
Daniel Veillard466fcda2012-08-27 12:03:40 +0800225 xmlRegMarkedType markd;
Daniel Veillard23e73572002-09-19 19:56:43 +0000226 xmlRegMarkedType reached;
Daniel Veillard4255d502002-04-16 15:50:10 +0000227 int no;
Daniel Veillard4255d502002-04-16 15:50:10 +0000228 int maxTrans;
229 int nbTrans;
230 xmlRegTrans *trans;
Daniel Veillarddb68b742005-07-30 13:18:24 +0000231 /* knowing states ponting to us can speed things up */
232 int maxTransTo;
233 int nbTransTo;
234 int *transTo;
Daniel Veillard4255d502002-04-16 15:50:10 +0000235};
236
237typedef struct _xmlAutomata xmlRegParserCtxt;
238typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
239
Daniel Veillard1ba2aca2009-08-31 16:47:39 +0200240#define AM_AUTOMATA_RNG 1
241
Daniel Veillard4255d502002-04-16 15:50:10 +0000242struct _xmlAutomata {
243 xmlChar *string;
244 xmlChar *cur;
245
246 int error;
247 int neg;
248
249 xmlRegStatePtr start;
250 xmlRegStatePtr end;
251 xmlRegStatePtr state;
252
253 xmlRegAtomPtr atom;
254
255 int maxAtoms;
256 int nbAtoms;
257 xmlRegAtomPtr *atoms;
258
259 int maxStates;
260 int nbStates;
261 xmlRegStatePtr *states;
262
263 int maxCounters;
264 int nbCounters;
265 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000266
267 int determinist;
Daniel Veillard6e65e152005-08-09 11:09:52 +0000268 int negs;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +0200269 int flags;
Daniel Veillard4255d502002-04-16 15:50:10 +0000270};
271
272struct _xmlRegexp {
273 xmlChar *string;
274 int nbStates;
275 xmlRegStatePtr *states;
276 int nbAtoms;
277 xmlRegAtomPtr *atoms;
278 int nbCounters;
279 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000280 int determinist;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +0200281 int flags;
Daniel Veillard23e73572002-09-19 19:56:43 +0000282 /*
283 * That's the compact form for determinists automatas
284 */
285 int nbstates;
286 int *compact;
Daniel Veillard118aed72002-09-24 14:13:13 +0000287 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000288 int nbstrings;
289 xmlChar **stringMap;
Daniel Veillard4255d502002-04-16 15:50:10 +0000290};
291
292typedef struct _xmlRegExecRollback xmlRegExecRollback;
293typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
294
295struct _xmlRegExecRollback {
296 xmlRegStatePtr state;/* the current state */
297 int index; /* the index in the input stack */
298 int nextbranch; /* the next transition to explore in that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000299 int *counts; /* save the automata state if it has some */
Daniel Veillard4255d502002-04-16 15:50:10 +0000300};
301
302typedef struct _xmlRegInputToken xmlRegInputToken;
303typedef xmlRegInputToken *xmlRegInputTokenPtr;
304
305struct _xmlRegInputToken {
306 xmlChar *value;
307 void *data;
308};
309
310struct _xmlRegExecCtxt {
311 int status; /* execution status != 0 indicate an error */
William M. Brackddf71d62004-05-06 04:17:26 +0000312 int determinist; /* did we find an indeterministic behaviour */
Daniel Veillard4255d502002-04-16 15:50:10 +0000313 xmlRegexpPtr comp; /* the compiled regexp */
314 xmlRegExecCallbacks callback;
315 void *data;
316
317 xmlRegStatePtr state;/* the current state */
318 int transno; /* the current transition on that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000319 int transcount; /* the number of chars in char counted transitions */
Daniel Veillard4255d502002-04-16 15:50:10 +0000320
321 /*
322 * A stack of rollback states
323 */
324 int maxRollbacks;
325 int nbRollbacks;
326 xmlRegExecRollback *rollbacks;
327
328 /*
329 * The state of the automata if any
330 */
331 int *counts;
332
333 /*
334 * The input stack
335 */
336 int inputStackMax;
337 int inputStackNr;
338 int index;
339 int *charStack;
340 const xmlChar *inputString; /* when operating on characters */
341 xmlRegInputTokenPtr inputStack;/* when operating on strings */
342
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +0000343 /*
344 * error handling
345 */
346 int errStateNo; /* the error state number */
347 xmlRegStatePtr errState; /* the error state */
348 xmlChar *errString; /* the string raising the error */
349 int *errCounts; /* counters at the error state */
Daniel Veillard94cc1032005-09-15 13:09:00 +0000350 int nbPush;
Daniel Veillard4255d502002-04-16 15:50:10 +0000351};
352
Daniel Veillard441bc322002-04-20 17:38:48 +0000353#define REGEXP_ALL_COUNTER 0x123456
354#define REGEXP_ALL_LAX_COUNTER 0x123457
Daniel Veillard7646b182002-04-20 06:41:40 +0000355
Daniel Veillard4255d502002-04-16 15:50:10 +0000356static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
Daniel Veillard23e73572002-09-19 19:56:43 +0000357static void xmlRegFreeState(xmlRegStatePtr state);
358static void xmlRegFreeAtom(xmlRegAtomPtr atom);
Daniel Veillard9efc4762005-07-19 14:33:55 +0000359static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
Daniel Veillard567a45b2005-10-18 19:11:55 +0000360static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
361static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
362 int neg, int start, int end, const xmlChar *blockName);
Daniel Veillard4255d502002-04-16 15:50:10 +0000363
Daniel Veillard1ba2aca2009-08-31 16:47:39 +0200364void xmlAutomataSetFlags(xmlAutomataPtr am, int flags);
365
Daniel Veillard4255d502002-04-16 15:50:10 +0000366/************************************************************************
Daniel Veillardff46a042003-10-08 08:53:17 +0000367 * *
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800368 * Regexp memory error handler *
Daniel Veillardff46a042003-10-08 08:53:17 +0000369 * *
370 ************************************************************************/
371/**
372 * xmlRegexpErrMemory:
William M. Brackddf71d62004-05-06 04:17:26 +0000373 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000374 *
375 * Handle an out of memory condition
376 */
377static void
378xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
379{
380 const char *regexp = NULL;
381 if (ctxt != NULL) {
382 regexp = (const char *) ctxt->string;
383 ctxt->error = XML_ERR_NO_MEMORY;
384 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000385 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000386 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
387 regexp, NULL, 0, 0,
388 "Memory allocation failed : %s\n", extra);
389}
390
391/**
392 * xmlRegexpErrCompile:
William M. Brackddf71d62004-05-06 04:17:26 +0000393 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000394 *
William M. Brackddf71d62004-05-06 04:17:26 +0000395 * Handle a compilation failure
Daniel Veillardff46a042003-10-08 08:53:17 +0000396 */
397static void
398xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
399{
400 const char *regexp = NULL;
401 int idx = 0;
402
403 if (ctxt != NULL) {
404 regexp = (const char *) ctxt->string;
405 idx = ctxt->cur - ctxt->string;
406 ctxt->error = XML_REGEXP_COMPILE_ERROR;
407 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000408 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000409 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
410 regexp, NULL, idx, 0,
411 "failed to compile: %s\n", extra);
412}
413
414/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800415 * *
416 * Allocation/Deallocation *
417 * *
Daniel Veillard4255d502002-04-16 15:50:10 +0000418 ************************************************************************/
419
Daniel Veillard23e73572002-09-19 19:56:43 +0000420static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
Daniel Veillard4255d502002-04-16 15:50:10 +0000421/**
422 * xmlRegEpxFromParse:
423 * @ctxt: the parser context used to build it
424 *
William M. Brackddf71d62004-05-06 04:17:26 +0000425 * Allocate a new regexp and fill it with the result from the parser
Daniel Veillard4255d502002-04-16 15:50:10 +0000426 *
427 * Returns the new regexp or NULL in case of error
428 */
429static xmlRegexpPtr
430xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
431 xmlRegexpPtr ret;
432
433 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000434 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000435 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +0000436 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000437 }
Daniel Veillard4255d502002-04-16 15:50:10 +0000438 memset(ret, 0, sizeof(xmlRegexp));
439 ret->string = ctxt->string;
Daniel Veillard4255d502002-04-16 15:50:10 +0000440 ret->nbStates = ctxt->nbStates;
Daniel Veillard4255d502002-04-16 15:50:10 +0000441 ret->states = ctxt->states;
Daniel Veillard4255d502002-04-16 15:50:10 +0000442 ret->nbAtoms = ctxt->nbAtoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000443 ret->atoms = ctxt->atoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000444 ret->nbCounters = ctxt->nbCounters;
Daniel Veillard4255d502002-04-16 15:50:10 +0000445 ret->counters = ctxt->counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000446 ret->determinist = ctxt->determinist;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +0200447 ret->flags = ctxt->flags;
Daniel Veillard567a45b2005-10-18 19:11:55 +0000448 if (ret->determinist == -1) {
449 xmlRegexpIsDeterminist(ret);
450 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000451
452 if ((ret->determinist != 0) &&
453 (ret->nbCounters == 0) &&
Daniel Veillard6e65e152005-08-09 11:09:52 +0000454 (ctxt->negs == 0) &&
Daniel Veillard118aed72002-09-24 14:13:13 +0000455 (ret->atoms != NULL) &&
Daniel Veillard23e73572002-09-19 19:56:43 +0000456 (ret->atoms[0] != NULL) &&
457 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
458 int i, j, nbstates = 0, nbatoms = 0;
459 int *stateRemap;
460 int *stringRemap;
461 int *transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000462 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000463 xmlChar **stringMap;
464 xmlChar *value;
465
466 /*
467 * Switch to a compact representation
468 * 1/ counting the effective number of states left
William M. Brackddf71d62004-05-06 04:17:26 +0000469 * 2/ counting the unique number of atoms, and check that
Daniel Veillard23e73572002-09-19 19:56:43 +0000470 * they are all of the string type
471 * 3/ build a table state x atom for the transitions
472 */
473
474 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000475 if (stateRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000476 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000477 xmlFree(ret);
478 return(NULL);
479 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000480 for (i = 0;i < ret->nbStates;i++) {
481 if (ret->states[i] != NULL) {
482 stateRemap[i] = nbstates;
483 nbstates++;
484 } else {
485 stateRemap[i] = -1;
486 }
487 }
488#ifdef DEBUG_COMPACTION
489 printf("Final: %d states\n", nbstates);
490#endif
491 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000492 if (stringMap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000493 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000494 xmlFree(stateRemap);
495 xmlFree(ret);
496 return(NULL);
497 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000498 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000499 if (stringRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000500 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000501 xmlFree(stringMap);
502 xmlFree(stateRemap);
503 xmlFree(ret);
504 return(NULL);
505 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000506 for (i = 0;i < ret->nbAtoms;i++) {
507 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
508 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
509 value = ret->atoms[i]->valuep;
510 for (j = 0;j < nbatoms;j++) {
511 if (xmlStrEqual(stringMap[j], value)) {
512 stringRemap[i] = j;
513 break;
514 }
515 }
516 if (j >= nbatoms) {
517 stringRemap[i] = nbatoms;
518 stringMap[nbatoms] = xmlStrdup(value);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000519 if (stringMap[nbatoms] == NULL) {
520 for (i = 0;i < nbatoms;i++)
521 xmlFree(stringMap[i]);
522 xmlFree(stringRemap);
523 xmlFree(stringMap);
524 xmlFree(stateRemap);
525 xmlFree(ret);
526 return(NULL);
527 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000528 nbatoms++;
529 }
530 } else {
531 xmlFree(stateRemap);
532 xmlFree(stringRemap);
533 for (i = 0;i < nbatoms;i++)
534 xmlFree(stringMap[i]);
535 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000536 xmlFree(ret);
537 return(NULL);
Daniel Veillard23e73572002-09-19 19:56:43 +0000538 }
539 }
540#ifdef DEBUG_COMPACTION
541 printf("Final: %d atoms\n", nbatoms);
542#endif
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000543 transitions = (int *) xmlMalloc((nbstates + 1) *
544 (nbatoms + 1) * sizeof(int));
545 if (transitions == NULL) {
546 xmlFree(stateRemap);
547 xmlFree(stringRemap);
548 xmlFree(stringMap);
549 xmlFree(ret);
550 return(NULL);
551 }
552 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int));
Daniel Veillard23e73572002-09-19 19:56:43 +0000553
554 /*
555 * Allocate the transition table. The first entry for each
William M. Brackddf71d62004-05-06 04:17:26 +0000556 * state corresponds to the state type.
Daniel Veillard23e73572002-09-19 19:56:43 +0000557 */
Daniel Veillard118aed72002-09-24 14:13:13 +0000558 transdata = NULL;
Daniel Veillard23e73572002-09-19 19:56:43 +0000559
560 for (i = 0;i < ret->nbStates;i++) {
561 int stateno, atomno, targetno, prev;
562 xmlRegStatePtr state;
563 xmlRegTransPtr trans;
564
565 stateno = stateRemap[i];
566 if (stateno == -1)
567 continue;
568 state = ret->states[i];
569
570 transitions[stateno * (nbatoms + 1)] = state->type;
571
572 for (j = 0;j < state->nbTrans;j++) {
573 trans = &(state->trans[j]);
574 if ((trans->to == -1) || (trans->atom == NULL))
575 continue;
576 atomno = stringRemap[trans->atom->no];
Daniel Veillard118aed72002-09-24 14:13:13 +0000577 if ((trans->atom->data != NULL) && (transdata == NULL)) {
578 transdata = (void **) xmlMalloc(nbstates * nbatoms *
579 sizeof(void *));
580 if (transdata != NULL)
581 memset(transdata, 0,
582 nbstates * nbatoms * sizeof(void *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000583 else {
Daniel Veillardff46a042003-10-08 08:53:17 +0000584 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000585 break;
586 }
Daniel Veillard118aed72002-09-24 14:13:13 +0000587 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000588 targetno = stateRemap[trans->to];
589 /*
William M. Brackddf71d62004-05-06 04:17:26 +0000590 * if the same atom can generate transitions to 2 different
Daniel Veillard23e73572002-09-19 19:56:43 +0000591 * states then it means the automata is not determinist and
592 * the compact form can't be used !
593 */
594 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
595 if (prev != 0) {
596 if (prev != targetno + 1) {
Daniel Veillard23e73572002-09-19 19:56:43 +0000597 ret->determinist = 0;
598#ifdef DEBUG_COMPACTION
599 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
600 i, j, trans->atom->no, trans->to, atomno, targetno);
601 printf(" previous to is %d\n", prev);
602#endif
Daniel Veillard118aed72002-09-24 14:13:13 +0000603 if (transdata != NULL)
604 xmlFree(transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +0000605 xmlFree(transitions);
606 xmlFree(stateRemap);
607 xmlFree(stringRemap);
608 for (i = 0;i < nbatoms;i++)
609 xmlFree(stringMap[i]);
610 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000611 goto not_determ;
Daniel Veillard23e73572002-09-19 19:56:43 +0000612 }
613 } else {
614#if 0
615 printf("State %d trans %d: atom %d to %d : %d to %d\n",
616 i, j, trans->atom->no, trans->to, atomno, targetno);
617#endif
618 transitions[stateno * (nbatoms + 1) + atomno + 1] =
Daniel Veillard118aed72002-09-24 14:13:13 +0000619 targetno + 1; /* to avoid 0 */
620 if (transdata != NULL)
621 transdata[stateno * nbatoms + atomno] =
622 trans->atom->data;
Daniel Veillard23e73572002-09-19 19:56:43 +0000623 }
624 }
625 }
626 ret->determinist = 1;
627#ifdef DEBUG_COMPACTION
628 /*
629 * Debug
630 */
631 for (i = 0;i < nbstates;i++) {
632 for (j = 0;j < nbatoms + 1;j++) {
633 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
634 }
635 printf("\n");
636 }
637 printf("\n");
638#endif
639 /*
640 * Cleanup of the old data
641 */
642 if (ret->states != NULL) {
643 for (i = 0;i < ret->nbStates;i++)
644 xmlRegFreeState(ret->states[i]);
645 xmlFree(ret->states);
646 }
647 ret->states = NULL;
648 ret->nbStates = 0;
649 if (ret->atoms != NULL) {
650 for (i = 0;i < ret->nbAtoms;i++)
651 xmlRegFreeAtom(ret->atoms[i]);
652 xmlFree(ret->atoms);
653 }
654 ret->atoms = NULL;
655 ret->nbAtoms = 0;
656
657 ret->compact = transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000658 ret->transdata = transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000659 ret->stringMap = stringMap;
660 ret->nbstrings = nbatoms;
661 ret->nbstates = nbstates;
662 xmlFree(stateRemap);
663 xmlFree(stringRemap);
664 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000665not_determ:
666 ctxt->string = NULL;
667 ctxt->nbStates = 0;
668 ctxt->states = NULL;
669 ctxt->nbAtoms = 0;
670 ctxt->atoms = NULL;
671 ctxt->nbCounters = 0;
672 ctxt->counters = NULL;
Daniel Veillard4255d502002-04-16 15:50:10 +0000673 return(ret);
674}
675
676/**
677 * xmlRegNewParserCtxt:
678 * @string: the string to parse
679 *
680 * Allocate a new regexp parser context
681 *
682 * Returns the new context or NULL in case of error
683 */
684static xmlRegParserCtxtPtr
685xmlRegNewParserCtxt(const xmlChar *string) {
686 xmlRegParserCtxtPtr ret;
687
688 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
689 if (ret == NULL)
690 return(NULL);
691 memset(ret, 0, sizeof(xmlRegParserCtxt));
692 if (string != NULL)
693 ret->string = xmlStrdup(string);
694 ret->cur = ret->string;
695 ret->neg = 0;
Daniel Veillard6e65e152005-08-09 11:09:52 +0000696 ret->negs = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +0000697 ret->error = 0;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000698 ret->determinist = -1;
Daniel Veillard4255d502002-04-16 15:50:10 +0000699 return(ret);
700}
701
702/**
703 * xmlRegNewRange:
704 * @ctxt: the regexp parser context
705 * @neg: is that negative
706 * @type: the type of range
707 * @start: the start codepoint
708 * @end: the end codepoint
709 *
710 * Allocate a new regexp range
711 *
712 * Returns the new range or NULL in case of error
713 */
714static xmlRegRangePtr
715xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
716 int neg, xmlRegAtomType type, int start, int end) {
717 xmlRegRangePtr ret;
718
719 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
720 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000721 xmlRegexpErrMemory(ctxt, "allocating range");
Daniel Veillard4255d502002-04-16 15:50:10 +0000722 return(NULL);
723 }
724 ret->neg = neg;
725 ret->type = type;
726 ret->start = start;
727 ret->end = end;
728 return(ret);
729}
730
731/**
732 * xmlRegFreeRange:
733 * @range: the regexp range
734 *
735 * Free a regexp range
736 */
737static void
738xmlRegFreeRange(xmlRegRangePtr range) {
739 if (range == NULL)
740 return;
741
742 if (range->blockName != NULL)
743 xmlFree(range->blockName);
744 xmlFree(range);
745}
746
747/**
Daniel Veillard76d59b62007-08-22 16:29:21 +0000748 * xmlRegCopyRange:
749 * @range: the regexp range
750 *
751 * Copy a regexp range
752 *
753 * Returns the new copy or NULL in case of error.
754 */
755static xmlRegRangePtr
756xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
757 xmlRegRangePtr ret;
758
759 if (range == NULL)
760 return(NULL);
761
762 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
763 range->end);
764 if (ret == NULL)
765 return(NULL);
766 if (range->blockName != NULL) {
767 ret->blockName = xmlStrdup(range->blockName);
768 if (ret->blockName == NULL) {
769 xmlRegexpErrMemory(ctxt, "allocating range");
770 xmlRegFreeRange(ret);
771 return(NULL);
772 }
773 }
774 return(ret);
775}
776
777/**
Daniel Veillard4255d502002-04-16 15:50:10 +0000778 * xmlRegNewAtom:
779 * @ctxt: the regexp parser context
780 * @type: the type of atom
781 *
Daniel Veillard76d59b62007-08-22 16:29:21 +0000782 * Allocate a new atom
Daniel Veillard4255d502002-04-16 15:50:10 +0000783 *
784 * Returns the new atom or NULL in case of error
785 */
786static xmlRegAtomPtr
787xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
788 xmlRegAtomPtr ret;
789
790 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
791 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000792 xmlRegexpErrMemory(ctxt, "allocating atom");
Daniel Veillard4255d502002-04-16 15:50:10 +0000793 return(NULL);
794 }
795 memset(ret, 0, sizeof(xmlRegAtom));
796 ret->type = type;
797 ret->quant = XML_REGEXP_QUANT_ONCE;
798 ret->min = 0;
799 ret->max = 0;
800 return(ret);
801}
802
803/**
804 * xmlRegFreeAtom:
805 * @atom: the regexp atom
806 *
807 * Free a regexp atom
808 */
809static void
810xmlRegFreeAtom(xmlRegAtomPtr atom) {
811 int i;
812
813 if (atom == NULL)
814 return;
815
816 for (i = 0;i < atom->nbRanges;i++)
817 xmlRegFreeRange(atom->ranges[i]);
818 if (atom->ranges != NULL)
819 xmlFree(atom->ranges);
Daniel Veillardde0e4982005-07-03 14:35:44 +0000820 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
821 xmlFree(atom->valuep);
Daniel Veillard77005e62005-07-19 16:26:18 +0000822 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
823 xmlFree(atom->valuep2);
Daniel Veillardde0e4982005-07-03 14:35:44 +0000824 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +0000825 xmlFree(atom->valuep);
826 xmlFree(atom);
827}
828
Daniel Veillard76d59b62007-08-22 16:29:21 +0000829/**
830 * xmlRegCopyAtom:
831 * @ctxt: the regexp parser context
832 * @atom: the oiginal atom
833 *
834 * Allocate a new regexp range
835 *
836 * Returns the new atom or NULL in case of error
837 */
838static xmlRegAtomPtr
839xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
840 xmlRegAtomPtr ret;
841
842 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
843 if (ret == NULL) {
844 xmlRegexpErrMemory(ctxt, "copying atom");
845 return(NULL);
846 }
847 memset(ret, 0, sizeof(xmlRegAtom));
848 ret->type = atom->type;
849 ret->quant = atom->quant;
850 ret->min = atom->min;
851 ret->max = atom->max;
852 if (atom->nbRanges > 0) {
853 int i;
854
855 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
856 atom->nbRanges);
857 if (ret->ranges == NULL) {
858 xmlRegexpErrMemory(ctxt, "copying atom");
859 goto error;
860 }
861 for (i = 0;i < atom->nbRanges;i++) {
862 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
863 if (ret->ranges[i] == NULL)
864 goto error;
865 ret->nbRanges = i + 1;
866 }
867 }
868 return(ret);
869
870error:
871 xmlRegFreeAtom(ret);
872 return(NULL);
873}
874
Daniel Veillard4255d502002-04-16 15:50:10 +0000875static xmlRegStatePtr
876xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
877 xmlRegStatePtr ret;
878
879 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
880 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000881 xmlRegexpErrMemory(ctxt, "allocating state");
Daniel Veillard4255d502002-04-16 15:50:10 +0000882 return(NULL);
883 }
884 memset(ret, 0, sizeof(xmlRegState));
885 ret->type = XML_REGEXP_TRANS_STATE;
886 ret->mark = XML_REGEXP_MARK_NORMAL;
887 return(ret);
888}
889
890/**
891 * xmlRegFreeState:
892 * @state: the regexp state
893 *
894 * Free a regexp state
895 */
896static void
897xmlRegFreeState(xmlRegStatePtr state) {
898 if (state == NULL)
899 return;
900
901 if (state->trans != NULL)
902 xmlFree(state->trans);
Daniel Veillarddb68b742005-07-30 13:18:24 +0000903 if (state->transTo != NULL)
904 xmlFree(state->transTo);
Daniel Veillard4255d502002-04-16 15:50:10 +0000905 xmlFree(state);
906}
907
908/**
909 * xmlRegFreeParserCtxt:
910 * @ctxt: the regexp parser context
911 *
912 * Free a regexp parser context
913 */
914static void
915xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
916 int i;
917 if (ctxt == NULL)
918 return;
919
920 if (ctxt->string != NULL)
921 xmlFree(ctxt->string);
922 if (ctxt->states != NULL) {
923 for (i = 0;i < ctxt->nbStates;i++)
924 xmlRegFreeState(ctxt->states[i]);
925 xmlFree(ctxt->states);
926 }
927 if (ctxt->atoms != NULL) {
928 for (i = 0;i < ctxt->nbAtoms;i++)
929 xmlRegFreeAtom(ctxt->atoms[i]);
930 xmlFree(ctxt->atoms);
931 }
932 if (ctxt->counters != NULL)
933 xmlFree(ctxt->counters);
934 xmlFree(ctxt);
935}
936
937/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800938 * *
939 * Display of Data structures *
940 * *
Daniel Veillard4255d502002-04-16 15:50:10 +0000941 ************************************************************************/
942
943static void
944xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
945 switch (type) {
946 case XML_REGEXP_EPSILON:
947 fprintf(output, "epsilon "); break;
948 case XML_REGEXP_CHARVAL:
949 fprintf(output, "charval "); break;
950 case XML_REGEXP_RANGES:
951 fprintf(output, "ranges "); break;
952 case XML_REGEXP_SUBREG:
953 fprintf(output, "subexpr "); break;
954 case XML_REGEXP_STRING:
955 fprintf(output, "string "); break;
956 case XML_REGEXP_ANYCHAR:
957 fprintf(output, "anychar "); break;
958 case XML_REGEXP_ANYSPACE:
959 fprintf(output, "anyspace "); break;
960 case XML_REGEXP_NOTSPACE:
961 fprintf(output, "notspace "); break;
962 case XML_REGEXP_INITNAME:
963 fprintf(output, "initname "); break;
964 case XML_REGEXP_NOTINITNAME:
965 fprintf(output, "notinitname "); break;
966 case XML_REGEXP_NAMECHAR:
967 fprintf(output, "namechar "); break;
968 case XML_REGEXP_NOTNAMECHAR:
969 fprintf(output, "notnamechar "); break;
970 case XML_REGEXP_DECIMAL:
971 fprintf(output, "decimal "); break;
972 case XML_REGEXP_NOTDECIMAL:
973 fprintf(output, "notdecimal "); break;
974 case XML_REGEXP_REALCHAR:
975 fprintf(output, "realchar "); break;
976 case XML_REGEXP_NOTREALCHAR:
977 fprintf(output, "notrealchar "); break;
978 case XML_REGEXP_LETTER:
979 fprintf(output, "LETTER "); break;
980 case XML_REGEXP_LETTER_UPPERCASE:
981 fprintf(output, "LETTER_UPPERCASE "); break;
982 case XML_REGEXP_LETTER_LOWERCASE:
983 fprintf(output, "LETTER_LOWERCASE "); break;
984 case XML_REGEXP_LETTER_TITLECASE:
985 fprintf(output, "LETTER_TITLECASE "); break;
986 case XML_REGEXP_LETTER_MODIFIER:
987 fprintf(output, "LETTER_MODIFIER "); break;
988 case XML_REGEXP_LETTER_OTHERS:
989 fprintf(output, "LETTER_OTHERS "); break;
990 case XML_REGEXP_MARK:
991 fprintf(output, "MARK "); break;
992 case XML_REGEXP_MARK_NONSPACING:
993 fprintf(output, "MARK_NONSPACING "); break;
994 case XML_REGEXP_MARK_SPACECOMBINING:
995 fprintf(output, "MARK_SPACECOMBINING "); break;
996 case XML_REGEXP_MARK_ENCLOSING:
997 fprintf(output, "MARK_ENCLOSING "); break;
998 case XML_REGEXP_NUMBER:
999 fprintf(output, "NUMBER "); break;
1000 case XML_REGEXP_NUMBER_DECIMAL:
1001 fprintf(output, "NUMBER_DECIMAL "); break;
1002 case XML_REGEXP_NUMBER_LETTER:
1003 fprintf(output, "NUMBER_LETTER "); break;
1004 case XML_REGEXP_NUMBER_OTHERS:
1005 fprintf(output, "NUMBER_OTHERS "); break;
1006 case XML_REGEXP_PUNCT:
1007 fprintf(output, "PUNCT "); break;
1008 case XML_REGEXP_PUNCT_CONNECTOR:
1009 fprintf(output, "PUNCT_CONNECTOR "); break;
1010 case XML_REGEXP_PUNCT_DASH:
1011 fprintf(output, "PUNCT_DASH "); break;
1012 case XML_REGEXP_PUNCT_OPEN:
1013 fprintf(output, "PUNCT_OPEN "); break;
1014 case XML_REGEXP_PUNCT_CLOSE:
1015 fprintf(output, "PUNCT_CLOSE "); break;
1016 case XML_REGEXP_PUNCT_INITQUOTE:
1017 fprintf(output, "PUNCT_INITQUOTE "); break;
1018 case XML_REGEXP_PUNCT_FINQUOTE:
1019 fprintf(output, "PUNCT_FINQUOTE "); break;
1020 case XML_REGEXP_PUNCT_OTHERS:
1021 fprintf(output, "PUNCT_OTHERS "); break;
1022 case XML_REGEXP_SEPAR:
1023 fprintf(output, "SEPAR "); break;
1024 case XML_REGEXP_SEPAR_SPACE:
1025 fprintf(output, "SEPAR_SPACE "); break;
1026 case XML_REGEXP_SEPAR_LINE:
1027 fprintf(output, "SEPAR_LINE "); break;
1028 case XML_REGEXP_SEPAR_PARA:
1029 fprintf(output, "SEPAR_PARA "); break;
1030 case XML_REGEXP_SYMBOL:
1031 fprintf(output, "SYMBOL "); break;
1032 case XML_REGEXP_SYMBOL_MATH:
1033 fprintf(output, "SYMBOL_MATH "); break;
1034 case XML_REGEXP_SYMBOL_CURRENCY:
1035 fprintf(output, "SYMBOL_CURRENCY "); break;
1036 case XML_REGEXP_SYMBOL_MODIFIER:
1037 fprintf(output, "SYMBOL_MODIFIER "); break;
1038 case XML_REGEXP_SYMBOL_OTHERS:
1039 fprintf(output, "SYMBOL_OTHERS "); break;
1040 case XML_REGEXP_OTHER:
1041 fprintf(output, "OTHER "); break;
1042 case XML_REGEXP_OTHER_CONTROL:
1043 fprintf(output, "OTHER_CONTROL "); break;
1044 case XML_REGEXP_OTHER_FORMAT:
1045 fprintf(output, "OTHER_FORMAT "); break;
1046 case XML_REGEXP_OTHER_PRIVATE:
1047 fprintf(output, "OTHER_PRIVATE "); break;
1048 case XML_REGEXP_OTHER_NA:
1049 fprintf(output, "OTHER_NA "); break;
1050 case XML_REGEXP_BLOCK_NAME:
1051 fprintf(output, "BLOCK "); break;
1052 }
1053}
1054
1055static void
1056xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1057 switch (type) {
1058 case XML_REGEXP_QUANT_EPSILON:
1059 fprintf(output, "epsilon "); break;
1060 case XML_REGEXP_QUANT_ONCE:
1061 fprintf(output, "once "); break;
1062 case XML_REGEXP_QUANT_OPT:
1063 fprintf(output, "? "); break;
1064 case XML_REGEXP_QUANT_MULT:
1065 fprintf(output, "* "); break;
1066 case XML_REGEXP_QUANT_PLUS:
1067 fprintf(output, "+ "); break;
1068 case XML_REGEXP_QUANT_RANGE:
1069 fprintf(output, "range "); break;
Daniel Veillard7646b182002-04-20 06:41:40 +00001070 case XML_REGEXP_QUANT_ONCEONLY:
1071 fprintf(output, "onceonly "); break;
1072 case XML_REGEXP_QUANT_ALL:
1073 fprintf(output, "all "); break;
Daniel Veillard4255d502002-04-16 15:50:10 +00001074 }
1075}
1076static void
1077xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1078 fprintf(output, " range: ");
1079 if (range->neg)
1080 fprintf(output, "negative ");
1081 xmlRegPrintAtomType(output, range->type);
1082 fprintf(output, "%c - %c\n", range->start, range->end);
1083}
1084
1085static void
1086xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1087 fprintf(output, " atom: ");
1088 if (atom == NULL) {
1089 fprintf(output, "NULL\n");
1090 return;
1091 }
Daniel Veillard9efc4762005-07-19 14:33:55 +00001092 if (atom->neg)
1093 fprintf(output, "not ");
Daniel Veillard4255d502002-04-16 15:50:10 +00001094 xmlRegPrintAtomType(output, atom->type);
1095 xmlRegPrintQuantType(output, atom->quant);
1096 if (atom->quant == XML_REGEXP_QUANT_RANGE)
1097 fprintf(output, "%d-%d ", atom->min, atom->max);
1098 if (atom->type == XML_REGEXP_STRING)
1099 fprintf(output, "'%s' ", (char *) atom->valuep);
1100 if (atom->type == XML_REGEXP_CHARVAL)
1101 fprintf(output, "char %c\n", atom->codepoint);
1102 else if (atom->type == XML_REGEXP_RANGES) {
1103 int i;
1104 fprintf(output, "%d entries\n", atom->nbRanges);
1105 for (i = 0; i < atom->nbRanges;i++)
1106 xmlRegPrintRange(output, atom->ranges[i]);
1107 } else if (atom->type == XML_REGEXP_SUBREG) {
1108 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1109 } else {
1110 fprintf(output, "\n");
1111 }
1112}
1113
1114static void
1115xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1116 fprintf(output, " trans: ");
1117 if (trans == NULL) {
1118 fprintf(output, "NULL\n");
1119 return;
1120 }
1121 if (trans->to < 0) {
1122 fprintf(output, "removed\n");
1123 return;
1124 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001125 if (trans->nd != 0) {
1126 if (trans->nd == 2)
1127 fprintf(output, "last not determinist, ");
1128 else
1129 fprintf(output, "not determinist, ");
1130 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001131 if (trans->counter >= 0) {
1132 fprintf(output, "counted %d, ", trans->counter);
1133 }
Daniel Veillard8a001f62002-04-20 07:24:11 +00001134 if (trans->count == REGEXP_ALL_COUNTER) {
1135 fprintf(output, "all transition, ");
1136 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00001137 fprintf(output, "count based %d, ", trans->count);
1138 }
1139 if (trans->atom == NULL) {
1140 fprintf(output, "epsilon to %d\n", trans->to);
1141 return;
1142 }
1143 if (trans->atom->type == XML_REGEXP_CHARVAL)
1144 fprintf(output, "char %c ", trans->atom->codepoint);
1145 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1146}
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001147
Daniel Veillard4255d502002-04-16 15:50:10 +00001148static void
1149xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1150 int i;
1151
1152 fprintf(output, " state: ");
1153 if (state == NULL) {
1154 fprintf(output, "NULL\n");
1155 return;
1156 }
1157 if (state->type == XML_REGEXP_START_STATE)
1158 fprintf(output, "START ");
1159 if (state->type == XML_REGEXP_FINAL_STATE)
1160 fprintf(output, "FINAL ");
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001161
Daniel Veillard4255d502002-04-16 15:50:10 +00001162 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1163 for (i = 0;i < state->nbTrans; i++) {
1164 xmlRegPrintTrans(output, &(state->trans[i]));
1165 }
1166}
1167
Daniel Veillard23e73572002-09-19 19:56:43 +00001168#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard4255d502002-04-16 15:50:10 +00001169static void
1170xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1171 int i;
1172
1173 fprintf(output, " ctxt: ");
1174 if (ctxt == NULL) {
1175 fprintf(output, "NULL\n");
1176 return;
1177 }
1178 fprintf(output, "'%s' ", ctxt->string);
1179 if (ctxt->error)
1180 fprintf(output, "error ");
1181 if (ctxt->neg)
1182 fprintf(output, "neg ");
1183 fprintf(output, "\n");
1184 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1185 for (i = 0;i < ctxt->nbAtoms; i++) {
1186 fprintf(output, " %02d ", i);
1187 xmlRegPrintAtom(output, ctxt->atoms[i]);
1188 }
1189 if (ctxt->atom != NULL) {
1190 fprintf(output, "current atom:\n");
1191 xmlRegPrintAtom(output, ctxt->atom);
1192 }
1193 fprintf(output, "%d states:", ctxt->nbStates);
1194 if (ctxt->start != NULL)
1195 fprintf(output, " start: %d", ctxt->start->no);
1196 if (ctxt->end != NULL)
1197 fprintf(output, " end: %d", ctxt->end->no);
1198 fprintf(output, "\n");
1199 for (i = 0;i < ctxt->nbStates; i++) {
1200 xmlRegPrintState(output, ctxt->states[i]);
1201 }
1202 fprintf(output, "%d counters:\n", ctxt->nbCounters);
1203 for (i = 0;i < ctxt->nbCounters; i++) {
1204 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1205 ctxt->counters[i].max);
1206 }
1207}
Daniel Veillard23e73572002-09-19 19:56:43 +00001208#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00001209
1210/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001211 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00001212 * Finite Automata structures manipulations *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001213 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00001214 ************************************************************************/
1215
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001216static void
Daniel Veillard4255d502002-04-16 15:50:10 +00001217xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1218 int neg, xmlRegAtomType type, int start, int end,
1219 xmlChar *blockName) {
1220 xmlRegRangePtr range;
1221
1222 if (atom == NULL) {
1223 ERROR("add range: atom is NULL");
1224 return;
1225 }
1226 if (atom->type != XML_REGEXP_RANGES) {
1227 ERROR("add range: atom is not ranges");
1228 return;
1229 }
1230 if (atom->maxRanges == 0) {
1231 atom->maxRanges = 4;
1232 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1233 sizeof(xmlRegRangePtr));
1234 if (atom->ranges == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001235 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001236 atom->maxRanges = 0;
1237 return;
1238 }
1239 } else if (atom->nbRanges >= atom->maxRanges) {
1240 xmlRegRangePtr *tmp;
1241 atom->maxRanges *= 2;
1242 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1243 sizeof(xmlRegRangePtr));
1244 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001245 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001246 atom->maxRanges /= 2;
1247 return;
1248 }
1249 atom->ranges = tmp;
1250 }
1251 range = xmlRegNewRange(ctxt, neg, type, start, end);
1252 if (range == NULL)
1253 return;
1254 range->blockName = blockName;
1255 atom->ranges[atom->nbRanges++] = range;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001256
Daniel Veillard4255d502002-04-16 15:50:10 +00001257}
1258
1259static int
1260xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1261 if (ctxt->maxCounters == 0) {
1262 ctxt->maxCounters = 4;
1263 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1264 sizeof(xmlRegCounter));
1265 if (ctxt->counters == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001266 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001267 ctxt->maxCounters = 0;
1268 return(-1);
1269 }
1270 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1271 xmlRegCounter *tmp;
1272 ctxt->maxCounters *= 2;
1273 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1274 sizeof(xmlRegCounter));
1275 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001276 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001277 ctxt->maxCounters /= 2;
1278 return(-1);
1279 }
1280 ctxt->counters = tmp;
1281 }
1282 ctxt->counters[ctxt->nbCounters].min = -1;
1283 ctxt->counters[ctxt->nbCounters].max = -1;
1284 return(ctxt->nbCounters++);
1285}
1286
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001287static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001288xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1289 if (atom == NULL) {
1290 ERROR("atom push: atom is NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001291 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001292 }
1293 if (ctxt->maxAtoms == 0) {
1294 ctxt->maxAtoms = 4;
1295 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1296 sizeof(xmlRegAtomPtr));
1297 if (ctxt->atoms == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001298 xmlRegexpErrMemory(ctxt, "pushing atom");
Daniel Veillard4255d502002-04-16 15:50:10 +00001299 ctxt->maxAtoms = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001300 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001301 }
1302 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1303 xmlRegAtomPtr *tmp;
1304 ctxt->maxAtoms *= 2;
1305 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1306 sizeof(xmlRegAtomPtr));
1307 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001308 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001309 ctxt->maxAtoms /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001310 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001311 }
1312 ctxt->atoms = tmp;
1313 }
1314 atom->no = ctxt->nbAtoms;
1315 ctxt->atoms[ctxt->nbAtoms++] = atom;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001316 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001317}
1318
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001319static void
Daniel Veillarddb68b742005-07-30 13:18:24 +00001320xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1321 int from) {
1322 if (target->maxTransTo == 0) {
1323 target->maxTransTo = 8;
1324 target->transTo = (int *) xmlMalloc(target->maxTransTo *
1325 sizeof(int));
1326 if (target->transTo == NULL) {
1327 xmlRegexpErrMemory(ctxt, "adding transition");
1328 target->maxTransTo = 0;
1329 return;
1330 }
1331 } else if (target->nbTransTo >= target->maxTransTo) {
1332 int *tmp;
1333 target->maxTransTo *= 2;
1334 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1335 sizeof(int));
1336 if (tmp == NULL) {
1337 xmlRegexpErrMemory(ctxt, "adding transition");
1338 target->maxTransTo /= 2;
1339 return;
1340 }
1341 target->transTo = tmp;
1342 }
1343 target->transTo[target->nbTransTo] = from;
1344 target->nbTransTo++;
1345}
1346
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001347static void
Daniel Veillard4255d502002-04-16 15:50:10 +00001348xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1349 xmlRegAtomPtr atom, xmlRegStatePtr target,
Daniel Veillard5de09382005-09-26 17:18:17 +00001350 int counter, int count) {
William M. Brackf9b5fa22004-05-10 07:52:15 +00001351
1352 int nrtrans;
1353
Daniel Veillard4255d502002-04-16 15:50:10 +00001354 if (state == NULL) {
1355 ERROR("add state: state is NULL");
1356 return;
1357 }
1358 if (target == NULL) {
1359 ERROR("add state: target is NULL");
1360 return;
1361 }
William M. Brackf9b5fa22004-05-10 07:52:15 +00001362 /*
1363 * Other routines follow the philosophy 'When in doubt, add a transition'
1364 * so we check here whether such a transition is already present and, if
1365 * so, silently ignore this request.
1366 */
1367
Daniel Veillard5de09382005-09-26 17:18:17 +00001368 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1369 xmlRegTransPtr trans = &(state->trans[nrtrans]);
1370 if ((trans->atom == atom) &&
1371 (trans->to == target->no) &&
1372 (trans->counter == counter) &&
1373 (trans->count == count)) {
William M. Brackf9b5fa22004-05-10 07:52:15 +00001374#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard5de09382005-09-26 17:18:17 +00001375 printf("Ignoring duplicate transition from %d to %d\n",
1376 state->no, target->no);
William M. Brackf9b5fa22004-05-10 07:52:15 +00001377#endif
Daniel Veillard5de09382005-09-26 17:18:17 +00001378 return;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001379 }
William M. Brackf9b5fa22004-05-10 07:52:15 +00001380 }
1381
Daniel Veillard4255d502002-04-16 15:50:10 +00001382 if (state->maxTrans == 0) {
Daniel Veillarddb68b742005-07-30 13:18:24 +00001383 state->maxTrans = 8;
Daniel Veillard4255d502002-04-16 15:50:10 +00001384 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1385 sizeof(xmlRegTrans));
1386 if (state->trans == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001387 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001388 state->maxTrans = 0;
1389 return;
1390 }
1391 } else if (state->nbTrans >= state->maxTrans) {
1392 xmlRegTrans *tmp;
1393 state->maxTrans *= 2;
1394 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1395 sizeof(xmlRegTrans));
1396 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001397 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001398 state->maxTrans /= 2;
1399 return;
1400 }
1401 state->trans = tmp;
1402 }
1403#ifdef DEBUG_REGEXP_GRAPH
1404 printf("Add trans from %d to %d ", state->no, target->no);
Daniel Veillard8a001f62002-04-20 07:24:11 +00001405 if (count == REGEXP_ALL_COUNTER)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001406 printf("all transition\n");
Daniel Veillard4402ab42002-09-12 16:02:56 +00001407 else if (count >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001408 printf("count based %d\n", count);
Daniel Veillard4255d502002-04-16 15:50:10 +00001409 else if (counter >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001410 printf("counted %d\n", counter);
Daniel Veillard4255d502002-04-16 15:50:10 +00001411 else if (atom == NULL)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001412 printf("epsilon transition\n");
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001413 else if (atom != NULL)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001414 xmlRegPrintAtom(stdout, atom);
Daniel Veillard4255d502002-04-16 15:50:10 +00001415#endif
1416
1417 state->trans[state->nbTrans].atom = atom;
1418 state->trans[state->nbTrans].to = target->no;
1419 state->trans[state->nbTrans].counter = counter;
1420 state->trans[state->nbTrans].count = count;
Daniel Veillard567a45b2005-10-18 19:11:55 +00001421 state->trans[state->nbTrans].nd = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00001422 state->nbTrans++;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001423 xmlRegStateAddTransTo(ctxt, target, state->no);
Daniel Veillard4255d502002-04-16 15:50:10 +00001424}
1425
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001426static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001427xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001428 if (state == NULL) return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001429 if (ctxt->maxStates == 0) {
1430 ctxt->maxStates = 4;
1431 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1432 sizeof(xmlRegStatePtr));
1433 if (ctxt->states == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001434 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001435 ctxt->maxStates = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001436 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001437 }
1438 } else if (ctxt->nbStates >= ctxt->maxStates) {
1439 xmlRegStatePtr *tmp;
1440 ctxt->maxStates *= 2;
1441 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1442 sizeof(xmlRegStatePtr));
1443 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001444 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001445 ctxt->maxStates /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001446 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001447 }
1448 ctxt->states = tmp;
1449 }
1450 state->no = ctxt->nbStates;
1451 ctxt->states[ctxt->nbStates++] = state;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001452 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001453}
1454
1455/**
Daniel Veillard7646b182002-04-20 06:41:40 +00001456 * xmlFAGenerateAllTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001457 * @ctxt: a regexp parser context
1458 * @from: the from state
1459 * @to: the target state or NULL for building a new one
1460 * @lax:
Daniel Veillard7646b182002-04-20 06:41:40 +00001461 *
1462 */
1463static void
1464xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
Daniel Veillard441bc322002-04-20 17:38:48 +00001465 xmlRegStatePtr from, xmlRegStatePtr to,
1466 int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00001467 if (to == NULL) {
1468 to = xmlRegNewState(ctxt);
1469 xmlRegStatePush(ctxt, to);
1470 ctxt->state = to;
1471 }
Daniel Veillard441bc322002-04-20 17:38:48 +00001472 if (lax)
Daniel Veillard5de09382005-09-26 17:18:17 +00001473 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
Daniel Veillard441bc322002-04-20 17:38:48 +00001474 else
Daniel Veillard5de09382005-09-26 17:18:17 +00001475 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
Daniel Veillard7646b182002-04-20 06:41:40 +00001476}
1477
1478/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001479 * xmlFAGenerateEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001480 * @ctxt: a regexp parser context
1481 * @from: the from state
1482 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001483 *
1484 */
1485static void
1486xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1487 xmlRegStatePtr from, xmlRegStatePtr to) {
1488 if (to == NULL) {
1489 to = xmlRegNewState(ctxt);
1490 xmlRegStatePush(ctxt, to);
1491 ctxt->state = to;
1492 }
Daniel Veillard5de09382005-09-26 17:18:17 +00001493 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001494}
1495
1496/**
1497 * xmlFAGenerateCountedEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001498 * @ctxt: a regexp parser context
1499 * @from: the from state
1500 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001501 * counter: the counter for that transition
1502 *
1503 */
1504static void
1505xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1506 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1507 if (to == NULL) {
1508 to = xmlRegNewState(ctxt);
1509 xmlRegStatePush(ctxt, to);
1510 ctxt->state = to;
1511 }
Daniel Veillard5de09382005-09-26 17:18:17 +00001512 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001513}
1514
1515/**
1516 * xmlFAGenerateCountedTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001517 * @ctxt: a regexp parser context
1518 * @from: the from state
1519 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001520 * counter: the counter for that transition
1521 *
1522 */
1523static void
1524xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1525 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1526 if (to == NULL) {
1527 to = xmlRegNewState(ctxt);
1528 xmlRegStatePush(ctxt, to);
1529 ctxt->state = to;
1530 }
Daniel Veillard5de09382005-09-26 17:18:17 +00001531 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
Daniel Veillard4255d502002-04-16 15:50:10 +00001532}
1533
1534/**
1535 * xmlFAGenerateTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001536 * @ctxt: a regexp parser context
1537 * @from: the from state
1538 * @to: the target state or NULL for building a new one
1539 * @atom: the atom generating the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00001540 *
William M. Brackddf71d62004-05-06 04:17:26 +00001541 * Returns 0 if success and -1 in case of error.
Daniel Veillard4255d502002-04-16 15:50:10 +00001542 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001543static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001544xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1545 xmlRegStatePtr to, xmlRegAtomPtr atom) {
Daniel Veillard10bda622008-03-13 07:27:24 +00001546 xmlRegStatePtr end;
Daniel Veillard34b35002016-05-09 09:28:38 +08001547 int nullable = 0;
Daniel Veillard10bda622008-03-13 07:27:24 +00001548
Daniel Veillard4255d502002-04-16 15:50:10 +00001549 if (atom == NULL) {
1550 ERROR("genrate transition: atom == NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001551 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001552 }
1553 if (atom->type == XML_REGEXP_SUBREG) {
1554 /*
1555 * this is a subexpression handling one should not need to
William M. Brackddf71d62004-05-06 04:17:26 +00001556 * create a new node except for XML_REGEXP_QUANT_RANGE.
Daniel Veillard4255d502002-04-16 15:50:10 +00001557 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001558 if (xmlRegAtomPush(ctxt, atom) < 0) {
1559 return(-1);
1560 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001561 if ((to != NULL) && (atom->stop != to) &&
1562 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1563 /*
1564 * Generate an epsilon transition to link to the target
1565 */
1566 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
Daniel Veillardaa622012005-10-20 15:55:25 +00001567#ifdef DV
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001568 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
Daniel Veillardaa622012005-10-20 15:55:25 +00001569 (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1570 to = xmlRegNewState(ctxt);
1571 xmlRegStatePush(ctxt, to);
1572 ctxt->state = to;
1573 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1574#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00001575 }
1576 switch (atom->quant) {
1577 case XML_REGEXP_QUANT_OPT:
1578 atom->quant = XML_REGEXP_QUANT_ONCE;
Daniel Veillard54eb0242006-03-21 23:17:57 +00001579 /*
1580 * transition done to the state after end of atom.
1581 * 1. set transition from atom start to new state
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001582 * 2. set transition from atom end to this state.
Daniel Veillard54eb0242006-03-21 23:17:57 +00001583 */
Daniel Veillardd80d0722009-08-22 18:56:01 +02001584 if (to == NULL) {
1585 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1586 xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1587 ctxt->state);
1588 } else {
1589 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1590 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001591 break;
1592 case XML_REGEXP_QUANT_MULT:
1593 atom->quant = XML_REGEXP_QUANT_ONCE;
1594 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1595 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1596 break;
1597 case XML_REGEXP_QUANT_PLUS:
1598 atom->quant = XML_REGEXP_QUANT_ONCE;
1599 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1600 break;
1601 case XML_REGEXP_QUANT_RANGE: {
1602 int counter;
Daniel Veillard76d59b62007-08-22 16:29:21 +00001603 xmlRegStatePtr inter, newstate;
Daniel Veillard4255d502002-04-16 15:50:10 +00001604
1605 /*
Daniel Veillard76d59b62007-08-22 16:29:21 +00001606 * create the final state now if needed
Daniel Veillard4255d502002-04-16 15:50:10 +00001607 */
Daniel Veillard4255d502002-04-16 15:50:10 +00001608 if (to != NULL) {
1609 newstate = to;
1610 } else {
1611 newstate = xmlRegNewState(ctxt);
1612 xmlRegStatePush(ctxt, newstate);
Daniel Veillard4255d502002-04-16 15:50:10 +00001613 }
Daniel Veillard76d59b62007-08-22 16:29:21 +00001614
1615 /*
1616 * The principle here is to use counted transition
1617 * to avoid explosion in the number of states in the
1618 * graph. This is clearly more complex but should not
1619 * be exploitable at runtime.
Daniel Veillard54eb0242006-03-21 23:17:57 +00001620 */
Daniel Veillard76d59b62007-08-22 16:29:21 +00001621 if ((atom->min == 0) && (atom->start0 == NULL)) {
1622 xmlRegAtomPtr copy;
1623 /*
1624 * duplicate a transition based on atom to count next
1625 * occurences after 1. We cannot loop to atom->start
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001626 * directly because we need an epsilon transition to
Daniel Veillard76d59b62007-08-22 16:29:21 +00001627 * newstate.
1628 */
1629 /* ???? For some reason it seems we never reach that
1630 case, I suppose this got optimized out before when
1631 building the automata */
Daniel Veillardc821e032007-08-28 17:33:45 +00001632 copy = xmlRegCopyAtom(ctxt, atom);
Daniel Veillard76d59b62007-08-22 16:29:21 +00001633 if (copy == NULL)
1634 return(-1);
Daniel Veillard76d59b62007-08-22 16:29:21 +00001635 copy->quant = XML_REGEXP_QUANT_ONCE;
1636 copy->min = 0;
1637 copy->max = 0;
1638
1639 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1640 < 0)
1641 return(-1);
1642 inter = ctxt->state;
1643 counter = xmlRegGetCounter(ctxt);
Nick Wellnhoferdc858b32023-02-17 15:53:07 +01001644 if (counter < 0)
1645 return(-1);
Daniel Veillard76d59b62007-08-22 16:29:21 +00001646 ctxt->counters[counter].min = atom->min - 1;
1647 ctxt->counters[counter].max = atom->max - 1;
1648 /* count the number of times we see it again */
1649 xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1650 atom->stop, counter);
1651 /* allow a way out based on the count */
1652 xmlFAGenerateCountedTransition(ctxt, inter,
1653 newstate, counter);
1654 /* and also allow a direct exit for 0 */
1655 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1656 newstate);
1657 } else {
1658 /*
1659 * either we need the atom at least once or there
1660 * is an atom->start0 allowing to easilly plug the
1661 * epsilon transition.
1662 */
1663 counter = xmlRegGetCounter(ctxt);
Nick Wellnhoferdc858b32023-02-17 15:53:07 +01001664 if (counter < 0)
1665 return(-1);
Daniel Veillard76d59b62007-08-22 16:29:21 +00001666 ctxt->counters[counter].min = atom->min - 1;
1667 ctxt->counters[counter].max = atom->max - 1;
1668 /* count the number of times we see it again */
1669 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1670 atom->start, counter);
1671 /* allow a way out based on the count */
1672 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1673 newstate, counter);
1674 /* and if needed allow a direct exit for 0 */
1675 if (atom->min == 0)
1676 xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1677 newstate);
1678
1679 }
1680 atom->min = 0;
1681 atom->max = 0;
1682 atom->quant = XML_REGEXP_QUANT_ONCE;
1683 ctxt->state = newstate;
Daniel Veillard4255d502002-04-16 15:50:10 +00001684 }
1685 default:
1686 break;
1687 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001688 return(0);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001689 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001690 if ((atom->min == 0) && (atom->max == 0) &&
Daniel Veillard99c394d2005-07-14 12:58:49 +00001691 (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1692 /*
1693 * we can discard the atom and generate an epsilon transition instead
1694 */
1695 if (to == NULL) {
1696 to = xmlRegNewState(ctxt);
1697 if (to != NULL)
1698 xmlRegStatePush(ctxt, to);
1699 else {
1700 return(-1);
1701 }
1702 }
1703 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1704 ctxt->state = to;
1705 xmlRegFreeAtom(atom);
1706 return(0);
Daniel Veillard567a45b2005-10-18 19:11:55 +00001707 }
1708 if (to == NULL) {
1709 to = xmlRegNewState(ctxt);
1710 if (to != NULL)
1711 xmlRegStatePush(ctxt, to);
1712 else {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001713 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001714 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001715 }
Daniel Veillard10bda622008-03-13 07:27:24 +00001716 end = to;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001717 if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
Daniel Veillard10bda622008-03-13 07:27:24 +00001718 (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1719 /*
1720 * Do not pollute the target state by adding transitions from
1721 * it as it is likely to be the shared target of multiple branches.
1722 * So isolate with an epsilon transition.
1723 */
1724 xmlRegStatePtr tmp;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001725
Daniel Veillard10bda622008-03-13 07:27:24 +00001726 tmp = xmlRegNewState(ctxt);
1727 if (tmp != NULL)
1728 xmlRegStatePush(ctxt, tmp);
1729 else {
1730 return(-1);
1731 }
1732 xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1733 to = tmp;
Daniel Veillard4255d502002-04-16 15:50:10 +00001734 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001735 if (xmlRegAtomPush(ctxt, atom) < 0) {
1736 return(-1);
1737 }
Daniel Veillard34b35002016-05-09 09:28:38 +08001738 if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
1739 (atom->min == 0) && (atom->max > 0)) {
1740 nullable = 1;
1741 atom->min = 1;
1742 if (atom->max == 1)
1743 atom->quant = XML_REGEXP_QUANT_OPT;
1744 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001745 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
Daniel Veillard10bda622008-03-13 07:27:24 +00001746 ctxt->state = end;
Daniel Veillard4255d502002-04-16 15:50:10 +00001747 switch (atom->quant) {
1748 case XML_REGEXP_QUANT_OPT:
1749 atom->quant = XML_REGEXP_QUANT_ONCE;
1750 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1751 break;
1752 case XML_REGEXP_QUANT_MULT:
1753 atom->quant = XML_REGEXP_QUANT_ONCE;
1754 xmlFAGenerateEpsilonTransition(ctxt, from, to);
Daniel Veillard5de09382005-09-26 17:18:17 +00001755 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001756 break;
1757 case XML_REGEXP_QUANT_PLUS:
1758 atom->quant = XML_REGEXP_QUANT_ONCE;
Daniel Veillard5de09382005-09-26 17:18:17 +00001759 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001760 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001761 case XML_REGEXP_QUANT_RANGE:
Daniel Veillard34b35002016-05-09 09:28:38 +08001762 if (nullable)
William M. Brack56578372007-04-11 14:33:46 +00001763 xmlFAGenerateEpsilonTransition(ctxt, from, to);
William M. Brack56578372007-04-11 14:33:46 +00001764 break;
Daniel Veillard4255d502002-04-16 15:50:10 +00001765 default:
1766 break;
1767 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001768 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001769}
1770
1771/**
1772 * xmlFAReduceEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001773 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001774 * @fromnr: the from state
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001775 * @tonr: the to state
William M. Brackddf71d62004-05-06 04:17:26 +00001776 * @counter: should that transition be associated to a counted
Daniel Veillard4255d502002-04-16 15:50:10 +00001777 *
1778 */
1779static void
1780xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1781 int tonr, int counter) {
1782 int transnr;
1783 xmlRegStatePtr from;
1784 xmlRegStatePtr to;
1785
1786#ifdef DEBUG_REGEXP_GRAPH
1787 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1788#endif
1789 from = ctxt->states[fromnr];
1790 if (from == NULL)
1791 return;
1792 to = ctxt->states[tonr];
1793 if (to == NULL)
1794 return;
1795 if ((to->mark == XML_REGEXP_MARK_START) ||
1796 (to->mark == XML_REGEXP_MARK_VISITED))
1797 return;
1798
1799 to->mark = XML_REGEXP_MARK_VISITED;
1800 if (to->type == XML_REGEXP_FINAL_STATE) {
1801#ifdef DEBUG_REGEXP_GRAPH
1802 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1803#endif
1804 from->type = XML_REGEXP_FINAL_STATE;
1805 }
1806 for (transnr = 0;transnr < to->nbTrans;transnr++) {
Daniel Veillarddb68b742005-07-30 13:18:24 +00001807 if (to->trans[transnr].to < 0)
1808 continue;
Daniel Veillard4255d502002-04-16 15:50:10 +00001809 if (to->trans[transnr].atom == NULL) {
1810 /*
1811 * Don't remove counted transitions
1812 * Don't loop either
1813 */
Daniel Veillardb509f152002-04-17 16:28:10 +00001814 if (to->trans[transnr].to != fromnr) {
1815 if (to->trans[transnr].count >= 0) {
1816 int newto = to->trans[transnr].to;
1817
1818 xmlRegStateAddTrans(ctxt, from, NULL,
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001819 ctxt->states[newto],
Daniel Veillard5de09382005-09-26 17:18:17 +00001820 -1, to->trans[transnr].count);
Daniel Veillardb509f152002-04-17 16:28:10 +00001821 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00001822#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillardb509f152002-04-17 16:28:10 +00001823 printf("Found epsilon trans %d from %d to %d\n",
1824 transnr, tonr, to->trans[transnr].to);
Daniel Veillard4255d502002-04-16 15:50:10 +00001825#endif
Daniel Veillardb509f152002-04-17 16:28:10 +00001826 if (to->trans[transnr].counter >= 0) {
1827 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1828 to->trans[transnr].to,
1829 to->trans[transnr].counter);
1830 } else {
1831 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1832 to->trans[transnr].to,
1833 counter);
1834 }
1835 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001836 }
1837 } else {
1838 int newto = to->trans[transnr].to;
1839
Daniel Veillardb509f152002-04-17 16:28:10 +00001840 if (to->trans[transnr].counter >= 0) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001841 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1842 ctxt->states[newto],
Daniel Veillard5de09382005-09-26 17:18:17 +00001843 to->trans[transnr].counter, -1);
Daniel Veillardb509f152002-04-17 16:28:10 +00001844 } else {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001845 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
Daniel Veillard5de09382005-09-26 17:18:17 +00001846 ctxt->states[newto], counter, -1);
Daniel Veillardb509f152002-04-17 16:28:10 +00001847 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001848 }
1849 }
1850 to->mark = XML_REGEXP_MARK_NORMAL;
1851}
1852
1853/**
Daniel Veillarddb68b742005-07-30 13:18:24 +00001854 * xmlFAEliminateSimpleEpsilonTransitions:
1855 * @ctxt: a regexp parser context
1856 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001857 * Eliminating general epsilon transitions can get costly in the general
Daniel Veillarddb68b742005-07-30 13:18:24 +00001858 * algorithm due to the large amount of generated new transitions and
1859 * associated comparisons. However for simple epsilon transition used just
1860 * to separate building blocks when generating the automata this can be
1861 * reduced to state elimination:
1862 * - if there exists an epsilon from X to Y
1863 * - if there is no other transition from X
1864 * then X and Y are semantically equivalent and X can be eliminated
1865 * If X is the start state then make Y the start state, else replace the
1866 * target of all transitions to X by transitions to Y.
1867 */
1868static void
1869xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1870 int statenr, i, j, newto;
1871 xmlRegStatePtr state, tmp;
1872
1873 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1874 state = ctxt->states[statenr];
1875 if (state == NULL)
1876 continue;
1877 if (state->nbTrans != 1)
1878 continue;
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001879 if (state->type == XML_REGEXP_UNREACH_STATE)
1880 continue;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001881 /* is the only transition out a basic transition */
1882 if ((state->trans[0].atom == NULL) &&
1883 (state->trans[0].to >= 0) &&
1884 (state->trans[0].to != statenr) &&
1885 (state->trans[0].counter < 0) &&
1886 (state->trans[0].count < 0)) {
1887 newto = state->trans[0].to;
1888
1889 if (state->type == XML_REGEXP_START_STATE) {
1890#ifdef DEBUG_REGEXP_GRAPH
1891 printf("Found simple epsilon trans from start %d to %d\n",
1892 statenr, newto);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001893#endif
Daniel Veillarddb68b742005-07-30 13:18:24 +00001894 } else {
1895#ifdef DEBUG_REGEXP_GRAPH
1896 printf("Found simple epsilon trans from %d to %d\n",
1897 statenr, newto);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001898#endif
Daniel Veillarddb68b742005-07-30 13:18:24 +00001899 for (i = 0;i < state->nbTransTo;i++) {
1900 tmp = ctxt->states[state->transTo[i]];
1901 for (j = 0;j < tmp->nbTrans;j++) {
1902 if (tmp->trans[j].to == statenr) {
Daniel Veillarddb68b742005-07-30 13:18:24 +00001903#ifdef DEBUG_REGEXP_GRAPH
1904 printf("Changed transition %d on %d to go to %d\n",
1905 j, tmp->no, newto);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001906#endif
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001907 tmp->trans[j].to = -1;
1908 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001909 ctxt->states[newto],
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001910 tmp->trans[j].counter,
1911 tmp->trans[j].count);
Daniel Veillarddb68b742005-07-30 13:18:24 +00001912 }
1913 }
1914 }
Daniel Veillarddb68b742005-07-30 13:18:24 +00001915 if (state->type == XML_REGEXP_FINAL_STATE)
1916 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1917 /* eliminate the transition completely */
1918 state->nbTrans = 0;
1919
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001920 state->type = XML_REGEXP_UNREACH_STATE;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001921
1922 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001923
Daniel Veillarddb68b742005-07-30 13:18:24 +00001924 }
1925 }
1926}
1927/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001928 * xmlFAEliminateEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001929 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001930 *
1931 */
1932static void
1933xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1934 int statenr, transnr;
1935 xmlRegStatePtr state;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001936 int has_epsilon;
Daniel Veillard4255d502002-04-16 15:50:10 +00001937
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001938 if (ctxt->states == NULL) return;
1939
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001940 /*
1941 * Eliminate simple epsilon transition and the associated unreachable
1942 * states.
1943 */
Daniel Veillarddb68b742005-07-30 13:18:24 +00001944 xmlFAEliminateSimpleEpsilonTransitions(ctxt);
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001945 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1946 state = ctxt->states[statenr];
1947 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1948#ifdef DEBUG_REGEXP_GRAPH
1949 printf("Removed unreachable state %d\n", statenr);
1950#endif
1951 xmlRegFreeState(state);
1952 ctxt->states[statenr] = NULL;
1953 }
1954 }
Daniel Veillarddb68b742005-07-30 13:18:24 +00001955
1956 has_epsilon = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001957
Daniel Veillard4255d502002-04-16 15:50:10 +00001958 /*
Daniel Veillardfcd18ff2006-11-02 10:28:04 +00001959 * Build the completed transitions bypassing the epsilons
Daniel Veillard4255d502002-04-16 15:50:10 +00001960 * Use a marking algorithm to avoid loops
Daniel Veillardfcd18ff2006-11-02 10:28:04 +00001961 * Mark sink states too.
1962 * Process from the latests states backward to the start when
1963 * there is long cascading epsilon chains this minimize the
1964 * recursions and transition compares when adding the new ones
Daniel Veillard4255d502002-04-16 15:50:10 +00001965 */
Daniel Veillardfcd18ff2006-11-02 10:28:04 +00001966 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
Daniel Veillard4255d502002-04-16 15:50:10 +00001967 state = ctxt->states[statenr];
1968 if (state == NULL)
1969 continue;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00001970 if ((state->nbTrans == 0) &&
1971 (state->type != XML_REGEXP_FINAL_STATE)) {
1972 state->type = XML_REGEXP_SINK_STATE;
1973 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001974 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1975 if ((state->trans[transnr].atom == NULL) &&
1976 (state->trans[transnr].to >= 0)) {
1977 if (state->trans[transnr].to == statenr) {
1978 state->trans[transnr].to = -1;
1979#ifdef DEBUG_REGEXP_GRAPH
1980 printf("Removed loopback epsilon trans %d on %d\n",
1981 transnr, statenr);
1982#endif
1983 } else if (state->trans[transnr].count < 0) {
1984 int newto = state->trans[transnr].to;
1985
1986#ifdef DEBUG_REGEXP_GRAPH
1987 printf("Found epsilon trans %d from %d to %d\n",
1988 transnr, statenr, newto);
1989#endif
Daniel Veillarddb68b742005-07-30 13:18:24 +00001990 has_epsilon = 1;
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001991 state->trans[transnr].to = -2;
1992 state->mark = XML_REGEXP_MARK_START;
Daniel Veillard4255d502002-04-16 15:50:10 +00001993 xmlFAReduceEpsilonTransitions(ctxt, statenr,
1994 newto, state->trans[transnr].counter);
1995 state->mark = XML_REGEXP_MARK_NORMAL;
1996#ifdef DEBUG_REGEXP_GRAPH
1997 } else {
1998 printf("Found counted transition %d on %d\n",
1999 transnr, statenr);
2000#endif
2001 }
2002 }
2003 }
2004 }
2005 /*
2006 * Eliminate the epsilon transitions
2007 */
Daniel Veillarddb68b742005-07-30 13:18:24 +00002008 if (has_epsilon) {
2009 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2010 state = ctxt->states[statenr];
2011 if (state == NULL)
2012 continue;
2013 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2014 xmlRegTransPtr trans = &(state->trans[transnr]);
2015 if ((trans->atom == NULL) &&
2016 (trans->count < 0) &&
2017 (trans->to >= 0)) {
2018 trans->to = -1;
2019 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002020 }
2021 }
2022 }
Daniel Veillard23e73572002-09-19 19:56:43 +00002023
2024 /*
2025 * Use this pass to detect unreachable states too
2026 */
2027 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2028 state = ctxt->states[statenr];
2029 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00002030 state->reached = XML_REGEXP_MARK_NORMAL;
Daniel Veillard23e73572002-09-19 19:56:43 +00002031 }
2032 state = ctxt->states[0];
2033 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00002034 state->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00002035 while (state != NULL) {
2036 xmlRegStatePtr target = NULL;
William M. Brack779af002003-08-01 15:55:39 +00002037 state->reached = XML_REGEXP_MARK_VISITED;
Daniel Veillard23e73572002-09-19 19:56:43 +00002038 /*
William M. Brackddf71d62004-05-06 04:17:26 +00002039 * Mark all states reachable from the current reachable state
Daniel Veillard23e73572002-09-19 19:56:43 +00002040 */
2041 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2042 if ((state->trans[transnr].to >= 0) &&
2043 ((state->trans[transnr].atom != NULL) ||
2044 (state->trans[transnr].count >= 0))) {
2045 int newto = state->trans[transnr].to;
2046
2047 if (ctxt->states[newto] == NULL)
2048 continue;
William M. Brack779af002003-08-01 15:55:39 +00002049 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
2050 ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00002051 target = ctxt->states[newto];
2052 }
2053 }
2054 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00002055
Daniel Veillard23e73572002-09-19 19:56:43 +00002056 /*
2057 * find the next accessible state not explored
2058 */
2059 if (target == NULL) {
2060 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
2061 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00002062 if ((state != NULL) && (state->reached ==
2063 XML_REGEXP_MARK_START)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00002064 target = state;
2065 break;
2066 }
2067 }
2068 }
2069 state = target;
2070 }
2071 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2072 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00002073 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00002074#ifdef DEBUG_REGEXP_GRAPH
2075 printf("Removed unreachable state %d\n", statenr);
2076#endif
2077 xmlRegFreeState(state);
2078 ctxt->states[statenr] = NULL;
2079 }
2080 }
2081
Daniel Veillard4255d502002-04-16 15:50:10 +00002082}
2083
Daniel Veillard567a45b2005-10-18 19:11:55 +00002084static int
2085xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
2086 int ret = 0;
2087
2088 if ((range1->type == XML_REGEXP_RANGES) ||
2089 (range2->type == XML_REGEXP_RANGES) ||
2090 (range2->type == XML_REGEXP_SUBREG) ||
2091 (range1->type == XML_REGEXP_SUBREG) ||
2092 (range1->type == XML_REGEXP_STRING) ||
2093 (range2->type == XML_REGEXP_STRING))
2094 return(-1);
2095
2096 /* put them in order */
2097 if (range1->type > range2->type) {
2098 xmlRegRangePtr tmp;
2099
2100 tmp = range1;
2101 range1 = range2;
2102 range2 = tmp;
2103 }
2104 if ((range1->type == XML_REGEXP_ANYCHAR) ||
2105 (range2->type == XML_REGEXP_ANYCHAR)) {
2106 ret = 1;
2107 } else if ((range1->type == XML_REGEXP_EPSILON) ||
2108 (range2->type == XML_REGEXP_EPSILON)) {
2109 return(0);
2110 } else if (range1->type == range2->type) {
Daniel Veillard9332b482009-09-23 18:28:43 +02002111 if (range1->type != XML_REGEXP_CHARVAL)
2112 ret = 1;
2113 else if ((range1->end < range2->start) ||
2114 (range2->end < range1->start))
Daniel Veillard567a45b2005-10-18 19:11:55 +00002115 ret = 0;
Daniel Veillard9332b482009-09-23 18:28:43 +02002116 else
2117 ret = 1;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002118 } else if (range1->type == XML_REGEXP_CHARVAL) {
2119 int codepoint;
2120 int neg = 0;
2121
2122 /*
2123 * just check all codepoints in the range for acceptance,
2124 * this is usually way cheaper since done only once at
Daniel Veillardf8e3db02012-09-11 13:26:36 +08002125 * compilation than testing over and over at runtime or
Daniel Veillard567a45b2005-10-18 19:11:55 +00002126 * pushing too many states when evaluating.
2127 */
2128 if (((range1->neg == 0) && (range2->neg != 0)) ||
2129 ((range1->neg != 0) && (range2->neg == 0)))
2130 neg = 1;
2131
2132 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2133 ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2134 0, range2->start, range2->end,
2135 range2->blockName);
2136 if (ret < 0)
2137 return(-1);
2138 if (((neg == 1) && (ret == 0)) ||
2139 ((neg == 0) && (ret == 1)))
2140 return(1);
2141 }
2142 return(0);
2143 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2144 (range2->type == XML_REGEXP_BLOCK_NAME)) {
2145 if (range1->type == range2->type) {
2146 ret = xmlStrEqual(range1->blockName, range2->blockName);
2147 } else {
2148 /*
2149 * comparing a block range with anything else is way
2150 * too costly, and maintining the table is like too much
2151 * memory too, so let's force the automata to save state
2152 * here.
2153 */
2154 return(1);
2155 }
2156 } else if ((range1->type < XML_REGEXP_LETTER) ||
2157 (range2->type < XML_REGEXP_LETTER)) {
2158 if ((range1->type == XML_REGEXP_ANYSPACE) &&
2159 (range2->type == XML_REGEXP_NOTSPACE))
2160 ret = 0;
2161 else if ((range1->type == XML_REGEXP_INITNAME) &&
2162 (range2->type == XML_REGEXP_NOTINITNAME))
2163 ret = 0;
2164 else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2165 (range2->type == XML_REGEXP_NOTNAMECHAR))
2166 ret = 0;
2167 else if ((range1->type == XML_REGEXP_DECIMAL) &&
2168 (range2->type == XML_REGEXP_NOTDECIMAL))
2169 ret = 0;
2170 else if ((range1->type == XML_REGEXP_REALCHAR) &&
2171 (range2->type == XML_REGEXP_NOTREALCHAR))
2172 ret = 0;
2173 else {
2174 /* same thing to limit complexity */
2175 return(1);
2176 }
2177 } else {
2178 ret = 0;
2179 /* range1->type < range2->type here */
2180 switch (range1->type) {
2181 case XML_REGEXP_LETTER:
2182 /* all disjoint except in the subgroups */
2183 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2184 (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2185 (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2186 (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2187 (range2->type == XML_REGEXP_LETTER_OTHERS))
2188 ret = 1;
2189 break;
2190 case XML_REGEXP_MARK:
2191 if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2192 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2193 (range2->type == XML_REGEXP_MARK_ENCLOSING))
2194 ret = 1;
2195 break;
2196 case XML_REGEXP_NUMBER:
2197 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2198 (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2199 (range2->type == XML_REGEXP_NUMBER_OTHERS))
2200 ret = 1;
2201 break;
2202 case XML_REGEXP_PUNCT:
2203 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2204 (range2->type == XML_REGEXP_PUNCT_DASH) ||
2205 (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2206 (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2207 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2208 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2209 (range2->type == XML_REGEXP_PUNCT_OTHERS))
2210 ret = 1;
2211 break;
2212 case XML_REGEXP_SEPAR:
2213 if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2214 (range2->type == XML_REGEXP_SEPAR_LINE) ||
2215 (range2->type == XML_REGEXP_SEPAR_PARA))
2216 ret = 1;
2217 break;
2218 case XML_REGEXP_SYMBOL:
2219 if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2220 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2221 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2222 (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2223 ret = 1;
2224 break;
2225 case XML_REGEXP_OTHER:
2226 if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2227 (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2228 (range2->type == XML_REGEXP_OTHER_PRIVATE))
2229 ret = 1;
2230 break;
2231 default:
2232 if ((range2->type >= XML_REGEXP_LETTER) &&
2233 (range2->type < XML_REGEXP_BLOCK_NAME))
2234 ret = 0;
2235 else {
2236 /* safety net ! */
2237 return(1);
2238 }
2239 }
2240 }
2241 if (((range1->neg == 0) && (range2->neg != 0)) ||
2242 ((range1->neg != 0) && (range2->neg == 0)))
2243 ret = !ret;
Daniel Veillard594e5df2009-09-07 14:58:47 +02002244 return(ret);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002245}
2246
Daniel Veillarde19fc232002-04-22 16:01:24 +00002247/**
Daniel Veillardfc011b72006-02-12 19:14:15 +00002248 * xmlFACompareAtomTypes:
2249 * @type1: an atom type
2250 * @type2: an atom type
2251 *
2252 * Compares two atoms type to check whether they intersect in some ways,
2253 * this is used by xmlFACompareAtoms only
2254 *
2255 * Returns 1 if they may intersect and 0 otherwise
2256 */
2257static int
2258xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2259 if ((type1 == XML_REGEXP_EPSILON) ||
2260 (type1 == XML_REGEXP_CHARVAL) ||
2261 (type1 == XML_REGEXP_RANGES) ||
2262 (type1 == XML_REGEXP_SUBREG) ||
2263 (type1 == XML_REGEXP_STRING) ||
2264 (type1 == XML_REGEXP_ANYCHAR))
2265 return(1);
2266 if ((type2 == XML_REGEXP_EPSILON) ||
2267 (type2 == XML_REGEXP_CHARVAL) ||
2268 (type2 == XML_REGEXP_RANGES) ||
2269 (type2 == XML_REGEXP_SUBREG) ||
2270 (type2 == XML_REGEXP_STRING) ||
2271 (type2 == XML_REGEXP_ANYCHAR))
2272 return(1);
2273
2274 if (type1 == type2) return(1);
2275
2276 /* simplify subsequent compares by making sure type1 < type2 */
2277 if (type1 > type2) {
2278 xmlRegAtomType tmp = type1;
2279 type1 = type2;
2280 type2 = tmp;
2281 }
2282 switch (type1) {
2283 case XML_REGEXP_ANYSPACE: /* \s */
2284 /* can't be a letter, number, mark, pontuation, symbol */
2285 if ((type2 == XML_REGEXP_NOTSPACE) ||
2286 ((type2 >= XML_REGEXP_LETTER) &&
2287 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2288 ((type2 >= XML_REGEXP_NUMBER) &&
2289 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2290 ((type2 >= XML_REGEXP_MARK) &&
2291 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2292 ((type2 >= XML_REGEXP_PUNCT) &&
2293 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2294 ((type2 >= XML_REGEXP_SYMBOL) &&
2295 (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2296 ) return(0);
2297 break;
2298 case XML_REGEXP_NOTSPACE: /* \S */
2299 break;
2300 case XML_REGEXP_INITNAME: /* \l */
2301 /* can't be a number, mark, separator, pontuation, symbol or other */
2302 if ((type2 == XML_REGEXP_NOTINITNAME) ||
2303 ((type2 >= XML_REGEXP_NUMBER) &&
2304 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2305 ((type2 >= XML_REGEXP_MARK) &&
2306 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2307 ((type2 >= XML_REGEXP_SEPAR) &&
2308 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2309 ((type2 >= XML_REGEXP_PUNCT) &&
2310 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2311 ((type2 >= XML_REGEXP_SYMBOL) &&
2312 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2313 ((type2 >= XML_REGEXP_OTHER) &&
2314 (type2 <= XML_REGEXP_OTHER_NA))
2315 ) return(0);
2316 break;
2317 case XML_REGEXP_NOTINITNAME: /* \L */
2318 break;
2319 case XML_REGEXP_NAMECHAR: /* \c */
2320 /* can't be a mark, separator, pontuation, symbol or other */
2321 if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2322 ((type2 >= XML_REGEXP_MARK) &&
2323 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2324 ((type2 >= XML_REGEXP_PUNCT) &&
2325 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2326 ((type2 >= XML_REGEXP_SEPAR) &&
2327 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2328 ((type2 >= XML_REGEXP_SYMBOL) &&
2329 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2330 ((type2 >= XML_REGEXP_OTHER) &&
2331 (type2 <= XML_REGEXP_OTHER_NA))
2332 ) return(0);
2333 break;
2334 case XML_REGEXP_NOTNAMECHAR: /* \C */
2335 break;
2336 case XML_REGEXP_DECIMAL: /* \d */
2337 /* can't be a letter, mark, separator, pontuation, symbol or other */
2338 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2339 (type2 == XML_REGEXP_REALCHAR) ||
2340 ((type2 >= XML_REGEXP_LETTER) &&
2341 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2342 ((type2 >= XML_REGEXP_MARK) &&
2343 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2344 ((type2 >= XML_REGEXP_PUNCT) &&
2345 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2346 ((type2 >= XML_REGEXP_SEPAR) &&
2347 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2348 ((type2 >= XML_REGEXP_SYMBOL) &&
2349 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2350 ((type2 >= XML_REGEXP_OTHER) &&
2351 (type2 <= XML_REGEXP_OTHER_NA))
2352 )return(0);
2353 break;
2354 case XML_REGEXP_NOTDECIMAL: /* \D */
2355 break;
2356 case XML_REGEXP_REALCHAR: /* \w */
2357 /* can't be a mark, separator, pontuation, symbol or other */
2358 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2359 ((type2 >= XML_REGEXP_MARK) &&
2360 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2361 ((type2 >= XML_REGEXP_PUNCT) &&
2362 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2363 ((type2 >= XML_REGEXP_SEPAR) &&
2364 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2365 ((type2 >= XML_REGEXP_SYMBOL) &&
2366 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2367 ((type2 >= XML_REGEXP_OTHER) &&
2368 (type2 <= XML_REGEXP_OTHER_NA))
2369 )return(0);
2370 break;
2371 case XML_REGEXP_NOTREALCHAR: /* \W */
2372 break;
2373 /*
2374 * at that point we know both type 1 and type2 are from
2375 * character categories are ordered and are different,
2376 * it becomes simple because this is a partition
2377 */
2378 case XML_REGEXP_LETTER:
2379 if (type2 <= XML_REGEXP_LETTER_OTHERS)
2380 return(1);
2381 return(0);
2382 case XML_REGEXP_LETTER_UPPERCASE:
2383 case XML_REGEXP_LETTER_LOWERCASE:
2384 case XML_REGEXP_LETTER_TITLECASE:
2385 case XML_REGEXP_LETTER_MODIFIER:
2386 case XML_REGEXP_LETTER_OTHERS:
2387 return(0);
2388 case XML_REGEXP_MARK:
2389 if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2390 return(1);
2391 return(0);
2392 case XML_REGEXP_MARK_NONSPACING:
2393 case XML_REGEXP_MARK_SPACECOMBINING:
2394 case XML_REGEXP_MARK_ENCLOSING:
2395 return(0);
2396 case XML_REGEXP_NUMBER:
2397 if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2398 return(1);
2399 return(0);
2400 case XML_REGEXP_NUMBER_DECIMAL:
2401 case XML_REGEXP_NUMBER_LETTER:
2402 case XML_REGEXP_NUMBER_OTHERS:
2403 return(0);
2404 case XML_REGEXP_PUNCT:
2405 if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2406 return(1);
2407 return(0);
2408 case XML_REGEXP_PUNCT_CONNECTOR:
2409 case XML_REGEXP_PUNCT_DASH:
2410 case XML_REGEXP_PUNCT_OPEN:
2411 case XML_REGEXP_PUNCT_CLOSE:
2412 case XML_REGEXP_PUNCT_INITQUOTE:
2413 case XML_REGEXP_PUNCT_FINQUOTE:
2414 case XML_REGEXP_PUNCT_OTHERS:
2415 return(0);
2416 case XML_REGEXP_SEPAR:
2417 if (type2 <= XML_REGEXP_SEPAR_PARA)
2418 return(1);
2419 return(0);
2420 case XML_REGEXP_SEPAR_SPACE:
2421 case XML_REGEXP_SEPAR_LINE:
2422 case XML_REGEXP_SEPAR_PARA:
2423 return(0);
2424 case XML_REGEXP_SYMBOL:
2425 if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2426 return(1);
2427 return(0);
2428 case XML_REGEXP_SYMBOL_MATH:
2429 case XML_REGEXP_SYMBOL_CURRENCY:
2430 case XML_REGEXP_SYMBOL_MODIFIER:
2431 case XML_REGEXP_SYMBOL_OTHERS:
2432 return(0);
2433 case XML_REGEXP_OTHER:
2434 if (type2 <= XML_REGEXP_OTHER_NA)
2435 return(1);
2436 return(0);
2437 case XML_REGEXP_OTHER_CONTROL:
2438 case XML_REGEXP_OTHER_FORMAT:
2439 case XML_REGEXP_OTHER_PRIVATE:
2440 case XML_REGEXP_OTHER_NA:
2441 return(0);
2442 default:
2443 break;
2444 }
2445 return(1);
2446}
2447
2448/**
2449 * xmlFAEqualAtoms:
Daniel Veillarde19fc232002-04-22 16:01:24 +00002450 * @atom1: an atom
2451 * @atom2: an atom
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002452 * @deep: if not set only compare string pointers
Daniel Veillarde19fc232002-04-22 16:01:24 +00002453 *
Daniel Veillardfc011b72006-02-12 19:14:15 +00002454 * Compares two atoms to check whether they are the same exactly
2455 * this is used to remove equivalent transitions
Daniel Veillarde19fc232002-04-22 16:01:24 +00002456 *
Daniel Veillardfc011b72006-02-12 19:14:15 +00002457 * Returns 1 if same and 0 otherwise
Daniel Veillarde19fc232002-04-22 16:01:24 +00002458 */
2459static int
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002460xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
Daniel Veillardfc011b72006-02-12 19:14:15 +00002461 int ret = 0;
Daniel Veillard9efc4762005-07-19 14:33:55 +00002462
Daniel Veillarde19fc232002-04-22 16:01:24 +00002463 if (atom1 == atom2)
2464 return(1);
2465 if ((atom1 == NULL) || (atom2 == NULL))
2466 return(0);
2467
Daniel Veillardfc011b72006-02-12 19:14:15 +00002468 if (atom1->type != atom2->type)
2469 return(0);
2470 switch (atom1->type) {
2471 case XML_REGEXP_EPSILON:
2472 ret = 0;
2473 break;
2474 case XML_REGEXP_STRING:
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002475 if (!deep)
2476 ret = (atom1->valuep == atom2->valuep);
2477 else
2478 ret = xmlStrEqual((xmlChar *)atom1->valuep,
2479 (xmlChar *)atom2->valuep);
Daniel Veillardfc011b72006-02-12 19:14:15 +00002480 break;
2481 case XML_REGEXP_CHARVAL:
2482 ret = (atom1->codepoint == atom2->codepoint);
2483 break;
2484 case XML_REGEXP_RANGES:
2485 /* too hard to do in the general case */
2486 ret = 0;
2487 default:
2488 break;
2489 }
2490 return(ret);
2491}
2492
2493/**
2494 * xmlFACompareAtoms:
2495 * @atom1: an atom
2496 * @atom2: an atom
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002497 * @deep: if not set only compare string pointers
Daniel Veillardfc011b72006-02-12 19:14:15 +00002498 *
2499 * Compares two atoms to check whether they intersect in some ways,
2500 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2501 *
2502 * Returns 1 if yes and 0 otherwise
2503 */
2504static int
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002505xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
Daniel Veillardfc011b72006-02-12 19:14:15 +00002506 int ret = 1;
2507
2508 if (atom1 == atom2)
2509 return(1);
2510 if ((atom1 == NULL) || (atom2 == NULL))
2511 return(0);
2512
2513 if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2514 (atom2->type == XML_REGEXP_ANYCHAR))
2515 return(1);
2516
2517 if (atom1->type > atom2->type) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002518 xmlRegAtomPtr tmp;
2519 tmp = atom1;
2520 atom1 = atom2;
2521 atom2 = tmp;
Daniel Veillardfc011b72006-02-12 19:14:15 +00002522 }
2523 if (atom1->type != atom2->type) {
2524 ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2525 /* if they can't intersect at the type level break now */
2526 if (ret == 0)
2527 return(0);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002528 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002529 switch (atom1->type) {
2530 case XML_REGEXP_STRING:
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002531 if (!deep)
2532 ret = (atom1->valuep != atom2->valuep);
2533 else
2534 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep,
2535 (xmlChar *)atom2->valuep);
Daniel Veillard9efc4762005-07-19 14:33:55 +00002536 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002537 case XML_REGEXP_EPSILON:
Daniel Veillard567a45b2005-10-18 19:11:55 +00002538 goto not_determinist;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002539 case XML_REGEXP_CHARVAL:
Daniel Veillardfc011b72006-02-12 19:14:15 +00002540 if (atom2->type == XML_REGEXP_CHARVAL) {
2541 ret = (atom1->codepoint == atom2->codepoint);
2542 } else {
2543 ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2544 if (ret < 0)
2545 ret = 1;
2546 }
Daniel Veillard9efc4762005-07-19 14:33:55 +00002547 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002548 case XML_REGEXP_RANGES:
Daniel Veillardfc011b72006-02-12 19:14:15 +00002549 if (atom2->type == XML_REGEXP_RANGES) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002550 int i, j, res;
2551 xmlRegRangePtr r1, r2;
2552
2553 /*
2554 * need to check that none of the ranges eventually matches
2555 */
2556 for (i = 0;i < atom1->nbRanges;i++) {
2557 for (j = 0;j < atom2->nbRanges;j++) {
2558 r1 = atom1->ranges[i];
2559 r2 = atom2->ranges[j];
2560 res = xmlFACompareRanges(r1, r2);
2561 if (res == 1) {
2562 ret = 1;
2563 goto done;
2564 }
2565 }
2566 }
2567 ret = 0;
2568 }
2569 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002570 default:
Daniel Veillard567a45b2005-10-18 19:11:55 +00002571 goto not_determinist;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002572 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002573done:
Daniel Veillard6e65e152005-08-09 11:09:52 +00002574 if (atom1->neg != atom2->neg) {
Daniel Veillard9efc4762005-07-19 14:33:55 +00002575 ret = !ret;
Daniel Veillard6e65e152005-08-09 11:09:52 +00002576 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002577 if (ret == 0)
2578 return(0);
2579not_determinist:
2580 return(1);
Daniel Veillarde19fc232002-04-22 16:01:24 +00002581}
2582
2583/**
2584 * xmlFARecurseDeterminism:
2585 * @ctxt: a regexp parser context
2586 *
2587 * Check whether the associated regexp is determinist,
2588 * should be called after xmlFAEliminateEpsilonTransitions()
2589 *
2590 */
2591static int
2592xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2593 int to, xmlRegAtomPtr atom) {
2594 int ret = 1;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002595 int res;
Daniel Veillard5de09382005-09-26 17:18:17 +00002596 int transnr, nbTrans;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002597 xmlRegTransPtr t1;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002598 int deep = 1;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002599
2600 if (state == NULL)
2601 return(ret);
Daniel Veillard466fcda2012-08-27 12:03:40 +08002602 if (state->markd == XML_REGEXP_MARK_VISITED)
2603 return(ret);
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002604
2605 if (ctxt->flags & AM_AUTOMATA_RNG)
2606 deep = 0;
2607
Daniel Veillard5de09382005-09-26 17:18:17 +00002608 /*
2609 * don't recurse on transitions potentially added in the course of
2610 * the elimination.
2611 */
2612 nbTrans = state->nbTrans;
2613 for (transnr = 0;transnr < nbTrans;transnr++) {
Daniel Veillarde19fc232002-04-22 16:01:24 +00002614 t1 = &(state->trans[transnr]);
2615 /*
2616 * check transitions conflicting with the one looked at
2617 */
2618 if (t1->atom == NULL) {
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00002619 if (t1->to < 0)
Daniel Veillarde19fc232002-04-22 16:01:24 +00002620 continue;
Daniel Veillard466fcda2012-08-27 12:03:40 +08002621 state->markd = XML_REGEXP_MARK_VISITED;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002622 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
Daniel Veillarde19fc232002-04-22 16:01:24 +00002623 to, atom);
Daniel Veillard466fcda2012-08-27 12:03:40 +08002624 state->markd = 0;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002625 if (res == 0) {
2626 ret = 0;
Daniel Veillardaa622012005-10-20 15:55:25 +00002627 /* t1->nd = 1; */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002628 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002629 continue;
2630 }
2631 if (t1->to != to)
2632 continue;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002633 if (xmlFACompareAtoms(t1->atom, atom, deep)) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002634 ret = 0;
2635 /* mark the transition as non-deterministic */
2636 t1->nd = 1;
2637 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002638 }
2639 return(ret);
2640}
2641
2642/**
2643 * xmlFAComputesDeterminism:
2644 * @ctxt: a regexp parser context
2645 *
2646 * Check whether the associated regexp is determinist,
2647 * should be called after xmlFAEliminateEpsilonTransitions()
2648 *
2649 */
2650static int
2651xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2652 int statenr, transnr;
2653 xmlRegStatePtr state;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002654 xmlRegTransPtr t1, t2, last;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002655 int i;
2656 int ret = 1;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002657 int deep = 1;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002658
Daniel Veillard4402ab42002-09-12 16:02:56 +00002659#ifdef DEBUG_REGEXP_GRAPH
2660 printf("xmlFAComputesDeterminism\n");
2661 xmlRegPrintCtxt(stdout, ctxt);
2662#endif
Daniel Veillarde19fc232002-04-22 16:01:24 +00002663 if (ctxt->determinist != -1)
2664 return(ctxt->determinist);
2665
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002666 if (ctxt->flags & AM_AUTOMATA_RNG)
2667 deep = 0;
2668
Daniel Veillarde19fc232002-04-22 16:01:24 +00002669 /*
Daniel Veillard567a45b2005-10-18 19:11:55 +00002670 * First cleanup the automata removing cancelled transitions
Daniel Veillarde19fc232002-04-22 16:01:24 +00002671 */
2672 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2673 state = ctxt->states[statenr];
2674 if (state == NULL)
2675 continue;
Daniel Veillard4f82c8a2005-08-09 21:40:08 +00002676 if (state->nbTrans < 2)
2677 continue;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002678 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2679 t1 = &(state->trans[transnr]);
2680 /*
2681 * Determinism checks in case of counted or all transitions
2682 * will have to be handled separately
2683 */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002684 if (t1->atom == NULL) {
Daniel Veillardaa622012005-10-20 15:55:25 +00002685 /* t1->nd = 1; */
Daniel Veillarde19fc232002-04-22 16:01:24 +00002686 continue;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002687 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002688 if (t1->to == -1) /* eliminated */
2689 continue;
2690 for (i = 0;i < transnr;i++) {
2691 t2 = &(state->trans[i]);
2692 if (t2->to == -1) /* eliminated */
2693 continue;
2694 if (t2->atom != NULL) {
2695 if (t1->to == t2->to) {
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002696 /*
2697 * Here we use deep because we want to keep the
2698 * transitions which indicate a conflict
2699 */
2700 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
Daniel Veillard11e28e42009-08-12 12:21:42 +02002701 (t1->counter == t2->counter) &&
2702 (t1->count == t2->count))
William M. Brackddf71d62004-05-06 04:17:26 +00002703 t2->to = -1; /* eliminated */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002704 }
2705 }
2706 }
2707 }
2708 }
2709
2710 /*
2711 * Check for all states that there aren't 2 transitions
2712 * with the same atom and a different target.
2713 */
2714 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2715 state = ctxt->states[statenr];
2716 if (state == NULL)
2717 continue;
2718 if (state->nbTrans < 2)
2719 continue;
2720 last = NULL;
2721 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2722 t1 = &(state->trans[transnr]);
2723 /*
2724 * Determinism checks in case of counted or all transitions
2725 * will have to be handled separately
2726 */
2727 if (t1->atom == NULL) {
2728 continue;
2729 }
2730 if (t1->to == -1) /* eliminated */
2731 continue;
2732 for (i = 0;i < transnr;i++) {
2733 t2 = &(state->trans[i]);
2734 if (t2->to == -1) /* eliminated */
2735 continue;
2736 if (t2->atom != NULL) {
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002737 /*
2738 * But here we don't use deep because we want to
2739 * find transitions which indicate a conflict
2740 */
2741 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002742 ret = 0;
2743 /* mark the transitions as non-deterministic ones */
2744 t1->nd = 1;
2745 t2->nd = 1;
2746 last = t1;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002747 }
2748 } else if (t1->to != -1) {
2749 /*
2750 * do the closure in case of remaining specific
2751 * epsilon transitions like choices or all
2752 */
2753 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2754 t2->to, t2->atom);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002755 /* don't shortcut the computation so all non deterministic
2756 transition get marked down
Daniel Veillarde19fc232002-04-22 16:01:24 +00002757 if (ret == 0)
Daniel Veillardaa622012005-10-20 15:55:25 +00002758 return(0);
2759 */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002760 if (ret == 0) {
2761 t1->nd = 1;
Daniel Veillardaa622012005-10-20 15:55:25 +00002762 /* t2->nd = 1; */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002763 last = t1;
2764 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002765 }
2766 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002767 /* don't shortcut the computation so all non deterministic
2768 transition get marked down
Daniel Veillarde19fc232002-04-22 16:01:24 +00002769 if (ret == 0)
Daniel Veillard567a45b2005-10-18 19:11:55 +00002770 break; */
Daniel Veillarde19fc232002-04-22 16:01:24 +00002771 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002772
2773 /*
2774 * mark specifically the last non-deterministic transition
2775 * from a state since there is no need to set-up rollback
2776 * from it
2777 */
2778 if (last != NULL) {
2779 last->nd = 2;
2780 }
2781
2782 /* don't shortcut the computation so all non deterministic
2783 transition get marked down
Daniel Veillarde19fc232002-04-22 16:01:24 +00002784 if (ret == 0)
Daniel Veillard567a45b2005-10-18 19:11:55 +00002785 break; */
Daniel Veillarde19fc232002-04-22 16:01:24 +00002786 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002787
Daniel Veillarde19fc232002-04-22 16:01:24 +00002788 ctxt->determinist = ret;
2789 return(ret);
2790}
2791
Daniel Veillard4255d502002-04-16 15:50:10 +00002792/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08002793 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00002794 * Routines to check input against transition atoms *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08002795 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00002796 ************************************************************************/
2797
2798static int
2799xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2800 int start, int end, const xmlChar *blockName) {
2801 int ret = 0;
2802
2803 switch (type) {
2804 case XML_REGEXP_STRING:
2805 case XML_REGEXP_SUBREG:
2806 case XML_REGEXP_RANGES:
2807 case XML_REGEXP_EPSILON:
2808 return(-1);
2809 case XML_REGEXP_ANYCHAR:
2810 ret = ((codepoint != '\n') && (codepoint != '\r'));
2811 break;
2812 case XML_REGEXP_CHARVAL:
2813 ret = ((codepoint >= start) && (codepoint <= end));
2814 break;
2815 case XML_REGEXP_NOTSPACE:
2816 neg = !neg;
J. Peter Mugaasd2c329a2017-10-21 13:49:31 +02002817 /* Falls through. */
Daniel Veillard4255d502002-04-16 15:50:10 +00002818 case XML_REGEXP_ANYSPACE:
2819 ret = ((codepoint == '\n') || (codepoint == '\r') ||
2820 (codepoint == '\t') || (codepoint == ' '));
2821 break;
2822 case XML_REGEXP_NOTINITNAME:
2823 neg = !neg;
J. Peter Mugaasd2c329a2017-10-21 13:49:31 +02002824 /* Falls through. */
Daniel Veillard4255d502002-04-16 15:50:10 +00002825 case XML_REGEXP_INITNAME:
Daniel Veillardf8e3db02012-09-11 13:26:36 +08002826 ret = (IS_LETTER(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00002827 (codepoint == '_') || (codepoint == ':'));
2828 break;
2829 case XML_REGEXP_NOTNAMECHAR:
2830 neg = !neg;
J. Peter Mugaasd2c329a2017-10-21 13:49:31 +02002831 /* Falls through. */
Daniel Veillard4255d502002-04-16 15:50:10 +00002832 case XML_REGEXP_NAMECHAR:
William M. Brack871611b2003-10-18 04:53:14 +00002833 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00002834 (codepoint == '.') || (codepoint == '-') ||
2835 (codepoint == '_') || (codepoint == ':') ||
William M. Brack871611b2003-10-18 04:53:14 +00002836 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
Daniel Veillard4255d502002-04-16 15:50:10 +00002837 break;
2838 case XML_REGEXP_NOTDECIMAL:
2839 neg = !neg;
J. Peter Mugaasd2c329a2017-10-21 13:49:31 +02002840 /* Falls through. */
Daniel Veillard4255d502002-04-16 15:50:10 +00002841 case XML_REGEXP_DECIMAL:
2842 ret = xmlUCSIsCatNd(codepoint);
2843 break;
2844 case XML_REGEXP_REALCHAR:
2845 neg = !neg;
J. Peter Mugaasd2c329a2017-10-21 13:49:31 +02002846 /* Falls through. */
Daniel Veillard4255d502002-04-16 15:50:10 +00002847 case XML_REGEXP_NOTREALCHAR:
2848 ret = xmlUCSIsCatP(codepoint);
2849 if (ret == 0)
2850 ret = xmlUCSIsCatZ(codepoint);
2851 if (ret == 0)
2852 ret = xmlUCSIsCatC(codepoint);
2853 break;
2854 case XML_REGEXP_LETTER:
2855 ret = xmlUCSIsCatL(codepoint);
2856 break;
2857 case XML_REGEXP_LETTER_UPPERCASE:
2858 ret = xmlUCSIsCatLu(codepoint);
2859 break;
2860 case XML_REGEXP_LETTER_LOWERCASE:
2861 ret = xmlUCSIsCatLl(codepoint);
2862 break;
2863 case XML_REGEXP_LETTER_TITLECASE:
2864 ret = xmlUCSIsCatLt(codepoint);
2865 break;
2866 case XML_REGEXP_LETTER_MODIFIER:
2867 ret = xmlUCSIsCatLm(codepoint);
2868 break;
2869 case XML_REGEXP_LETTER_OTHERS:
2870 ret = xmlUCSIsCatLo(codepoint);
2871 break;
2872 case XML_REGEXP_MARK:
2873 ret = xmlUCSIsCatM(codepoint);
2874 break;
2875 case XML_REGEXP_MARK_NONSPACING:
2876 ret = xmlUCSIsCatMn(codepoint);
2877 break;
2878 case XML_REGEXP_MARK_SPACECOMBINING:
2879 ret = xmlUCSIsCatMc(codepoint);
2880 break;
2881 case XML_REGEXP_MARK_ENCLOSING:
2882 ret = xmlUCSIsCatMe(codepoint);
2883 break;
2884 case XML_REGEXP_NUMBER:
2885 ret = xmlUCSIsCatN(codepoint);
2886 break;
2887 case XML_REGEXP_NUMBER_DECIMAL:
2888 ret = xmlUCSIsCatNd(codepoint);
2889 break;
2890 case XML_REGEXP_NUMBER_LETTER:
2891 ret = xmlUCSIsCatNl(codepoint);
2892 break;
2893 case XML_REGEXP_NUMBER_OTHERS:
2894 ret = xmlUCSIsCatNo(codepoint);
2895 break;
2896 case XML_REGEXP_PUNCT:
2897 ret = xmlUCSIsCatP(codepoint);
2898 break;
2899 case XML_REGEXP_PUNCT_CONNECTOR:
2900 ret = xmlUCSIsCatPc(codepoint);
2901 break;
2902 case XML_REGEXP_PUNCT_DASH:
2903 ret = xmlUCSIsCatPd(codepoint);
2904 break;
2905 case XML_REGEXP_PUNCT_OPEN:
2906 ret = xmlUCSIsCatPs(codepoint);
2907 break;
2908 case XML_REGEXP_PUNCT_CLOSE:
2909 ret = xmlUCSIsCatPe(codepoint);
2910 break;
2911 case XML_REGEXP_PUNCT_INITQUOTE:
2912 ret = xmlUCSIsCatPi(codepoint);
2913 break;
2914 case XML_REGEXP_PUNCT_FINQUOTE:
2915 ret = xmlUCSIsCatPf(codepoint);
2916 break;
2917 case XML_REGEXP_PUNCT_OTHERS:
2918 ret = xmlUCSIsCatPo(codepoint);
2919 break;
2920 case XML_REGEXP_SEPAR:
2921 ret = xmlUCSIsCatZ(codepoint);
2922 break;
2923 case XML_REGEXP_SEPAR_SPACE:
2924 ret = xmlUCSIsCatZs(codepoint);
2925 break;
2926 case XML_REGEXP_SEPAR_LINE:
2927 ret = xmlUCSIsCatZl(codepoint);
2928 break;
2929 case XML_REGEXP_SEPAR_PARA:
2930 ret = xmlUCSIsCatZp(codepoint);
2931 break;
2932 case XML_REGEXP_SYMBOL:
2933 ret = xmlUCSIsCatS(codepoint);
2934 break;
2935 case XML_REGEXP_SYMBOL_MATH:
2936 ret = xmlUCSIsCatSm(codepoint);
2937 break;
2938 case XML_REGEXP_SYMBOL_CURRENCY:
2939 ret = xmlUCSIsCatSc(codepoint);
2940 break;
2941 case XML_REGEXP_SYMBOL_MODIFIER:
2942 ret = xmlUCSIsCatSk(codepoint);
2943 break;
2944 case XML_REGEXP_SYMBOL_OTHERS:
2945 ret = xmlUCSIsCatSo(codepoint);
2946 break;
2947 case XML_REGEXP_OTHER:
2948 ret = xmlUCSIsCatC(codepoint);
2949 break;
2950 case XML_REGEXP_OTHER_CONTROL:
2951 ret = xmlUCSIsCatCc(codepoint);
2952 break;
2953 case XML_REGEXP_OTHER_FORMAT:
2954 ret = xmlUCSIsCatCf(codepoint);
2955 break;
2956 case XML_REGEXP_OTHER_PRIVATE:
2957 ret = xmlUCSIsCatCo(codepoint);
2958 break;
2959 case XML_REGEXP_OTHER_NA:
2960 /* ret = xmlUCSIsCatCn(codepoint); */
2961 /* Seems it doesn't exist anymore in recent Unicode releases */
2962 ret = 0;
2963 break;
2964 case XML_REGEXP_BLOCK_NAME:
2965 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
2966 break;
2967 }
2968 if (neg)
2969 return(!ret);
2970 return(ret);
2971}
2972
2973static int
2974xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
2975 int i, ret = 0;
2976 xmlRegRangePtr range;
2977
William M. Brack871611b2003-10-18 04:53:14 +00002978 if ((atom == NULL) || (!IS_CHAR(codepoint)))
Daniel Veillard4255d502002-04-16 15:50:10 +00002979 return(-1);
2980
2981 switch (atom->type) {
2982 case XML_REGEXP_SUBREG:
2983 case XML_REGEXP_EPSILON:
2984 return(-1);
2985 case XML_REGEXP_CHARVAL:
2986 return(codepoint == atom->codepoint);
2987 case XML_REGEXP_RANGES: {
2988 int accept = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00002989
Daniel Veillard4255d502002-04-16 15:50:10 +00002990 for (i = 0;i < atom->nbRanges;i++) {
2991 range = atom->ranges[i];
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002992 if (range->neg == 2) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002993 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2994 0, range->start, range->end,
2995 range->blockName);
2996 if (ret != 0)
2997 return(0); /* excluded char */
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002998 } else if (range->neg) {
2999 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3000 0, range->start, range->end,
3001 range->blockName);
3002 if (ret == 0)
Daniel Veillardf2a12832003-11-24 13:04:35 +00003003 accept = 1;
Daniel Veillardf8b9de32003-11-24 14:27:26 +00003004 else
3005 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00003006 } else {
3007 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3008 0, range->start, range->end,
3009 range->blockName);
3010 if (ret != 0)
3011 accept = 1; /* might still be excluded */
3012 }
3013 }
3014 return(accept);
3015 }
3016 case XML_REGEXP_STRING:
3017 printf("TODO: XML_REGEXP_STRING\n");
3018 return(-1);
3019 case XML_REGEXP_ANYCHAR:
3020 case XML_REGEXP_ANYSPACE:
3021 case XML_REGEXP_NOTSPACE:
3022 case XML_REGEXP_INITNAME:
3023 case XML_REGEXP_NOTINITNAME:
3024 case XML_REGEXP_NAMECHAR:
3025 case XML_REGEXP_NOTNAMECHAR:
3026 case XML_REGEXP_DECIMAL:
3027 case XML_REGEXP_NOTDECIMAL:
3028 case XML_REGEXP_REALCHAR:
3029 case XML_REGEXP_NOTREALCHAR:
3030 case XML_REGEXP_LETTER:
3031 case XML_REGEXP_LETTER_UPPERCASE:
3032 case XML_REGEXP_LETTER_LOWERCASE:
3033 case XML_REGEXP_LETTER_TITLECASE:
3034 case XML_REGEXP_LETTER_MODIFIER:
3035 case XML_REGEXP_LETTER_OTHERS:
3036 case XML_REGEXP_MARK:
3037 case XML_REGEXP_MARK_NONSPACING:
3038 case XML_REGEXP_MARK_SPACECOMBINING:
3039 case XML_REGEXP_MARK_ENCLOSING:
3040 case XML_REGEXP_NUMBER:
3041 case XML_REGEXP_NUMBER_DECIMAL:
3042 case XML_REGEXP_NUMBER_LETTER:
3043 case XML_REGEXP_NUMBER_OTHERS:
3044 case XML_REGEXP_PUNCT:
3045 case XML_REGEXP_PUNCT_CONNECTOR:
3046 case XML_REGEXP_PUNCT_DASH:
3047 case XML_REGEXP_PUNCT_OPEN:
3048 case XML_REGEXP_PUNCT_CLOSE:
3049 case XML_REGEXP_PUNCT_INITQUOTE:
3050 case XML_REGEXP_PUNCT_FINQUOTE:
3051 case XML_REGEXP_PUNCT_OTHERS:
3052 case XML_REGEXP_SEPAR:
3053 case XML_REGEXP_SEPAR_SPACE:
3054 case XML_REGEXP_SEPAR_LINE:
3055 case XML_REGEXP_SEPAR_PARA:
3056 case XML_REGEXP_SYMBOL:
3057 case XML_REGEXP_SYMBOL_MATH:
3058 case XML_REGEXP_SYMBOL_CURRENCY:
3059 case XML_REGEXP_SYMBOL_MODIFIER:
3060 case XML_REGEXP_SYMBOL_OTHERS:
3061 case XML_REGEXP_OTHER:
3062 case XML_REGEXP_OTHER_CONTROL:
3063 case XML_REGEXP_OTHER_FORMAT:
3064 case XML_REGEXP_OTHER_PRIVATE:
3065 case XML_REGEXP_OTHER_NA:
3066 case XML_REGEXP_BLOCK_NAME:
3067 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3068 (const xmlChar *)atom->valuep);
3069 if (atom->neg)
3070 ret = !ret;
3071 break;
3072 }
3073 return(ret);
3074}
3075
3076/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003077 * *
William M. Brackddf71d62004-05-06 04:17:26 +00003078 * Saving and restoring state of an execution context *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003079 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00003080 ************************************************************************/
3081
3082#ifdef DEBUG_REGEXP_EXEC
3083static void
3084xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
3085 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
3086 if (exec->inputStack != NULL) {
3087 int i;
3088 printf(": ");
3089 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00003090 printf("%s ", (const char *)
3091 exec->inputStack[exec->inputStackNr - (i + 1)].value);
Daniel Veillard4255d502002-04-16 15:50:10 +00003092 } else {
3093 printf(": %s", &(exec->inputString[exec->index]));
3094 }
3095 printf("\n");
3096}
3097#endif
3098
3099static void
3100xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3101#ifdef DEBUG_REGEXP_EXEC
3102 printf("saving ");
3103 exec->transno++;
3104 xmlFARegDebugExec(exec);
3105 exec->transno--;
3106#endif
Daniel Veillard94cc1032005-09-15 13:09:00 +00003107#ifdef MAX_PUSH
3108 if (exec->nbPush > MAX_PUSH) {
3109 return;
3110 }
3111 exec->nbPush++;
3112#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003113
3114 if (exec->maxRollbacks == 0) {
3115 exec->maxRollbacks = 4;
3116 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3117 sizeof(xmlRegExecRollback));
3118 if (exec->rollbacks == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003119 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00003120 exec->maxRollbacks = 0;
3121 return;
3122 }
3123 memset(exec->rollbacks, 0,
3124 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3125 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
3126 xmlRegExecRollback *tmp;
3127 int len = exec->maxRollbacks;
3128
3129 exec->maxRollbacks *= 2;
3130 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3131 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3132 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003133 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00003134 exec->maxRollbacks /= 2;
3135 return;
3136 }
3137 exec->rollbacks = tmp;
3138 tmp = &exec->rollbacks[len];
3139 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3140 }
3141 exec->rollbacks[exec->nbRollbacks].state = exec->state;
3142 exec->rollbacks[exec->nbRollbacks].index = exec->index;
3143 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3144 if (exec->comp->nbCounters > 0) {
3145 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3146 exec->rollbacks[exec->nbRollbacks].counts = (int *)
3147 xmlMalloc(exec->comp->nbCounters * sizeof(int));
3148 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003149 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00003150 exec->status = -5;
3151 return;
3152 }
3153 }
3154 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3155 exec->comp->nbCounters * sizeof(int));
3156 }
3157 exec->nbRollbacks++;
3158}
3159
3160static void
3161xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3162 if (exec->nbRollbacks <= 0) {
3163 exec->status = -1;
3164#ifdef DEBUG_REGEXP_EXEC
3165 printf("rollback failed on empty stack\n");
3166#endif
3167 return;
3168 }
3169 exec->nbRollbacks--;
3170 exec->state = exec->rollbacks[exec->nbRollbacks].state;
3171 exec->index = exec->rollbacks[exec->nbRollbacks].index;
3172 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3173 if (exec->comp->nbCounters > 0) {
3174 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3175 fprintf(stderr, "exec save: allocation failed");
3176 exec->status = -6;
3177 return;
3178 }
Gaurav2671b012013-09-11 14:59:06 +08003179 if (exec->counts) {
3180 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
Daniel Veillard4255d502002-04-16 15:50:10 +00003181 exec->comp->nbCounters * sizeof(int));
Gaurav2671b012013-09-11 14:59:06 +08003182 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003183 }
3184
3185#ifdef DEBUG_REGEXP_EXEC
3186 printf("restored ");
3187 xmlFARegDebugExec(exec);
3188#endif
3189}
3190
3191/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003192 * *
William M. Brackddf71d62004-05-06 04:17:26 +00003193 * Verifier, running an input against a compiled regexp *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003194 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00003195 ************************************************************************/
3196
3197static int
3198xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3199 xmlRegExecCtxt execval;
3200 xmlRegExecCtxtPtr exec = &execval;
Daniel Veillard567a45b2005-10-18 19:11:55 +00003201 int ret, codepoint = 0, len, deter;
Daniel Veillard4255d502002-04-16 15:50:10 +00003202
3203 exec->inputString = content;
3204 exec->index = 0;
Daniel Veillard94cc1032005-09-15 13:09:00 +00003205 exec->nbPush = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003206 exec->determinist = 1;
3207 exec->maxRollbacks = 0;
3208 exec->nbRollbacks = 0;
3209 exec->rollbacks = NULL;
3210 exec->status = 0;
3211 exec->comp = comp;
3212 exec->state = comp->states[0];
3213 exec->transno = 0;
3214 exec->transcount = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00003215 exec->inputStack = NULL;
3216 exec->inputStackMax = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003217 if (comp->nbCounters > 0) {
3218 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
Daniel Veillardff46a042003-10-08 08:53:17 +00003219 if (exec->counts == NULL) {
3220 xmlRegexpErrMemory(NULL, "running regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00003221 return(-1);
Daniel Veillardff46a042003-10-08 08:53:17 +00003222 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003223 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3224 } else
3225 exec->counts = NULL;
Daniel Veillard40851d02012-08-17 20:34:05 +08003226 while ((exec->status == 0) && (exec->state != NULL) &&
Daniel Veillard4255d502002-04-16 15:50:10 +00003227 ((exec->inputString[exec->index] != 0) ||
Daniel Veillardad559982008-05-12 13:15:35 +00003228 ((exec->state != NULL) &&
3229 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003230 xmlRegTransPtr trans;
3231 xmlRegAtomPtr atom;
3232
3233 /*
William M. Brack0e00b282004-04-26 15:40:47 +00003234 * If end of input on non-terminal state, rollback, however we may
Daniel Veillard4255d502002-04-16 15:50:10 +00003235 * still have epsilon like transition for counted transitions
William M. Brack0e00b282004-04-26 15:40:47 +00003236 * on counters, in that case don't break too early. Additionally,
3237 * if we are working on a range like "AB{0,2}", where B is not present,
3238 * we don't want to break.
Daniel Veillard4255d502002-04-16 15:50:10 +00003239 */
Daniel Veillard11ce4002006-03-10 00:36:23 +00003240 len = 1;
William M. Brack0e00b282004-04-26 15:40:47 +00003241 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
William M. Brackddf71d62004-05-06 04:17:26 +00003242 /*
3243 * if there is a transition, we must check if
3244 * atom allows minOccurs of 0
3245 */
3246 if (exec->transno < exec->state->nbTrans) {
William M. Brack0e00b282004-04-26 15:40:47 +00003247 trans = &exec->state->trans[exec->transno];
3248 if (trans->to >=0) {
3249 atom = trans->atom;
3250 if (!((atom->min == 0) && (atom->max > 0)))
3251 goto rollback;
3252 }
3253 } else
3254 goto rollback;
3255 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003256
3257 exec->transcount = 0;
3258 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3259 trans = &exec->state->trans[exec->transno];
3260 if (trans->to < 0)
3261 continue;
3262 atom = trans->atom;
3263 ret = 0;
Daniel Veillard567a45b2005-10-18 19:11:55 +00003264 deter = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003265 if (trans->count >= 0) {
3266 int count;
3267 xmlRegCounterPtr counter;
3268
Daniel Veillard11ce4002006-03-10 00:36:23 +00003269 if (exec->counts == NULL) {
3270 exec->status = -1;
3271 goto error;
3272 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003273 /*
3274 * A counted transition.
3275 */
3276
3277 count = exec->counts[trans->count];
3278 counter = &exec->comp->counters[trans->count];
3279#ifdef DEBUG_REGEXP_EXEC
3280 printf("testing count %d: val %d, min %d, max %d\n",
3281 trans->count, count, counter->min, counter->max);
3282#endif
3283 ret = ((count >= counter->min) && (count <= counter->max));
Daniel Veillard567a45b2005-10-18 19:11:55 +00003284 if ((ret) && (counter->min != counter->max))
3285 deter = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003286 } else if (atom == NULL) {
3287 fprintf(stderr, "epsilon transition left at runtime\n");
3288 exec->status = -2;
3289 break;
3290 } else if (exec->inputString[exec->index] != 0) {
3291 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3292 ret = xmlRegCheckCharacter(atom, codepoint);
William M. Brack0e00b282004-04-26 15:40:47 +00003293 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003294 xmlRegStatePtr to = comp->states[trans->to];
3295
3296 /*
3297 * this is a multiple input sequence
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003298 * If there is a counter associated increment it now.
3299 * before potentially saving and rollback
Daniel Veillardc821e032007-08-28 17:33:45 +00003300 * do not increment if the counter is already over the
3301 * maximum limit in which case get to next transition
Daniel Veillard4255d502002-04-16 15:50:10 +00003302 */
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003303 if (trans->counter >= 0) {
Daniel Veillardc821e032007-08-28 17:33:45 +00003304 xmlRegCounterPtr counter;
3305
3306 if ((exec->counts == NULL) ||
3307 (exec->comp == NULL) ||
3308 (exec->comp->counters == NULL)) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00003309 exec->status = -1;
3310 goto error;
3311 }
Daniel Veillardc821e032007-08-28 17:33:45 +00003312 counter = &exec->comp->counters[trans->counter];
3313 if (exec->counts[trans->counter] >= counter->max)
3314 continue; /* for loop on transitions */
3315
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003316#ifdef DEBUG_REGEXP_EXEC
3317 printf("Increasing count %d\n", trans->counter);
3318#endif
3319 exec->counts[trans->counter]++;
3320 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003321 if (exec->state->nbTrans > exec->transno + 1) {
3322 xmlFARegExecSave(exec);
3323 }
3324 exec->transcount = 1;
3325 do {
3326 /*
3327 * Try to progress as much as possible on the input
3328 */
3329 if (exec->transcount == atom->max) {
3330 break;
3331 }
3332 exec->index += len;
3333 /*
3334 * End of input: stop here
3335 */
3336 if (exec->inputString[exec->index] == 0) {
3337 exec->index -= len;
3338 break;
3339 }
3340 if (exec->transcount >= atom->min) {
3341 int transno = exec->transno;
3342 xmlRegStatePtr state = exec->state;
3343
3344 /*
3345 * The transition is acceptable save it
3346 */
3347 exec->transno = -1; /* trick */
3348 exec->state = to;
3349 xmlFARegExecSave(exec);
3350 exec->transno = transno;
3351 exec->state = state;
3352 }
3353 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3354 len);
3355 ret = xmlRegCheckCharacter(atom, codepoint);
3356 exec->transcount++;
3357 } while (ret == 1);
3358 if (exec->transcount < atom->min)
3359 ret = 0;
3360
3361 /*
3362 * If the last check failed but one transition was found
3363 * possible, rollback
3364 */
3365 if (ret < 0)
3366 ret = 0;
3367 if (ret == 0) {
3368 goto rollback;
3369 }
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003370 if (trans->counter >= 0) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00003371 if (exec->counts == NULL) {
3372 exec->status = -1;
3373 goto error;
3374 }
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003375#ifdef DEBUG_REGEXP_EXEC
3376 printf("Decreasing count %d\n", trans->counter);
3377#endif
3378 exec->counts[trans->counter]--;
3379 }
William M. Brack0e00b282004-04-26 15:40:47 +00003380 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3381 /*
3382 * we don't match on the codepoint, but minOccurs of 0
3383 * says that's ok. Setting len to 0 inhibits stepping
3384 * over the codepoint.
3385 */
3386 exec->transcount = 1;
3387 len = 0;
3388 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003389 }
William M. Brack0e00b282004-04-26 15:40:47 +00003390 } else if ((atom->min == 0) && (atom->max > 0)) {
3391 /* another spot to match when minOccurs is 0 */
3392 exec->transcount = 1;
3393 len = 0;
3394 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003395 }
3396 if (ret == 1) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00003397 if ((trans->nd == 1) ||
3398 ((trans->count >= 0) && (deter == 0) &&
3399 (exec->state->nbTrans > exec->transno + 1))) {
Daniel Veillardaa622012005-10-20 15:55:25 +00003400#ifdef DEBUG_REGEXP_EXEC
3401 if (trans->nd == 1)
3402 printf("Saving on nd transition atom %d for %c at %d\n",
3403 trans->atom->no, codepoint, exec->index);
3404 else
3405 printf("Saving on counted transition count %d for %c at %d\n",
3406 trans->count, codepoint, exec->index);
3407#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003408 xmlFARegExecSave(exec);
3409 }
3410 if (trans->counter >= 0) {
Daniel Veillardc821e032007-08-28 17:33:45 +00003411 xmlRegCounterPtr counter;
3412
3413 /* make sure we don't go over the counter maximum value */
3414 if ((exec->counts == NULL) ||
3415 (exec->comp == NULL) ||
3416 (exec->comp->counters == NULL)) {
3417 exec->status = -1;
Daniel Veillard11ce4002006-03-10 00:36:23 +00003418 goto error;
3419 }
Daniel Veillardc821e032007-08-28 17:33:45 +00003420 counter = &exec->comp->counters[trans->counter];
3421 if (exec->counts[trans->counter] >= counter->max)
3422 continue; /* for loop on transitions */
Daniel Veillard4255d502002-04-16 15:50:10 +00003423#ifdef DEBUG_REGEXP_EXEC
3424 printf("Increasing count %d\n", trans->counter);
3425#endif
3426 exec->counts[trans->counter]++;
3427 }
Daniel Veillard10752282005-08-08 13:05:13 +00003428 if ((trans->count >= 0) &&
3429 (trans->count < REGEXP_ALL_COUNTER)) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00003430 if (exec->counts == NULL) {
3431 exec->status = -1;
3432 goto error;
3433 }
Daniel Veillard10752282005-08-08 13:05:13 +00003434#ifdef DEBUG_REGEXP_EXEC
3435 printf("resetting count %d on transition\n",
3436 trans->count);
3437#endif
3438 exec->counts[trans->count] = 0;
3439 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003440#ifdef DEBUG_REGEXP_EXEC
3441 printf("entering state %d\n", trans->to);
3442#endif
3443 exec->state = comp->states[trans->to];
3444 exec->transno = 0;
3445 if (trans->atom != NULL) {
3446 exec->index += len;
3447 }
3448 goto progress;
3449 } else if (ret < 0) {
3450 exec->status = -4;
3451 break;
3452 }
3453 }
3454 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3455rollback:
3456 /*
3457 * Failed to find a way out
3458 */
3459 exec->determinist = 0;
Daniel Veillardaa622012005-10-20 15:55:25 +00003460#ifdef DEBUG_REGEXP_EXEC
3461 printf("rollback from state %d on %d:%c\n", exec->state->no,
3462 codepoint,codepoint);
3463#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003464 xmlFARegExecRollBack(exec);
3465 }
3466progress:
3467 continue;
3468 }
Daniel Veillard11ce4002006-03-10 00:36:23 +00003469error:
Daniel Veillard4255d502002-04-16 15:50:10 +00003470 if (exec->rollbacks != NULL) {
3471 if (exec->counts != NULL) {
3472 int i;
3473
3474 for (i = 0;i < exec->maxRollbacks;i++)
3475 if (exec->rollbacks[i].counts != NULL)
3476 xmlFree(exec->rollbacks[i].counts);
3477 }
3478 xmlFree(exec->rollbacks);
3479 }
Daniel Veillard40851d02012-08-17 20:34:05 +08003480 if (exec->state == NULL)
3481 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003482 if (exec->counts != NULL)
3483 xmlFree(exec->counts);
3484 if (exec->status == 0)
3485 return(1);
Daniel Veillard94cc1032005-09-15 13:09:00 +00003486 if (exec->status == -1) {
3487 if (exec->nbPush > MAX_PUSH)
3488 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003489 return(0);
Daniel Veillard94cc1032005-09-15 13:09:00 +00003490 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003491 return(exec->status);
3492}
3493
3494/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003495 * *
William M. Brackddf71d62004-05-06 04:17:26 +00003496 * Progressive interface to the verifier one atom at a time *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003497 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00003498 ************************************************************************/
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003499#ifdef DEBUG_ERR
3500static void testerr(xmlRegExecCtxtPtr exec);
3501#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003502
3503/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003504 * xmlRegNewExecCtxt:
Daniel Veillard4255d502002-04-16 15:50:10 +00003505 * @comp: a precompiled regular expression
3506 * @callback: a callback function used for handling progresses in the
3507 * automata matching phase
3508 * @data: the context data associated to the callback in this context
3509 *
3510 * Build a context used for progressive evaluation of a regexp.
Daniel Veillard01c13b52002-12-10 15:19:08 +00003511 *
3512 * Returns the new context
Daniel Veillard4255d502002-04-16 15:50:10 +00003513 */
3514xmlRegExecCtxtPtr
3515xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3516 xmlRegExecCtxtPtr exec;
3517
3518 if (comp == NULL)
3519 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00003520 if ((comp->compact == NULL) && (comp->states == NULL))
3521 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00003522 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3523 if (exec == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003524 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00003525 return(NULL);
3526 }
3527 memset(exec, 0, sizeof(xmlRegExecCtxt));
3528 exec->inputString = NULL;
3529 exec->index = 0;
3530 exec->determinist = 1;
3531 exec->maxRollbacks = 0;
3532 exec->nbRollbacks = 0;
3533 exec->rollbacks = NULL;
3534 exec->status = 0;
3535 exec->comp = comp;
Daniel Veillard23e73572002-09-19 19:56:43 +00003536 if (comp->compact == NULL)
3537 exec->state = comp->states[0];
Daniel Veillard4255d502002-04-16 15:50:10 +00003538 exec->transno = 0;
3539 exec->transcount = 0;
3540 exec->callback = callback;
3541 exec->data = data;
3542 if (comp->nbCounters > 0) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003543 /*
3544 * For error handling, exec->counts is allocated twice the size
3545 * the second half is used to store the data in case of rollback
3546 */
3547 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3548 * 2);
Daniel Veillard4255d502002-04-16 15:50:10 +00003549 if (exec->counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003550 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00003551 xmlFree(exec);
3552 return(NULL);
3553 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003554 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3555 exec->errCounts = &exec->counts[comp->nbCounters];
3556 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00003557 exec->counts = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003558 exec->errCounts = NULL;
3559 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003560 exec->inputStackMax = 0;
3561 exec->inputStackNr = 0;
3562 exec->inputStack = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003563 exec->errStateNo = -1;
3564 exec->errString = NULL;
Daniel Veillard94cc1032005-09-15 13:09:00 +00003565 exec->nbPush = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003566 return(exec);
3567}
3568
3569/**
3570 * xmlRegFreeExecCtxt:
3571 * @exec: a regular expression evaulation context
3572 *
3573 * Free the structures associated to a regular expression evaulation context.
3574 */
3575void
3576xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3577 if (exec == NULL)
3578 return;
3579
3580 if (exec->rollbacks != NULL) {
3581 if (exec->counts != NULL) {
3582 int i;
3583
3584 for (i = 0;i < exec->maxRollbacks;i++)
3585 if (exec->rollbacks[i].counts != NULL)
3586 xmlFree(exec->rollbacks[i].counts);
3587 }
3588 xmlFree(exec->rollbacks);
3589 }
3590 if (exec->counts != NULL)
3591 xmlFree(exec->counts);
3592 if (exec->inputStack != NULL) {
3593 int i;
3594
Daniel Veillard32370232002-10-16 14:08:14 +00003595 for (i = 0;i < exec->inputStackNr;i++) {
3596 if (exec->inputStack[i].value != NULL)
3597 xmlFree(exec->inputStack[i].value);
3598 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003599 xmlFree(exec->inputStack);
3600 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003601 if (exec->errString != NULL)
3602 xmlFree(exec->errString);
Daniel Veillard4255d502002-04-16 15:50:10 +00003603 xmlFree(exec);
3604}
3605
3606static void
3607xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3608 void *data) {
3609#ifdef DEBUG_PUSH
3610 printf("saving value: %d:%s\n", exec->inputStackNr, value);
3611#endif
3612 if (exec->inputStackMax == 0) {
3613 exec->inputStackMax = 4;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003614 exec->inputStack = (xmlRegInputTokenPtr)
Daniel Veillard4255d502002-04-16 15:50:10 +00003615 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3616 if (exec->inputStack == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003617 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00003618 exec->inputStackMax = 0;
3619 return;
3620 }
3621 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3622 xmlRegInputTokenPtr tmp;
3623
3624 exec->inputStackMax *= 2;
3625 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3626 exec->inputStackMax * sizeof(xmlRegInputToken));
3627 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003628 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00003629 exec->inputStackMax /= 2;
3630 return;
3631 }
3632 exec->inputStack = tmp;
3633 }
3634 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3635 exec->inputStack[exec->inputStackNr].data = data;
3636 exec->inputStackNr++;
3637 exec->inputStack[exec->inputStackNr].value = NULL;
3638 exec->inputStack[exec->inputStackNr].data = NULL;
3639}
3640
Daniel Veillardc0826a72004-08-10 14:17:33 +00003641/**
3642 * xmlRegStrEqualWildcard:
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003643 * @expStr: the string to be evaluated
Daniel Veillardc0826a72004-08-10 14:17:33 +00003644 * @valStr: the validation string
3645 *
3646 * Checks if both strings are equal or have the same content. "*"
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003647 * can be used as a wildcard in @valStr; "|" is used as a seperator of
Daniel Veillardc0826a72004-08-10 14:17:33 +00003648 * substrings in both @expStr and @valStr.
3649 *
3650 * Returns 1 if the comparison is satisfied and the number of substrings
3651 * is equal, 0 otherwise.
3652 */
3653
3654static int
3655xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3656 if (expStr == valStr) return(1);
3657 if (expStr == NULL) return(0);
3658 if (valStr == NULL) return(0);
3659 do {
3660 /*
3661 * Eval if we have a wildcard for the current item.
3662 */
3663 if (*expStr != *valStr) {
Daniel Veillard4f82c8a2005-08-09 21:40:08 +00003664 /* if one of them starts with a wildcard make valStr be it */
3665 if (*valStr == '*') {
3666 const xmlChar *tmp;
3667
3668 tmp = valStr;
3669 valStr = expStr;
3670 expStr = tmp;
3671 }
Daniel Veillardc0826a72004-08-10 14:17:33 +00003672 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3673 do {
3674 if (*valStr == XML_REG_STRING_SEPARATOR)
3675 break;
Kasimier T. Buchcikc0e833f2005-04-19 15:02:20 +00003676 valStr++;
Daniel Veillardc0826a72004-08-10 14:17:33 +00003677 } while (*valStr != 0);
3678 continue;
3679 } else
3680 return(0);
3681 }
Kasimier T. Buchcikc0e833f2005-04-19 15:02:20 +00003682 expStr++;
3683 valStr++;
Daniel Veillardc0826a72004-08-10 14:17:33 +00003684 } while (*valStr != 0);
3685 if (*expStr != 0)
3686 return (0);
3687 else
3688 return (1);
3689}
Daniel Veillard4255d502002-04-16 15:50:10 +00003690
3691/**
Daniel Veillard23e73572002-09-19 19:56:43 +00003692 * xmlRegCompactPushString:
3693 * @exec: a regexp execution context
3694 * @comp: the precompiled exec with a compact table
3695 * @value: a string token input
3696 * @data: data associated to the token to reuse in callbacks
3697 *
3698 * Push one input token in the execution context
3699 *
3700 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3701 * a negative value in case of error.
3702 */
3703static int
3704xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3705 xmlRegexpPtr comp,
3706 const xmlChar *value,
3707 void *data) {
3708 int state = exec->index;
3709 int i, target;
3710
3711 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3712 return(-1);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003713
Daniel Veillard23e73572002-09-19 19:56:43 +00003714 if (value == NULL) {
3715 /*
3716 * are we at a final state ?
3717 */
3718 if (comp->compact[state * (comp->nbstrings + 1)] ==
3719 XML_REGEXP_FINAL_STATE)
3720 return(1);
3721 return(0);
3722 }
3723
3724#ifdef DEBUG_PUSH
3725 printf("value pushed: %s\n", value);
3726#endif
3727
3728 /*
William M. Brackddf71d62004-05-06 04:17:26 +00003729 * Examine all outside transitions from current state
Daniel Veillard23e73572002-09-19 19:56:43 +00003730 */
3731 for (i = 0;i < comp->nbstrings;i++) {
3732 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3733 if ((target > 0) && (target <= comp->nbstates)) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003734 target--; /* to avoid 0 */
Daniel Veillardc0826a72004-08-10 14:17:33 +00003735 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003736 exec->index = target;
Daniel Veillard118aed72002-09-24 14:13:13 +00003737 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3738 exec->callback(exec->data, value,
3739 comp->transdata[state * comp->nbstrings + i], data);
3740 }
Daniel Veillard23e73572002-09-19 19:56:43 +00003741#ifdef DEBUG_PUSH
3742 printf("entering state %d\n", target);
3743#endif
3744 if (comp->compact[target * (comp->nbstrings + 1)] ==
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003745 XML_REGEXP_SINK_STATE)
3746 goto error;
3747
3748 if (comp->compact[target * (comp->nbstrings + 1)] ==
Daniel Veillard23e73572002-09-19 19:56:43 +00003749 XML_REGEXP_FINAL_STATE)
3750 return(1);
3751 return(0);
3752 }
3753 }
3754 }
3755 /*
3756 * Failed to find an exit transition out from current state for the
3757 * current token
3758 */
3759#ifdef DEBUG_PUSH
3760 printf("failed to find a transition for %s on state %d\n", value, state);
3761#endif
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003762error:
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003763 if (exec->errString != NULL)
3764 xmlFree(exec->errString);
3765 exec->errString = xmlStrdup(value);
3766 exec->errStateNo = state;
Daniel Veillard23e73572002-09-19 19:56:43 +00003767 exec->status = -1;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003768#ifdef DEBUG_ERR
3769 testerr(exec);
3770#endif
Daniel Veillard23e73572002-09-19 19:56:43 +00003771 return(-1);
3772}
3773
3774/**
Daniel Veillard6e65e152005-08-09 11:09:52 +00003775 * xmlRegExecPushStringInternal:
Daniel Veillardea7751d2002-12-20 00:16:24 +00003776 * @exec: a regexp execution context or NULL to indicate the end
Daniel Veillard4255d502002-04-16 15:50:10 +00003777 * @value: a string token input
3778 * @data: data associated to the token to reuse in callbacks
Daniel Veillard6e65e152005-08-09 11:09:52 +00003779 * @compound: value was assembled from 2 strings
Daniel Veillard4255d502002-04-16 15:50:10 +00003780 *
3781 * Push one input token in the execution context
3782 *
3783 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3784 * a negative value in case of error.
3785 */
Daniel Veillard6e65e152005-08-09 11:09:52 +00003786static int
3787xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3788 void *data, int compound) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003789 xmlRegTransPtr trans;
3790 xmlRegAtomPtr atom;
3791 int ret;
3792 int final = 0;
Daniel Veillard90700152005-01-08 22:05:09 +00003793 int progress = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003794
3795 if (exec == NULL)
3796 return(-1);
Daniel Veillard23e73572002-09-19 19:56:43 +00003797 if (exec->comp == NULL)
3798 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003799 if (exec->status != 0)
3800 return(exec->status);
3801
Daniel Veillard23e73572002-09-19 19:56:43 +00003802 if (exec->comp->compact != NULL)
3803 return(xmlRegCompactPushString(exec, exec->comp, value, data));
3804
Daniel Veillard4255d502002-04-16 15:50:10 +00003805 if (value == NULL) {
3806 if (exec->state->type == XML_REGEXP_FINAL_STATE)
3807 return(1);
3808 final = 1;
3809 }
3810
3811#ifdef DEBUG_PUSH
3812 printf("value pushed: %s\n", value);
3813#endif
3814 /*
3815 * If we have an active rollback stack push the new value there
3816 * and get back to where we were left
3817 */
3818 if ((value != NULL) && (exec->inputStackNr > 0)) {
3819 xmlFARegExecSaveInputString(exec, value, data);
3820 value = exec->inputStack[exec->index].value;
3821 data = exec->inputStack[exec->index].data;
3822#ifdef DEBUG_PUSH
3823 printf("value loaded: %s\n", value);
3824#endif
3825 }
3826
3827 while ((exec->status == 0) &&
3828 ((value != NULL) ||
3829 ((final == 1) &&
3830 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3831
3832 /*
3833 * End of input on non-terminal state, rollback, however we may
3834 * still have epsilon like transition for counted transitions
3835 * on counters, in that case don't break too early.
3836 */
Daniel Veillardb509f152002-04-17 16:28:10 +00003837 if ((value == NULL) && (exec->counts == NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +00003838 goto rollback;
3839
3840 exec->transcount = 0;
3841 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3842 trans = &exec->state->trans[exec->transno];
3843 if (trans->to < 0)
3844 continue;
3845 atom = trans->atom;
3846 ret = 0;
Daniel Veillard441bc322002-04-20 17:38:48 +00003847 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3848 int i;
3849 int count;
3850 xmlRegTransPtr t;
3851 xmlRegCounterPtr counter;
3852
3853 ret = 0;
3854
3855#ifdef DEBUG_PUSH
3856 printf("testing all lax %d\n", trans->count);
3857#endif
3858 /*
3859 * Check all counted transitions from the current state
3860 */
3861 if ((value == NULL) && (final)) {
3862 ret = 1;
3863 } else if (value != NULL) {
3864 for (i = 0;i < exec->state->nbTrans;i++) {
3865 t = &exec->state->trans[i];
3866 if ((t->counter < 0) || (t == trans))
3867 continue;
3868 counter = &exec->comp->counters[t->counter];
3869 count = exec->counts[t->counter];
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003870 if ((count < counter->max) &&
Daniel Veillard441bc322002-04-20 17:38:48 +00003871 (t->atom != NULL) &&
3872 (xmlStrEqual(value, t->atom->valuep))) {
3873 ret = 0;
3874 break;
3875 }
3876 if ((count >= counter->min) &&
3877 (count < counter->max) &&
Daniel Veillard11ce4002006-03-10 00:36:23 +00003878 (t->atom != NULL) &&
Daniel Veillard441bc322002-04-20 17:38:48 +00003879 (xmlStrEqual(value, t->atom->valuep))) {
3880 ret = 1;
3881 break;
3882 }
3883 }
3884 }
3885 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillard8a001f62002-04-20 07:24:11 +00003886 int i;
3887 int count;
3888 xmlRegTransPtr t;
3889 xmlRegCounterPtr counter;
3890
3891 ret = 1;
3892
3893#ifdef DEBUG_PUSH
3894 printf("testing all %d\n", trans->count);
3895#endif
3896 /*
3897 * Check all counted transitions from the current state
3898 */
3899 for (i = 0;i < exec->state->nbTrans;i++) {
3900 t = &exec->state->trans[i];
3901 if ((t->counter < 0) || (t == trans))
3902 continue;
3903 counter = &exec->comp->counters[t->counter];
3904 count = exec->counts[t->counter];
3905 if ((count < counter->min) || (count > counter->max)) {
3906 ret = 0;
3907 break;
3908 }
3909 }
3910 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003911 int count;
3912 xmlRegCounterPtr counter;
3913
3914 /*
3915 * A counted transition.
3916 */
3917
3918 count = exec->counts[trans->count];
3919 counter = &exec->comp->counters[trans->count];
3920#ifdef DEBUG_PUSH
3921 printf("testing count %d: val %d, min %d, max %d\n",
3922 trans->count, count, counter->min, counter->max);
3923#endif
3924 ret = ((count >= counter->min) && (count <= counter->max));
3925 } else if (atom == NULL) {
3926 fprintf(stderr, "epsilon transition left at runtime\n");
3927 exec->status = -2;
3928 break;
3929 } else if (value != NULL) {
Daniel Veillardc0826a72004-08-10 14:17:33 +00003930 ret = xmlRegStrEqualWildcard(atom->valuep, value);
Daniel Veillard6e65e152005-08-09 11:09:52 +00003931 if (atom->neg) {
Daniel Veillard9efc4762005-07-19 14:33:55 +00003932 ret = !ret;
Daniel Veillard6e65e152005-08-09 11:09:52 +00003933 if (!compound)
3934 ret = 0;
3935 }
Daniel Veillard441bc322002-04-20 17:38:48 +00003936 if ((ret == 1) && (trans->counter >= 0)) {
3937 xmlRegCounterPtr counter;
3938 int count;
3939
3940 count = exec->counts[trans->counter];
3941 counter = &exec->comp->counters[trans->counter];
3942 if (count >= counter->max)
3943 ret = 0;
3944 }
3945
Daniel Veillard4255d502002-04-16 15:50:10 +00003946 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3947 xmlRegStatePtr to = exec->comp->states[trans->to];
3948
3949 /*
3950 * this is a multiple input sequence
3951 */
3952 if (exec->state->nbTrans > exec->transno + 1) {
3953 if (exec->inputStackNr <= 0) {
3954 xmlFARegExecSaveInputString(exec, value, data);
3955 }
3956 xmlFARegExecSave(exec);
3957 }
3958 exec->transcount = 1;
3959 do {
3960 /*
3961 * Try to progress as much as possible on the input
3962 */
3963 if (exec->transcount == atom->max) {
3964 break;
3965 }
3966 exec->index++;
3967 value = exec->inputStack[exec->index].value;
3968 data = exec->inputStack[exec->index].data;
3969#ifdef DEBUG_PUSH
3970 printf("value loaded: %s\n", value);
3971#endif
3972
3973 /*
3974 * End of input: stop here
3975 */
3976 if (value == NULL) {
3977 exec->index --;
3978 break;
3979 }
3980 if (exec->transcount >= atom->min) {
3981 int transno = exec->transno;
3982 xmlRegStatePtr state = exec->state;
3983
3984 /*
3985 * The transition is acceptable save it
3986 */
3987 exec->transno = -1; /* trick */
3988 exec->state = to;
3989 if (exec->inputStackNr <= 0) {
3990 xmlFARegExecSaveInputString(exec, value, data);
3991 }
3992 xmlFARegExecSave(exec);
3993 exec->transno = transno;
3994 exec->state = state;
3995 }
3996 ret = xmlStrEqual(value, atom->valuep);
3997 exec->transcount++;
3998 } while (ret == 1);
3999 if (exec->transcount < atom->min)
4000 ret = 0;
4001
4002 /*
4003 * If the last check failed but one transition was found
4004 * possible, rollback
4005 */
4006 if (ret < 0)
4007 ret = 0;
4008 if (ret == 0) {
4009 goto rollback;
4010 }
4011 }
4012 }
4013 if (ret == 1) {
William M. Brack98873952003-12-26 06:03:14 +00004014 if ((exec->callback != NULL) && (atom != NULL) &&
4015 (data != NULL)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00004016 exec->callback(exec->data, atom->valuep,
4017 atom->data, data);
4018 }
4019 if (exec->state->nbTrans > exec->transno + 1) {
4020 if (exec->inputStackNr <= 0) {
4021 xmlFARegExecSaveInputString(exec, value, data);
4022 }
4023 xmlFARegExecSave(exec);
4024 }
4025 if (trans->counter >= 0) {
4026#ifdef DEBUG_PUSH
4027 printf("Increasing count %d\n", trans->counter);
4028#endif
4029 exec->counts[trans->counter]++;
4030 }
Daniel Veillard10752282005-08-08 13:05:13 +00004031 if ((trans->count >= 0) &&
4032 (trans->count < REGEXP_ALL_COUNTER)) {
4033#ifdef DEBUG_REGEXP_EXEC
4034 printf("resetting count %d on transition\n",
4035 trans->count);
4036#endif
4037 exec->counts[trans->count] = 0;
4038 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004039#ifdef DEBUG_PUSH
4040 printf("entering state %d\n", trans->to);
4041#endif
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004042 if ((exec->comp->states[trans->to] != NULL) &&
4043 (exec->comp->states[trans->to]->type ==
4044 XML_REGEXP_SINK_STATE)) {
4045 /*
4046 * entering a sink state, save the current state as error
4047 * state.
4048 */
4049 if (exec->errString != NULL)
4050 xmlFree(exec->errString);
4051 exec->errString = xmlStrdup(value);
4052 exec->errState = exec->state;
4053 memcpy(exec->errCounts, exec->counts,
4054 exec->comp->nbCounters * sizeof(int));
4055 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004056 exec->state = exec->comp->states[trans->to];
4057 exec->transno = 0;
4058 if (trans->atom != NULL) {
4059 if (exec->inputStack != NULL) {
4060 exec->index++;
4061 if (exec->index < exec->inputStackNr) {
4062 value = exec->inputStack[exec->index].value;
4063 data = exec->inputStack[exec->index].data;
4064#ifdef DEBUG_PUSH
4065 printf("value loaded: %s\n", value);
4066#endif
4067 } else {
4068 value = NULL;
4069 data = NULL;
4070#ifdef DEBUG_PUSH
4071 printf("end of input\n");
4072#endif
4073 }
4074 } else {
4075 value = NULL;
4076 data = NULL;
4077#ifdef DEBUG_PUSH
4078 printf("end of input\n");
4079#endif
4080 }
4081 }
4082 goto progress;
4083 } else if (ret < 0) {
4084 exec->status = -4;
4085 break;
4086 }
4087 }
4088 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4089rollback:
Daniel Veillard90700152005-01-08 22:05:09 +00004090 /*
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004091 * if we didn't yet rollback on the current input
4092 * store the current state as the error state.
Daniel Veillard90700152005-01-08 22:05:09 +00004093 */
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004094 if ((progress) && (exec->state != NULL) &&
4095 (exec->state->type != XML_REGEXP_SINK_STATE)) {
Daniel Veillard90700152005-01-08 22:05:09 +00004096 progress = 0;
4097 if (exec->errString != NULL)
4098 xmlFree(exec->errString);
4099 exec->errString = xmlStrdup(value);
4100 exec->errState = exec->state;
Nick Wellnhofer34e44562017-05-31 16:48:27 +02004101 if (exec->comp->nbCounters)
4102 memcpy(exec->errCounts, exec->counts,
4103 exec->comp->nbCounters * sizeof(int));
Daniel Veillard90700152005-01-08 22:05:09 +00004104 }
4105
Daniel Veillard4255d502002-04-16 15:50:10 +00004106 /*
4107 * Failed to find a way out
4108 */
4109 exec->determinist = 0;
4110 xmlFARegExecRollBack(exec);
Gaurav2671b012013-09-11 14:59:06 +08004111 if ((exec->inputStack != NULL ) && (exec->status == 0)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00004112 value = exec->inputStack[exec->index].value;
4113 data = exec->inputStack[exec->index].data;
4114#ifdef DEBUG_PUSH
4115 printf("value loaded: %s\n", value);
4116#endif
4117 }
4118 }
Daniel Veillard90700152005-01-08 22:05:09 +00004119 continue;
Daniel Veillard4255d502002-04-16 15:50:10 +00004120progress:
Daniel Veillard90700152005-01-08 22:05:09 +00004121 progress = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00004122 continue;
4123 }
4124 if (exec->status == 0) {
4125 return(exec->state->type == XML_REGEXP_FINAL_STATE);
4126 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004127#ifdef DEBUG_ERR
Daniel Veillard90700152005-01-08 22:05:09 +00004128 if (exec->status < 0) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004129 testerr(exec);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004130 }
Daniel Veillard90700152005-01-08 22:05:09 +00004131#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00004132 return(exec->status);
4133}
4134
Daniel Veillard52b48c72003-04-13 19:53:42 +00004135/**
Daniel Veillard6e65e152005-08-09 11:09:52 +00004136 * xmlRegExecPushString:
4137 * @exec: a regexp execution context or NULL to indicate the end
4138 * @value: a string token input
4139 * @data: data associated to the token to reuse in callbacks
4140 *
4141 * Push one input token in the execution context
4142 *
4143 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4144 * a negative value in case of error.
4145 */
4146int
4147xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4148 void *data) {
4149 return(xmlRegExecPushStringInternal(exec, value, data, 0));
4150}
4151
4152/**
Daniel Veillard52b48c72003-04-13 19:53:42 +00004153 * xmlRegExecPushString2:
4154 * @exec: a regexp execution context or NULL to indicate the end
4155 * @value: the first string token input
4156 * @value2: the second string token input
4157 * @data: data associated to the token to reuse in callbacks
4158 *
4159 * Push one input token in the execution context
4160 *
4161 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4162 * a negative value in case of error.
4163 */
4164int
4165xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4166 const xmlChar *value2, void *data) {
4167 xmlChar buf[150];
4168 int lenn, lenp, ret;
4169 xmlChar *str;
4170
4171 if (exec == NULL)
4172 return(-1);
4173 if (exec->comp == NULL)
4174 return(-1);
4175 if (exec->status != 0)
4176 return(exec->status);
4177
4178 if (value2 == NULL)
4179 return(xmlRegExecPushString(exec, value, data));
4180
4181 lenn = strlen((char *) value2);
4182 lenp = strlen((char *) value);
4183
4184 if (150 < lenn + lenp + 2) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00004185 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00004186 if (str == NULL) {
4187 exec->status = -1;
4188 return(-1);
4189 }
4190 } else {
4191 str = buf;
4192 }
4193 memcpy(&str[0], value, lenp);
Daniel Veillardc0826a72004-08-10 14:17:33 +00004194 str[lenp] = XML_REG_STRING_SEPARATOR;
Daniel Veillard52b48c72003-04-13 19:53:42 +00004195 memcpy(&str[lenp + 1], value2, lenn);
4196 str[lenn + lenp + 1] = 0;
4197
4198 if (exec->comp->compact != NULL)
4199 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4200 else
Daniel Veillard6e65e152005-08-09 11:09:52 +00004201 ret = xmlRegExecPushStringInternal(exec, str, data, 1);
Daniel Veillard52b48c72003-04-13 19:53:42 +00004202
4203 if (str != buf)
Daniel Veillard0b1ff142005-12-28 21:13:33 +00004204 xmlFree(str);
Daniel Veillard52b48c72003-04-13 19:53:42 +00004205 return(ret);
4206}
4207
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004208/**
Daniel Veillard77005e62005-07-19 16:26:18 +00004209 * xmlRegExecGetValues:
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004210 * @exec: a regexp execution context
4211 * @err: error extraction or normal one
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004212 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004213 * @nbneg: return number of negative transitions
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004214 * @values: pointer to the array of acceptable values
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004215 * @terminal: return value if this was a terminal state
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004216 *
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004217 * Extract informations from the regexp execution, internal routine to
4218 * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004219 *
4220 * Returns: 0 in case of success or -1 in case of error.
4221 */
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004222static int
4223xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004224 int *nbval, int *nbneg,
4225 xmlChar **values, int *terminal) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004226 int maxval;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004227 int nb = 0;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004228
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004229 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004230 (values == NULL) || (*nbval <= 0))
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004231 return(-1);
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004232
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004233 maxval = *nbval;
4234 *nbval = 0;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004235 *nbneg = 0;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004236 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4237 xmlRegexpPtr comp;
4238 int target, i, state;
4239
4240 comp = exec->comp;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004241
4242 if (err) {
4243 if (exec->errStateNo == -1) return(-1);
4244 state = exec->errStateNo;
4245 } else {
4246 state = exec->index;
4247 }
4248 if (terminal != NULL) {
4249 if (comp->compact[state * (comp->nbstrings + 1)] ==
4250 XML_REGEXP_FINAL_STATE)
4251 *terminal = 1;
4252 else
4253 *terminal = 0;
4254 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004255 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004256 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004257 if ((target > 0) && (target <= comp->nbstates) &&
4258 (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4259 XML_REGEXP_SINK_STATE)) {
4260 values[nb++] = comp->stringMap[i];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004261 (*nbval)++;
4262 }
4263 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004264 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4265 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4266 if ((target > 0) && (target <= comp->nbstates) &&
4267 (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4268 XML_REGEXP_SINK_STATE)) {
4269 values[nb++] = comp->stringMap[i];
4270 (*nbneg)++;
4271 }
4272 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004273 } else {
4274 int transno;
4275 xmlRegTransPtr trans;
4276 xmlRegAtomPtr atom;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004277 xmlRegStatePtr state;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004278
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004279 if (terminal != NULL) {
4280 if (exec->state->type == XML_REGEXP_FINAL_STATE)
4281 *terminal = 1;
4282 else
4283 *terminal = 0;
4284 }
4285
4286 if (err) {
4287 if (exec->errState == NULL) return(-1);
4288 state = exec->errState;
4289 } else {
4290 if (exec->state == NULL) return(-1);
4291 state = exec->state;
4292 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004293 for (transno = 0;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004294 (transno < state->nbTrans) && (nb < maxval);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004295 transno++) {
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004296 trans = &state->trans[transno];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004297 if (trans->to < 0)
4298 continue;
4299 atom = trans->atom;
4300 if ((atom == NULL) || (atom->valuep == NULL))
4301 continue;
4302 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004303 /* this should not be reached but ... */
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004304 TODO;
4305 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004306 /* this should not be reached but ... */
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004307 TODO;
4308 } else if (trans->counter >= 0) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00004309 xmlRegCounterPtr counter = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004310 int count;
4311
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004312 if (err)
4313 count = exec->errCounts[trans->counter];
4314 else
4315 count = exec->counts[trans->counter];
Daniel Veillard11ce4002006-03-10 00:36:23 +00004316 if (exec->comp != NULL)
4317 counter = &exec->comp->counters[trans->counter];
4318 if ((counter == NULL) || (count < counter->max)) {
Daniel Veillard77005e62005-07-19 16:26:18 +00004319 if (atom->neg)
4320 values[nb++] = (xmlChar *) atom->valuep2;
4321 else
4322 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004323 (*nbval)++;
4324 }
4325 } else {
Gaurav2671b012013-09-11 14:59:06 +08004326 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004327 (exec->comp->states[trans->to]->type !=
4328 XML_REGEXP_SINK_STATE)) {
Daniel Veillard77005e62005-07-19 16:26:18 +00004329 if (atom->neg)
4330 values[nb++] = (xmlChar *) atom->valuep2;
4331 else
4332 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004333 (*nbval)++;
4334 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004335 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004336 }
4337 for (transno = 0;
4338 (transno < state->nbTrans) && (nb < maxval);
4339 transno++) {
4340 trans = &state->trans[transno];
4341 if (trans->to < 0)
4342 continue;
4343 atom = trans->atom;
4344 if ((atom == NULL) || (atom->valuep == NULL))
4345 continue;
4346 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4347 continue;
4348 } else if (trans->count == REGEXP_ALL_COUNTER) {
4349 continue;
4350 } else if (trans->counter >= 0) {
4351 continue;
4352 } else {
4353 if ((exec->comp->states[trans->to] != NULL) &&
4354 (exec->comp->states[trans->to]->type ==
4355 XML_REGEXP_SINK_STATE)) {
Daniel Veillard77005e62005-07-19 16:26:18 +00004356 if (atom->neg)
4357 values[nb++] = (xmlChar *) atom->valuep2;
4358 else
4359 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004360 (*nbneg)++;
4361 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004362 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004363 }
4364 }
4365 return(0);
4366}
4367
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004368/**
4369 * xmlRegExecNextValues:
4370 * @exec: a regexp execution context
4371 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004372 * @nbneg: return number of negative transitions
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004373 * @values: pointer to the array of acceptable values
4374 * @terminal: return value if this was a terminal state
4375 *
4376 * Extract informations from the regexp execution,
4377 * the parameter @values must point to an array of @nbval string pointers
4378 * on return nbval will contain the number of possible strings in that
4379 * state and the @values array will be updated with them. The string values
4380 * returned will be freed with the @exec context and don't need to be
4381 * deallocated.
4382 *
4383 * Returns: 0 in case of success or -1 in case of error.
4384 */
4385int
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004386xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4387 xmlChar **values, int *terminal) {
4388 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004389}
4390
4391/**
4392 * xmlRegExecErrInfo:
4393 * @exec: a regexp execution context generating an error
4394 * @string: return value for the error string
4395 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004396 * @nbneg: return number of negative transitions
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004397 * @values: pointer to the array of acceptable values
4398 * @terminal: return value if this was a terminal state
4399 *
4400 * Extract error informations from the regexp execution, the parameter
4401 * @string will be updated with the value pushed and not accepted,
4402 * the parameter @values must point to an array of @nbval string pointers
4403 * on return nbval will contain the number of possible strings in that
4404 * state and the @values array will be updated with them. The string values
4405 * returned will be freed with the @exec context and don't need to be
4406 * deallocated.
4407 *
4408 * Returns: 0 in case of success or -1 in case of error.
4409 */
4410int
4411xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004412 int *nbval, int *nbneg, xmlChar **values, int *terminal) {
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004413 if (exec == NULL)
4414 return(-1);
4415 if (string != NULL) {
4416 if (exec->status != 0)
4417 *string = exec->errString;
4418 else
4419 *string = NULL;
4420 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004421 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004422}
4423
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004424#ifdef DEBUG_ERR
4425static void testerr(xmlRegExecCtxtPtr exec) {
4426 const xmlChar *string;
Daniel Veillardcee2b3a2005-01-25 00:22:52 +00004427 xmlChar *values[5];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004428 int nb = 5;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004429 int nbneg;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004430 int terminal;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004431 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004432}
4433#endif
4434
Daniel Veillard4255d502002-04-16 15:50:10 +00004435#if 0
4436static int
4437xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4438 xmlRegTransPtr trans;
4439 xmlRegAtomPtr atom;
4440 int ret;
4441 int codepoint, len;
4442
4443 if (exec == NULL)
4444 return(-1);
4445 if (exec->status != 0)
4446 return(exec->status);
4447
4448 while ((exec->status == 0) &&
4449 ((exec->inputString[exec->index] != 0) ||
4450 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4451
4452 /*
4453 * End of input on non-terminal state, rollback, however we may
4454 * still have epsilon like transition for counted transitions
4455 * on counters, in that case don't break too early.
4456 */
4457 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4458 goto rollback;
4459
4460 exec->transcount = 0;
4461 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4462 trans = &exec->state->trans[exec->transno];
4463 if (trans->to < 0)
4464 continue;
4465 atom = trans->atom;
4466 ret = 0;
4467 if (trans->count >= 0) {
4468 int count;
4469 xmlRegCounterPtr counter;
4470
4471 /*
4472 * A counted transition.
4473 */
4474
4475 count = exec->counts[trans->count];
4476 counter = &exec->comp->counters[trans->count];
4477#ifdef DEBUG_REGEXP_EXEC
4478 printf("testing count %d: val %d, min %d, max %d\n",
4479 trans->count, count, counter->min, counter->max);
4480#endif
4481 ret = ((count >= counter->min) && (count <= counter->max));
4482 } else if (atom == NULL) {
4483 fprintf(stderr, "epsilon transition left at runtime\n");
4484 exec->status = -2;
4485 break;
4486 } else if (exec->inputString[exec->index] != 0) {
4487 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4488 ret = xmlRegCheckCharacter(atom, codepoint);
4489 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4490 xmlRegStatePtr to = exec->comp->states[trans->to];
4491
4492 /*
4493 * this is a multiple input sequence
4494 */
4495 if (exec->state->nbTrans > exec->transno + 1) {
4496 xmlFARegExecSave(exec);
4497 }
4498 exec->transcount = 1;
4499 do {
4500 /*
4501 * Try to progress as much as possible on the input
4502 */
4503 if (exec->transcount == atom->max) {
4504 break;
4505 }
4506 exec->index += len;
4507 /*
4508 * End of input: stop here
4509 */
4510 if (exec->inputString[exec->index] == 0) {
4511 exec->index -= len;
4512 break;
4513 }
4514 if (exec->transcount >= atom->min) {
4515 int transno = exec->transno;
4516 xmlRegStatePtr state = exec->state;
4517
4518 /*
4519 * The transition is acceptable save it
4520 */
4521 exec->transno = -1; /* trick */
4522 exec->state = to;
4523 xmlFARegExecSave(exec);
4524 exec->transno = transno;
4525 exec->state = state;
4526 }
4527 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4528 len);
4529 ret = xmlRegCheckCharacter(atom, codepoint);
4530 exec->transcount++;
4531 } while (ret == 1);
4532 if (exec->transcount < atom->min)
4533 ret = 0;
4534
4535 /*
4536 * If the last check failed but one transition was found
4537 * possible, rollback
4538 */
4539 if (ret < 0)
4540 ret = 0;
4541 if (ret == 0) {
4542 goto rollback;
4543 }
4544 }
4545 }
4546 if (ret == 1) {
4547 if (exec->state->nbTrans > exec->transno + 1) {
4548 xmlFARegExecSave(exec);
4549 }
Daniel Veillard54eb0242006-03-21 23:17:57 +00004550 /*
4551 * restart count for expressions like this ((abc){2})*
4552 */
4553 if (trans->count >= 0) {
4554#ifdef DEBUG_REGEXP_EXEC
4555 printf("Reset count %d\n", trans->count);
4556#endif
4557 exec->counts[trans->count] = 0;
4558 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004559 if (trans->counter >= 0) {
4560#ifdef DEBUG_REGEXP_EXEC
4561 printf("Increasing count %d\n", trans->counter);
4562#endif
4563 exec->counts[trans->counter]++;
4564 }
4565#ifdef DEBUG_REGEXP_EXEC
4566 printf("entering state %d\n", trans->to);
4567#endif
4568 exec->state = exec->comp->states[trans->to];
4569 exec->transno = 0;
4570 if (trans->atom != NULL) {
4571 exec->index += len;
4572 }
4573 goto progress;
4574 } else if (ret < 0) {
4575 exec->status = -4;
4576 break;
4577 }
4578 }
4579 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4580rollback:
4581 /*
4582 * Failed to find a way out
4583 */
4584 exec->determinist = 0;
4585 xmlFARegExecRollBack(exec);
4586 }
4587progress:
4588 continue;
4589 }
4590}
4591#endif
4592/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004593 * *
William M. Brackddf71d62004-05-06 04:17:26 +00004594 * Parser for the Schemas Datatype Regular Expressions *
Daniel Veillard4255d502002-04-16 15:50:10 +00004595 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004596 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00004597 ************************************************************************/
4598
4599/**
4600 * xmlFAIsChar:
Daniel Veillard441bc322002-04-20 17:38:48 +00004601 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004602 *
4603 * [10] Char ::= [^.\?*+()|#x5B#x5D]
4604 */
4605static int
4606xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4607 int cur;
4608 int len;
4609
4610 cur = CUR_SCHAR(ctxt->cur, len);
4611 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4612 (cur == '*') || (cur == '+') || (cur == '(') ||
4613 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
4614 (cur == 0x5D) || (cur == 0))
4615 return(-1);
4616 return(cur);
4617}
4618
4619/**
4620 * xmlFAParseCharProp:
Daniel Veillard441bc322002-04-20 17:38:48 +00004621 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004622 *
4623 * [27] charProp ::= IsCategory | IsBlock
4624 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004625 * Separators | Symbols | Others
Daniel Veillard4255d502002-04-16 15:50:10 +00004626 * [29] Letters ::= 'L' [ultmo]?
4627 * [30] Marks ::= 'M' [nce]?
4628 * [31] Numbers ::= 'N' [dlo]?
4629 * [32] Punctuation ::= 'P' [cdseifo]?
4630 * [33] Separators ::= 'Z' [slp]?
4631 * [34] Symbols ::= 'S' [mcko]?
4632 * [35] Others ::= 'C' [cfon]?
4633 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
4634 */
4635static void
4636xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4637 int cur;
William M. Brack779af002003-08-01 15:55:39 +00004638 xmlRegAtomType type = (xmlRegAtomType) 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00004639 xmlChar *blockName = NULL;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004640
Daniel Veillard4255d502002-04-16 15:50:10 +00004641 cur = CUR;
4642 if (cur == 'L') {
4643 NEXT;
4644 cur = CUR;
4645 if (cur == 'u') {
4646 NEXT;
4647 type = XML_REGEXP_LETTER_UPPERCASE;
4648 } else if (cur == 'l') {
4649 NEXT;
4650 type = XML_REGEXP_LETTER_LOWERCASE;
4651 } else if (cur == 't') {
4652 NEXT;
4653 type = XML_REGEXP_LETTER_TITLECASE;
4654 } else if (cur == 'm') {
4655 NEXT;
4656 type = XML_REGEXP_LETTER_MODIFIER;
4657 } else if (cur == 'o') {
4658 NEXT;
4659 type = XML_REGEXP_LETTER_OTHERS;
4660 } else {
4661 type = XML_REGEXP_LETTER;
4662 }
4663 } else if (cur == 'M') {
4664 NEXT;
4665 cur = CUR;
4666 if (cur == 'n') {
4667 NEXT;
4668 /* nonspacing */
4669 type = XML_REGEXP_MARK_NONSPACING;
4670 } else if (cur == 'c') {
4671 NEXT;
4672 /* spacing combining */
4673 type = XML_REGEXP_MARK_SPACECOMBINING;
4674 } else if (cur == 'e') {
4675 NEXT;
4676 /* enclosing */
4677 type = XML_REGEXP_MARK_ENCLOSING;
4678 } else {
4679 /* all marks */
4680 type = XML_REGEXP_MARK;
4681 }
4682 } else if (cur == 'N') {
4683 NEXT;
4684 cur = CUR;
4685 if (cur == 'd') {
4686 NEXT;
4687 /* digital */
4688 type = XML_REGEXP_NUMBER_DECIMAL;
4689 } else if (cur == 'l') {
4690 NEXT;
4691 /* letter */
4692 type = XML_REGEXP_NUMBER_LETTER;
4693 } else if (cur == 'o') {
4694 NEXT;
4695 /* other */
4696 type = XML_REGEXP_NUMBER_OTHERS;
4697 } else {
4698 /* all numbers */
4699 type = XML_REGEXP_NUMBER;
4700 }
4701 } else if (cur == 'P') {
4702 NEXT;
4703 cur = CUR;
4704 if (cur == 'c') {
4705 NEXT;
4706 /* connector */
4707 type = XML_REGEXP_PUNCT_CONNECTOR;
4708 } else if (cur == 'd') {
4709 NEXT;
4710 /* dash */
4711 type = XML_REGEXP_PUNCT_DASH;
4712 } else if (cur == 's') {
4713 NEXT;
4714 /* open */
4715 type = XML_REGEXP_PUNCT_OPEN;
4716 } else if (cur == 'e') {
4717 NEXT;
4718 /* close */
4719 type = XML_REGEXP_PUNCT_CLOSE;
4720 } else if (cur == 'i') {
4721 NEXT;
4722 /* initial quote */
4723 type = XML_REGEXP_PUNCT_INITQUOTE;
4724 } else if (cur == 'f') {
4725 NEXT;
4726 /* final quote */
4727 type = XML_REGEXP_PUNCT_FINQUOTE;
4728 } else if (cur == 'o') {
4729 NEXT;
4730 /* other */
4731 type = XML_REGEXP_PUNCT_OTHERS;
4732 } else {
4733 /* all punctuation */
4734 type = XML_REGEXP_PUNCT;
4735 }
4736 } else if (cur == 'Z') {
4737 NEXT;
4738 cur = CUR;
4739 if (cur == 's') {
4740 NEXT;
4741 /* space */
4742 type = XML_REGEXP_SEPAR_SPACE;
4743 } else if (cur == 'l') {
4744 NEXT;
4745 /* line */
4746 type = XML_REGEXP_SEPAR_LINE;
4747 } else if (cur == 'p') {
4748 NEXT;
4749 /* paragraph */
4750 type = XML_REGEXP_SEPAR_PARA;
4751 } else {
4752 /* all separators */
4753 type = XML_REGEXP_SEPAR;
4754 }
4755 } else if (cur == 'S') {
4756 NEXT;
4757 cur = CUR;
4758 if (cur == 'm') {
4759 NEXT;
4760 type = XML_REGEXP_SYMBOL_MATH;
4761 /* math */
4762 } else if (cur == 'c') {
4763 NEXT;
4764 type = XML_REGEXP_SYMBOL_CURRENCY;
4765 /* currency */
4766 } else if (cur == 'k') {
4767 NEXT;
4768 type = XML_REGEXP_SYMBOL_MODIFIER;
4769 /* modifiers */
4770 } else if (cur == 'o') {
4771 NEXT;
4772 type = XML_REGEXP_SYMBOL_OTHERS;
4773 /* other */
4774 } else {
4775 /* all symbols */
4776 type = XML_REGEXP_SYMBOL;
4777 }
4778 } else if (cur == 'C') {
4779 NEXT;
4780 cur = CUR;
4781 if (cur == 'c') {
4782 NEXT;
4783 /* control */
4784 type = XML_REGEXP_OTHER_CONTROL;
4785 } else if (cur == 'f') {
4786 NEXT;
4787 /* format */
4788 type = XML_REGEXP_OTHER_FORMAT;
4789 } else if (cur == 'o') {
4790 NEXT;
4791 /* private use */
4792 type = XML_REGEXP_OTHER_PRIVATE;
4793 } else if (cur == 'n') {
4794 NEXT;
4795 /* not assigned */
4796 type = XML_REGEXP_OTHER_NA;
4797 } else {
4798 /* all others */
4799 type = XML_REGEXP_OTHER;
4800 }
4801 } else if (cur == 'I') {
4802 const xmlChar *start;
4803 NEXT;
4804 cur = CUR;
4805 if (cur != 's') {
4806 ERROR("IsXXXX expected");
4807 return;
4808 }
4809 NEXT;
4810 start = ctxt->cur;
4811 cur = CUR;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004812 if (((cur >= 'a') && (cur <= 'z')) ||
4813 ((cur >= 'A') && (cur <= 'Z')) ||
4814 ((cur >= '0') && (cur <= '9')) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00004815 (cur == 0x2D)) {
4816 NEXT;
4817 cur = CUR;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004818 while (((cur >= 'a') && (cur <= 'z')) ||
4819 ((cur >= 'A') && (cur <= 'Z')) ||
4820 ((cur >= '0') && (cur <= '9')) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00004821 (cur == 0x2D)) {
4822 NEXT;
4823 cur = CUR;
4824 }
4825 }
4826 type = XML_REGEXP_BLOCK_NAME;
4827 blockName = xmlStrndup(start, ctxt->cur - start);
4828 } else {
4829 ERROR("Unknown char property");
4830 return;
4831 }
4832 if (ctxt->atom == NULL) {
4833 ctxt->atom = xmlRegNewAtom(ctxt, type);
4834 if (ctxt->atom != NULL)
4835 ctxt->atom->valuep = blockName;
4836 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4837 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4838 type, 0, 0, blockName);
4839 }
4840}
4841
4842/**
4843 * xmlFAParseCharClassEsc:
Daniel Veillard441bc322002-04-20 17:38:48 +00004844 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004845 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004846 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
Daniel Veillard4255d502002-04-16 15:50:10 +00004847 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4848 * [25] catEsc ::= '\p{' charProp '}'
4849 * [26] complEsc ::= '\P{' charProp '}'
4850 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4851 */
4852static void
4853xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4854 int cur;
4855
4856 if (CUR == '.') {
4857 if (ctxt->atom == NULL) {
4858 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4859 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4860 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4861 XML_REGEXP_ANYCHAR, 0, 0, NULL);
4862 }
4863 NEXT;
4864 return;
4865 }
4866 if (CUR != '\\') {
4867 ERROR("Escaped sequence: expecting \\");
4868 return;
4869 }
4870 NEXT;
4871 cur = CUR;
4872 if (cur == 'p') {
4873 NEXT;
4874 if (CUR != '{') {
4875 ERROR("Expecting '{'");
4876 return;
4877 }
4878 NEXT;
4879 xmlFAParseCharProp(ctxt);
4880 if (CUR != '}') {
4881 ERROR("Expecting '}'");
4882 return;
4883 }
4884 NEXT;
4885 } else if (cur == 'P') {
4886 NEXT;
4887 if (CUR != '{') {
4888 ERROR("Expecting '{'");
4889 return;
4890 }
4891 NEXT;
4892 xmlFAParseCharProp(ctxt);
Nick Wellnhofer8a0c6692017-07-04 17:13:06 +02004893 if (ctxt->atom != NULL)
4894 ctxt->atom->neg = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00004895 if (CUR != '}') {
4896 ERROR("Expecting '}'");
4897 return;
4898 }
4899 NEXT;
4900 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4901 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4902 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4903 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4904 (cur == 0x5E)) {
4905 if (ctxt->atom == NULL) {
4906 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
Daniel Veillard99c394d2005-07-14 12:58:49 +00004907 if (ctxt->atom != NULL) {
4908 switch (cur) {
4909 case 'n':
4910 ctxt->atom->codepoint = '\n';
4911 break;
4912 case 'r':
4913 ctxt->atom->codepoint = '\r';
4914 break;
4915 case 't':
4916 ctxt->atom->codepoint = '\t';
4917 break;
4918 default:
4919 ctxt->atom->codepoint = cur;
4920 }
4921 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004922 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
Daniel Veillard9543aee2010-03-15 11:13:39 +01004923 switch (cur) {
4924 case 'n':
4925 cur = '\n';
4926 break;
4927 case 'r':
4928 cur = '\r';
4929 break;
4930 case 't':
4931 cur = '\t';
4932 break;
4933 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004934 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4935 XML_REGEXP_CHARVAL, cur, cur, NULL);
4936 }
4937 NEXT;
4938 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
4939 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
4940 (cur == 'w') || (cur == 'W')) {
Daniel Veillardb509f152002-04-17 16:28:10 +00004941 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
Daniel Veillard4255d502002-04-16 15:50:10 +00004942
4943 switch (cur) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004944 case 's':
Daniel Veillard4255d502002-04-16 15:50:10 +00004945 type = XML_REGEXP_ANYSPACE;
4946 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004947 case 'S':
Daniel Veillard4255d502002-04-16 15:50:10 +00004948 type = XML_REGEXP_NOTSPACE;
4949 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004950 case 'i':
Daniel Veillard4255d502002-04-16 15:50:10 +00004951 type = XML_REGEXP_INITNAME;
4952 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004953 case 'I':
Daniel Veillard4255d502002-04-16 15:50:10 +00004954 type = XML_REGEXP_NOTINITNAME;
4955 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004956 case 'c':
Daniel Veillard4255d502002-04-16 15:50:10 +00004957 type = XML_REGEXP_NAMECHAR;
4958 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004959 case 'C':
Daniel Veillard4255d502002-04-16 15:50:10 +00004960 type = XML_REGEXP_NOTNAMECHAR;
4961 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004962 case 'd':
Daniel Veillard4255d502002-04-16 15:50:10 +00004963 type = XML_REGEXP_DECIMAL;
4964 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004965 case 'D':
Daniel Veillard4255d502002-04-16 15:50:10 +00004966 type = XML_REGEXP_NOTDECIMAL;
4967 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004968 case 'w':
Daniel Veillard4255d502002-04-16 15:50:10 +00004969 type = XML_REGEXP_REALCHAR;
4970 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004971 case 'W':
Daniel Veillard4255d502002-04-16 15:50:10 +00004972 type = XML_REGEXP_NOTREALCHAR;
4973 break;
4974 }
4975 NEXT;
4976 if (ctxt->atom == NULL) {
4977 ctxt->atom = xmlRegNewAtom(ctxt, type);
4978 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4979 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4980 type, 0, 0, NULL);
4981 }
Daniel Veillardcb4284e2007-04-25 13:55:20 +00004982 } else {
4983 ERROR("Wrong escape sequence, misuse of character '\\'");
Daniel Veillard4255d502002-04-16 15:50:10 +00004984 }
4985}
4986
4987/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004988 * xmlFAParseCharRange:
Daniel Veillard441bc322002-04-20 17:38:48 +00004989 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004990 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004991 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
Daniel Veillard4255d502002-04-16 15:50:10 +00004992 * [18] seRange ::= charOrEsc '-' charOrEsc
4993 * [20] charOrEsc ::= XmlChar | SingleCharEsc
4994 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
4995 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
4996 */
4997static void
4998xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
William M. Brackdc99df92003-12-27 01:54:25 +00004999 int cur, len;
Daniel Veillard4255d502002-04-16 15:50:10 +00005000 int start = -1;
5001 int end = -1;
5002
Daniel Veillard777737e2006-10-17 21:23:17 +00005003 if (CUR == '\0') {
5004 ERROR("Expecting ']'");
5005 return;
5006 }
5007
Daniel Veillard4255d502002-04-16 15:50:10 +00005008 cur = CUR;
5009 if (cur == '\\') {
5010 NEXT;
5011 cur = CUR;
5012 switch (cur) {
5013 case 'n': start = 0xA; break;
5014 case 'r': start = 0xD; break;
5015 case 't': start = 0x9; break;
5016 case '\\': case '|': case '.': case '-': case '^': case '?':
5017 case '*': case '+': case '{': case '}': case '(': case ')':
5018 case '[': case ']':
5019 start = cur; break;
5020 default:
5021 ERROR("Invalid escape value");
5022 return;
5023 }
5024 end = start;
William M. Brackdc99df92003-12-27 01:54:25 +00005025 len = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00005026 } else if ((cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00005027 end = start = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00005028 } else {
5029 ERROR("Expecting a char range");
5030 return;
5031 }
William M. Bracka9cbf282007-03-21 13:16:33 +00005032 /*
5033 * Since we are "inside" a range, we can assume ctxt->cur is past
5034 * the start of ctxt->string, and PREV should be safe
5035 */
5036 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
5037 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00005038 return;
5039 }
William M. Bracka9cbf282007-03-21 13:16:33 +00005040 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00005041 cur = CUR;
William M. Brack10f1ef42004-03-20 14:51:25 +00005042 if ((cur != '-') || (NXT(1) == ']')) {
Daniel Veillard4255d502002-04-16 15:50:10 +00005043 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5044 XML_REGEXP_CHARVAL, start, end, NULL);
5045 return;
5046 }
5047 NEXT;
5048 cur = CUR;
5049 if (cur == '\\') {
5050 NEXT;
5051 cur = CUR;
5052 switch (cur) {
5053 case 'n': end = 0xA; break;
5054 case 'r': end = 0xD; break;
5055 case 't': end = 0x9; break;
5056 case '\\': case '|': case '.': case '-': case '^': case '?':
5057 case '*': case '+': case '{': case '}': case '(': case ')':
5058 case '[': case ']':
5059 end = cur; break;
5060 default:
5061 ERROR("Invalid escape value");
5062 return;
5063 }
William M. Brackdc99df92003-12-27 01:54:25 +00005064 len = 1;
David Kilzerfb56f802017-07-04 18:38:03 +02005065 } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00005066 end = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00005067 } else {
5068 ERROR("Expecting the end of a char range");
5069 return;
5070 }
Pranjal Jumdecbb27162016-03-07 06:34:26 -08005071
Daniel Veillard4255d502002-04-16 15:50:10 +00005072 /* TODO check that the values are acceptable character ranges for XML */
5073 if (end < start) {
5074 ERROR("End of range is before start of range");
5075 } else {
Pranjal Jumdecbb27162016-03-07 06:34:26 -08005076 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00005077 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5078 XML_REGEXP_CHARVAL, start, end, NULL);
5079 }
5080 return;
5081}
5082
5083/**
5084 * xmlFAParsePosCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00005085 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005086 *
5087 * [14] posCharGroup ::= ( charRange | charClassEsc )+
5088 */
5089static void
5090xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5091 do {
Daniel Veillard041b6872008-02-08 10:37:18 +00005092 if (CUR == '\\') {
Daniel Veillard4255d502002-04-16 15:50:10 +00005093 xmlFAParseCharClassEsc(ctxt);
5094 } else {
5095 xmlFAParseCharRange(ctxt);
5096 }
5097 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
Daniel Veillard777737e2006-10-17 21:23:17 +00005098 (CUR != 0) && (ctxt->error == 0));
Daniel Veillard4255d502002-04-16 15:50:10 +00005099}
5100
5101/**
5102 * xmlFAParseCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00005103 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005104 *
5105 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
5106 * [15] negCharGroup ::= '^' posCharGroup
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005107 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
Daniel Veillard4255d502002-04-16 15:50:10 +00005108 * [12] charClassExpr ::= '[' charGroup ']'
5109 */
5110static void
5111xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
5112 int n = ctxt->neg;
5113 while ((CUR != ']') && (ctxt->error == 0)) {
5114 if (CUR == '^') {
5115 int neg = ctxt->neg;
5116
5117 NEXT;
5118 ctxt->neg = !ctxt->neg;
5119 xmlFAParsePosCharGroup(ctxt);
5120 ctxt->neg = neg;
William M. Brack10f1ef42004-03-20 14:51:25 +00005121 } else if ((CUR == '-') && (NXT(1) == '[')) {
Daniel Veillardf8b9de32003-11-24 14:27:26 +00005122 int neg = ctxt->neg;
Daniel Veillardf8b9de32003-11-24 14:27:26 +00005123 ctxt->neg = 2;
William M. Brack10f1ef42004-03-20 14:51:25 +00005124 NEXT; /* eat the '-' */
5125 NEXT; /* eat the '[' */
Daniel Veillard4255d502002-04-16 15:50:10 +00005126 xmlFAParseCharGroup(ctxt);
5127 if (CUR == ']') {
5128 NEXT;
5129 } else {
5130 ERROR("charClassExpr: ']' expected");
5131 break;
5132 }
Daniel Veillardf8b9de32003-11-24 14:27:26 +00005133 ctxt->neg = neg;
Daniel Veillard4255d502002-04-16 15:50:10 +00005134 break;
5135 } else if (CUR != ']') {
5136 xmlFAParsePosCharGroup(ctxt);
5137 }
5138 }
5139 ctxt->neg = n;
5140}
5141
5142/**
5143 * xmlFAParseCharClass:
Daniel Veillard441bc322002-04-20 17:38:48 +00005144 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005145 *
5146 * [11] charClass ::= charClassEsc | charClassExpr
5147 * [12] charClassExpr ::= '[' charGroup ']'
5148 */
5149static void
5150xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5151 if (CUR == '[') {
5152 NEXT;
5153 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5154 if (ctxt->atom == NULL)
5155 return;
5156 xmlFAParseCharGroup(ctxt);
5157 if (CUR == ']') {
5158 NEXT;
5159 } else {
5160 ERROR("xmlFAParseCharClass: ']' expected");
5161 }
5162 } else {
5163 xmlFAParseCharClassEsc(ctxt);
5164 }
5165}
5166
5167/**
5168 * xmlFAParseQuantExact:
Daniel Veillard441bc322002-04-20 17:38:48 +00005169 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005170 *
5171 * [8] QuantExact ::= [0-9]+
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005172 *
5173 * Returns 0 if success or -1 in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00005174 */
5175static int
5176xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5177 int ret = 0;
5178 int ok = 0;
5179
5180 while ((CUR >= '0') && (CUR <= '9')) {
5181 ret = ret * 10 + (CUR - '0');
5182 ok = 1;
5183 NEXT;
5184 }
5185 if (ok != 1) {
5186 return(-1);
5187 }
5188 return(ret);
5189}
5190
5191/**
5192 * xmlFAParseQuantifier:
Daniel Veillard441bc322002-04-20 17:38:48 +00005193 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005194 *
5195 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
5196 * [5] quantity ::= quantRange | quantMin | QuantExact
5197 * [6] quantRange ::= QuantExact ',' QuantExact
5198 * [7] quantMin ::= QuantExact ','
5199 * [8] QuantExact ::= [0-9]+
5200 */
5201static int
5202xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5203 int cur;
5204
5205 cur = CUR;
5206 if ((cur == '?') || (cur == '*') || (cur == '+')) {
5207 if (ctxt->atom != NULL) {
5208 if (cur == '?')
5209 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5210 else if (cur == '*')
5211 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5212 else if (cur == '+')
5213 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5214 }
5215 NEXT;
5216 return(1);
5217 }
5218 if (cur == '{') {
5219 int min = 0, max = 0;
5220
5221 NEXT;
5222 cur = xmlFAParseQuantExact(ctxt);
5223 if (cur >= 0)
5224 min = cur;
5225 if (CUR == ',') {
5226 NEXT;
Daniel Veillardebe48c62003-12-03 12:12:27 +00005227 if (CUR == '}')
5228 max = INT_MAX;
5229 else {
5230 cur = xmlFAParseQuantExact(ctxt);
5231 if (cur >= 0)
5232 max = cur;
5233 else {
5234 ERROR("Improper quantifier");
5235 }
5236 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005237 }
5238 if (CUR == '}') {
5239 NEXT;
5240 } else {
5241 ERROR("Unterminated quantifier");
5242 }
5243 if (max == 0)
5244 max = min;
5245 if (ctxt->atom != NULL) {
5246 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5247 ctxt->atom->min = min;
5248 ctxt->atom->max = max;
5249 }
5250 return(1);
5251 }
5252 return(0);
5253}
5254
5255/**
5256 * xmlFAParseAtom:
Daniel Veillard441bc322002-04-20 17:38:48 +00005257 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005258 *
5259 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
5260 */
5261static int
5262xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5263 int codepoint, len;
5264
5265 codepoint = xmlFAIsChar(ctxt);
5266 if (codepoint > 0) {
5267 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5268 if (ctxt->atom == NULL)
5269 return(-1);
5270 codepoint = CUR_SCHAR(ctxt->cur, len);
5271 ctxt->atom->codepoint = codepoint;
5272 NEXTL(len);
5273 return(1);
5274 } else if (CUR == '|') {
5275 return(0);
5276 } else if (CUR == 0) {
5277 return(0);
5278 } else if (CUR == ')') {
5279 return(0);
5280 } else if (CUR == '(') {
Daniel Veillard76d59b62007-08-22 16:29:21 +00005281 xmlRegStatePtr start, oldend, start0;
Daniel Veillard4255d502002-04-16 15:50:10 +00005282
5283 NEXT;
Daniel Veillard76d59b62007-08-22 16:29:21 +00005284 /*
5285 * this extra Epsilon transition is needed if we count with 0 allowed
5286 * unfortunately this can't be known at that point
5287 */
5288 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5289 start0 = ctxt->state;
Daniel Veillard4255d502002-04-16 15:50:10 +00005290 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5291 start = ctxt->state;
5292 oldend = ctxt->end;
5293 ctxt->end = NULL;
5294 ctxt->atom = NULL;
5295 xmlFAParseRegExp(ctxt, 0);
5296 if (CUR == ')') {
5297 NEXT;
5298 } else {
5299 ERROR("xmlFAParseAtom: expecting ')'");
5300 }
5301 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5302 if (ctxt->atom == NULL)
5303 return(-1);
5304 ctxt->atom->start = start;
Daniel Veillard76d59b62007-08-22 16:29:21 +00005305 ctxt->atom->start0 = start0;
Daniel Veillard4255d502002-04-16 15:50:10 +00005306 ctxt->atom->stop = ctxt->state;
5307 ctxt->end = oldend;
5308 return(1);
5309 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5310 xmlFAParseCharClass(ctxt);
5311 return(1);
5312 }
5313 return(0);
5314}
5315
5316/**
5317 * xmlFAParsePiece:
Daniel Veillard441bc322002-04-20 17:38:48 +00005318 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005319 *
5320 * [3] piece ::= atom quantifier?
5321 */
5322static int
5323xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5324 int ret;
5325
5326 ctxt->atom = NULL;
5327 ret = xmlFAParseAtom(ctxt);
5328 if (ret == 0)
5329 return(0);
5330 if (ctxt->atom == NULL) {
5331 ERROR("internal: no atom generated");
5332 }
5333 xmlFAParseQuantifier(ctxt);
5334 return(1);
5335}
5336
5337/**
5338 * xmlFAParseBranch:
Daniel Veillard441bc322002-04-20 17:38:48 +00005339 * @ctxt: a regexp parser context
Daniel Veillard54eb0242006-03-21 23:17:57 +00005340 * @to: optional target to the end of the branch
5341 *
5342 * @to is used to optimize by removing duplicate path in automata
5343 * in expressions like (a|b)(c|d)
Daniel Veillard4255d502002-04-16 15:50:10 +00005344 *
5345 * [2] branch ::= piece*
5346 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005347static int
Daniel Veillard54eb0242006-03-21 23:17:57 +00005348xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
Daniel Veillard4255d502002-04-16 15:50:10 +00005349 xmlRegStatePtr previous;
Daniel Veillard4255d502002-04-16 15:50:10 +00005350 int ret;
5351
5352 previous = ctxt->state;
5353 ret = xmlFAParsePiece(ctxt);
5354 if (ret != 0) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005355 if (xmlFAGenerateTransitions(ctxt, previous,
Daniel Veillard54eb0242006-03-21 23:17:57 +00005356 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005357 return(-1);
5358 previous = ctxt->state;
Daniel Veillard4255d502002-04-16 15:50:10 +00005359 ctxt->atom = NULL;
5360 }
5361 while ((ret != 0) && (ctxt->error == 0)) {
5362 ret = xmlFAParsePiece(ctxt);
5363 if (ret != 0) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005364 if (xmlFAGenerateTransitions(ctxt, previous,
Daniel Veillard54eb0242006-03-21 23:17:57 +00005365 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0)
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005366 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00005367 previous = ctxt->state;
5368 ctxt->atom = NULL;
5369 }
5370 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005371 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00005372}
5373
5374/**
5375 * xmlFAParseRegExp:
Daniel Veillard441bc322002-04-20 17:38:48 +00005376 * @ctxt: a regexp parser context
William M. Brackddf71d62004-05-06 04:17:26 +00005377 * @top: is this the top-level expression ?
Daniel Veillard4255d502002-04-16 15:50:10 +00005378 *
5379 * [1] regExp ::= branch ( '|' branch )*
5380 */
5381static void
5382xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
Daniel Veillardc7e3cc42004-09-28 12:33:52 +00005383 xmlRegStatePtr start, end;
Daniel Veillard4255d502002-04-16 15:50:10 +00005384
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005385 /* if not top start should have been generated by an epsilon trans */
Daniel Veillard4255d502002-04-16 15:50:10 +00005386 start = ctxt->state;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005387 ctxt->end = NULL;
Daniel Veillard54eb0242006-03-21 23:17:57 +00005388 xmlFAParseBranch(ctxt, NULL);
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005389 if (top) {
5390#ifdef DEBUG_REGEXP_GRAPH
5391 printf("State %d is final\n", ctxt->state->no);
5392#endif
5393 ctxt->state->type = XML_REGEXP_FINAL_STATE;
5394 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005395 if (CUR != '|') {
5396 ctxt->end = ctxt->state;
5397 return;
5398 }
5399 end = ctxt->state;
5400 while ((CUR == '|') && (ctxt->error == 0)) {
5401 NEXT;
Daniel Veillard40851d02012-08-17 20:34:05 +08005402 if (CUR == 0) {
5403 ERROR("expecting a branch after |")
5404 return;
5405 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005406 ctxt->state = start;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005407 ctxt->end = NULL;
Daniel Veillard54eb0242006-03-21 23:17:57 +00005408 xmlFAParseBranch(ctxt, end);
Daniel Veillard4255d502002-04-16 15:50:10 +00005409 }
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005410 if (!top) {
5411 ctxt->state = end;
5412 ctxt->end = end;
5413 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005414}
5415
5416/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005417 * *
5418 * The basic API *
5419 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00005420 ************************************************************************/
5421
5422/**
5423 * xmlRegexpPrint:
5424 * @output: the file for the output debug
5425 * @regexp: the compiled regexp
5426 *
5427 * Print the content of the compiled regular expression
5428 */
5429void
5430xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5431 int i;
5432
Daniel Veillarda82b1822004-11-08 16:24:57 +00005433 if (output == NULL)
5434 return;
Daniel Veillard4255d502002-04-16 15:50:10 +00005435 fprintf(output, " regexp: ");
5436 if (regexp == NULL) {
5437 fprintf(output, "NULL\n");
5438 return;
5439 }
5440 fprintf(output, "'%s' ", regexp->string);
5441 fprintf(output, "\n");
5442 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5443 for (i = 0;i < regexp->nbAtoms; i++) {
5444 fprintf(output, " %02d ", i);
5445 xmlRegPrintAtom(output, regexp->atoms[i]);
5446 }
5447 fprintf(output, "%d states:", regexp->nbStates);
5448 fprintf(output, "\n");
5449 for (i = 0;i < regexp->nbStates; i++) {
5450 xmlRegPrintState(output, regexp->states[i]);
5451 }
5452 fprintf(output, "%d counters:\n", regexp->nbCounters);
5453 for (i = 0;i < regexp->nbCounters; i++) {
5454 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5455 regexp->counters[i].max);
5456 }
5457}
5458
5459/**
5460 * xmlRegexpCompile:
5461 * @regexp: a regular expression string
5462 *
5463 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
William M. Brackddf71d62004-05-06 04:17:26 +00005464 * Appendix F and builds an automata suitable for testing strings against
Daniel Veillard4255d502002-04-16 15:50:10 +00005465 * that regular expression
5466 *
5467 * Returns the compiled expression or NULL in case of error
5468 */
5469xmlRegexpPtr
5470xmlRegexpCompile(const xmlChar *regexp) {
5471 xmlRegexpPtr ret;
5472 xmlRegParserCtxtPtr ctxt;
5473
5474 ctxt = xmlRegNewParserCtxt(regexp);
5475 if (ctxt == NULL)
5476 return(NULL);
5477
5478 /* initialize the parser */
5479 ctxt->end = NULL;
5480 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5481 xmlRegStatePush(ctxt, ctxt->start);
5482
5483 /* parse the expression building an automata */
5484 xmlFAParseRegExp(ctxt, 1);
5485 if (CUR != 0) {
5486 ERROR("xmlFAParseRegExp: extra characters");
5487 }
Daniel Veillardcb4284e2007-04-25 13:55:20 +00005488 if (ctxt->error != 0) {
5489 xmlRegFreeParserCtxt(ctxt);
5490 return(NULL);
5491 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005492 ctxt->end = ctxt->state;
5493 ctxt->start->type = XML_REGEXP_START_STATE;
5494 ctxt->end->type = XML_REGEXP_FINAL_STATE;
5495
5496 /* remove the Epsilon except for counted transitions */
5497 xmlFAEliminateEpsilonTransitions(ctxt);
5498
5499
5500 if (ctxt->error != 0) {
5501 xmlRegFreeParserCtxt(ctxt);
5502 return(NULL);
5503 }
5504 ret = xmlRegEpxFromParse(ctxt);
5505 xmlRegFreeParserCtxt(ctxt);
5506 return(ret);
5507}
5508
5509/**
5510 * xmlRegexpExec:
5511 * @comp: the compiled regular expression
5512 * @content: the value to check against the regular expression
5513 *
William M. Brackddf71d62004-05-06 04:17:26 +00005514 * Check if the regular expression generates the value
Daniel Veillard4255d502002-04-16 15:50:10 +00005515 *
William M. Brackddf71d62004-05-06 04:17:26 +00005516 * Returns 1 if it matches, 0 if not and a negative value in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00005517 */
5518int
5519xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5520 if ((comp == NULL) || (content == NULL))
5521 return(-1);
5522 return(xmlFARegExec(comp, content));
5523}
5524
5525/**
Daniel Veillard23e73572002-09-19 19:56:43 +00005526 * xmlRegexpIsDeterminist:
5527 * @comp: the compiled regular expression
5528 *
5529 * Check if the regular expression is determinist
5530 *
William M. Brackddf71d62004-05-06 04:17:26 +00005531 * Returns 1 if it yes, 0 if not and a negative value in case of error
Daniel Veillard23e73572002-09-19 19:56:43 +00005532 */
5533int
5534xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5535 xmlAutomataPtr am;
5536 int ret;
5537
5538 if (comp == NULL)
5539 return(-1);
5540 if (comp->determinist != -1)
5541 return(comp->determinist);
5542
5543 am = xmlNewAutomata();
Daniel Veillardbd9afb52002-09-25 22:25:35 +00005544 if (am->states != NULL) {
5545 int i;
5546
5547 for (i = 0;i < am->nbStates;i++)
5548 xmlRegFreeState(am->states[i]);
5549 xmlFree(am->states);
5550 }
Daniel Veillard23e73572002-09-19 19:56:43 +00005551 am->nbAtoms = comp->nbAtoms;
5552 am->atoms = comp->atoms;
5553 am->nbStates = comp->nbStates;
5554 am->states = comp->states;
5555 am->determinist = -1;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02005556 am->flags = comp->flags;
Daniel Veillard23e73572002-09-19 19:56:43 +00005557 ret = xmlFAComputesDeterminism(am);
5558 am->atoms = NULL;
5559 am->states = NULL;
5560 xmlFreeAutomata(am);
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02005561 comp->determinist = ret;
Daniel Veillard23e73572002-09-19 19:56:43 +00005562 return(ret);
5563}
5564
5565/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005566 * xmlRegFreeRegexp:
5567 * @regexp: the regexp
5568 *
5569 * Free a regexp
5570 */
5571void
5572xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5573 int i;
5574 if (regexp == NULL)
5575 return;
5576
5577 if (regexp->string != NULL)
5578 xmlFree(regexp->string);
5579 if (regexp->states != NULL) {
5580 for (i = 0;i < regexp->nbStates;i++)
5581 xmlRegFreeState(regexp->states[i]);
5582 xmlFree(regexp->states);
5583 }
5584 if (regexp->atoms != NULL) {
5585 for (i = 0;i < regexp->nbAtoms;i++)
5586 xmlRegFreeAtom(regexp->atoms[i]);
5587 xmlFree(regexp->atoms);
5588 }
5589 if (regexp->counters != NULL)
5590 xmlFree(regexp->counters);
Daniel Veillard23e73572002-09-19 19:56:43 +00005591 if (regexp->compact != NULL)
5592 xmlFree(regexp->compact);
Daniel Veillard118aed72002-09-24 14:13:13 +00005593 if (regexp->transdata != NULL)
5594 xmlFree(regexp->transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +00005595 if (regexp->stringMap != NULL) {
5596 for (i = 0; i < regexp->nbstrings;i++)
5597 xmlFree(regexp->stringMap[i]);
5598 xmlFree(regexp->stringMap);
5599 }
5600
Daniel Veillard4255d502002-04-16 15:50:10 +00005601 xmlFree(regexp);
5602}
5603
5604#ifdef LIBXML_AUTOMATA_ENABLED
5605/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005606 * *
5607 * The Automata interface *
5608 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00005609 ************************************************************************/
5610
5611/**
5612 * xmlNewAutomata:
5613 *
5614 * Create a new automata
5615 *
5616 * Returns the new object or NULL in case of failure
5617 */
5618xmlAutomataPtr
5619xmlNewAutomata(void) {
5620 xmlAutomataPtr ctxt;
5621
5622 ctxt = xmlRegNewParserCtxt(NULL);
5623 if (ctxt == NULL)
5624 return(NULL);
5625
5626 /* initialize the parser */
5627 ctxt->end = NULL;
5628 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005629 if (ctxt->start == NULL) {
5630 xmlFreeAutomata(ctxt);
5631 return(NULL);
5632 }
Daniel Veillardd0271472006-01-02 10:22:02 +00005633 ctxt->start->type = XML_REGEXP_START_STATE;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005634 if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5635 xmlRegFreeState(ctxt->start);
5636 xmlFreeAutomata(ctxt);
5637 return(NULL);
5638 }
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02005639 ctxt->flags = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00005640
5641 return(ctxt);
5642}
5643
5644/**
5645 * xmlFreeAutomata:
5646 * @am: an automata
5647 *
5648 * Free an automata
5649 */
5650void
5651xmlFreeAutomata(xmlAutomataPtr am) {
5652 if (am == NULL)
5653 return;
5654 xmlRegFreeParserCtxt(am);
5655}
5656
5657/**
Daniel Veillard29341682009-09-10 18:23:39 +02005658 * xmlAutomataSetFlags:
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02005659 * @am: an automata
5660 * @flags: a set of internal flags
5661 *
5662 * Set some flags on the automata
5663 */
5664void
5665xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5666 if (am == NULL)
5667 return;
5668 am->flags |= flags;
5669}
5670
5671/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005672 * xmlAutomataGetInitState:
5673 * @am: an automata
5674 *
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005675 * Initial state lookup
5676 *
Daniel Veillard4255d502002-04-16 15:50:10 +00005677 * Returns the initial state of the automata
5678 */
5679xmlAutomataStatePtr
5680xmlAutomataGetInitState(xmlAutomataPtr am) {
5681 if (am == NULL)
5682 return(NULL);
5683 return(am->start);
5684}
5685
5686/**
5687 * xmlAutomataSetFinalState:
5688 * @am: an automata
5689 * @state: a state in this automata
5690 *
5691 * Makes that state a final state
5692 *
5693 * Returns 0 or -1 in case of error
5694 */
5695int
5696xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5697 if ((am == NULL) || (state == NULL))
5698 return(-1);
5699 state->type = XML_REGEXP_FINAL_STATE;
5700 return(0);
5701}
5702
5703/**
5704 * xmlAutomataNewTransition:
5705 * @am: an automata
5706 * @from: the starting point of the transition
5707 * @to: the target point of the transition or NULL
5708 * @token: the input string associated to that transition
5709 * @data: data passed to the callback function if the transition is activated
5710 *
William M. Brackddf71d62004-05-06 04:17:26 +00005711 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00005712 * and then adds a transition from the @from state to the target state
5713 * activated by the value of @token
5714 *
5715 * Returns the target state or NULL in case of error
5716 */
5717xmlAutomataStatePtr
5718xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5719 xmlAutomataStatePtr to, const xmlChar *token,
5720 void *data) {
5721 xmlRegAtomPtr atom;
5722
5723 if ((am == NULL) || (from == NULL) || (token == NULL))
5724 return(NULL);
5725 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005726 if (atom == NULL)
5727 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00005728 atom->data = data;
Daniel Veillard4255d502002-04-16 15:50:10 +00005729 atom->valuep = xmlStrdup(token);
5730
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005731 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5732 xmlRegFreeAtom(atom);
5733 return(NULL);
5734 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005735 if (to == NULL)
5736 return(am->state);
5737 return(to);
5738}
5739
5740/**
Daniel Veillard52b48c72003-04-13 19:53:42 +00005741 * xmlAutomataNewTransition2:
5742 * @am: an automata
5743 * @from: the starting point of the transition
5744 * @to: the target point of the transition or NULL
5745 * @token: the first input string associated to that transition
5746 * @token2: the second input string associated to that transition
5747 * @data: data passed to the callback function if the transition is activated
5748 *
William M. Brackddf71d62004-05-06 04:17:26 +00005749 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard52b48c72003-04-13 19:53:42 +00005750 * and then adds a transition from the @from state to the target state
5751 * activated by the value of @token
5752 *
5753 * Returns the target state or NULL in case of error
5754 */
5755xmlAutomataStatePtr
5756xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5757 xmlAutomataStatePtr to, const xmlChar *token,
5758 const xmlChar *token2, void *data) {
5759 xmlRegAtomPtr atom;
5760
5761 if ((am == NULL) || (from == NULL) || (token == NULL))
5762 return(NULL);
5763 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
Daniel Veillard52b48c72003-04-13 19:53:42 +00005764 if (atom == NULL)
5765 return(NULL);
Daniel Veillard11ce4002006-03-10 00:36:23 +00005766 atom->data = data;
Daniel Veillard52b48c72003-04-13 19:53:42 +00005767 if ((token2 == NULL) || (*token2 == 0)) {
5768 atom->valuep = xmlStrdup(token);
5769 } else {
5770 int lenn, lenp;
5771 xmlChar *str;
5772
5773 lenn = strlen((char *) token2);
5774 lenp = strlen((char *) token);
5775
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005776 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00005777 if (str == NULL) {
5778 xmlRegFreeAtom(atom);
5779 return(NULL);
5780 }
5781 memcpy(&str[0], token, lenp);
5782 str[lenp] = '|';
5783 memcpy(&str[lenp + 1], token2, lenn);
5784 str[lenn + lenp + 1] = 0;
5785
5786 atom->valuep = str;
5787 }
5788
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005789 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5790 xmlRegFreeAtom(atom);
5791 return(NULL);
5792 }
Daniel Veillard52b48c72003-04-13 19:53:42 +00005793 if (to == NULL)
5794 return(am->state);
5795 return(to);
5796}
5797
5798/**
Daniel Veillard9efc4762005-07-19 14:33:55 +00005799 * xmlAutomataNewNegTrans:
5800 * @am: an automata
5801 * @from: the starting point of the transition
5802 * @to: the target point of the transition or NULL
5803 * @token: the first input string associated to that transition
5804 * @token2: the second input string associated to that transition
5805 * @data: data passed to the callback function if the transition is activated
5806 *
5807 * If @to is NULL, this creates first a new target state in the automata
5808 * and then adds a transition from the @from state to the target state
5809 * activated by any value except (@token,@token2)
Daniel Veillard6e65e152005-08-09 11:09:52 +00005810 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5811 # the semantic of XSD ##other
Daniel Veillard9efc4762005-07-19 14:33:55 +00005812 *
5813 * Returns the target state or NULL in case of error
5814 */
5815xmlAutomataStatePtr
5816xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5817 xmlAutomataStatePtr to, const xmlChar *token,
5818 const xmlChar *token2, void *data) {
5819 xmlRegAtomPtr atom;
Daniel Veillard77005e62005-07-19 16:26:18 +00005820 xmlChar err_msg[200];
Daniel Veillard9efc4762005-07-19 14:33:55 +00005821
5822 if ((am == NULL) || (from == NULL) || (token == NULL))
5823 return(NULL);
5824 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5825 if (atom == NULL)
5826 return(NULL);
5827 atom->data = data;
5828 atom->neg = 1;
5829 if ((token2 == NULL) || (*token2 == 0)) {
5830 atom->valuep = xmlStrdup(token);
5831 } else {
5832 int lenn, lenp;
5833 xmlChar *str;
5834
5835 lenn = strlen((char *) token2);
5836 lenp = strlen((char *) token);
5837
5838 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5839 if (str == NULL) {
5840 xmlRegFreeAtom(atom);
5841 return(NULL);
5842 }
5843 memcpy(&str[0], token, lenp);
5844 str[lenp] = '|';
5845 memcpy(&str[lenp + 1], token2, lenn);
5846 str[lenn + lenp + 1] = 0;
5847
5848 atom->valuep = str;
5849 }
Daniel Veillarddb68b742005-07-30 13:18:24 +00005850 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
Daniel Veillard77005e62005-07-19 16:26:18 +00005851 err_msg[199] = 0;
5852 atom->valuep2 = xmlStrdup(err_msg);
Daniel Veillard9efc4762005-07-19 14:33:55 +00005853
5854 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5855 xmlRegFreeAtom(atom);
5856 return(NULL);
5857 }
Daniel Veillard6e65e152005-08-09 11:09:52 +00005858 am->negs++;
Daniel Veillard9efc4762005-07-19 14:33:55 +00005859 if (to == NULL)
5860 return(am->state);
5861 return(to);
5862}
5863
5864/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005865 * xmlAutomataNewCountTrans2:
5866 * @am: an automata
5867 * @from: the starting point of the transition
5868 * @to: the target point of the transition or NULL
5869 * @token: the input string associated to that transition
5870 * @token2: the second input string associated to that transition
5871 * @min: the minimum successive occurences of token
5872 * @max: the maximum successive occurences of token
5873 * @data: data associated to the transition
5874 *
5875 * If @to is NULL, this creates first a new target state in the automata
5876 * and then adds a transition from the @from state to the target state
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005877 * activated by a succession of input of value @token and @token2 and
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005878 * whose number is between @min and @max
5879 *
5880 * Returns the target state or NULL in case of error
5881 */
5882xmlAutomataStatePtr
5883xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5884 xmlAutomataStatePtr to, const xmlChar *token,
5885 const xmlChar *token2,
5886 int min, int max, void *data) {
5887 xmlRegAtomPtr atom;
5888 int counter;
5889
5890 if ((am == NULL) || (from == NULL) || (token == NULL))
5891 return(NULL);
5892 if (min < 0)
5893 return(NULL);
5894 if ((max < min) || (max < 1))
5895 return(NULL);
5896 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5897 if (atom == NULL)
5898 return(NULL);
5899 if ((token2 == NULL) || (*token2 == 0)) {
5900 atom->valuep = xmlStrdup(token);
5901 } else {
5902 int lenn, lenp;
5903 xmlChar *str;
5904
5905 lenn = strlen((char *) token2);
5906 lenp = strlen((char *) token);
5907
5908 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5909 if (str == NULL) {
5910 xmlRegFreeAtom(atom);
5911 return(NULL);
5912 }
5913 memcpy(&str[0], token, lenp);
5914 str[lenp] = '|';
5915 memcpy(&str[lenp + 1], token2, lenn);
5916 str[lenn + lenp + 1] = 0;
5917
5918 atom->valuep = str;
5919 }
5920 atom->data = data;
5921 if (min == 0)
5922 atom->min = 1;
5923 else
5924 atom->min = min;
5925 atom->max = max;
5926
5927 /*
5928 * associate a counter to the transition.
5929 */
5930 counter = xmlRegGetCounter(am);
Nick Wellnhoferdc858b32023-02-17 15:53:07 +01005931 if (counter < 0)
5932 goto error;
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005933 am->counters[counter].min = min;
5934 am->counters[counter].max = max;
5935
5936 /* xmlFAGenerateTransitions(am, from, to, atom); */
5937 if (to == NULL) {
5938 to = xmlRegNewState(am);
5939 xmlRegStatePush(am, to);
5940 }
Daniel Veillard5de09382005-09-26 17:18:17 +00005941 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005942 xmlRegAtomPush(am, atom);
5943 am->state = to;
5944
5945 if (to == NULL)
5946 to = am->state;
5947 if (to == NULL)
5948 return(NULL);
5949 if (min == 0)
5950 xmlFAGenerateEpsilonTransition(am, from, to);
5951 return(to);
Nick Wellnhoferdc858b32023-02-17 15:53:07 +01005952
5953error:
5954 xmlRegFreeAtom(atom);
5955 return(NULL);
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005956}
5957
5958/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005959 * xmlAutomataNewCountTrans:
5960 * @am: an automata
5961 * @from: the starting point of the transition
5962 * @to: the target point of the transition or NULL
5963 * @token: the input string associated to that transition
5964 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005965 * @max: the maximum successive occurences of token
5966 * @data: data associated to the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00005967 *
William M. Brackddf71d62004-05-06 04:17:26 +00005968 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00005969 * and then adds a transition from the @from state to the target state
5970 * activated by a succession of input of value @token and whose number
5971 * is between @min and @max
5972 *
5973 * Returns the target state or NULL in case of error
5974 */
5975xmlAutomataStatePtr
5976xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5977 xmlAutomataStatePtr to, const xmlChar *token,
5978 int min, int max, void *data) {
5979 xmlRegAtomPtr atom;
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00005980 int counter;
Daniel Veillard4255d502002-04-16 15:50:10 +00005981
5982 if ((am == NULL) || (from == NULL) || (token == NULL))
5983 return(NULL);
5984 if (min < 0)
5985 return(NULL);
5986 if ((max < min) || (max < 1))
5987 return(NULL);
5988 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5989 if (atom == NULL)
5990 return(NULL);
5991 atom->valuep = xmlStrdup(token);
5992 atom->data = data;
5993 if (min == 0)
5994 atom->min = 1;
5995 else
5996 atom->min = min;
5997 atom->max = max;
5998
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00005999 /*
6000 * associate a counter to the transition.
6001 */
6002 counter = xmlRegGetCounter(am);
Nick Wellnhoferdc858b32023-02-17 15:53:07 +01006003 if (counter < 0)
6004 goto error;
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00006005 am->counters[counter].min = min;
6006 am->counters[counter].max = max;
6007
6008 /* xmlFAGenerateTransitions(am, from, to, atom); */
6009 if (to == NULL) {
6010 to = xmlRegNewState(am);
6011 xmlRegStatePush(am, to);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00006012 }
Daniel Veillard5de09382005-09-26 17:18:17 +00006013 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00006014 xmlRegAtomPush(am, atom);
6015 am->state = to;
6016
Daniel Veillard4255d502002-04-16 15:50:10 +00006017 if (to == NULL)
6018 to = am->state;
6019 if (to == NULL)
6020 return(NULL);
6021 if (min == 0)
6022 xmlFAGenerateEpsilonTransition(am, from, to);
6023 return(to);
Nick Wellnhoferdc858b32023-02-17 15:53:07 +01006024
6025error:
6026 xmlRegFreeAtom(atom);
6027 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00006028}
6029
6030/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006031 * xmlAutomataNewOnceTrans2:
6032 * @am: an automata
6033 * @from: the starting point of the transition
6034 * @to: the target point of the transition or NULL
6035 * @token: the input string associated to that transition
6036 * @token2: the second input string associated to that transition
6037 * @min: the minimum successive occurences of token
6038 * @max: the maximum successive occurences of token
6039 * @data: data associated to the transition
6040 *
6041 * If @to is NULL, this creates first a new target state in the automata
6042 * and then adds a transition from the @from state to the target state
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006043 * activated by a succession of input of value @token and @token2 and whose
6044 * number is between @min and @max, moreover that transition can only be
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006045 * crossed once.
6046 *
6047 * Returns the target state or NULL in case of error
6048 */
6049xmlAutomataStatePtr
6050xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6051 xmlAutomataStatePtr to, const xmlChar *token,
6052 const xmlChar *token2,
6053 int min, int max, void *data) {
6054 xmlRegAtomPtr atom;
6055 int counter;
6056
6057 if ((am == NULL) || (from == NULL) || (token == NULL))
6058 return(NULL);
6059 if (min < 1)
6060 return(NULL);
6061 if ((max < min) || (max < 1))
6062 return(NULL);
6063 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6064 if (atom == NULL)
6065 return(NULL);
6066 if ((token2 == NULL) || (*token2 == 0)) {
6067 atom->valuep = xmlStrdup(token);
6068 } else {
6069 int lenn, lenp;
6070 xmlChar *str;
6071
6072 lenn = strlen((char *) token2);
6073 lenp = strlen((char *) token);
6074
6075 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6076 if (str == NULL) {
6077 xmlRegFreeAtom(atom);
6078 return(NULL);
6079 }
6080 memcpy(&str[0], token, lenp);
6081 str[lenp] = '|';
6082 memcpy(&str[lenp + 1], token2, lenn);
6083 str[lenn + lenp + 1] = 0;
6084
6085 atom->valuep = str;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006086 }
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006087 atom->data = data;
6088 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
Daniel Veillard11ce4002006-03-10 00:36:23 +00006089 atom->min = min;
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006090 atom->max = max;
6091 /*
6092 * associate a counter to the transition.
6093 */
6094 counter = xmlRegGetCounter(am);
Nick Wellnhoferdc858b32023-02-17 15:53:07 +01006095 if (counter < 0)
6096 goto error;
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006097 am->counters[counter].min = 1;
6098 am->counters[counter].max = 1;
6099
6100 /* xmlFAGenerateTransitions(am, from, to, atom); */
6101 if (to == NULL) {
6102 to = xmlRegNewState(am);
6103 xmlRegStatePush(am, to);
6104 }
Daniel Veillard5de09382005-09-26 17:18:17 +00006105 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006106 xmlRegAtomPush(am, atom);
6107 am->state = to;
6108 return(to);
Nick Wellnhoferdc858b32023-02-17 15:53:07 +01006109
6110error:
6111 xmlRegFreeAtom(atom);
6112 return(NULL);
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006113}
6114
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006115
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006116
6117/**
Daniel Veillard7646b182002-04-20 06:41:40 +00006118 * xmlAutomataNewOnceTrans:
6119 * @am: an automata
6120 * @from: the starting point of the transition
6121 * @to: the target point of the transition or NULL
6122 * @token: the input string associated to that transition
6123 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00006124 * @max: the maximum successive occurences of token
6125 * @data: data associated to the transition
Daniel Veillard7646b182002-04-20 06:41:40 +00006126 *
William M. Brackddf71d62004-05-06 04:17:26 +00006127 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00006128 * and then adds a transition from the @from state to the target state
6129 * activated by a succession of input of value @token and whose number
William M. Brackddf71d62004-05-06 04:17:26 +00006130 * is between @min and @max, moreover that transition can only be crossed
Daniel Veillard7646b182002-04-20 06:41:40 +00006131 * once.
6132 *
6133 * Returns the target state or NULL in case of error
6134 */
6135xmlAutomataStatePtr
6136xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6137 xmlAutomataStatePtr to, const xmlChar *token,
6138 int min, int max, void *data) {
6139 xmlRegAtomPtr atom;
6140 int counter;
6141
6142 if ((am == NULL) || (from == NULL) || (token == NULL))
6143 return(NULL);
6144 if (min < 1)
6145 return(NULL);
6146 if ((max < min) || (max < 1))
6147 return(NULL);
6148 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6149 if (atom == NULL)
6150 return(NULL);
6151 atom->valuep = xmlStrdup(token);
6152 atom->data = data;
6153 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
Daniel Veillard11ce4002006-03-10 00:36:23 +00006154 atom->min = min;
Daniel Veillard7646b182002-04-20 06:41:40 +00006155 atom->max = max;
6156 /*
6157 * associate a counter to the transition.
6158 */
6159 counter = xmlRegGetCounter(am);
Nick Wellnhoferdc858b32023-02-17 15:53:07 +01006160 if (counter < 0)
6161 goto error;
Daniel Veillard7646b182002-04-20 06:41:40 +00006162 am->counters[counter].min = 1;
6163 am->counters[counter].max = 1;
6164
6165 /* xmlFAGenerateTransitions(am, from, to, atom); */
6166 if (to == NULL) {
6167 to = xmlRegNewState(am);
6168 xmlRegStatePush(am, to);
6169 }
Daniel Veillard5de09382005-09-26 17:18:17 +00006170 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Daniel Veillard7646b182002-04-20 06:41:40 +00006171 xmlRegAtomPush(am, atom);
6172 am->state = to;
Daniel Veillard7646b182002-04-20 06:41:40 +00006173 return(to);
Nick Wellnhoferdc858b32023-02-17 15:53:07 +01006174
6175error:
6176 xmlRegFreeAtom(atom);
6177 return(NULL);
Daniel Veillard7646b182002-04-20 06:41:40 +00006178}
6179
6180/**
Daniel Veillard4255d502002-04-16 15:50:10 +00006181 * xmlAutomataNewState:
6182 * @am: an automata
6183 *
6184 * Create a new disconnected state in the automata
6185 *
6186 * Returns the new state or NULL in case of error
6187 */
6188xmlAutomataStatePtr
6189xmlAutomataNewState(xmlAutomataPtr am) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006190 xmlAutomataStatePtr to;
Daniel Veillard4255d502002-04-16 15:50:10 +00006191
6192 if (am == NULL)
6193 return(NULL);
6194 to = xmlRegNewState(am);
6195 xmlRegStatePush(am, to);
6196 return(to);
6197}
6198
6199/**
Daniel Veillarda9b66d02002-12-11 14:23:49 +00006200 * xmlAutomataNewEpsilon:
Daniel Veillard4255d502002-04-16 15:50:10 +00006201 * @am: an automata
6202 * @from: the starting point of the transition
6203 * @to: the target point of the transition or NULL
6204 *
William M. Brackddf71d62004-05-06 04:17:26 +00006205 * If @to is NULL, this creates first a new target state in the automata
6206 * and then adds an epsilon transition from the @from state to the
Daniel Veillard4255d502002-04-16 15:50:10 +00006207 * target state
6208 *
6209 * Returns the target state or NULL in case of error
6210 */
6211xmlAutomataStatePtr
6212xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
6213 xmlAutomataStatePtr to) {
6214 if ((am == NULL) || (from == NULL))
6215 return(NULL);
6216 xmlFAGenerateEpsilonTransition(am, from, to);
6217 if (to == NULL)
6218 return(am->state);
6219 return(to);
6220}
6221
Daniel Veillardb509f152002-04-17 16:28:10 +00006222/**
Daniel Veillard7646b182002-04-20 06:41:40 +00006223 * xmlAutomataNewAllTrans:
6224 * @am: an automata
6225 * @from: the starting point of the transition
6226 * @to: the target point of the transition or NULL
Daniel Veillarda9b66d02002-12-11 14:23:49 +00006227 * @lax: allow to transition if not all all transitions have been activated
Daniel Veillard7646b182002-04-20 06:41:40 +00006228 *
William M. Brackddf71d62004-05-06 04:17:26 +00006229 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00006230 * and then adds a an ALL transition from the @from state to the
6231 * target state. That transition is an epsilon transition allowed only when
6232 * all transitions from the @from node have been activated.
6233 *
6234 * Returns the target state or NULL in case of error
6235 */
6236xmlAutomataStatePtr
6237xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
Daniel Veillard441bc322002-04-20 17:38:48 +00006238 xmlAutomataStatePtr to, int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00006239 if ((am == NULL) || (from == NULL))
6240 return(NULL);
Daniel Veillard441bc322002-04-20 17:38:48 +00006241 xmlFAGenerateAllTransition(am, from, to, lax);
Daniel Veillard7646b182002-04-20 06:41:40 +00006242 if (to == NULL)
6243 return(am->state);
6244 return(to);
6245}
6246
6247/**
Daniel Veillardb509f152002-04-17 16:28:10 +00006248 * xmlAutomataNewCounter:
6249 * @am: an automata
6250 * @min: the minimal value on the counter
6251 * @max: the maximal value on the counter
6252 *
6253 * Create a new counter
6254 *
6255 * Returns the counter number or -1 in case of error
6256 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006257int
Daniel Veillardb509f152002-04-17 16:28:10 +00006258xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6259 int ret;
6260
6261 if (am == NULL)
6262 return(-1);
6263
6264 ret = xmlRegGetCounter(am);
6265 if (ret < 0)
6266 return(-1);
6267 am->counters[ret].min = min;
6268 am->counters[ret].max = max;
6269 return(ret);
6270}
6271
6272/**
6273 * xmlAutomataNewCountedTrans:
6274 * @am: an automata
6275 * @from: the starting point of the transition
6276 * @to: the target point of the transition or NULL
6277 * @counter: the counter associated to that transition
6278 *
William M. Brackddf71d62004-05-06 04:17:26 +00006279 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00006280 * and then adds an epsilon transition from the @from state to the target state
6281 * which will increment the counter provided
6282 *
6283 * Returns the target state or NULL in case of error
6284 */
6285xmlAutomataStatePtr
6286xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6287 xmlAutomataStatePtr to, int counter) {
6288 if ((am == NULL) || (from == NULL) || (counter < 0))
6289 return(NULL);
6290 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6291 if (to == NULL)
6292 return(am->state);
6293 return(to);
6294}
6295
6296/**
6297 * xmlAutomataNewCounterTrans:
6298 * @am: an automata
6299 * @from: the starting point of the transition
6300 * @to: the target point of the transition or NULL
6301 * @counter: the counter associated to that transition
6302 *
William M. Brackddf71d62004-05-06 04:17:26 +00006303 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00006304 * and then adds an epsilon transition from the @from state to the target state
6305 * which will be allowed only if the counter is within the right range.
6306 *
6307 * Returns the target state or NULL in case of error
6308 */
6309xmlAutomataStatePtr
6310xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6311 xmlAutomataStatePtr to, int counter) {
6312 if ((am == NULL) || (from == NULL) || (counter < 0))
6313 return(NULL);
6314 xmlFAGenerateCountedTransition(am, from, to, counter);
6315 if (to == NULL)
6316 return(am->state);
6317 return(to);
6318}
Daniel Veillard4255d502002-04-16 15:50:10 +00006319
6320/**
6321 * xmlAutomataCompile:
6322 * @am: an automata
6323 *
6324 * Compile the automata into a Reg Exp ready for being executed.
6325 * The automata should be free after this point.
6326 *
6327 * Returns the compiled regexp or NULL in case of error
6328 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006329xmlRegexpPtr
Daniel Veillard4255d502002-04-16 15:50:10 +00006330xmlAutomataCompile(xmlAutomataPtr am) {
6331 xmlRegexpPtr ret;
6332
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00006333 if ((am == NULL) || (am->error != 0)) return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00006334 xmlFAEliminateEpsilonTransitions(am);
Daniel Veillard23e73572002-09-19 19:56:43 +00006335 /* xmlFAComputesDeterminism(am); */
Daniel Veillard4255d502002-04-16 15:50:10 +00006336 ret = xmlRegEpxFromParse(am);
6337
6338 return(ret);
6339}
Daniel Veillarde19fc232002-04-22 16:01:24 +00006340
6341/**
6342 * xmlAutomataIsDeterminist:
6343 * @am: an automata
6344 *
6345 * Checks if an automata is determinist.
6346 *
6347 * Returns 1 if true, 0 if not, and -1 in case of error
6348 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006349int
Daniel Veillarde19fc232002-04-22 16:01:24 +00006350xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6351 int ret;
6352
6353 if (am == NULL)
6354 return(-1);
6355
6356 ret = xmlFAComputesDeterminism(am);
6357 return(ret);
6358}
Daniel Veillard4255d502002-04-16 15:50:10 +00006359#endif /* LIBXML_AUTOMATA_ENABLED */
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006360
6361#ifdef LIBXML_EXPR_ENABLED
6362/************************************************************************
6363 * *
6364 * Formal Expression handling code *
6365 * *
6366 ************************************************************************/
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006367/************************************************************************
6368 * *
6369 * Expression handling context *
6370 * *
6371 ************************************************************************/
6372
6373struct _xmlExpCtxt {
6374 xmlDictPtr dict;
6375 xmlExpNodePtr *table;
6376 int size;
6377 int nbElems;
6378 int nb_nodes;
Daniel Veillard594e5df2009-09-07 14:58:47 +02006379 int maxNodes;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006380 const char *expr;
6381 const char *cur;
6382 int nb_cons;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006383 int tabSize;
6384};
6385
6386/**
6387 * xmlExpNewCtxt:
6388 * @maxNodes: the maximum number of nodes
Jan Pokornýbb654fe2016-04-13 16:56:07 +02006389 * @dict: optional dictionary to use internally
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006390 *
6391 * Creates a new context for manipulating expressions
6392 *
6393 * Returns the context or NULL in case of error
6394 */
6395xmlExpCtxtPtr
6396xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6397 xmlExpCtxtPtr ret;
6398 int size = 256;
6399
6400 if (maxNodes <= 4096)
6401 maxNodes = 4096;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006402
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006403 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6404 if (ret == NULL)
6405 return(NULL);
6406 memset(ret, 0, sizeof(xmlExpCtxt));
6407 ret->size = size;
6408 ret->nbElems = 0;
Daniel Veillard594e5df2009-09-07 14:58:47 +02006409 ret->maxNodes = maxNodes;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006410 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6411 if (ret->table == NULL) {
6412 xmlFree(ret);
6413 return(NULL);
6414 }
6415 memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6416 if (dict == NULL) {
6417 ret->dict = xmlDictCreate();
6418 if (ret->dict == NULL) {
6419 xmlFree(ret->table);
6420 xmlFree(ret);
6421 return(NULL);
6422 }
6423 } else {
6424 ret->dict = dict;
6425 xmlDictReference(ret->dict);
6426 }
6427 return(ret);
6428}
6429
6430/**
6431 * xmlExpFreeCtxt:
6432 * @ctxt: an expression context
6433 *
6434 * Free an expression context
6435 */
6436void
6437xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6438 if (ctxt == NULL)
6439 return;
6440 xmlDictFree(ctxt->dict);
6441 if (ctxt->table != NULL)
6442 xmlFree(ctxt->table);
6443 xmlFree(ctxt);
6444}
6445
6446/************************************************************************
6447 * *
6448 * Structure associated to an expression node *
6449 * *
6450 ************************************************************************/
Daniel Veillard465a0002005-08-22 12:07:04 +00006451#define MAX_NODES 10000
6452
6453/* #define DEBUG_DERIV */
6454
6455/*
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006456 * TODO:
Daniel Veillard465a0002005-08-22 12:07:04 +00006457 * - Wildcards
6458 * - public API for creation
6459 *
6460 * Started
6461 * - regression testing
6462 *
6463 * Done
6464 * - split into module and test tool
6465 * - memleaks
6466 */
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006467
6468typedef enum {
6469 XML_EXP_NILABLE = (1 << 0)
6470} xmlExpNodeInfo;
6471
6472#define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6473
6474struct _xmlExpNode {
6475 unsigned char type;/* xmlExpNodeType */
6476 unsigned char info;/* OR of xmlExpNodeInfo */
6477 unsigned short key; /* the hash key */
6478 unsigned int ref; /* The number of references */
6479 int c_max; /* the maximum length it can consume */
6480 xmlExpNodePtr exp_left;
6481 xmlExpNodePtr next;/* the next node in the hash table or free list */
6482 union {
6483 struct {
6484 int f_min;
6485 int f_max;
6486 } count;
6487 struct {
6488 xmlExpNodePtr f_right;
6489 } children;
6490 const xmlChar *f_str;
6491 } field;
6492};
6493
6494#define exp_min field.count.f_min
6495#define exp_max field.count.f_max
6496/* #define exp_left field.children.f_left */
6497#define exp_right field.children.f_right
6498#define exp_str field.f_str
6499
6500static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6501static xmlExpNode forbiddenExpNode = {
6502 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6503};
6504xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6505static xmlExpNode emptyExpNode = {
6506 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6507};
6508xmlExpNodePtr emptyExp = &emptyExpNode;
6509
6510/************************************************************************
6511 * *
6512 * The custom hash table for unicity and canonicalization *
6513 * of sub-expressions pointers *
6514 * *
6515 ************************************************************************/
6516/*
6517 * xmlExpHashNameComputeKey:
6518 * Calculate the hash key for a token
6519 */
6520static unsigned short
6521xmlExpHashNameComputeKey(const xmlChar *name) {
6522 unsigned short value = 0L;
6523 char ch;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006524
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006525 if (name != NULL) {
6526 value += 30 * (*name);
6527 while ((ch = *name++) != 0) {
6528 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6529 }
6530 }
6531 return (value);
6532}
6533
6534/*
6535 * xmlExpHashComputeKey:
6536 * Calculate the hash key for a compound expression
6537 */
6538static unsigned short
6539xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6540 xmlExpNodePtr right) {
6541 unsigned long value;
6542 unsigned short ret;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006543
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006544 switch (type) {
6545 case XML_EXP_SEQ:
6546 value = left->key;
6547 value += right->key;
6548 value *= 3;
6549 ret = (unsigned short) value;
6550 break;
6551 case XML_EXP_OR:
6552 value = left->key;
6553 value += right->key;
6554 value *= 7;
6555 ret = (unsigned short) value;
6556 break;
6557 case XML_EXP_COUNT:
6558 value = left->key;
6559 value += right->key;
6560 ret = (unsigned short) value;
6561 break;
6562 default:
6563 ret = 0;
6564 }
6565 return(ret);
6566}
6567
6568
6569static xmlExpNodePtr
6570xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6571 xmlExpNodePtr ret;
6572
6573 if (ctxt->nb_nodes >= MAX_NODES)
6574 return(NULL);
6575 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6576 if (ret == NULL)
6577 return(NULL);
6578 memset(ret, 0, sizeof(xmlExpNode));
6579 ret->type = type;
6580 ret->next = NULL;
6581 ctxt->nb_nodes++;
6582 ctxt->nb_cons++;
6583 return(ret);
6584}
6585
6586/**
6587 * xmlExpHashGetEntry:
6588 * @table: the hash table
6589 *
6590 * Get the unique entry from the hash table. The entry is created if
6591 * needed. @left and @right are consumed, i.e. their ref count will
6592 * be decremented by the operation.
6593 *
6594 * Returns the pointer or NULL in case of error
6595 */
6596static xmlExpNodePtr
6597xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6598 xmlExpNodePtr left, xmlExpNodePtr right,
6599 const xmlChar *name, int min, int max) {
6600 unsigned short kbase, key;
6601 xmlExpNodePtr entry;
6602 xmlExpNodePtr insert;
6603
6604 if (ctxt == NULL)
6605 return(NULL);
6606
6607 /*
6608 * Check for duplicate and insertion location.
6609 */
6610 if (type == XML_EXP_ATOM) {
6611 kbase = xmlExpHashNameComputeKey(name);
6612 } else if (type == XML_EXP_COUNT) {
6613 /* COUNT reduction rule 1 */
6614 /* a{1} -> a */
6615 if (min == max) {
6616 if (min == 1) {
6617 return(left);
6618 }
6619 if (min == 0) {
6620 xmlExpFree(ctxt, left);
6621 return(emptyExp);
6622 }
6623 }
6624 if (min < 0) {
6625 xmlExpFree(ctxt, left);
6626 return(forbiddenExp);
6627 }
6628 if (max == -1)
6629 kbase = min + 79;
6630 else
6631 kbase = max - min;
6632 kbase += left->key;
6633 } else if (type == XML_EXP_OR) {
6634 /* Forbid reduction rules */
6635 if (left->type == XML_EXP_FORBID) {
6636 xmlExpFree(ctxt, left);
6637 return(right);
6638 }
6639 if (right->type == XML_EXP_FORBID) {
6640 xmlExpFree(ctxt, right);
6641 return(left);
6642 }
6643
6644 /* OR reduction rule 1 */
6645 /* a | a reduced to a */
6646 if (left == right) {
6647 left->ref--;
6648 return(left);
6649 }
6650 /* OR canonicalization rule 1 */
6651 /* linearize (a | b) | c into a | (b | c) */
6652 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6653 xmlExpNodePtr tmp = left;
6654 left = right;
6655 right = tmp;
6656 }
6657 /* OR reduction rule 2 */
6658 /* a | (a | b) and b | (a | b) are reduced to a | b */
6659 if (right->type == XML_EXP_OR) {
6660 if ((left == right->exp_left) ||
6661 (left == right->exp_right)) {
6662 xmlExpFree(ctxt, left);
6663 return(right);
6664 }
6665 }
6666 /* OR canonicalization rule 2 */
6667 /* linearize (a | b) | c into a | (b | c) */
6668 if (left->type == XML_EXP_OR) {
6669 xmlExpNodePtr tmp;
6670
6671 /* OR canonicalization rule 2 */
6672 if ((left->exp_right->type != XML_EXP_OR) &&
6673 (left->exp_right->key < left->exp_left->key)) {
6674 tmp = left->exp_right;
6675 left->exp_right = left->exp_left;
6676 left->exp_left = tmp;
6677 }
6678 left->exp_right->ref++;
6679 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6680 NULL, 0, 0);
6681 left->exp_left->ref++;
6682 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6683 NULL, 0, 0);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006684
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006685 xmlExpFree(ctxt, left);
6686 return(tmp);
6687 }
6688 if (right->type == XML_EXP_OR) {
6689 /* Ordering in the tree */
6690 /* C | (A | B) -> A | (B | C) */
6691 if (left->key > right->exp_right->key) {
6692 xmlExpNodePtr tmp;
6693 right->exp_right->ref++;
6694 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6695 left, NULL, 0, 0);
6696 right->exp_left->ref++;
6697 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6698 tmp, NULL, 0, 0);
6699 xmlExpFree(ctxt, right);
6700 return(tmp);
6701 }
6702 /* Ordering in the tree */
6703 /* B | (A | C) -> A | (B | C) */
6704 if (left->key > right->exp_left->key) {
6705 xmlExpNodePtr tmp;
6706 right->exp_right->ref++;
6707 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6708 right->exp_right, NULL, 0, 0);
6709 right->exp_left->ref++;
6710 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6711 tmp, NULL, 0, 0);
6712 xmlExpFree(ctxt, right);
6713 return(tmp);
6714 }
6715 }
6716 /* we know both types are != XML_EXP_OR here */
6717 else if (left->key > right->key) {
6718 xmlExpNodePtr tmp = left;
6719 left = right;
6720 right = tmp;
6721 }
6722 kbase = xmlExpHashComputeKey(type, left, right);
6723 } else if (type == XML_EXP_SEQ) {
6724 /* Forbid reduction rules */
6725 if (left->type == XML_EXP_FORBID) {
6726 xmlExpFree(ctxt, right);
6727 return(left);
6728 }
6729 if (right->type == XML_EXP_FORBID) {
6730 xmlExpFree(ctxt, left);
6731 return(right);
6732 }
6733 /* Empty reduction rules */
6734 if (right->type == XML_EXP_EMPTY) {
6735 return(left);
6736 }
6737 if (left->type == XML_EXP_EMPTY) {
6738 return(right);
6739 }
6740 kbase = xmlExpHashComputeKey(type, left, right);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006741 } else
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006742 return(NULL);
6743
6744 key = kbase % ctxt->size;
6745 if (ctxt->table[key] != NULL) {
6746 for (insert = ctxt->table[key]; insert != NULL;
6747 insert = insert->next) {
6748 if ((insert->key == kbase) &&
6749 (insert->type == type)) {
6750 if (type == XML_EXP_ATOM) {
6751 if (name == insert->exp_str) {
6752 insert->ref++;
6753 return(insert);
6754 }
6755 } else if (type == XML_EXP_COUNT) {
6756 if ((insert->exp_min == min) && (insert->exp_max == max) &&
6757 (insert->exp_left == left)) {
6758 insert->ref++;
6759 left->ref--;
6760 return(insert);
6761 }
6762 } else if ((insert->exp_left == left) &&
6763 (insert->exp_right == right)) {
6764 insert->ref++;
6765 left->ref--;
6766 right->ref--;
6767 return(insert);
6768 }
6769 }
6770 }
6771 }
6772
6773 entry = xmlExpNewNode(ctxt, type);
6774 if (entry == NULL)
6775 return(NULL);
6776 entry->key = kbase;
6777 if (type == XML_EXP_ATOM) {
6778 entry->exp_str = name;
6779 entry->c_max = 1;
6780 } else if (type == XML_EXP_COUNT) {
6781 entry->exp_min = min;
6782 entry->exp_max = max;
6783 entry->exp_left = left;
6784 if ((min == 0) || (IS_NILLABLE(left)))
6785 entry->info |= XML_EXP_NILABLE;
6786 if (max < 0)
6787 entry->c_max = -1;
6788 else
6789 entry->c_max = max * entry->exp_left->c_max;
6790 } else {
6791 entry->exp_left = left;
6792 entry->exp_right = right;
6793 if (type == XML_EXP_OR) {
6794 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6795 entry->info |= XML_EXP_NILABLE;
6796 if ((entry->exp_left->c_max == -1) ||
6797 (entry->exp_right->c_max == -1))
6798 entry->c_max = -1;
6799 else if (entry->exp_left->c_max > entry->exp_right->c_max)
6800 entry->c_max = entry->exp_left->c_max;
6801 else
6802 entry->c_max = entry->exp_right->c_max;
6803 } else {
6804 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6805 entry->info |= XML_EXP_NILABLE;
6806 if ((entry->exp_left->c_max == -1) ||
6807 (entry->exp_right->c_max == -1))
6808 entry->c_max = -1;
6809 else
6810 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6811 }
6812 }
6813 entry->ref = 1;
6814 if (ctxt->table[key] != NULL)
6815 entry->next = ctxt->table[key];
6816
6817 ctxt->table[key] = entry;
6818 ctxt->nbElems++;
6819
6820 return(entry);
6821}
6822
6823/**
6824 * xmlExpFree:
6825 * @ctxt: the expression context
6826 * @exp: the expression
6827 *
6828 * Dereference the expression
6829 */
6830void
6831xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6832 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6833 return;
6834 exp->ref--;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006835 if (exp->ref == 0) {
6836 unsigned short key;
6837
6838 /* Unlink it first from the hash table */
6839 key = exp->key % ctxt->size;
6840 if (ctxt->table[key] == exp) {
6841 ctxt->table[key] = exp->next;
6842 } else {
6843 xmlExpNodePtr tmp;
6844
6845 tmp = ctxt->table[key];
6846 while (tmp != NULL) {
6847 if (tmp->next == exp) {
6848 tmp->next = exp->next;
6849 break;
6850 }
6851 tmp = tmp->next;
6852 }
6853 }
6854
6855 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6856 xmlExpFree(ctxt, exp->exp_left);
6857 xmlExpFree(ctxt, exp->exp_right);
6858 } else if (exp->type == XML_EXP_COUNT) {
6859 xmlExpFree(ctxt, exp->exp_left);
6860 }
6861 xmlFree(exp);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006862 ctxt->nb_nodes--;
6863 }
6864}
6865
6866/**
6867 * xmlExpRef:
6868 * @exp: the expression
6869 *
6870 * Increase the reference count of the expression
6871 */
6872void
6873xmlExpRef(xmlExpNodePtr exp) {
6874 if (exp != NULL)
6875 exp->ref++;
6876}
6877
Daniel Veillardccb4d412005-08-23 13:41:17 +00006878/**
6879 * xmlExpNewAtom:
6880 * @ctxt: the expression context
6881 * @name: the atom name
Michael Woodfb27e2c2012-09-28 08:59:33 +02006882 * @len: the atom name length in byte (or -1);
Daniel Veillardccb4d412005-08-23 13:41:17 +00006883 *
6884 * Get the atom associated to this name from that context
6885 *
6886 * Returns the node or NULL in case of error
6887 */
6888xmlExpNodePtr
6889xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
6890 if ((ctxt == NULL) || (name == NULL))
6891 return(NULL);
6892 name = xmlDictLookup(ctxt->dict, name, len);
6893 if (name == NULL)
6894 return(NULL);
6895 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
6896}
6897
6898/**
6899 * xmlExpNewOr:
6900 * @ctxt: the expression context
6901 * @left: left expression
6902 * @right: right expression
6903 *
6904 * Get the atom associated to the choice @left | @right
6905 * Note that @left and @right are consumed in the operation, to keep
6906 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6907 * this is true even in case of failure (unless ctxt == NULL).
6908 *
6909 * Returns the node or NULL in case of error
6910 */
6911xmlExpNodePtr
6912xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00006913 if (ctxt == NULL)
6914 return(NULL);
6915 if ((left == NULL) || (right == NULL)) {
Daniel Veillardccb4d412005-08-23 13:41:17 +00006916 xmlExpFree(ctxt, left);
6917 xmlExpFree(ctxt, right);
6918 return(NULL);
6919 }
6920 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
6921}
6922
6923/**
6924 * xmlExpNewSeq:
6925 * @ctxt: the expression context
6926 * @left: left expression
6927 * @right: right expression
6928 *
6929 * Get the atom associated to the sequence @left , @right
6930 * Note that @left and @right are consumed in the operation, to keep
6931 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6932 * this is true even in case of failure (unless ctxt == NULL).
6933 *
6934 * Returns the node or NULL in case of error
6935 */
6936xmlExpNodePtr
6937xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00006938 if (ctxt == NULL)
6939 return(NULL);
6940 if ((left == NULL) || (right == NULL)) {
Daniel Veillardccb4d412005-08-23 13:41:17 +00006941 xmlExpFree(ctxt, left);
6942 xmlExpFree(ctxt, right);
6943 return(NULL);
6944 }
6945 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
6946}
6947
6948/**
6949 * xmlExpNewRange:
6950 * @ctxt: the expression context
6951 * @subset: the expression to be repeated
6952 * @min: the lower bound for the repetition
6953 * @max: the upper bound for the repetition, -1 means infinite
6954 *
6955 * Get the atom associated to the range (@subset){@min, @max}
6956 * Note that @subset is consumed in the operation, to keep
6957 * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
6958 * this is true even in case of failure (unless ctxt == NULL).
6959 *
6960 * Returns the node or NULL in case of error
6961 */
6962xmlExpNodePtr
6963xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00006964 if (ctxt == NULL)
6965 return(NULL);
6966 if ((subset == NULL) || (min < 0) || (max < -1) ||
Daniel Veillardccb4d412005-08-23 13:41:17 +00006967 ((max >= 0) && (min > max))) {
6968 xmlExpFree(ctxt, subset);
6969 return(NULL);
6970 }
6971 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
6972 NULL, NULL, min, max));
6973}
6974
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006975/************************************************************************
6976 * *
6977 * Public API for operations on expressions *
6978 * *
6979 ************************************************************************/
6980
6981static int
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006982xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006983 const xmlChar**list, int len, int nb) {
6984 int tmp, tmp2;
6985tail:
6986 switch (exp->type) {
6987 case XML_EXP_EMPTY:
6988 return(0);
6989 case XML_EXP_ATOM:
6990 for (tmp = 0;tmp < nb;tmp++)
6991 if (list[tmp] == exp->exp_str)
6992 return(0);
6993 if (nb >= len)
6994 return(-2);
Daniel Veillard13cee4e2009-09-05 14:52:55 +02006995 list[nb] = exp->exp_str;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006996 return(1);
6997 case XML_EXP_COUNT:
6998 exp = exp->exp_left;
6999 goto tail;
7000 case XML_EXP_SEQ:
7001 case XML_EXP_OR:
7002 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
7003 if (tmp < 0)
7004 return(tmp);
7005 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
7006 nb + tmp);
7007 if (tmp2 < 0)
7008 return(tmp2);
7009 return(tmp + tmp2);
7010 }
7011 return(-1);
7012}
7013
7014/**
7015 * xmlExpGetLanguage:
7016 * @ctxt: the expression context
7017 * @exp: the expression
Daniel Veillard7802ba52005-10-27 11:56:20 +00007018 * @langList: where to store the tokens
Michael Woodfb27e2c2012-09-28 08:59:33 +02007019 * @len: the allocated length of @list
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007020 *
7021 * Find all the strings used in @exp and store them in @list
7022 *
7023 * Returns the number of unique strings found, -1 in case of errors and
7024 * -2 if there is more than @len strings
7025 */
7026int
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007027xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard7802ba52005-10-27 11:56:20 +00007028 const xmlChar**langList, int len) {
7029 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007030 return(-1);
Daniel Veillard7802ba52005-10-27 11:56:20 +00007031 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007032}
7033
7034static int
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007035xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007036 const xmlChar**list, int len, int nb) {
7037 int tmp, tmp2;
7038tail:
7039 switch (exp->type) {
7040 case XML_EXP_FORBID:
7041 return(0);
7042 case XML_EXP_EMPTY:
7043 return(0);
7044 case XML_EXP_ATOM:
7045 for (tmp = 0;tmp < nb;tmp++)
7046 if (list[tmp] == exp->exp_str)
7047 return(0);
7048 if (nb >= len)
7049 return(-2);
Daniel Veillard13cee4e2009-09-05 14:52:55 +02007050 list[nb] = exp->exp_str;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007051 return(1);
7052 case XML_EXP_COUNT:
7053 exp = exp->exp_left;
7054 goto tail;
7055 case XML_EXP_SEQ:
7056 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7057 if (tmp < 0)
7058 return(tmp);
7059 if (IS_NILLABLE(exp->exp_left)) {
7060 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7061 nb + tmp);
7062 if (tmp2 < 0)
7063 return(tmp2);
7064 tmp += tmp2;
7065 }
7066 return(tmp);
7067 case XML_EXP_OR:
7068 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7069 if (tmp < 0)
7070 return(tmp);
7071 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7072 nb + tmp);
7073 if (tmp2 < 0)
7074 return(tmp2);
7075 return(tmp + tmp2);
7076 }
7077 return(-1);
7078}
7079
7080/**
7081 * xmlExpGetStart:
7082 * @ctxt: the expression context
7083 * @exp: the expression
Daniel Veillard7802ba52005-10-27 11:56:20 +00007084 * @tokList: where to store the tokens
Michael Woodfb27e2c2012-09-28 08:59:33 +02007085 * @len: the allocated length of @list
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007086 *
7087 * Find all the strings that appears at the start of the languages
7088 * accepted by @exp and store them in @list. E.g. for (a, b) | c
7089 * it will return the list [a, c]
7090 *
7091 * Returns the number of unique strings found, -1 in case of errors and
7092 * -2 if there is more than @len strings
7093 */
7094int
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007095xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard7802ba52005-10-27 11:56:20 +00007096 const xmlChar**tokList, int len) {
7097 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007098 return(-1);
Daniel Veillard7802ba52005-10-27 11:56:20 +00007099 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007100}
7101
7102/**
7103 * xmlExpIsNillable:
7104 * @exp: the expression
7105 *
7106 * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce
7107 *
7108 * Returns 1 if nillable, 0 if not and -1 in case of error
7109 */
7110int
7111xmlExpIsNillable(xmlExpNodePtr exp) {
7112 if (exp == NULL)
7113 return(-1);
7114 return(IS_NILLABLE(exp) != 0);
7115}
7116
7117static xmlExpNodePtr
7118xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
7119{
7120 xmlExpNodePtr ret;
7121
7122 switch (exp->type) {
7123 case XML_EXP_EMPTY:
7124 return(forbiddenExp);
7125 case XML_EXP_FORBID:
7126 return(forbiddenExp);
7127 case XML_EXP_ATOM:
7128 if (exp->exp_str == str) {
7129#ifdef DEBUG_DERIV
7130 printf("deriv atom: equal => Empty\n");
7131#endif
7132 ret = emptyExp;
7133 } else {
7134#ifdef DEBUG_DERIV
7135 printf("deriv atom: mismatch => forbid\n");
7136#endif
7137 /* TODO wildcards here */
7138 ret = forbiddenExp;
7139 }
7140 return(ret);
7141 case XML_EXP_OR: {
7142 xmlExpNodePtr tmp;
7143
7144#ifdef DEBUG_DERIV
7145 printf("deriv or: => or(derivs)\n");
7146#endif
7147 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7148 if (tmp == NULL) {
7149 return(NULL);
7150 }
7151 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7152 if (ret == NULL) {
7153 xmlExpFree(ctxt, tmp);
7154 return(NULL);
7155 }
7156 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
7157 NULL, 0, 0);
7158 return(ret);
7159 }
7160 case XML_EXP_SEQ:
7161#ifdef DEBUG_DERIV
7162 printf("deriv seq: starting with left\n");
7163#endif
7164 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7165 if (ret == NULL) {
7166 return(NULL);
7167 } else if (ret == forbiddenExp) {
7168 if (IS_NILLABLE(exp->exp_left)) {
7169#ifdef DEBUG_DERIV
7170 printf("deriv seq: left failed but nillable\n");
7171#endif
7172 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7173 }
7174 } else {
7175#ifdef DEBUG_DERIV
7176 printf("deriv seq: left match => sequence\n");
7177#endif
7178 exp->exp_right->ref++;
7179 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
7180 NULL, 0, 0);
7181 }
7182 return(ret);
7183 case XML_EXP_COUNT: {
7184 int min, max;
7185 xmlExpNodePtr tmp;
7186
7187 if (exp->exp_max == 0)
7188 return(forbiddenExp);
7189 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7190 if (ret == NULL)
7191 return(NULL);
7192 if (ret == forbiddenExp) {
7193#ifdef DEBUG_DERIV
7194 printf("deriv count: pattern mismatch => forbid\n");
7195#endif
7196 return(ret);
7197 }
7198 if (exp->exp_max == 1)
7199 return(ret);
7200 if (exp->exp_max < 0) /* unbounded */
7201 max = -1;
7202 else
7203 max = exp->exp_max - 1;
7204 if (exp->exp_min > 0)
7205 min = exp->exp_min - 1;
7206 else
7207 min = 0;
7208 exp->exp_left->ref++;
7209 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
7210 NULL, min, max);
7211 if (ret == emptyExp) {
7212#ifdef DEBUG_DERIV
7213 printf("deriv count: match to empty => new count\n");
7214#endif
7215 return(tmp);
7216 }
7217#ifdef DEBUG_DERIV
7218 printf("deriv count: match => sequence with new count\n");
7219#endif
7220 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
7221 NULL, 0, 0));
7222 }
7223 }
7224 return(NULL);
7225}
7226
7227/**
7228 * xmlExpStringDerive:
7229 * @ctxt: the expression context
7230 * @exp: the expression
7231 * @str: the string
7232 * @len: the string len in bytes if available
7233 *
7234 * Do one step of Brzozowski derivation of the expression @exp with
7235 * respect to the input string
7236 *
7237 * Returns the resulting expression or NULL in case of internal error
7238 */
7239xmlExpNodePtr
7240xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7241 const xmlChar *str, int len) {
7242 const xmlChar *input;
7243
7244 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
7245 return(NULL);
7246 }
7247 /*
Jan Pokornýbb654fe2016-04-13 16:56:07 +02007248 * check the string is in the dictionary, if yes use an interned
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007249 * copy, otherwise we know it's not an acceptable input
7250 */
7251 input = xmlDictExists(ctxt->dict, str, len);
7252 if (input == NULL) {
7253 return(forbiddenExp);
7254 }
7255 return(xmlExpStringDeriveInt(ctxt, exp, input));
7256}
7257
7258static int
7259xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7260 int ret = 1;
7261
7262 if (sub->c_max == -1) {
7263 if (exp->c_max != -1)
7264 ret = 0;
7265 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7266 ret = 0;
7267 }
7268#if 0
7269 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7270 ret = 0;
7271#endif
7272 return(ret);
7273}
7274
7275static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7276 xmlExpNodePtr sub);
7277/**
7278 * xmlExpDivide:
7279 * @ctxt: the expressions context
7280 * @exp: the englobing expression
7281 * @sub: the subexpression
7282 * @mult: the multiple expression
7283 * @remain: the remain from the derivation of the multiple
7284 *
7285 * Check if exp is a multiple of sub, i.e. if there is a finite number n
7286 * so that sub{n} subsume exp
7287 *
7288 * Returns the multiple value if successful, 0 if it is not a multiple
7289 * and -1 in case of internel error.
7290 */
7291
7292static int
7293xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7294 xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7295 int i;
7296 xmlExpNodePtr tmp, tmp2;
7297
7298 if (mult != NULL) *mult = NULL;
7299 if (remain != NULL) *remain = NULL;
7300 if (exp->c_max == -1) return(0);
7301 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7302
7303 for (i = 1;i <= exp->c_max;i++) {
7304 sub->ref++;
7305 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7306 sub, NULL, NULL, i, i);
7307 if (tmp == NULL) {
7308 return(-1);
7309 }
7310 if (!xmlExpCheckCard(tmp, exp)) {
7311 xmlExpFree(ctxt, tmp);
7312 continue;
7313 }
7314 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7315 if (tmp2 == NULL) {
7316 xmlExpFree(ctxt, tmp);
7317 return(-1);
7318 }
7319 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7320 if (remain != NULL)
7321 *remain = tmp2;
7322 else
7323 xmlExpFree(ctxt, tmp2);
7324 if (mult != NULL)
7325 *mult = tmp;
7326 else
7327 xmlExpFree(ctxt, tmp);
7328#ifdef DEBUG_DERIV
7329 printf("Divide succeeded %d\n", i);
7330#endif
7331 return(i);
7332 }
7333 xmlExpFree(ctxt, tmp);
7334 xmlExpFree(ctxt, tmp2);
7335 }
7336#ifdef DEBUG_DERIV
7337 printf("Divide failed\n");
7338#endif
7339 return(0);
7340}
7341
7342/**
7343 * xmlExpExpDeriveInt:
7344 * @ctxt: the expressions context
7345 * @exp: the englobing expression
7346 * @sub: the subexpression
7347 *
7348 * Try to do a step of Brzozowski derivation but at a higher level
7349 * the input being a subexpression.
7350 *
7351 * Returns the resulting expression or NULL in case of internal error
7352 */
7353static xmlExpNodePtr
7354xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7355 xmlExpNodePtr ret, tmp, tmp2, tmp3;
7356 const xmlChar **tab;
7357 int len, i;
7358
7359 /*
7360 * In case of equality and if the expression can only consume a finite
7361 * amount, then the derivation is empty
7362 */
7363 if ((exp == sub) && (exp->c_max >= 0)) {
7364#ifdef DEBUG_DERIV
7365 printf("Equal(exp, sub) and finite -> Empty\n");
7366#endif
7367 return(emptyExp);
7368 }
7369 /*
7370 * decompose sub sequence first
7371 */
7372 if (sub->type == XML_EXP_EMPTY) {
7373#ifdef DEBUG_DERIV
7374 printf("Empty(sub) -> Empty\n");
7375#endif
7376 exp->ref++;
7377 return(exp);
7378 }
7379 if (sub->type == XML_EXP_SEQ) {
7380#ifdef DEBUG_DERIV
7381 printf("Seq(sub) -> decompose\n");
7382#endif
7383 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7384 if (tmp == NULL)
7385 return(NULL);
7386 if (tmp == forbiddenExp)
7387 return(tmp);
7388 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7389 xmlExpFree(ctxt, tmp);
7390 return(ret);
7391 }
7392 if (sub->type == XML_EXP_OR) {
7393#ifdef DEBUG_DERIV
7394 printf("Or(sub) -> decompose\n");
7395#endif
7396 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7397 if (tmp == forbiddenExp)
7398 return(tmp);
7399 if (tmp == NULL)
7400 return(NULL);
7401 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7402 if ((ret == NULL) || (ret == forbiddenExp)) {
7403 xmlExpFree(ctxt, tmp);
7404 return(ret);
7405 }
7406 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7407 }
7408 if (!xmlExpCheckCard(exp, sub)) {
7409#ifdef DEBUG_DERIV
7410 printf("CheckCard(exp, sub) failed -> Forbid\n");
7411#endif
7412 return(forbiddenExp);
7413 }
7414 switch (exp->type) {
7415 case XML_EXP_EMPTY:
7416 if (sub == emptyExp)
7417 return(emptyExp);
7418#ifdef DEBUG_DERIV
7419 printf("Empty(exp) -> Forbid\n");
7420#endif
7421 return(forbiddenExp);
7422 case XML_EXP_FORBID:
7423#ifdef DEBUG_DERIV
7424 printf("Forbid(exp) -> Forbid\n");
7425#endif
7426 return(forbiddenExp);
7427 case XML_EXP_ATOM:
7428 if (sub->type == XML_EXP_ATOM) {
7429 /* TODO: handle wildcards */
7430 if (exp->exp_str == sub->exp_str) {
7431#ifdef DEBUG_DERIV
7432 printf("Atom match -> Empty\n");
7433#endif
7434 return(emptyExp);
7435 }
7436#ifdef DEBUG_DERIV
7437 printf("Atom mismatch -> Forbid\n");
7438#endif
7439 return(forbiddenExp);
7440 }
7441 if ((sub->type == XML_EXP_COUNT) &&
7442 (sub->exp_max == 1) &&
7443 (sub->exp_left->type == XML_EXP_ATOM)) {
7444 /* TODO: handle wildcards */
7445 if (exp->exp_str == sub->exp_left->exp_str) {
7446#ifdef DEBUG_DERIV
7447 printf("Atom match -> Empty\n");
7448#endif
7449 return(emptyExp);
7450 }
7451#ifdef DEBUG_DERIV
7452 printf("Atom mismatch -> Forbid\n");
7453#endif
7454 return(forbiddenExp);
7455 }
7456#ifdef DEBUG_DERIV
7457 printf("Compex exp vs Atom -> Forbid\n");
7458#endif
7459 return(forbiddenExp);
7460 case XML_EXP_SEQ:
7461 /* try to get the sequence consumed only if possible */
7462 if (xmlExpCheckCard(exp->exp_left, sub)) {
7463 /* See if the sequence can be consumed directly */
7464#ifdef DEBUG_DERIV
7465 printf("Seq trying left only\n");
7466#endif
7467 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7468 if ((ret != forbiddenExp) && (ret != NULL)) {
7469#ifdef DEBUG_DERIV
7470 printf("Seq trying left only worked\n");
7471#endif
7472 /*
7473 * TODO: assumption here that we are determinist
7474 * i.e. we won't get to a nillable exp left
7475 * subset which could be matched by the right
7476 * part too.
7477 * e.g.: (a | b)+,(a | c) and 'a+,a'
7478 */
7479 exp->exp_right->ref++;
7480 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7481 exp->exp_right, NULL, 0, 0));
7482 }
7483#ifdef DEBUG_DERIV
7484 } else {
7485 printf("Seq: left too short\n");
7486#endif
7487 }
7488 /* Try instead to decompose */
7489 if (sub->type == XML_EXP_COUNT) {
7490 int min, max;
7491
7492#ifdef DEBUG_DERIV
7493 printf("Seq: sub is a count\n");
7494#endif
7495 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7496 if (ret == NULL)
7497 return(NULL);
7498 if (ret != forbiddenExp) {
7499#ifdef DEBUG_DERIV
7500 printf("Seq , Count match on left\n");
7501#endif
7502 if (sub->exp_max < 0)
7503 max = -1;
7504 else
7505 max = sub->exp_max -1;
7506 if (sub->exp_min > 0)
7507 min = sub->exp_min -1;
7508 else
7509 min = 0;
7510 exp->exp_right->ref++;
7511 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7512 exp->exp_right, NULL, 0, 0);
7513 if (tmp == NULL)
7514 return(NULL);
7515
7516 sub->exp_left->ref++;
7517 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7518 sub->exp_left, NULL, NULL, min, max);
7519 if (tmp2 == NULL) {
7520 xmlExpFree(ctxt, tmp);
7521 return(NULL);
7522 }
7523 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7524 xmlExpFree(ctxt, tmp);
7525 xmlExpFree(ctxt, tmp2);
7526 return(ret);
7527 }
7528 }
7529 /* we made no progress on structured operations */
7530 break;
7531 case XML_EXP_OR:
7532#ifdef DEBUG_DERIV
7533 printf("Or , trying both side\n");
7534#endif
7535 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7536 if (ret == NULL)
7537 return(NULL);
7538 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7539 if (tmp == NULL) {
7540 xmlExpFree(ctxt, ret);
7541 return(NULL);
7542 }
7543 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7544 case XML_EXP_COUNT: {
7545 int min, max;
7546
7547 if (sub->type == XML_EXP_COUNT) {
7548 /*
7549 * Try to see if the loop is completely subsumed
7550 */
7551 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7552 if (tmp == NULL)
7553 return(NULL);
7554 if (tmp == forbiddenExp) {
7555 int mult;
7556
7557#ifdef DEBUG_DERIV
7558 printf("Count, Count inner don't subsume\n");
7559#endif
7560 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7561 NULL, &tmp);
7562 if (mult <= 0) {
7563#ifdef DEBUG_DERIV
7564 printf("Count, Count not multiple => forbidden\n");
7565#endif
7566 return(forbiddenExp);
7567 }
7568 if (sub->exp_max == -1) {
7569 max = -1;
7570 if (exp->exp_max == -1) {
7571 if (exp->exp_min <= sub->exp_min * mult)
7572 min = 0;
7573 else
7574 min = exp->exp_min - sub->exp_min * mult;
7575 } else {
7576#ifdef DEBUG_DERIV
7577 printf("Count, Count finite can't subsume infinite\n");
7578#endif
7579 xmlExpFree(ctxt, tmp);
7580 return(forbiddenExp);
7581 }
7582 } else {
7583 if (exp->exp_max == -1) {
7584#ifdef DEBUG_DERIV
7585 printf("Infinite loop consume mult finite loop\n");
7586#endif
7587 if (exp->exp_min > sub->exp_min * mult) {
7588 max = -1;
7589 min = exp->exp_min - sub->exp_min * mult;
7590 } else {
7591 max = -1;
7592 min = 0;
7593 }
7594 } else {
7595 if (exp->exp_max < sub->exp_max * mult) {
7596#ifdef DEBUG_DERIV
7597 printf("loops max mult mismatch => forbidden\n");
7598#endif
7599 xmlExpFree(ctxt, tmp);
7600 return(forbiddenExp);
7601 }
7602 if (sub->exp_max * mult > exp->exp_min)
7603 min = 0;
7604 else
7605 min = exp->exp_min - sub->exp_max * mult;
7606 max = exp->exp_max - sub->exp_max * mult;
7607 }
7608 }
7609 } else if (!IS_NILLABLE(tmp)) {
7610 /*
7611 * TODO: loop here to try to grow if working on finite
7612 * blocks.
7613 */
7614#ifdef DEBUG_DERIV
7615 printf("Count, Count remain not nillable => forbidden\n");
7616#endif
7617 xmlExpFree(ctxt, tmp);
7618 return(forbiddenExp);
7619 } else if (sub->exp_max == -1) {
7620 if (exp->exp_max == -1) {
7621 if (exp->exp_min <= sub->exp_min) {
7622#ifdef DEBUG_DERIV
7623 printf("Infinite loops Okay => COUNT(0,Inf)\n");
7624#endif
7625 max = -1;
7626 min = 0;
7627 } else {
7628#ifdef DEBUG_DERIV
7629 printf("Infinite loops min => Count(X,Inf)\n");
7630#endif
7631 max = -1;
7632 min = exp->exp_min - sub->exp_min;
7633 }
7634 } else if (exp->exp_min > sub->exp_min) {
7635#ifdef DEBUG_DERIV
7636 printf("loops min mismatch 1 => forbidden ???\n");
7637#endif
7638 xmlExpFree(ctxt, tmp);
7639 return(forbiddenExp);
7640 } else {
7641 max = -1;
7642 min = 0;
7643 }
7644 } else {
7645 if (exp->exp_max == -1) {
7646#ifdef DEBUG_DERIV
7647 printf("Infinite loop consume finite loop\n");
7648#endif
7649 if (exp->exp_min > sub->exp_min) {
7650 max = -1;
7651 min = exp->exp_min - sub->exp_min;
7652 } else {
7653 max = -1;
7654 min = 0;
7655 }
7656 } else {
7657 if (exp->exp_max < sub->exp_max) {
7658#ifdef DEBUG_DERIV
7659 printf("loops max mismatch => forbidden\n");
7660#endif
7661 xmlExpFree(ctxt, tmp);
7662 return(forbiddenExp);
7663 }
7664 if (sub->exp_max > exp->exp_min)
7665 min = 0;
7666 else
7667 min = exp->exp_min - sub->exp_max;
7668 max = exp->exp_max - sub->exp_max;
7669 }
7670 }
7671#ifdef DEBUG_DERIV
7672 printf("loops match => SEQ(COUNT())\n");
7673#endif
7674 exp->exp_left->ref++;
7675 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7676 NULL, NULL, min, max);
7677 if (tmp2 == NULL) {
7678 return(NULL);
7679 }
7680 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7681 NULL, 0, 0);
7682 return(ret);
7683 }
7684 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7685 if (tmp == NULL)
7686 return(NULL);
7687 if (tmp == forbiddenExp) {
7688#ifdef DEBUG_DERIV
7689 printf("loop mismatch => forbidden\n");
7690#endif
7691 return(forbiddenExp);
7692 }
7693 if (exp->exp_min > 0)
7694 min = exp->exp_min - 1;
7695 else
7696 min = 0;
7697 if (exp->exp_max < 0)
7698 max = -1;
7699 else
7700 max = exp->exp_max - 1;
7701
7702#ifdef DEBUG_DERIV
7703 printf("loop match => SEQ(COUNT())\n");
7704#endif
7705 exp->exp_left->ref++;
7706 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7707 NULL, NULL, min, max);
7708 if (tmp2 == NULL)
7709 return(NULL);
7710 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7711 NULL, 0, 0);
7712 return(ret);
7713 }
7714 }
7715
Daniel Veillardccb4d412005-08-23 13:41:17 +00007716#ifdef DEBUG_DERIV
7717 printf("Fallback to derivative\n");
7718#endif
7719 if (IS_NILLABLE(sub)) {
7720 if (!(IS_NILLABLE(exp)))
7721 return(forbiddenExp);
7722 else
7723 ret = emptyExp;
7724 } else
7725 ret = NULL;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007726 /*
7727 * here the structured derivation made no progress so
7728 * we use the default token based derivation to force one more step
7729 */
7730 if (ctxt->tabSize == 0)
7731 ctxt->tabSize = 40;
7732
7733 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7734 sizeof(const xmlChar *));
7735 if (tab == NULL) {
7736 return(NULL);
7737 }
7738
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007739 /*
7740 * collect all the strings accepted by the subexpression on input
7741 */
7742 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7743 while (len < 0) {
7744 const xmlChar **temp;
Rob Richards54a8f672005-10-07 02:33:00 +00007745 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007746 sizeof(const xmlChar *));
7747 if (temp == NULL) {
Rob Richards54a8f672005-10-07 02:33:00 +00007748 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007749 return(NULL);
7750 }
7751 tab = temp;
7752 ctxt->tabSize *= 2;
7753 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7754 }
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007755 for (i = 0;i < len;i++) {
7756 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7757 if ((tmp == NULL) || (tmp == forbiddenExp)) {
7758 xmlExpFree(ctxt, ret);
Rob Richards54a8f672005-10-07 02:33:00 +00007759 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007760 return(tmp);
7761 }
7762 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7763 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7764 xmlExpFree(ctxt, tmp);
7765 xmlExpFree(ctxt, ret);
Rob Richards54a8f672005-10-07 02:33:00 +00007766 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007767 return(tmp);
7768 }
7769 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7770 xmlExpFree(ctxt, tmp);
7771 xmlExpFree(ctxt, tmp2);
7772
7773 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7774 xmlExpFree(ctxt, ret);
Rob Richards54a8f672005-10-07 02:33:00 +00007775 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007776 return(tmp3);
7777 }
7778
7779 if (ret == NULL)
7780 ret = tmp3;
7781 else {
7782 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7783 if (ret == NULL) {
Rob Richards54a8f672005-10-07 02:33:00 +00007784 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007785 return(NULL);
7786 }
7787 }
7788 }
Rob Richards54a8f672005-10-07 02:33:00 +00007789 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007790 return(ret);
7791}
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007792
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007793/**
Daniel Veillard0090bd52005-08-22 14:43:43 +00007794 * xmlExpExpDerive:
7795 * @ctxt: the expressions context
7796 * @exp: the englobing expression
7797 * @sub: the subexpression
7798 *
7799 * Evaluates the expression resulting from @exp consuming a sub expression @sub
7800 * Based on algebraic derivation and sometimes direct Brzozowski derivation
7801 * it usually tatkes less than linear time and can handle expressions generating
7802 * infinite languages.
7803 *
7804 * Returns the resulting expression or NULL in case of internal error, the
7805 * result must be freed
7806 */
7807xmlExpNodePtr
7808xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7809 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7810 return(NULL);
7811
7812 /*
7813 * O(1) speedups
7814 */
7815 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7816#ifdef DEBUG_DERIV
7817 printf("Sub nillable and not exp : can't subsume\n");
7818#endif
7819 return(forbiddenExp);
7820 }
7821 if (xmlExpCheckCard(exp, sub) == 0) {
7822#ifdef DEBUG_DERIV
7823 printf("sub generate longuer sequances than exp : can't subsume\n");
7824#endif
7825 return(forbiddenExp);
7826 }
7827 return(xmlExpExpDeriveInt(ctxt, exp, sub));
7828}
7829
7830/**
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007831 * xmlExpSubsume:
7832 * @ctxt: the expressions context
7833 * @exp: the englobing expression
7834 * @sub: the subexpression
7835 *
7836 * Check whether @exp accepts all the languages accexpted by @sub
7837 * the input being a subexpression.
7838 *
7839 * Returns 1 if true 0 if false and -1 in case of failure.
7840 */
7841int
7842xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7843 xmlExpNodePtr tmp;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007844
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007845 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7846 return(-1);
7847
7848 /*
7849 * TODO: speedup by checking the language of sub is a subset of the
7850 * language of exp
7851 */
7852 /*
7853 * O(1) speedups
7854 */
7855 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7856#ifdef DEBUG_DERIV
7857 printf("Sub nillable and not exp : can't subsume\n");
7858#endif
7859 return(0);
7860 }
7861 if (xmlExpCheckCard(exp, sub) == 0) {
7862#ifdef DEBUG_DERIV
7863 printf("sub generate longuer sequances than exp : can't subsume\n");
7864#endif
7865 return(0);
7866 }
7867 tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7868#ifdef DEBUG_DERIV
7869 printf("Result derivation :\n");
7870 PRINT_EXP(tmp);
7871#endif
7872 if (tmp == NULL)
7873 return(-1);
7874 if (tmp == forbiddenExp)
7875 return(0);
7876 if (tmp == emptyExp)
7877 return(1);
7878 if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
7879 xmlExpFree(ctxt, tmp);
7880 return(1);
7881 }
7882 xmlExpFree(ctxt, tmp);
7883 return(0);
7884}
Daniel Veillard465a0002005-08-22 12:07:04 +00007885
7886/************************************************************************
7887 * *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007888 * Parsing expression *
Daniel Veillard465a0002005-08-22 12:07:04 +00007889 * *
7890 ************************************************************************/
7891
7892static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
7893
7894#undef CUR
7895#define CUR (*ctxt->cur)
7896#undef NEXT
7897#define NEXT ctxt->cur++;
7898#undef IS_BLANK
7899#define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
7900#define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
7901
7902static int
7903xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
7904 int ret = 0;
7905
7906 SKIP_BLANKS
7907 if (CUR == '*') {
7908 NEXT
7909 return(-1);
7910 }
7911 if ((CUR < '0') || (CUR > '9'))
7912 return(-1);
7913 while ((CUR >= '0') && (CUR <= '9')) {
7914 ret = ret * 10 + (CUR - '0');
7915 NEXT
7916 }
7917 return(ret);
7918}
7919
7920static xmlExpNodePtr
7921xmlExpParseOr(xmlExpCtxtPtr ctxt) {
7922 const char *base;
7923 xmlExpNodePtr ret;
7924 const xmlChar *val;
7925
7926 SKIP_BLANKS
7927 base = ctxt->cur;
7928 if (*ctxt->cur == '(') {
7929 NEXT
7930 ret = xmlExpParseExpr(ctxt);
7931 SKIP_BLANKS
7932 if (*ctxt->cur != ')') {
7933 fprintf(stderr, "unbalanced '(' : %s\n", base);
7934 xmlExpFree(ctxt, ret);
7935 return(NULL);
7936 }
7937 NEXT;
7938 SKIP_BLANKS
7939 goto parse_quantifier;
7940 }
7941 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
7942 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
7943 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
7944 NEXT;
7945 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
7946 if (val == NULL)
7947 return(NULL);
7948 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
7949 if (ret == NULL)
7950 return(NULL);
7951 SKIP_BLANKS
7952parse_quantifier:
7953 if (CUR == '{') {
7954 int min, max;
7955
7956 NEXT
7957 min = xmlExpParseNumber(ctxt);
7958 if (min < 0) {
7959 xmlExpFree(ctxt, ret);
7960 return(NULL);
7961 }
7962 SKIP_BLANKS
7963 if (CUR == ',') {
7964 NEXT
7965 max = xmlExpParseNumber(ctxt);
7966 SKIP_BLANKS
7967 } else
7968 max = min;
7969 if (CUR != '}') {
7970 xmlExpFree(ctxt, ret);
7971 return(NULL);
7972 }
7973 NEXT
7974 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7975 min, max);
7976 SKIP_BLANKS
7977 } else if (CUR == '?') {
7978 NEXT
7979 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7980 0, 1);
7981 SKIP_BLANKS
7982 } else if (CUR == '+') {
7983 NEXT
7984 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7985 1, -1);
7986 SKIP_BLANKS
7987 } else if (CUR == '*') {
7988 NEXT
7989 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7990 0, -1);
7991 SKIP_BLANKS
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007992 }
Daniel Veillard465a0002005-08-22 12:07:04 +00007993 return(ret);
7994}
7995
7996
7997static xmlExpNodePtr
7998xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
7999 xmlExpNodePtr ret, right;
8000
8001 ret = xmlExpParseOr(ctxt);
8002 SKIP_BLANKS
8003 while (CUR == '|') {
8004 NEXT
8005 right = xmlExpParseOr(ctxt);
8006 if (right == NULL) {
8007 xmlExpFree(ctxt, ret);
8008 return(NULL);
8009 }
8010 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
8011 if (ret == NULL)
8012 return(NULL);
8013 }
8014 return(ret);
8015}
8016
8017static xmlExpNodePtr
8018xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
8019 xmlExpNodePtr ret, right;
8020
8021 ret = xmlExpParseSeq(ctxt);
8022 SKIP_BLANKS
8023 while (CUR == ',') {
8024 NEXT
8025 right = xmlExpParseSeq(ctxt);
8026 if (right == NULL) {
8027 xmlExpFree(ctxt, ret);
8028 return(NULL);
8029 }
8030 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
8031 if (ret == NULL)
8032 return(NULL);
8033 }
8034 return(ret);
8035}
8036
8037/**
8038 * xmlExpParse:
8039 * @ctxt: the expressions context
8040 * @expr: the 0 terminated string
8041 *
8042 * Minimal parser for regexps, it understand the following constructs
8043 * - string terminals
8044 * - choice operator |
8045 * - sequence operator ,
8046 * - subexpressions (...)
8047 * - usual cardinality operators + * and ?
8048 * - finite sequences { min, max }
8049 * - infinite sequences { min, * }
8050 * There is minimal checkings made especially no checking on strings values
8051 *
8052 * Returns a new expression or NULL in case of failure
8053 */
8054xmlExpNodePtr
8055xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
8056 xmlExpNodePtr ret;
8057
8058 ctxt->expr = expr;
8059 ctxt->cur = expr;
8060
8061 ret = xmlExpParseExpr(ctxt);
8062 SKIP_BLANKS
8063 if (*ctxt->cur != 0) {
8064 xmlExpFree(ctxt, ret);
8065 return(NULL);
8066 }
8067 return(ret);
8068}
8069
8070static void
8071xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
8072 xmlExpNodePtr c;
8073
8074 if (expr == NULL) return;
8075 if (glob) xmlBufferWriteChar(buf, "(");
8076 switch (expr->type) {
8077 case XML_EXP_EMPTY:
8078 xmlBufferWriteChar(buf, "empty");
8079 break;
8080 case XML_EXP_FORBID:
8081 xmlBufferWriteChar(buf, "forbidden");
8082 break;
8083 case XML_EXP_ATOM:
8084 xmlBufferWriteCHAR(buf, expr->exp_str);
8085 break;
8086 case XML_EXP_SEQ:
8087 c = expr->exp_left;
8088 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8089 xmlExpDumpInt(buf, c, 1);
8090 else
8091 xmlExpDumpInt(buf, c, 0);
8092 xmlBufferWriteChar(buf, " , ");
8093 c = expr->exp_right;
8094 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8095 xmlExpDumpInt(buf, c, 1);
8096 else
8097 xmlExpDumpInt(buf, c, 0);
8098 break;
8099 case XML_EXP_OR:
8100 c = expr->exp_left;
8101 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8102 xmlExpDumpInt(buf, c, 1);
8103 else
8104 xmlExpDumpInt(buf, c, 0);
8105 xmlBufferWriteChar(buf, " | ");
8106 c = expr->exp_right;
8107 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8108 xmlExpDumpInt(buf, c, 1);
8109 else
8110 xmlExpDumpInt(buf, c, 0);
8111 break;
8112 case XML_EXP_COUNT: {
8113 char rep[40];
Daniel Veillardf8e3db02012-09-11 13:26:36 +08008114
Daniel Veillard465a0002005-08-22 12:07:04 +00008115 c = expr->exp_left;
8116 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8117 xmlExpDumpInt(buf, c, 1);
8118 else
8119 xmlExpDumpInt(buf, c, 0);
8120 if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
8121 rep[0] = '?';
8122 rep[1] = 0;
8123 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
8124 rep[0] = '*';
8125 rep[1] = 0;
8126 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
8127 rep[0] = '+';
8128 rep[1] = 0;
8129 } else if (expr->exp_max == expr->exp_min) {
8130 snprintf(rep, 39, "{%d}", expr->exp_min);
8131 } else if (expr->exp_max < 0) {
8132 snprintf(rep, 39, "{%d,inf}", expr->exp_min);
8133 } else {
8134 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
8135 }
8136 rep[39] = 0;
8137 xmlBufferWriteChar(buf, rep);
8138 break;
8139 }
8140 default:
8141 fprintf(stderr, "Error in tree\n");
8142 }
8143 if (glob)
8144 xmlBufferWriteChar(buf, ")");
8145}
8146/**
8147 * xmlExpDump:
8148 * @buf: a buffer to receive the output
8149 * @expr: the compiled expression
8150 *
8151 * Serialize the expression as compiled to the buffer
8152 */
8153void
Daniel Veillard5eee7672005-08-22 21:22:27 +00008154xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
8155 if ((buf == NULL) || (expr == NULL))
Daniel Veillard465a0002005-08-22 12:07:04 +00008156 return;
Daniel Veillard5eee7672005-08-22 21:22:27 +00008157 xmlExpDumpInt(buf, expr, 0);
Daniel Veillard465a0002005-08-22 12:07:04 +00008158}
8159
8160/**
8161 * xmlExpMaxToken:
8162 * @expr: a compiled expression
8163 *
8164 * Indicate the maximum number of input a expression can accept
8165 *
8166 * Returns the maximum length or -1 in case of error
8167 */
8168int
8169xmlExpMaxToken(xmlExpNodePtr expr) {
8170 if (expr == NULL)
8171 return(-1);
8172 return(expr->c_max);
8173}
8174
8175/**
8176 * xmlExpCtxtNbNodes:
8177 * @ctxt: an expression context
8178 *
8179 * Debugging facility provides the number of allocated nodes at a that point
8180 *
8181 * Returns the number of nodes in use or -1 in case of error
8182 */
8183int
8184xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
8185 if (ctxt == NULL)
8186 return(-1);
8187 return(ctxt->nb_nodes);
8188}
8189
8190/**
8191 * xmlExpCtxtNbCons:
8192 * @ctxt: an expression context
8193 *
8194 * Debugging facility provides the number of allocated nodes over lifetime
8195 *
8196 * Returns the number of nodes ever allocated or -1 in case of error
8197 */
8198int
8199xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
8200 if (ctxt == NULL)
8201 return(-1);
8202 return(ctxt->nb_cons);
8203}
8204
Daniel Veillard81a8ec62005-08-22 00:20:58 +00008205#endif /* LIBXML_EXPR_ENABLED */
Daniel Veillard5d4644e2005-04-01 13:11:58 +00008206#define bottom_xmlregexp
8207#include "elfgcchack.h"
Daniel Veillard4255d502002-04-16 15:50:10 +00008208#endif /* LIBXML_REGEXP_ENABLED */