blob: ce09b2216f580c3ef30bcaca8f4a8ce7e9a728f9 [file] [log] [blame]
Daniel Veillard4255d502002-04-16 15:50:10 +00001/*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004 * Basically designed with the purpose of compiling regexps for
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005 * the variety of validation/schemas mechanisms now available in
William M. Brackddf71d62004-05-06 04:17:26 +00006 * XML related specifications these include:
Daniel Veillard4255d502002-04-16 15:50:10 +00007 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17#define IN_LIBXML
18#include "libxml.h"
19
20#ifdef LIBXML_REGEXP_ENABLED
21
Daniel Veillardcee2b3a2005-01-25 00:22:52 +000022/* #define DEBUG_ERR */
Daniel Veillardfc0b6f62005-01-09 17:48:02 +000023
Daniel Veillard4255d502002-04-16 15:50:10 +000024#include <stdio.h>
25#include <string.h>
Daniel Veillardebe48c62003-12-03 12:12:27 +000026#ifdef HAVE_LIMITS_H
27#include <limits.h>
28#endif
Haibo Huangcfd91dc2020-07-30 23:01:33 -070029#ifdef HAVE_STDINT_H
30#include <stdint.h>
31#endif
Daniel Veillardebe48c62003-12-03 12:12:27 +000032
Daniel Veillard4255d502002-04-16 15:50:10 +000033#include <libxml/tree.h>
34#include <libxml/parserInternals.h>
35#include <libxml/xmlregexp.h>
36#include <libxml/xmlautomata.h>
37#include <libxml/xmlunicode.h>
38
Daniel Veillardebe48c62003-12-03 12:12:27 +000039#ifndef INT_MAX
40#define INT_MAX 123456789 /* easy to flag and big enough for our needs */
41#endif
Haibo Huangcfd91dc2020-07-30 23:01:33 -070042#ifndef SIZE_MAX
43#define SIZE_MAX ((size_t) -1)
44#endif
Daniel Veillardebe48c62003-12-03 12:12:27 +000045
Daniel Veillardc0826a72004-08-10 14:17:33 +000046/* #define DEBUG_REGEXP_GRAPH */
Daniel Veillard10752282005-08-08 13:05:13 +000047/* #define DEBUG_REGEXP_EXEC */
Daniel Veillard4255d502002-04-16 15:50:10 +000048/* #define DEBUG_PUSH */
Daniel Veillard23e73572002-09-19 19:56:43 +000049/* #define DEBUG_COMPACTION */
Daniel Veillard4255d502002-04-16 15:50:10 +000050
Daniel Veillard567a45b2005-10-18 19:11:55 +000051#define MAX_PUSH 10000000
Daniel Veillard94cc1032005-09-15 13:09:00 +000052
Patrick R. Gansterer204f1f12012-05-10 20:24:00 +080053#ifdef ERROR
54#undef ERROR
55#endif
Daniel Veillardff46a042003-10-08 08:53:17 +000056#define ERROR(str) \
57 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
58 xmlRegexpErrCompile(ctxt, str);
Daniel Veillard4255d502002-04-16 15:50:10 +000059#define NEXT ctxt->cur++
60#define CUR (*(ctxt->cur))
61#define NXT(index) (ctxt->cur[index])
62
63#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
64#define NEXTL(l) ctxt->cur += l;
Daniel Veillardc0826a72004-08-10 14:17:33 +000065#define XML_REG_STRING_SEPARATOR '|'
William M. Bracka9cbf282007-03-21 13:16:33 +000066/*
67 * Need PREV to check on a '-' within a Character Group. May only be used
68 * when it's guaranteed that cur is not at the beginning of ctxt->string!
69 */
70#define PREV (ctxt->cur[-1])
Daniel Veillard4255d502002-04-16 15:50:10 +000071
Daniel Veillarde19fc232002-04-22 16:01:24 +000072/**
73 * TODO:
74 *
75 * macro to flag unimplemented blocks
76 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +080077#define TODO \
Daniel Veillarde19fc232002-04-22 16:01:24 +000078 xmlGenericError(xmlGenericErrorContext, \
79 "Unimplemented block at %s:%d\n", \
80 __FILE__, __LINE__);
81
Daniel Veillard4255d502002-04-16 15:50:10 +000082/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +080083 * *
84 * Datatypes and structures *
85 * *
Daniel Veillard4255d502002-04-16 15:50:10 +000086 ************************************************************************/
87
Daniel Veillardfc011b72006-02-12 19:14:15 +000088/*
89 * Note: the order of the enums below is significant, do not shuffle
90 */
Daniel Veillard4255d502002-04-16 15:50:10 +000091typedef enum {
92 XML_REGEXP_EPSILON = 1,
93 XML_REGEXP_CHARVAL,
94 XML_REGEXP_RANGES,
Daniel Veillard567a45b2005-10-18 19:11:55 +000095 XML_REGEXP_SUBREG, /* used for () sub regexps */
Daniel Veillard4255d502002-04-16 15:50:10 +000096 XML_REGEXP_STRING,
97 XML_REGEXP_ANYCHAR, /* . */
98 XML_REGEXP_ANYSPACE, /* \s */
99 XML_REGEXP_NOTSPACE, /* \S */
100 XML_REGEXP_INITNAME, /* \l */
Daniel Veillard567a45b2005-10-18 19:11:55 +0000101 XML_REGEXP_NOTINITNAME, /* \L */
Daniel Veillard4255d502002-04-16 15:50:10 +0000102 XML_REGEXP_NAMECHAR, /* \c */
103 XML_REGEXP_NOTNAMECHAR, /* \C */
104 XML_REGEXP_DECIMAL, /* \d */
Daniel Veillard567a45b2005-10-18 19:11:55 +0000105 XML_REGEXP_NOTDECIMAL, /* \D */
Daniel Veillard4255d502002-04-16 15:50:10 +0000106 XML_REGEXP_REALCHAR, /* \w */
Daniel Veillard567a45b2005-10-18 19:11:55 +0000107 XML_REGEXP_NOTREALCHAR, /* \W */
108 XML_REGEXP_LETTER = 100,
Daniel Veillard4255d502002-04-16 15:50:10 +0000109 XML_REGEXP_LETTER_UPPERCASE,
110 XML_REGEXP_LETTER_LOWERCASE,
111 XML_REGEXP_LETTER_TITLECASE,
112 XML_REGEXP_LETTER_MODIFIER,
113 XML_REGEXP_LETTER_OTHERS,
114 XML_REGEXP_MARK,
115 XML_REGEXP_MARK_NONSPACING,
116 XML_REGEXP_MARK_SPACECOMBINING,
117 XML_REGEXP_MARK_ENCLOSING,
118 XML_REGEXP_NUMBER,
119 XML_REGEXP_NUMBER_DECIMAL,
120 XML_REGEXP_NUMBER_LETTER,
121 XML_REGEXP_NUMBER_OTHERS,
122 XML_REGEXP_PUNCT,
123 XML_REGEXP_PUNCT_CONNECTOR,
124 XML_REGEXP_PUNCT_DASH,
125 XML_REGEXP_PUNCT_OPEN,
126 XML_REGEXP_PUNCT_CLOSE,
127 XML_REGEXP_PUNCT_INITQUOTE,
128 XML_REGEXP_PUNCT_FINQUOTE,
129 XML_REGEXP_PUNCT_OTHERS,
130 XML_REGEXP_SEPAR,
131 XML_REGEXP_SEPAR_SPACE,
132 XML_REGEXP_SEPAR_LINE,
133 XML_REGEXP_SEPAR_PARA,
134 XML_REGEXP_SYMBOL,
135 XML_REGEXP_SYMBOL_MATH,
136 XML_REGEXP_SYMBOL_CURRENCY,
137 XML_REGEXP_SYMBOL_MODIFIER,
138 XML_REGEXP_SYMBOL_OTHERS,
139 XML_REGEXP_OTHER,
140 XML_REGEXP_OTHER_CONTROL,
141 XML_REGEXP_OTHER_FORMAT,
142 XML_REGEXP_OTHER_PRIVATE,
143 XML_REGEXP_OTHER_NA,
144 XML_REGEXP_BLOCK_NAME
145} xmlRegAtomType;
146
147typedef enum {
148 XML_REGEXP_QUANT_EPSILON = 1,
149 XML_REGEXP_QUANT_ONCE,
150 XML_REGEXP_QUANT_OPT,
151 XML_REGEXP_QUANT_MULT,
152 XML_REGEXP_QUANT_PLUS,
Daniel Veillard7646b182002-04-20 06:41:40 +0000153 XML_REGEXP_QUANT_ONCEONLY,
154 XML_REGEXP_QUANT_ALL,
Daniel Veillard4255d502002-04-16 15:50:10 +0000155 XML_REGEXP_QUANT_RANGE
156} xmlRegQuantType;
157
158typedef enum {
159 XML_REGEXP_START_STATE = 1,
160 XML_REGEXP_FINAL_STATE,
Daniel Veillardcc026dc2005-01-12 13:21:17 +0000161 XML_REGEXP_TRANS_STATE,
Daniel Veillard0e05f4c2006-11-01 15:33:04 +0000162 XML_REGEXP_SINK_STATE,
163 XML_REGEXP_UNREACH_STATE
Daniel Veillard4255d502002-04-16 15:50:10 +0000164} xmlRegStateType;
165
166typedef enum {
167 XML_REGEXP_MARK_NORMAL = 0,
168 XML_REGEXP_MARK_START,
169 XML_REGEXP_MARK_VISITED
170} xmlRegMarkedType;
171
172typedef struct _xmlRegRange xmlRegRange;
173typedef xmlRegRange *xmlRegRangePtr;
174
175struct _xmlRegRange {
Daniel Veillardf8b9de32003-11-24 14:27:26 +0000176 int neg; /* 0 normal, 1 not, 2 exclude */
Daniel Veillard4255d502002-04-16 15:50:10 +0000177 xmlRegAtomType type;
178 int start;
179 int end;
180 xmlChar *blockName;
181};
182
183typedef struct _xmlRegAtom xmlRegAtom;
184typedef xmlRegAtom *xmlRegAtomPtr;
185
186typedef struct _xmlAutomataState xmlRegState;
187typedef xmlRegState *xmlRegStatePtr;
188
189struct _xmlRegAtom {
190 int no;
191 xmlRegAtomType type;
192 xmlRegQuantType quant;
193 int min;
194 int max;
195
196 void *valuep;
Daniel Veillarda646cfd2002-09-17 21:50:03 +0000197 void *valuep2;
Daniel Veillard4255d502002-04-16 15:50:10 +0000198 int neg;
199 int codepoint;
200 xmlRegStatePtr start;
Daniel Veillard76d59b62007-08-22 16:29:21 +0000201 xmlRegStatePtr start0;
Daniel Veillard4255d502002-04-16 15:50:10 +0000202 xmlRegStatePtr stop;
203 int maxRanges;
204 int nbRanges;
205 xmlRegRangePtr *ranges;
206 void *data;
207};
208
209typedef struct _xmlRegCounter xmlRegCounter;
210typedef xmlRegCounter *xmlRegCounterPtr;
211
212struct _xmlRegCounter {
213 int min;
214 int max;
215};
216
217typedef struct _xmlRegTrans xmlRegTrans;
218typedef xmlRegTrans *xmlRegTransPtr;
219
220struct _xmlRegTrans {
221 xmlRegAtomPtr atom;
222 int to;
223 int counter;
224 int count;
Daniel Veillard567a45b2005-10-18 19:11:55 +0000225 int nd;
Daniel Veillard4255d502002-04-16 15:50:10 +0000226};
227
228struct _xmlAutomataState {
229 xmlRegStateType type;
230 xmlRegMarkedType mark;
Daniel Veillard466fcda2012-08-27 12:03:40 +0800231 xmlRegMarkedType markd;
Daniel Veillard23e73572002-09-19 19:56:43 +0000232 xmlRegMarkedType reached;
Daniel Veillard4255d502002-04-16 15:50:10 +0000233 int no;
Daniel Veillard4255d502002-04-16 15:50:10 +0000234 int maxTrans;
235 int nbTrans;
236 xmlRegTrans *trans;
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700237 /* knowing states pointing to us can speed things up */
Daniel Veillarddb68b742005-07-30 13:18:24 +0000238 int maxTransTo;
239 int nbTransTo;
240 int *transTo;
Daniel Veillard4255d502002-04-16 15:50:10 +0000241};
242
243typedef struct _xmlAutomata xmlRegParserCtxt;
244typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
245
Daniel Veillard1ba2aca2009-08-31 16:47:39 +0200246#define AM_AUTOMATA_RNG 1
247
Daniel Veillard4255d502002-04-16 15:50:10 +0000248struct _xmlAutomata {
249 xmlChar *string;
250 xmlChar *cur;
251
252 int error;
253 int neg;
254
255 xmlRegStatePtr start;
256 xmlRegStatePtr end;
257 xmlRegStatePtr state;
258
259 xmlRegAtomPtr atom;
260
261 int maxAtoms;
262 int nbAtoms;
263 xmlRegAtomPtr *atoms;
264
265 int maxStates;
266 int nbStates;
267 xmlRegStatePtr *states;
268
269 int maxCounters;
270 int nbCounters;
271 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000272
273 int determinist;
Daniel Veillard6e65e152005-08-09 11:09:52 +0000274 int negs;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +0200275 int flags;
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700276
277 int depth;
Daniel Veillard4255d502002-04-16 15:50:10 +0000278};
279
280struct _xmlRegexp {
281 xmlChar *string;
282 int nbStates;
283 xmlRegStatePtr *states;
284 int nbAtoms;
285 xmlRegAtomPtr *atoms;
286 int nbCounters;
287 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000288 int determinist;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +0200289 int flags;
Daniel Veillard23e73572002-09-19 19:56:43 +0000290 /*
291 * That's the compact form for determinists automatas
292 */
293 int nbstates;
294 int *compact;
Daniel Veillard118aed72002-09-24 14:13:13 +0000295 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000296 int nbstrings;
297 xmlChar **stringMap;
Daniel Veillard4255d502002-04-16 15:50:10 +0000298};
299
300typedef struct _xmlRegExecRollback xmlRegExecRollback;
301typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
302
303struct _xmlRegExecRollback {
304 xmlRegStatePtr state;/* the current state */
305 int index; /* the index in the input stack */
306 int nextbranch; /* the next transition to explore in that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000307 int *counts; /* save the automata state if it has some */
Daniel Veillard4255d502002-04-16 15:50:10 +0000308};
309
310typedef struct _xmlRegInputToken xmlRegInputToken;
311typedef xmlRegInputToken *xmlRegInputTokenPtr;
312
313struct _xmlRegInputToken {
314 xmlChar *value;
315 void *data;
316};
317
318struct _xmlRegExecCtxt {
319 int status; /* execution status != 0 indicate an error */
William M. Brackddf71d62004-05-06 04:17:26 +0000320 int determinist; /* did we find an indeterministic behaviour */
Daniel Veillard4255d502002-04-16 15:50:10 +0000321 xmlRegexpPtr comp; /* the compiled regexp */
322 xmlRegExecCallbacks callback;
323 void *data;
324
325 xmlRegStatePtr state;/* the current state */
326 int transno; /* the current transition on that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000327 int transcount; /* the number of chars in char counted transitions */
Daniel Veillard4255d502002-04-16 15:50:10 +0000328
329 /*
330 * A stack of rollback states
331 */
332 int maxRollbacks;
333 int nbRollbacks;
334 xmlRegExecRollback *rollbacks;
335
336 /*
337 * The state of the automata if any
338 */
339 int *counts;
340
341 /*
342 * The input stack
343 */
344 int inputStackMax;
345 int inputStackNr;
346 int index;
347 int *charStack;
348 const xmlChar *inputString; /* when operating on characters */
349 xmlRegInputTokenPtr inputStack;/* when operating on strings */
350
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +0000351 /*
352 * error handling
353 */
354 int errStateNo; /* the error state number */
355 xmlRegStatePtr errState; /* the error state */
356 xmlChar *errString; /* the string raising the error */
357 int *errCounts; /* counters at the error state */
Daniel Veillard94cc1032005-09-15 13:09:00 +0000358 int nbPush;
Daniel Veillard4255d502002-04-16 15:50:10 +0000359};
360
Daniel Veillard441bc322002-04-20 17:38:48 +0000361#define REGEXP_ALL_COUNTER 0x123456
362#define REGEXP_ALL_LAX_COUNTER 0x123457
Daniel Veillard7646b182002-04-20 06:41:40 +0000363
Daniel Veillard4255d502002-04-16 15:50:10 +0000364static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
Daniel Veillard23e73572002-09-19 19:56:43 +0000365static void xmlRegFreeState(xmlRegStatePtr state);
366static void xmlRegFreeAtom(xmlRegAtomPtr atom);
Daniel Veillard9efc4762005-07-19 14:33:55 +0000367static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
Daniel Veillard567a45b2005-10-18 19:11:55 +0000368static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
369static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
370 int neg, int start, int end, const xmlChar *blockName);
Daniel Veillard4255d502002-04-16 15:50:10 +0000371
Daniel Veillard1ba2aca2009-08-31 16:47:39 +0200372void xmlAutomataSetFlags(xmlAutomataPtr am, int flags);
373
Daniel Veillard4255d502002-04-16 15:50:10 +0000374/************************************************************************
Daniel Veillardff46a042003-10-08 08:53:17 +0000375 * *
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800376 * Regexp memory error handler *
Daniel Veillardff46a042003-10-08 08:53:17 +0000377 * *
378 ************************************************************************/
379/**
380 * xmlRegexpErrMemory:
William M. Brackddf71d62004-05-06 04:17:26 +0000381 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000382 *
383 * Handle an out of memory condition
384 */
385static void
386xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
387{
388 const char *regexp = NULL;
389 if (ctxt != NULL) {
390 regexp = (const char *) ctxt->string;
391 ctxt->error = XML_ERR_NO_MEMORY;
392 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000393 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000394 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
395 regexp, NULL, 0, 0,
396 "Memory allocation failed : %s\n", extra);
397}
398
399/**
400 * xmlRegexpErrCompile:
William M. Brackddf71d62004-05-06 04:17:26 +0000401 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000402 *
William M. Brackddf71d62004-05-06 04:17:26 +0000403 * Handle a compilation failure
Daniel Veillardff46a042003-10-08 08:53:17 +0000404 */
405static void
406xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
407{
408 const char *regexp = NULL;
409 int idx = 0;
410
411 if (ctxt != NULL) {
412 regexp = (const char *) ctxt->string;
413 idx = ctxt->cur - ctxt->string;
414 ctxt->error = XML_REGEXP_COMPILE_ERROR;
415 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000416 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000417 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
418 regexp, NULL, idx, 0,
419 "failed to compile: %s\n", extra);
420}
421
422/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800423 * *
424 * Allocation/Deallocation *
425 * *
Daniel Veillard4255d502002-04-16 15:50:10 +0000426 ************************************************************************/
427
Daniel Veillard23e73572002-09-19 19:56:43 +0000428static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700429
430/**
431 * xmlRegCalloc2:
432 * @dim1: size of first dimension
433 * @dim2: size of second dimension
434 * @elemSize: size of element
435 *
436 * Allocate a two-dimensional array and set all elements to zero.
437 *
438 * Returns the new array or NULL in case of error.
439 */
440static void*
441xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) {
442 size_t totalSize;
443 void *ret;
444
445 /* Check for overflow */
446 if (dim1 > SIZE_MAX / dim2 / elemSize)
447 return (NULL);
448 totalSize = dim1 * dim2 * elemSize;
449 ret = xmlMalloc(totalSize);
450 if (ret != NULL)
451 memset(ret, 0, totalSize);
452 return (ret);
453}
454
Daniel Veillard4255d502002-04-16 15:50:10 +0000455/**
456 * xmlRegEpxFromParse:
457 * @ctxt: the parser context used to build it
458 *
William M. Brackddf71d62004-05-06 04:17:26 +0000459 * Allocate a new regexp and fill it with the result from the parser
Daniel Veillard4255d502002-04-16 15:50:10 +0000460 *
461 * Returns the new regexp or NULL in case of error
462 */
463static xmlRegexpPtr
464xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
465 xmlRegexpPtr ret;
466
467 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000468 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000469 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +0000470 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000471 }
Daniel Veillard4255d502002-04-16 15:50:10 +0000472 memset(ret, 0, sizeof(xmlRegexp));
473 ret->string = ctxt->string;
Daniel Veillard4255d502002-04-16 15:50:10 +0000474 ret->nbStates = ctxt->nbStates;
Daniel Veillard4255d502002-04-16 15:50:10 +0000475 ret->states = ctxt->states;
Daniel Veillard4255d502002-04-16 15:50:10 +0000476 ret->nbAtoms = ctxt->nbAtoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000477 ret->atoms = ctxt->atoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000478 ret->nbCounters = ctxt->nbCounters;
Daniel Veillard4255d502002-04-16 15:50:10 +0000479 ret->counters = ctxt->counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000480 ret->determinist = ctxt->determinist;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +0200481 ret->flags = ctxt->flags;
Daniel Veillard567a45b2005-10-18 19:11:55 +0000482 if (ret->determinist == -1) {
483 xmlRegexpIsDeterminist(ret);
484 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000485
486 if ((ret->determinist != 0) &&
487 (ret->nbCounters == 0) &&
Daniel Veillard6e65e152005-08-09 11:09:52 +0000488 (ctxt->negs == 0) &&
Daniel Veillard118aed72002-09-24 14:13:13 +0000489 (ret->atoms != NULL) &&
Daniel Veillard23e73572002-09-19 19:56:43 +0000490 (ret->atoms[0] != NULL) &&
491 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
492 int i, j, nbstates = 0, nbatoms = 0;
493 int *stateRemap;
494 int *stringRemap;
495 int *transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000496 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000497 xmlChar **stringMap;
498 xmlChar *value;
499
500 /*
501 * Switch to a compact representation
502 * 1/ counting the effective number of states left
William M. Brackddf71d62004-05-06 04:17:26 +0000503 * 2/ counting the unique number of atoms, and check that
Daniel Veillard23e73572002-09-19 19:56:43 +0000504 * they are all of the string type
505 * 3/ build a table state x atom for the transitions
506 */
507
508 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000509 if (stateRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000510 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000511 xmlFree(ret);
512 return(NULL);
513 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000514 for (i = 0;i < ret->nbStates;i++) {
515 if (ret->states[i] != NULL) {
516 stateRemap[i] = nbstates;
517 nbstates++;
518 } else {
519 stateRemap[i] = -1;
520 }
521 }
522#ifdef DEBUG_COMPACTION
523 printf("Final: %d states\n", nbstates);
524#endif
525 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000526 if (stringMap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000527 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000528 xmlFree(stateRemap);
529 xmlFree(ret);
530 return(NULL);
531 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000532 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000533 if (stringRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000534 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000535 xmlFree(stringMap);
536 xmlFree(stateRemap);
537 xmlFree(ret);
538 return(NULL);
539 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000540 for (i = 0;i < ret->nbAtoms;i++) {
541 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
542 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
543 value = ret->atoms[i]->valuep;
544 for (j = 0;j < nbatoms;j++) {
545 if (xmlStrEqual(stringMap[j], value)) {
546 stringRemap[i] = j;
547 break;
548 }
549 }
550 if (j >= nbatoms) {
551 stringRemap[i] = nbatoms;
552 stringMap[nbatoms] = xmlStrdup(value);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000553 if (stringMap[nbatoms] == NULL) {
554 for (i = 0;i < nbatoms;i++)
555 xmlFree(stringMap[i]);
556 xmlFree(stringRemap);
557 xmlFree(stringMap);
558 xmlFree(stateRemap);
559 xmlFree(ret);
560 return(NULL);
561 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000562 nbatoms++;
563 }
564 } else {
565 xmlFree(stateRemap);
566 xmlFree(stringRemap);
567 for (i = 0;i < nbatoms;i++)
568 xmlFree(stringMap[i]);
569 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000570 xmlFree(ret);
571 return(NULL);
Daniel Veillard23e73572002-09-19 19:56:43 +0000572 }
573 }
574#ifdef DEBUG_COMPACTION
575 printf("Final: %d atoms\n", nbatoms);
576#endif
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700577 transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1,
578 sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000579 if (transitions == NULL) {
580 xmlFree(stateRemap);
581 xmlFree(stringRemap);
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700582 for (i = 0;i < nbatoms;i++)
583 xmlFree(stringMap[i]);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000584 xmlFree(stringMap);
585 xmlFree(ret);
586 return(NULL);
587 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000588
589 /*
590 * Allocate the transition table. The first entry for each
William M. Brackddf71d62004-05-06 04:17:26 +0000591 * state corresponds to the state type.
Daniel Veillard23e73572002-09-19 19:56:43 +0000592 */
Daniel Veillard118aed72002-09-24 14:13:13 +0000593 transdata = NULL;
Daniel Veillard23e73572002-09-19 19:56:43 +0000594
595 for (i = 0;i < ret->nbStates;i++) {
596 int stateno, atomno, targetno, prev;
597 xmlRegStatePtr state;
598 xmlRegTransPtr trans;
599
600 stateno = stateRemap[i];
601 if (stateno == -1)
602 continue;
603 state = ret->states[i];
604
605 transitions[stateno * (nbatoms + 1)] = state->type;
606
607 for (j = 0;j < state->nbTrans;j++) {
608 trans = &(state->trans[j]);
609 if ((trans->to == -1) || (trans->atom == NULL))
610 continue;
611 atomno = stringRemap[trans->atom->no];
Daniel Veillard118aed72002-09-24 14:13:13 +0000612 if ((trans->atom->data != NULL) && (transdata == NULL)) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700613 transdata = (void **) xmlRegCalloc2(nbstates, nbatoms,
614 sizeof(void *));
615 if (transdata == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000616 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000617 break;
618 }
Daniel Veillard118aed72002-09-24 14:13:13 +0000619 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000620 targetno = stateRemap[trans->to];
621 /*
William M. Brackddf71d62004-05-06 04:17:26 +0000622 * if the same atom can generate transitions to 2 different
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700623 * states then it means the automata is not deterministic and
Daniel Veillard23e73572002-09-19 19:56:43 +0000624 * the compact form can't be used !
625 */
626 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
627 if (prev != 0) {
628 if (prev != targetno + 1) {
Daniel Veillard23e73572002-09-19 19:56:43 +0000629 ret->determinist = 0;
630#ifdef DEBUG_COMPACTION
631 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
632 i, j, trans->atom->no, trans->to, atomno, targetno);
633 printf(" previous to is %d\n", prev);
634#endif
Daniel Veillard118aed72002-09-24 14:13:13 +0000635 if (transdata != NULL)
636 xmlFree(transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +0000637 xmlFree(transitions);
638 xmlFree(stateRemap);
639 xmlFree(stringRemap);
640 for (i = 0;i < nbatoms;i++)
641 xmlFree(stringMap[i]);
642 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000643 goto not_determ;
Daniel Veillard23e73572002-09-19 19:56:43 +0000644 }
645 } else {
646#if 0
647 printf("State %d trans %d: atom %d to %d : %d to %d\n",
648 i, j, trans->atom->no, trans->to, atomno, targetno);
649#endif
650 transitions[stateno * (nbatoms + 1) + atomno + 1] =
Daniel Veillard118aed72002-09-24 14:13:13 +0000651 targetno + 1; /* to avoid 0 */
652 if (transdata != NULL)
653 transdata[stateno * nbatoms + atomno] =
654 trans->atom->data;
Daniel Veillard23e73572002-09-19 19:56:43 +0000655 }
656 }
657 }
658 ret->determinist = 1;
659#ifdef DEBUG_COMPACTION
660 /*
661 * Debug
662 */
663 for (i = 0;i < nbstates;i++) {
664 for (j = 0;j < nbatoms + 1;j++) {
665 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
666 }
667 printf("\n");
668 }
669 printf("\n");
670#endif
671 /*
672 * Cleanup of the old data
673 */
674 if (ret->states != NULL) {
675 for (i = 0;i < ret->nbStates;i++)
676 xmlRegFreeState(ret->states[i]);
677 xmlFree(ret->states);
678 }
679 ret->states = NULL;
680 ret->nbStates = 0;
681 if (ret->atoms != NULL) {
682 for (i = 0;i < ret->nbAtoms;i++)
683 xmlRegFreeAtom(ret->atoms[i]);
684 xmlFree(ret->atoms);
685 }
686 ret->atoms = NULL;
687 ret->nbAtoms = 0;
688
689 ret->compact = transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000690 ret->transdata = transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000691 ret->stringMap = stringMap;
692 ret->nbstrings = nbatoms;
693 ret->nbstates = nbstates;
694 xmlFree(stateRemap);
695 xmlFree(stringRemap);
696 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000697not_determ:
698 ctxt->string = NULL;
699 ctxt->nbStates = 0;
700 ctxt->states = NULL;
701 ctxt->nbAtoms = 0;
702 ctxt->atoms = NULL;
703 ctxt->nbCounters = 0;
704 ctxt->counters = NULL;
Daniel Veillard4255d502002-04-16 15:50:10 +0000705 return(ret);
706}
707
708/**
709 * xmlRegNewParserCtxt:
710 * @string: the string to parse
711 *
712 * Allocate a new regexp parser context
713 *
714 * Returns the new context or NULL in case of error
715 */
716static xmlRegParserCtxtPtr
717xmlRegNewParserCtxt(const xmlChar *string) {
718 xmlRegParserCtxtPtr ret;
719
720 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
721 if (ret == NULL)
722 return(NULL);
723 memset(ret, 0, sizeof(xmlRegParserCtxt));
724 if (string != NULL)
725 ret->string = xmlStrdup(string);
726 ret->cur = ret->string;
727 ret->neg = 0;
Daniel Veillard6e65e152005-08-09 11:09:52 +0000728 ret->negs = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +0000729 ret->error = 0;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000730 ret->determinist = -1;
Daniel Veillard4255d502002-04-16 15:50:10 +0000731 return(ret);
732}
733
734/**
735 * xmlRegNewRange:
736 * @ctxt: the regexp parser context
737 * @neg: is that negative
738 * @type: the type of range
739 * @start: the start codepoint
740 * @end: the end codepoint
741 *
742 * Allocate a new regexp range
743 *
744 * Returns the new range or NULL in case of error
745 */
746static xmlRegRangePtr
747xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
748 int neg, xmlRegAtomType type, int start, int end) {
749 xmlRegRangePtr ret;
750
751 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
752 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000753 xmlRegexpErrMemory(ctxt, "allocating range");
Daniel Veillard4255d502002-04-16 15:50:10 +0000754 return(NULL);
755 }
756 ret->neg = neg;
757 ret->type = type;
758 ret->start = start;
759 ret->end = end;
760 return(ret);
761}
762
763/**
764 * xmlRegFreeRange:
765 * @range: the regexp range
766 *
767 * Free a regexp range
768 */
769static void
770xmlRegFreeRange(xmlRegRangePtr range) {
771 if (range == NULL)
772 return;
773
774 if (range->blockName != NULL)
775 xmlFree(range->blockName);
776 xmlFree(range);
777}
778
779/**
Daniel Veillard76d59b62007-08-22 16:29:21 +0000780 * xmlRegCopyRange:
781 * @range: the regexp range
782 *
783 * Copy a regexp range
784 *
785 * Returns the new copy or NULL in case of error.
786 */
787static xmlRegRangePtr
788xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
789 xmlRegRangePtr ret;
790
791 if (range == NULL)
792 return(NULL);
793
794 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
795 range->end);
796 if (ret == NULL)
797 return(NULL);
798 if (range->blockName != NULL) {
799 ret->blockName = xmlStrdup(range->blockName);
800 if (ret->blockName == NULL) {
801 xmlRegexpErrMemory(ctxt, "allocating range");
802 xmlRegFreeRange(ret);
803 return(NULL);
804 }
805 }
806 return(ret);
807}
808
809/**
Daniel Veillard4255d502002-04-16 15:50:10 +0000810 * xmlRegNewAtom:
811 * @ctxt: the regexp parser context
812 * @type: the type of atom
813 *
Daniel Veillard76d59b62007-08-22 16:29:21 +0000814 * Allocate a new atom
Daniel Veillard4255d502002-04-16 15:50:10 +0000815 *
816 * Returns the new atom or NULL in case of error
817 */
818static xmlRegAtomPtr
819xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
820 xmlRegAtomPtr ret;
821
822 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
823 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000824 xmlRegexpErrMemory(ctxt, "allocating atom");
Daniel Veillard4255d502002-04-16 15:50:10 +0000825 return(NULL);
826 }
827 memset(ret, 0, sizeof(xmlRegAtom));
828 ret->type = type;
829 ret->quant = XML_REGEXP_QUANT_ONCE;
830 ret->min = 0;
831 ret->max = 0;
832 return(ret);
833}
834
835/**
836 * xmlRegFreeAtom:
837 * @atom: the regexp atom
838 *
839 * Free a regexp atom
840 */
841static void
842xmlRegFreeAtom(xmlRegAtomPtr atom) {
843 int i;
844
845 if (atom == NULL)
846 return;
847
848 for (i = 0;i < atom->nbRanges;i++)
849 xmlRegFreeRange(atom->ranges[i]);
850 if (atom->ranges != NULL)
851 xmlFree(atom->ranges);
Daniel Veillardde0e4982005-07-03 14:35:44 +0000852 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
853 xmlFree(atom->valuep);
Daniel Veillard77005e62005-07-19 16:26:18 +0000854 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
855 xmlFree(atom->valuep2);
Daniel Veillardde0e4982005-07-03 14:35:44 +0000856 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +0000857 xmlFree(atom->valuep);
858 xmlFree(atom);
859}
860
Daniel Veillard76d59b62007-08-22 16:29:21 +0000861/**
862 * xmlRegCopyAtom:
863 * @ctxt: the regexp parser context
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700864 * @atom: the original atom
Daniel Veillard76d59b62007-08-22 16:29:21 +0000865 *
866 * Allocate a new regexp range
867 *
868 * Returns the new atom or NULL in case of error
869 */
870static xmlRegAtomPtr
871xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
872 xmlRegAtomPtr ret;
873
874 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
875 if (ret == NULL) {
876 xmlRegexpErrMemory(ctxt, "copying atom");
877 return(NULL);
878 }
879 memset(ret, 0, sizeof(xmlRegAtom));
880 ret->type = atom->type;
881 ret->quant = atom->quant;
882 ret->min = atom->min;
883 ret->max = atom->max;
884 if (atom->nbRanges > 0) {
885 int i;
886
887 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
888 atom->nbRanges);
889 if (ret->ranges == NULL) {
890 xmlRegexpErrMemory(ctxt, "copying atom");
891 goto error;
892 }
893 for (i = 0;i < atom->nbRanges;i++) {
894 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
895 if (ret->ranges[i] == NULL)
896 goto error;
897 ret->nbRanges = i + 1;
898 }
899 }
900 return(ret);
901
902error:
903 xmlRegFreeAtom(ret);
904 return(NULL);
905}
906
Daniel Veillard4255d502002-04-16 15:50:10 +0000907static xmlRegStatePtr
908xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
909 xmlRegStatePtr ret;
910
911 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
912 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000913 xmlRegexpErrMemory(ctxt, "allocating state");
Daniel Veillard4255d502002-04-16 15:50:10 +0000914 return(NULL);
915 }
916 memset(ret, 0, sizeof(xmlRegState));
917 ret->type = XML_REGEXP_TRANS_STATE;
918 ret->mark = XML_REGEXP_MARK_NORMAL;
919 return(ret);
920}
921
922/**
923 * xmlRegFreeState:
924 * @state: the regexp state
925 *
926 * Free a regexp state
927 */
928static void
929xmlRegFreeState(xmlRegStatePtr state) {
930 if (state == NULL)
931 return;
932
933 if (state->trans != NULL)
934 xmlFree(state->trans);
Daniel Veillarddb68b742005-07-30 13:18:24 +0000935 if (state->transTo != NULL)
936 xmlFree(state->transTo);
Daniel Veillard4255d502002-04-16 15:50:10 +0000937 xmlFree(state);
938}
939
940/**
941 * xmlRegFreeParserCtxt:
942 * @ctxt: the regexp parser context
943 *
944 * Free a regexp parser context
945 */
946static void
947xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
948 int i;
949 if (ctxt == NULL)
950 return;
951
952 if (ctxt->string != NULL)
953 xmlFree(ctxt->string);
954 if (ctxt->states != NULL) {
955 for (i = 0;i < ctxt->nbStates;i++)
956 xmlRegFreeState(ctxt->states[i]);
957 xmlFree(ctxt->states);
958 }
959 if (ctxt->atoms != NULL) {
960 for (i = 0;i < ctxt->nbAtoms;i++)
961 xmlRegFreeAtom(ctxt->atoms[i]);
962 xmlFree(ctxt->atoms);
963 }
964 if (ctxt->counters != NULL)
965 xmlFree(ctxt->counters);
966 xmlFree(ctxt);
967}
968
969/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800970 * *
971 * Display of Data structures *
972 * *
Daniel Veillard4255d502002-04-16 15:50:10 +0000973 ************************************************************************/
974
975static void
976xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
977 switch (type) {
978 case XML_REGEXP_EPSILON:
979 fprintf(output, "epsilon "); break;
980 case XML_REGEXP_CHARVAL:
981 fprintf(output, "charval "); break;
982 case XML_REGEXP_RANGES:
983 fprintf(output, "ranges "); break;
984 case XML_REGEXP_SUBREG:
985 fprintf(output, "subexpr "); break;
986 case XML_REGEXP_STRING:
987 fprintf(output, "string "); break;
988 case XML_REGEXP_ANYCHAR:
989 fprintf(output, "anychar "); break;
990 case XML_REGEXP_ANYSPACE:
991 fprintf(output, "anyspace "); break;
992 case XML_REGEXP_NOTSPACE:
993 fprintf(output, "notspace "); break;
994 case XML_REGEXP_INITNAME:
995 fprintf(output, "initname "); break;
996 case XML_REGEXP_NOTINITNAME:
997 fprintf(output, "notinitname "); break;
998 case XML_REGEXP_NAMECHAR:
999 fprintf(output, "namechar "); break;
1000 case XML_REGEXP_NOTNAMECHAR:
1001 fprintf(output, "notnamechar "); break;
1002 case XML_REGEXP_DECIMAL:
1003 fprintf(output, "decimal "); break;
1004 case XML_REGEXP_NOTDECIMAL:
1005 fprintf(output, "notdecimal "); break;
1006 case XML_REGEXP_REALCHAR:
1007 fprintf(output, "realchar "); break;
1008 case XML_REGEXP_NOTREALCHAR:
1009 fprintf(output, "notrealchar "); break;
1010 case XML_REGEXP_LETTER:
1011 fprintf(output, "LETTER "); break;
1012 case XML_REGEXP_LETTER_UPPERCASE:
1013 fprintf(output, "LETTER_UPPERCASE "); break;
1014 case XML_REGEXP_LETTER_LOWERCASE:
1015 fprintf(output, "LETTER_LOWERCASE "); break;
1016 case XML_REGEXP_LETTER_TITLECASE:
1017 fprintf(output, "LETTER_TITLECASE "); break;
1018 case XML_REGEXP_LETTER_MODIFIER:
1019 fprintf(output, "LETTER_MODIFIER "); break;
1020 case XML_REGEXP_LETTER_OTHERS:
1021 fprintf(output, "LETTER_OTHERS "); break;
1022 case XML_REGEXP_MARK:
1023 fprintf(output, "MARK "); break;
1024 case XML_REGEXP_MARK_NONSPACING:
1025 fprintf(output, "MARK_NONSPACING "); break;
1026 case XML_REGEXP_MARK_SPACECOMBINING:
1027 fprintf(output, "MARK_SPACECOMBINING "); break;
1028 case XML_REGEXP_MARK_ENCLOSING:
1029 fprintf(output, "MARK_ENCLOSING "); break;
1030 case XML_REGEXP_NUMBER:
1031 fprintf(output, "NUMBER "); break;
1032 case XML_REGEXP_NUMBER_DECIMAL:
1033 fprintf(output, "NUMBER_DECIMAL "); break;
1034 case XML_REGEXP_NUMBER_LETTER:
1035 fprintf(output, "NUMBER_LETTER "); break;
1036 case XML_REGEXP_NUMBER_OTHERS:
1037 fprintf(output, "NUMBER_OTHERS "); break;
1038 case XML_REGEXP_PUNCT:
1039 fprintf(output, "PUNCT "); break;
1040 case XML_REGEXP_PUNCT_CONNECTOR:
1041 fprintf(output, "PUNCT_CONNECTOR "); break;
1042 case XML_REGEXP_PUNCT_DASH:
1043 fprintf(output, "PUNCT_DASH "); break;
1044 case XML_REGEXP_PUNCT_OPEN:
1045 fprintf(output, "PUNCT_OPEN "); break;
1046 case XML_REGEXP_PUNCT_CLOSE:
1047 fprintf(output, "PUNCT_CLOSE "); break;
1048 case XML_REGEXP_PUNCT_INITQUOTE:
1049 fprintf(output, "PUNCT_INITQUOTE "); break;
1050 case XML_REGEXP_PUNCT_FINQUOTE:
1051 fprintf(output, "PUNCT_FINQUOTE "); break;
1052 case XML_REGEXP_PUNCT_OTHERS:
1053 fprintf(output, "PUNCT_OTHERS "); break;
1054 case XML_REGEXP_SEPAR:
1055 fprintf(output, "SEPAR "); break;
1056 case XML_REGEXP_SEPAR_SPACE:
1057 fprintf(output, "SEPAR_SPACE "); break;
1058 case XML_REGEXP_SEPAR_LINE:
1059 fprintf(output, "SEPAR_LINE "); break;
1060 case XML_REGEXP_SEPAR_PARA:
1061 fprintf(output, "SEPAR_PARA "); break;
1062 case XML_REGEXP_SYMBOL:
1063 fprintf(output, "SYMBOL "); break;
1064 case XML_REGEXP_SYMBOL_MATH:
1065 fprintf(output, "SYMBOL_MATH "); break;
1066 case XML_REGEXP_SYMBOL_CURRENCY:
1067 fprintf(output, "SYMBOL_CURRENCY "); break;
1068 case XML_REGEXP_SYMBOL_MODIFIER:
1069 fprintf(output, "SYMBOL_MODIFIER "); break;
1070 case XML_REGEXP_SYMBOL_OTHERS:
1071 fprintf(output, "SYMBOL_OTHERS "); break;
1072 case XML_REGEXP_OTHER:
1073 fprintf(output, "OTHER "); break;
1074 case XML_REGEXP_OTHER_CONTROL:
1075 fprintf(output, "OTHER_CONTROL "); break;
1076 case XML_REGEXP_OTHER_FORMAT:
1077 fprintf(output, "OTHER_FORMAT "); break;
1078 case XML_REGEXP_OTHER_PRIVATE:
1079 fprintf(output, "OTHER_PRIVATE "); break;
1080 case XML_REGEXP_OTHER_NA:
1081 fprintf(output, "OTHER_NA "); break;
1082 case XML_REGEXP_BLOCK_NAME:
1083 fprintf(output, "BLOCK "); break;
1084 }
1085}
1086
1087static void
1088xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1089 switch (type) {
1090 case XML_REGEXP_QUANT_EPSILON:
1091 fprintf(output, "epsilon "); break;
1092 case XML_REGEXP_QUANT_ONCE:
1093 fprintf(output, "once "); break;
1094 case XML_REGEXP_QUANT_OPT:
1095 fprintf(output, "? "); break;
1096 case XML_REGEXP_QUANT_MULT:
1097 fprintf(output, "* "); break;
1098 case XML_REGEXP_QUANT_PLUS:
1099 fprintf(output, "+ "); break;
1100 case XML_REGEXP_QUANT_RANGE:
1101 fprintf(output, "range "); break;
Daniel Veillard7646b182002-04-20 06:41:40 +00001102 case XML_REGEXP_QUANT_ONCEONLY:
1103 fprintf(output, "onceonly "); break;
1104 case XML_REGEXP_QUANT_ALL:
1105 fprintf(output, "all "); break;
Daniel Veillard4255d502002-04-16 15:50:10 +00001106 }
1107}
1108static void
1109xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1110 fprintf(output, " range: ");
1111 if (range->neg)
1112 fprintf(output, "negative ");
1113 xmlRegPrintAtomType(output, range->type);
1114 fprintf(output, "%c - %c\n", range->start, range->end);
1115}
1116
1117static void
1118xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1119 fprintf(output, " atom: ");
1120 if (atom == NULL) {
1121 fprintf(output, "NULL\n");
1122 return;
1123 }
Daniel Veillard9efc4762005-07-19 14:33:55 +00001124 if (atom->neg)
1125 fprintf(output, "not ");
Daniel Veillard4255d502002-04-16 15:50:10 +00001126 xmlRegPrintAtomType(output, atom->type);
1127 xmlRegPrintQuantType(output, atom->quant);
1128 if (atom->quant == XML_REGEXP_QUANT_RANGE)
1129 fprintf(output, "%d-%d ", atom->min, atom->max);
1130 if (atom->type == XML_REGEXP_STRING)
1131 fprintf(output, "'%s' ", (char *) atom->valuep);
1132 if (atom->type == XML_REGEXP_CHARVAL)
1133 fprintf(output, "char %c\n", atom->codepoint);
1134 else if (atom->type == XML_REGEXP_RANGES) {
1135 int i;
1136 fprintf(output, "%d entries\n", atom->nbRanges);
1137 for (i = 0; i < atom->nbRanges;i++)
1138 xmlRegPrintRange(output, atom->ranges[i]);
1139 } else if (atom->type == XML_REGEXP_SUBREG) {
1140 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1141 } else {
1142 fprintf(output, "\n");
1143 }
1144}
1145
1146static void
1147xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1148 fprintf(output, " trans: ");
1149 if (trans == NULL) {
1150 fprintf(output, "NULL\n");
1151 return;
1152 }
1153 if (trans->to < 0) {
1154 fprintf(output, "removed\n");
1155 return;
1156 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001157 if (trans->nd != 0) {
1158 if (trans->nd == 2)
1159 fprintf(output, "last not determinist, ");
1160 else
1161 fprintf(output, "not determinist, ");
1162 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001163 if (trans->counter >= 0) {
1164 fprintf(output, "counted %d, ", trans->counter);
1165 }
Daniel Veillard8a001f62002-04-20 07:24:11 +00001166 if (trans->count == REGEXP_ALL_COUNTER) {
1167 fprintf(output, "all transition, ");
1168 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00001169 fprintf(output, "count based %d, ", trans->count);
1170 }
1171 if (trans->atom == NULL) {
1172 fprintf(output, "epsilon to %d\n", trans->to);
1173 return;
1174 }
1175 if (trans->atom->type == XML_REGEXP_CHARVAL)
1176 fprintf(output, "char %c ", trans->atom->codepoint);
1177 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1178}
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001179
Daniel Veillard4255d502002-04-16 15:50:10 +00001180static void
1181xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1182 int i;
1183
1184 fprintf(output, " state: ");
1185 if (state == NULL) {
1186 fprintf(output, "NULL\n");
1187 return;
1188 }
1189 if (state->type == XML_REGEXP_START_STATE)
1190 fprintf(output, "START ");
1191 if (state->type == XML_REGEXP_FINAL_STATE)
1192 fprintf(output, "FINAL ");
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001193
Daniel Veillard4255d502002-04-16 15:50:10 +00001194 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1195 for (i = 0;i < state->nbTrans; i++) {
1196 xmlRegPrintTrans(output, &(state->trans[i]));
1197 }
1198}
1199
Daniel Veillard23e73572002-09-19 19:56:43 +00001200#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard4255d502002-04-16 15:50:10 +00001201static void
1202xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1203 int i;
1204
1205 fprintf(output, " ctxt: ");
1206 if (ctxt == NULL) {
1207 fprintf(output, "NULL\n");
1208 return;
1209 }
1210 fprintf(output, "'%s' ", ctxt->string);
1211 if (ctxt->error)
1212 fprintf(output, "error ");
1213 if (ctxt->neg)
1214 fprintf(output, "neg ");
1215 fprintf(output, "\n");
1216 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1217 for (i = 0;i < ctxt->nbAtoms; i++) {
1218 fprintf(output, " %02d ", i);
1219 xmlRegPrintAtom(output, ctxt->atoms[i]);
1220 }
1221 if (ctxt->atom != NULL) {
1222 fprintf(output, "current atom:\n");
1223 xmlRegPrintAtom(output, ctxt->atom);
1224 }
1225 fprintf(output, "%d states:", ctxt->nbStates);
1226 if (ctxt->start != NULL)
1227 fprintf(output, " start: %d", ctxt->start->no);
1228 if (ctxt->end != NULL)
1229 fprintf(output, " end: %d", ctxt->end->no);
1230 fprintf(output, "\n");
1231 for (i = 0;i < ctxt->nbStates; i++) {
1232 xmlRegPrintState(output, ctxt->states[i]);
1233 }
1234 fprintf(output, "%d counters:\n", ctxt->nbCounters);
1235 for (i = 0;i < ctxt->nbCounters; i++) {
1236 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1237 ctxt->counters[i].max);
1238 }
1239}
Daniel Veillard23e73572002-09-19 19:56:43 +00001240#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00001241
1242/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001243 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00001244 * Finite Automata structures manipulations *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001245 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00001246 ************************************************************************/
1247
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001248static void
Daniel Veillard4255d502002-04-16 15:50:10 +00001249xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1250 int neg, xmlRegAtomType type, int start, int end,
1251 xmlChar *blockName) {
1252 xmlRegRangePtr range;
1253
1254 if (atom == NULL) {
1255 ERROR("add range: atom is NULL");
1256 return;
1257 }
1258 if (atom->type != XML_REGEXP_RANGES) {
1259 ERROR("add range: atom is not ranges");
1260 return;
1261 }
1262 if (atom->maxRanges == 0) {
1263 atom->maxRanges = 4;
1264 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1265 sizeof(xmlRegRangePtr));
1266 if (atom->ranges == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001267 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001268 atom->maxRanges = 0;
1269 return;
1270 }
1271 } else if (atom->nbRanges >= atom->maxRanges) {
1272 xmlRegRangePtr *tmp;
1273 atom->maxRanges *= 2;
1274 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1275 sizeof(xmlRegRangePtr));
1276 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001277 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001278 atom->maxRanges /= 2;
1279 return;
1280 }
1281 atom->ranges = tmp;
1282 }
1283 range = xmlRegNewRange(ctxt, neg, type, start, end);
1284 if (range == NULL)
1285 return;
1286 range->blockName = blockName;
1287 atom->ranges[atom->nbRanges++] = range;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001288
Daniel Veillard4255d502002-04-16 15:50:10 +00001289}
1290
1291static int
1292xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1293 if (ctxt->maxCounters == 0) {
1294 ctxt->maxCounters = 4;
1295 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1296 sizeof(xmlRegCounter));
1297 if (ctxt->counters == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001298 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001299 ctxt->maxCounters = 0;
1300 return(-1);
1301 }
1302 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1303 xmlRegCounter *tmp;
1304 ctxt->maxCounters *= 2;
1305 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1306 sizeof(xmlRegCounter));
1307 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001308 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001309 ctxt->maxCounters /= 2;
1310 return(-1);
1311 }
1312 ctxt->counters = tmp;
1313 }
1314 ctxt->counters[ctxt->nbCounters].min = -1;
1315 ctxt->counters[ctxt->nbCounters].max = -1;
1316 return(ctxt->nbCounters++);
1317}
1318
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001319static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001320xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1321 if (atom == NULL) {
1322 ERROR("atom push: atom is NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001323 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001324 }
1325 if (ctxt->maxAtoms == 0) {
1326 ctxt->maxAtoms = 4;
1327 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1328 sizeof(xmlRegAtomPtr));
1329 if (ctxt->atoms == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001330 xmlRegexpErrMemory(ctxt, "pushing atom");
Daniel Veillard4255d502002-04-16 15:50:10 +00001331 ctxt->maxAtoms = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001332 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001333 }
1334 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1335 xmlRegAtomPtr *tmp;
1336 ctxt->maxAtoms *= 2;
1337 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1338 sizeof(xmlRegAtomPtr));
1339 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001340 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001341 ctxt->maxAtoms /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001342 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001343 }
1344 ctxt->atoms = tmp;
1345 }
1346 atom->no = ctxt->nbAtoms;
1347 ctxt->atoms[ctxt->nbAtoms++] = atom;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001348 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001349}
1350
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001351static void
Daniel Veillarddb68b742005-07-30 13:18:24 +00001352xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1353 int from) {
1354 if (target->maxTransTo == 0) {
1355 target->maxTransTo = 8;
1356 target->transTo = (int *) xmlMalloc(target->maxTransTo *
1357 sizeof(int));
1358 if (target->transTo == NULL) {
1359 xmlRegexpErrMemory(ctxt, "adding transition");
1360 target->maxTransTo = 0;
1361 return;
1362 }
1363 } else if (target->nbTransTo >= target->maxTransTo) {
1364 int *tmp;
1365 target->maxTransTo *= 2;
1366 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1367 sizeof(int));
1368 if (tmp == NULL) {
1369 xmlRegexpErrMemory(ctxt, "adding transition");
1370 target->maxTransTo /= 2;
1371 return;
1372 }
1373 target->transTo = tmp;
1374 }
1375 target->transTo[target->nbTransTo] = from;
1376 target->nbTransTo++;
1377}
1378
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001379static void
Daniel Veillard4255d502002-04-16 15:50:10 +00001380xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1381 xmlRegAtomPtr atom, xmlRegStatePtr target,
Daniel Veillard5de09382005-09-26 17:18:17 +00001382 int counter, int count) {
William M. Brackf9b5fa22004-05-10 07:52:15 +00001383
1384 int nrtrans;
1385
Daniel Veillard4255d502002-04-16 15:50:10 +00001386 if (state == NULL) {
1387 ERROR("add state: state is NULL");
1388 return;
1389 }
1390 if (target == NULL) {
1391 ERROR("add state: target is NULL");
1392 return;
1393 }
William M. Brackf9b5fa22004-05-10 07:52:15 +00001394 /*
1395 * Other routines follow the philosophy 'When in doubt, add a transition'
1396 * so we check here whether such a transition is already present and, if
1397 * so, silently ignore this request.
1398 */
1399
Daniel Veillard5de09382005-09-26 17:18:17 +00001400 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1401 xmlRegTransPtr trans = &(state->trans[nrtrans]);
1402 if ((trans->atom == atom) &&
1403 (trans->to == target->no) &&
1404 (trans->counter == counter) &&
1405 (trans->count == count)) {
William M. Brackf9b5fa22004-05-10 07:52:15 +00001406#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard5de09382005-09-26 17:18:17 +00001407 printf("Ignoring duplicate transition from %d to %d\n",
1408 state->no, target->no);
William M. Brackf9b5fa22004-05-10 07:52:15 +00001409#endif
Daniel Veillard5de09382005-09-26 17:18:17 +00001410 return;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001411 }
William M. Brackf9b5fa22004-05-10 07:52:15 +00001412 }
1413
Daniel Veillard4255d502002-04-16 15:50:10 +00001414 if (state->maxTrans == 0) {
Daniel Veillarddb68b742005-07-30 13:18:24 +00001415 state->maxTrans = 8;
Daniel Veillard4255d502002-04-16 15:50:10 +00001416 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1417 sizeof(xmlRegTrans));
1418 if (state->trans == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001419 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001420 state->maxTrans = 0;
1421 return;
1422 }
1423 } else if (state->nbTrans >= state->maxTrans) {
1424 xmlRegTrans *tmp;
1425 state->maxTrans *= 2;
1426 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1427 sizeof(xmlRegTrans));
1428 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001429 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001430 state->maxTrans /= 2;
1431 return;
1432 }
1433 state->trans = tmp;
1434 }
1435#ifdef DEBUG_REGEXP_GRAPH
1436 printf("Add trans from %d to %d ", state->no, target->no);
Daniel Veillard8a001f62002-04-20 07:24:11 +00001437 if (count == REGEXP_ALL_COUNTER)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001438 printf("all transition\n");
Daniel Veillard4402ab42002-09-12 16:02:56 +00001439 else if (count >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001440 printf("count based %d\n", count);
Daniel Veillard4255d502002-04-16 15:50:10 +00001441 else if (counter >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001442 printf("counted %d\n", counter);
Daniel Veillard4255d502002-04-16 15:50:10 +00001443 else if (atom == NULL)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001444 printf("epsilon transition\n");
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001445 else if (atom != NULL)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001446 xmlRegPrintAtom(stdout, atom);
Daniel Veillard4255d502002-04-16 15:50:10 +00001447#endif
1448
1449 state->trans[state->nbTrans].atom = atom;
1450 state->trans[state->nbTrans].to = target->no;
1451 state->trans[state->nbTrans].counter = counter;
1452 state->trans[state->nbTrans].count = count;
Daniel Veillard567a45b2005-10-18 19:11:55 +00001453 state->trans[state->nbTrans].nd = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00001454 state->nbTrans++;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001455 xmlRegStateAddTransTo(ctxt, target, state->no);
Daniel Veillard4255d502002-04-16 15:50:10 +00001456}
1457
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001458static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001459xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001460 if (state == NULL) return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001461 if (ctxt->maxStates == 0) {
1462 ctxt->maxStates = 4;
1463 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1464 sizeof(xmlRegStatePtr));
1465 if (ctxt->states == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001466 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001467 ctxt->maxStates = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001468 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001469 }
1470 } else if (ctxt->nbStates >= ctxt->maxStates) {
1471 xmlRegStatePtr *tmp;
1472 ctxt->maxStates *= 2;
1473 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1474 sizeof(xmlRegStatePtr));
1475 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001476 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001477 ctxt->maxStates /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001478 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001479 }
1480 ctxt->states = tmp;
1481 }
1482 state->no = ctxt->nbStates;
1483 ctxt->states[ctxt->nbStates++] = state;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001484 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001485}
1486
1487/**
Daniel Veillard7646b182002-04-20 06:41:40 +00001488 * xmlFAGenerateAllTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001489 * @ctxt: a regexp parser context
1490 * @from: the from state
1491 * @to: the target state or NULL for building a new one
1492 * @lax:
Daniel Veillard7646b182002-04-20 06:41:40 +00001493 *
1494 */
1495static void
1496xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
Daniel Veillard441bc322002-04-20 17:38:48 +00001497 xmlRegStatePtr from, xmlRegStatePtr to,
1498 int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00001499 if (to == NULL) {
1500 to = xmlRegNewState(ctxt);
1501 xmlRegStatePush(ctxt, to);
1502 ctxt->state = to;
1503 }
Daniel Veillard441bc322002-04-20 17:38:48 +00001504 if (lax)
Daniel Veillard5de09382005-09-26 17:18:17 +00001505 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
Daniel Veillard441bc322002-04-20 17:38:48 +00001506 else
Daniel Veillard5de09382005-09-26 17:18:17 +00001507 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
Daniel Veillard7646b182002-04-20 06:41:40 +00001508}
1509
1510/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001511 * xmlFAGenerateEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001512 * @ctxt: a regexp parser context
1513 * @from: the from state
1514 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001515 *
1516 */
1517static void
1518xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1519 xmlRegStatePtr from, xmlRegStatePtr to) {
1520 if (to == NULL) {
1521 to = xmlRegNewState(ctxt);
1522 xmlRegStatePush(ctxt, to);
1523 ctxt->state = to;
1524 }
Daniel Veillard5de09382005-09-26 17:18:17 +00001525 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001526}
1527
1528/**
1529 * xmlFAGenerateCountedEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001530 * @ctxt: a regexp parser context
1531 * @from: the from state
1532 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001533 * counter: the counter for that transition
1534 *
1535 */
1536static void
1537xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1538 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1539 if (to == NULL) {
1540 to = xmlRegNewState(ctxt);
1541 xmlRegStatePush(ctxt, to);
1542 ctxt->state = to;
1543 }
Daniel Veillard5de09382005-09-26 17:18:17 +00001544 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001545}
1546
1547/**
1548 * xmlFAGenerateCountedTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001549 * @ctxt: a regexp parser context
1550 * @from: the from state
1551 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001552 * counter: the counter for that transition
1553 *
1554 */
1555static void
1556xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1557 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1558 if (to == NULL) {
1559 to = xmlRegNewState(ctxt);
1560 xmlRegStatePush(ctxt, to);
1561 ctxt->state = to;
1562 }
Daniel Veillard5de09382005-09-26 17:18:17 +00001563 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
Daniel Veillard4255d502002-04-16 15:50:10 +00001564}
1565
1566/**
1567 * xmlFAGenerateTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001568 * @ctxt: a regexp parser context
1569 * @from: the from state
1570 * @to: the target state or NULL for building a new one
1571 * @atom: the atom generating the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00001572 *
William M. Brackddf71d62004-05-06 04:17:26 +00001573 * Returns 0 if success and -1 in case of error.
Daniel Veillard4255d502002-04-16 15:50:10 +00001574 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001575static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001576xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1577 xmlRegStatePtr to, xmlRegAtomPtr atom) {
Daniel Veillard10bda622008-03-13 07:27:24 +00001578 xmlRegStatePtr end;
Daniel Veillard34b35002016-05-09 09:28:38 +08001579 int nullable = 0;
Daniel Veillard10bda622008-03-13 07:27:24 +00001580
Daniel Veillard4255d502002-04-16 15:50:10 +00001581 if (atom == NULL) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07001582 ERROR("generate transition: atom == NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001583 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001584 }
1585 if (atom->type == XML_REGEXP_SUBREG) {
1586 /*
1587 * this is a subexpression handling one should not need to
William M. Brackddf71d62004-05-06 04:17:26 +00001588 * create a new node except for XML_REGEXP_QUANT_RANGE.
Daniel Veillard4255d502002-04-16 15:50:10 +00001589 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001590 if (xmlRegAtomPush(ctxt, atom) < 0) {
1591 return(-1);
1592 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001593 if ((to != NULL) && (atom->stop != to) &&
1594 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1595 /*
1596 * Generate an epsilon transition to link to the target
1597 */
1598 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
Daniel Veillardaa622012005-10-20 15:55:25 +00001599#ifdef DV
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001600 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
Daniel Veillardaa622012005-10-20 15:55:25 +00001601 (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1602 to = xmlRegNewState(ctxt);
1603 xmlRegStatePush(ctxt, to);
1604 ctxt->state = to;
1605 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1606#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00001607 }
1608 switch (atom->quant) {
1609 case XML_REGEXP_QUANT_OPT:
1610 atom->quant = XML_REGEXP_QUANT_ONCE;
Daniel Veillard54eb0242006-03-21 23:17:57 +00001611 /*
1612 * transition done to the state after end of atom.
1613 * 1. set transition from atom start to new state
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001614 * 2. set transition from atom end to this state.
Daniel Veillard54eb0242006-03-21 23:17:57 +00001615 */
Daniel Veillardd80d0722009-08-22 18:56:01 +02001616 if (to == NULL) {
1617 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1618 xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1619 ctxt->state);
1620 } else {
1621 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1622 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001623 break;
1624 case XML_REGEXP_QUANT_MULT:
1625 atom->quant = XML_REGEXP_QUANT_ONCE;
1626 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1627 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1628 break;
1629 case XML_REGEXP_QUANT_PLUS:
1630 atom->quant = XML_REGEXP_QUANT_ONCE;
1631 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1632 break;
1633 case XML_REGEXP_QUANT_RANGE: {
1634 int counter;
Daniel Veillard76d59b62007-08-22 16:29:21 +00001635 xmlRegStatePtr inter, newstate;
Daniel Veillard4255d502002-04-16 15:50:10 +00001636
1637 /*
Daniel Veillard76d59b62007-08-22 16:29:21 +00001638 * create the final state now if needed
Daniel Veillard4255d502002-04-16 15:50:10 +00001639 */
Daniel Veillard4255d502002-04-16 15:50:10 +00001640 if (to != NULL) {
1641 newstate = to;
1642 } else {
1643 newstate = xmlRegNewState(ctxt);
1644 xmlRegStatePush(ctxt, newstate);
Daniel Veillard4255d502002-04-16 15:50:10 +00001645 }
Daniel Veillard76d59b62007-08-22 16:29:21 +00001646
1647 /*
1648 * The principle here is to use counted transition
1649 * to avoid explosion in the number of states in the
1650 * graph. This is clearly more complex but should not
1651 * be exploitable at runtime.
Daniel Veillard54eb0242006-03-21 23:17:57 +00001652 */
Daniel Veillard76d59b62007-08-22 16:29:21 +00001653 if ((atom->min == 0) && (atom->start0 == NULL)) {
1654 xmlRegAtomPtr copy;
1655 /*
1656 * duplicate a transition based on atom to count next
Haibo Huangcfd91dc2020-07-30 23:01:33 -07001657 * occurrences after 1. We cannot loop to atom->start
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001658 * directly because we need an epsilon transition to
Daniel Veillard76d59b62007-08-22 16:29:21 +00001659 * newstate.
1660 */
1661 /* ???? For some reason it seems we never reach that
1662 case, I suppose this got optimized out before when
1663 building the automata */
Daniel Veillardc821e032007-08-28 17:33:45 +00001664 copy = xmlRegCopyAtom(ctxt, atom);
Daniel Veillard76d59b62007-08-22 16:29:21 +00001665 if (copy == NULL)
1666 return(-1);
Daniel Veillard76d59b62007-08-22 16:29:21 +00001667 copy->quant = XML_REGEXP_QUANT_ONCE;
1668 copy->min = 0;
1669 copy->max = 0;
1670
1671 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1672 < 0)
1673 return(-1);
1674 inter = ctxt->state;
1675 counter = xmlRegGetCounter(ctxt);
Nick Wellnhofer1ccf89b2023-02-17 15:53:07 +01001676 if (counter < 0)
1677 return(-1);
Daniel Veillard76d59b62007-08-22 16:29:21 +00001678 ctxt->counters[counter].min = atom->min - 1;
1679 ctxt->counters[counter].max = atom->max - 1;
1680 /* count the number of times we see it again */
1681 xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1682 atom->stop, counter);
1683 /* allow a way out based on the count */
1684 xmlFAGenerateCountedTransition(ctxt, inter,
1685 newstate, counter);
1686 /* and also allow a direct exit for 0 */
1687 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1688 newstate);
1689 } else {
1690 /*
1691 * either we need the atom at least once or there
Haibo Huangcfd91dc2020-07-30 23:01:33 -07001692 * is an atom->start0 allowing to easily plug the
Daniel Veillard76d59b62007-08-22 16:29:21 +00001693 * epsilon transition.
1694 */
1695 counter = xmlRegGetCounter(ctxt);
Nick Wellnhofer1ccf89b2023-02-17 15:53:07 +01001696 if (counter < 0)
1697 return(-1);
Daniel Veillard76d59b62007-08-22 16:29:21 +00001698 ctxt->counters[counter].min = atom->min - 1;
1699 ctxt->counters[counter].max = atom->max - 1;
1700 /* count the number of times we see it again */
1701 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1702 atom->start, counter);
1703 /* allow a way out based on the count */
1704 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1705 newstate, counter);
1706 /* and if needed allow a direct exit for 0 */
1707 if (atom->min == 0)
1708 xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1709 newstate);
1710
1711 }
1712 atom->min = 0;
1713 atom->max = 0;
1714 atom->quant = XML_REGEXP_QUANT_ONCE;
1715 ctxt->state = newstate;
Daniel Veillard4255d502002-04-16 15:50:10 +00001716 }
1717 default:
1718 break;
1719 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001720 return(0);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001721 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001722 if ((atom->min == 0) && (atom->max == 0) &&
Daniel Veillard99c394d2005-07-14 12:58:49 +00001723 (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1724 /*
1725 * we can discard the atom and generate an epsilon transition instead
1726 */
1727 if (to == NULL) {
1728 to = xmlRegNewState(ctxt);
1729 if (to != NULL)
1730 xmlRegStatePush(ctxt, to);
1731 else {
1732 return(-1);
1733 }
1734 }
1735 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1736 ctxt->state = to;
1737 xmlRegFreeAtom(atom);
1738 return(0);
Daniel Veillard567a45b2005-10-18 19:11:55 +00001739 }
1740 if (to == NULL) {
1741 to = xmlRegNewState(ctxt);
1742 if (to != NULL)
1743 xmlRegStatePush(ctxt, to);
1744 else {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001745 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001746 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001747 }
Daniel Veillard10bda622008-03-13 07:27:24 +00001748 end = to;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001749 if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
Daniel Veillard10bda622008-03-13 07:27:24 +00001750 (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1751 /*
1752 * Do not pollute the target state by adding transitions from
1753 * it as it is likely to be the shared target of multiple branches.
1754 * So isolate with an epsilon transition.
1755 */
1756 xmlRegStatePtr tmp;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001757
Daniel Veillard10bda622008-03-13 07:27:24 +00001758 tmp = xmlRegNewState(ctxt);
1759 if (tmp != NULL)
1760 xmlRegStatePush(ctxt, tmp);
1761 else {
1762 return(-1);
1763 }
1764 xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1765 to = tmp;
Daniel Veillard4255d502002-04-16 15:50:10 +00001766 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001767 if (xmlRegAtomPush(ctxt, atom) < 0) {
1768 return(-1);
1769 }
Daniel Veillard34b35002016-05-09 09:28:38 +08001770 if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
1771 (atom->min == 0) && (atom->max > 0)) {
1772 nullable = 1;
1773 atom->min = 1;
1774 if (atom->max == 1)
1775 atom->quant = XML_REGEXP_QUANT_OPT;
1776 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001777 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
Daniel Veillard10bda622008-03-13 07:27:24 +00001778 ctxt->state = end;
Daniel Veillard4255d502002-04-16 15:50:10 +00001779 switch (atom->quant) {
1780 case XML_REGEXP_QUANT_OPT:
1781 atom->quant = XML_REGEXP_QUANT_ONCE;
1782 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1783 break;
1784 case XML_REGEXP_QUANT_MULT:
1785 atom->quant = XML_REGEXP_QUANT_ONCE;
1786 xmlFAGenerateEpsilonTransition(ctxt, from, to);
Daniel Veillard5de09382005-09-26 17:18:17 +00001787 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001788 break;
1789 case XML_REGEXP_QUANT_PLUS:
1790 atom->quant = XML_REGEXP_QUANT_ONCE;
Daniel Veillard5de09382005-09-26 17:18:17 +00001791 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001792 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001793 case XML_REGEXP_QUANT_RANGE:
Daniel Veillard34b35002016-05-09 09:28:38 +08001794 if (nullable)
William M. Brack56578372007-04-11 14:33:46 +00001795 xmlFAGenerateEpsilonTransition(ctxt, from, to);
William M. Brack56578372007-04-11 14:33:46 +00001796 break;
Daniel Veillard4255d502002-04-16 15:50:10 +00001797 default:
1798 break;
1799 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001800 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001801}
1802
1803/**
1804 * xmlFAReduceEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001805 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001806 * @fromnr: the from state
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001807 * @tonr: the to state
William M. Brackddf71d62004-05-06 04:17:26 +00001808 * @counter: should that transition be associated to a counted
Daniel Veillard4255d502002-04-16 15:50:10 +00001809 *
1810 */
1811static void
1812xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1813 int tonr, int counter) {
1814 int transnr;
1815 xmlRegStatePtr from;
1816 xmlRegStatePtr to;
1817
1818#ifdef DEBUG_REGEXP_GRAPH
1819 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1820#endif
1821 from = ctxt->states[fromnr];
1822 if (from == NULL)
1823 return;
1824 to = ctxt->states[tonr];
1825 if (to == NULL)
1826 return;
1827 if ((to->mark == XML_REGEXP_MARK_START) ||
1828 (to->mark == XML_REGEXP_MARK_VISITED))
1829 return;
1830
1831 to->mark = XML_REGEXP_MARK_VISITED;
1832 if (to->type == XML_REGEXP_FINAL_STATE) {
1833#ifdef DEBUG_REGEXP_GRAPH
1834 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1835#endif
1836 from->type = XML_REGEXP_FINAL_STATE;
1837 }
1838 for (transnr = 0;transnr < to->nbTrans;transnr++) {
Daniel Veillarddb68b742005-07-30 13:18:24 +00001839 if (to->trans[transnr].to < 0)
1840 continue;
Daniel Veillard4255d502002-04-16 15:50:10 +00001841 if (to->trans[transnr].atom == NULL) {
1842 /*
1843 * Don't remove counted transitions
1844 * Don't loop either
1845 */
Daniel Veillardb509f152002-04-17 16:28:10 +00001846 if (to->trans[transnr].to != fromnr) {
1847 if (to->trans[transnr].count >= 0) {
1848 int newto = to->trans[transnr].to;
1849
1850 xmlRegStateAddTrans(ctxt, from, NULL,
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001851 ctxt->states[newto],
Daniel Veillard5de09382005-09-26 17:18:17 +00001852 -1, to->trans[transnr].count);
Daniel Veillardb509f152002-04-17 16:28:10 +00001853 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00001854#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillardb509f152002-04-17 16:28:10 +00001855 printf("Found epsilon trans %d from %d to %d\n",
1856 transnr, tonr, to->trans[transnr].to);
Daniel Veillard4255d502002-04-16 15:50:10 +00001857#endif
Daniel Veillardb509f152002-04-17 16:28:10 +00001858 if (to->trans[transnr].counter >= 0) {
1859 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1860 to->trans[transnr].to,
1861 to->trans[transnr].counter);
1862 } else {
1863 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1864 to->trans[transnr].to,
1865 counter);
1866 }
1867 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001868 }
1869 } else {
1870 int newto = to->trans[transnr].to;
1871
Daniel Veillardb509f152002-04-17 16:28:10 +00001872 if (to->trans[transnr].counter >= 0) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001873 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1874 ctxt->states[newto],
Daniel Veillard5de09382005-09-26 17:18:17 +00001875 to->trans[transnr].counter, -1);
Daniel Veillardb509f152002-04-17 16:28:10 +00001876 } else {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001877 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
Daniel Veillard5de09382005-09-26 17:18:17 +00001878 ctxt->states[newto], counter, -1);
Daniel Veillardb509f152002-04-17 16:28:10 +00001879 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001880 }
1881 }
1882 to->mark = XML_REGEXP_MARK_NORMAL;
1883}
1884
1885/**
Daniel Veillarddb68b742005-07-30 13:18:24 +00001886 * xmlFAEliminateSimpleEpsilonTransitions:
1887 * @ctxt: a regexp parser context
1888 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001889 * Eliminating general epsilon transitions can get costly in the general
Daniel Veillarddb68b742005-07-30 13:18:24 +00001890 * algorithm due to the large amount of generated new transitions and
1891 * associated comparisons. However for simple epsilon transition used just
1892 * to separate building blocks when generating the automata this can be
1893 * reduced to state elimination:
1894 * - if there exists an epsilon from X to Y
1895 * - if there is no other transition from X
1896 * then X and Y are semantically equivalent and X can be eliminated
1897 * If X is the start state then make Y the start state, else replace the
1898 * target of all transitions to X by transitions to Y.
Elliott Hughes60f5c162021-08-20 17:09:52 -07001899 *
1900 * If X is a final state, skip it.
1901 * Otherwise it would be necessary to manipulate counters for this case when
1902 * eliminating state 2:
1903 * State 1 has a transition with an atom to state 2.
1904 * State 2 is final and has an epsilon transition to state 1.
Daniel Veillarddb68b742005-07-30 13:18:24 +00001905 */
1906static void
1907xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1908 int statenr, i, j, newto;
1909 xmlRegStatePtr state, tmp;
1910
1911 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1912 state = ctxt->states[statenr];
1913 if (state == NULL)
1914 continue;
1915 if (state->nbTrans != 1)
1916 continue;
Elliott Hughes60f5c162021-08-20 17:09:52 -07001917 if (state->type == XML_REGEXP_UNREACH_STATE ||
1918 state->type == XML_REGEXP_FINAL_STATE)
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001919 continue;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001920 /* is the only transition out a basic transition */
1921 if ((state->trans[0].atom == NULL) &&
1922 (state->trans[0].to >= 0) &&
1923 (state->trans[0].to != statenr) &&
1924 (state->trans[0].counter < 0) &&
1925 (state->trans[0].count < 0)) {
1926 newto = state->trans[0].to;
1927
1928 if (state->type == XML_REGEXP_START_STATE) {
1929#ifdef DEBUG_REGEXP_GRAPH
1930 printf("Found simple epsilon trans from start %d to %d\n",
1931 statenr, newto);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001932#endif
Daniel Veillarddb68b742005-07-30 13:18:24 +00001933 } else {
1934#ifdef DEBUG_REGEXP_GRAPH
1935 printf("Found simple epsilon trans from %d to %d\n",
1936 statenr, newto);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001937#endif
Daniel Veillarddb68b742005-07-30 13:18:24 +00001938 for (i = 0;i < state->nbTransTo;i++) {
1939 tmp = ctxt->states[state->transTo[i]];
1940 for (j = 0;j < tmp->nbTrans;j++) {
1941 if (tmp->trans[j].to == statenr) {
Daniel Veillarddb68b742005-07-30 13:18:24 +00001942#ifdef DEBUG_REGEXP_GRAPH
1943 printf("Changed transition %d on %d to go to %d\n",
1944 j, tmp->no, newto);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001945#endif
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001946 tmp->trans[j].to = -1;
1947 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001948 ctxt->states[newto],
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001949 tmp->trans[j].counter,
1950 tmp->trans[j].count);
Daniel Veillarddb68b742005-07-30 13:18:24 +00001951 }
1952 }
1953 }
Daniel Veillarddb68b742005-07-30 13:18:24 +00001954 if (state->type == XML_REGEXP_FINAL_STATE)
1955 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1956 /* eliminate the transition completely */
1957 state->nbTrans = 0;
1958
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001959 state->type = XML_REGEXP_UNREACH_STATE;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001960
1961 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001962
Daniel Veillarddb68b742005-07-30 13:18:24 +00001963 }
1964 }
1965}
1966/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001967 * xmlFAEliminateEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001968 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001969 *
1970 */
1971static void
1972xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1973 int statenr, transnr;
1974 xmlRegStatePtr state;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001975 int has_epsilon;
Daniel Veillard4255d502002-04-16 15:50:10 +00001976
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001977 if (ctxt->states == NULL) return;
1978
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001979 /*
1980 * Eliminate simple epsilon transition and the associated unreachable
1981 * states.
1982 */
Daniel Veillarddb68b742005-07-30 13:18:24 +00001983 xmlFAEliminateSimpleEpsilonTransitions(ctxt);
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001984 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1985 state = ctxt->states[statenr];
1986 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1987#ifdef DEBUG_REGEXP_GRAPH
1988 printf("Removed unreachable state %d\n", statenr);
1989#endif
1990 xmlRegFreeState(state);
1991 ctxt->states[statenr] = NULL;
1992 }
1993 }
Daniel Veillarddb68b742005-07-30 13:18:24 +00001994
1995 has_epsilon = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001996
Daniel Veillard4255d502002-04-16 15:50:10 +00001997 /*
Daniel Veillardfcd18ff2006-11-02 10:28:04 +00001998 * Build the completed transitions bypassing the epsilons
Daniel Veillard4255d502002-04-16 15:50:10 +00001999 * Use a marking algorithm to avoid loops
Daniel Veillardfcd18ff2006-11-02 10:28:04 +00002000 * Mark sink states too.
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002001 * Process from the latest states backward to the start when
Daniel Veillardfcd18ff2006-11-02 10:28:04 +00002002 * there is long cascading epsilon chains this minimize the
2003 * recursions and transition compares when adding the new ones
Daniel Veillard4255d502002-04-16 15:50:10 +00002004 */
Daniel Veillardfcd18ff2006-11-02 10:28:04 +00002005 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002006 state = ctxt->states[statenr];
2007 if (state == NULL)
2008 continue;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00002009 if ((state->nbTrans == 0) &&
2010 (state->type != XML_REGEXP_FINAL_STATE)) {
2011 state->type = XML_REGEXP_SINK_STATE;
2012 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002013 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2014 if ((state->trans[transnr].atom == NULL) &&
2015 (state->trans[transnr].to >= 0)) {
2016 if (state->trans[transnr].to == statenr) {
2017 state->trans[transnr].to = -1;
2018#ifdef DEBUG_REGEXP_GRAPH
2019 printf("Removed loopback epsilon trans %d on %d\n",
2020 transnr, statenr);
2021#endif
2022 } else if (state->trans[transnr].count < 0) {
2023 int newto = state->trans[transnr].to;
2024
2025#ifdef DEBUG_REGEXP_GRAPH
2026 printf("Found epsilon trans %d from %d to %d\n",
2027 transnr, statenr, newto);
2028#endif
Daniel Veillarddb68b742005-07-30 13:18:24 +00002029 has_epsilon = 1;
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00002030 state->trans[transnr].to = -2;
2031 state->mark = XML_REGEXP_MARK_START;
Daniel Veillard4255d502002-04-16 15:50:10 +00002032 xmlFAReduceEpsilonTransitions(ctxt, statenr,
2033 newto, state->trans[transnr].counter);
2034 state->mark = XML_REGEXP_MARK_NORMAL;
2035#ifdef DEBUG_REGEXP_GRAPH
2036 } else {
2037 printf("Found counted transition %d on %d\n",
2038 transnr, statenr);
2039#endif
2040 }
2041 }
2042 }
2043 }
2044 /*
2045 * Eliminate the epsilon transitions
2046 */
Daniel Veillarddb68b742005-07-30 13:18:24 +00002047 if (has_epsilon) {
2048 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2049 state = ctxt->states[statenr];
2050 if (state == NULL)
2051 continue;
2052 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2053 xmlRegTransPtr trans = &(state->trans[transnr]);
2054 if ((trans->atom == NULL) &&
2055 (trans->count < 0) &&
2056 (trans->to >= 0)) {
2057 trans->to = -1;
2058 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002059 }
2060 }
2061 }
Daniel Veillard23e73572002-09-19 19:56:43 +00002062
2063 /*
2064 * Use this pass to detect unreachable states too
2065 */
2066 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2067 state = ctxt->states[statenr];
2068 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00002069 state->reached = XML_REGEXP_MARK_NORMAL;
Daniel Veillard23e73572002-09-19 19:56:43 +00002070 }
2071 state = ctxt->states[0];
2072 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00002073 state->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00002074 while (state != NULL) {
2075 xmlRegStatePtr target = NULL;
William M. Brack779af002003-08-01 15:55:39 +00002076 state->reached = XML_REGEXP_MARK_VISITED;
Daniel Veillard23e73572002-09-19 19:56:43 +00002077 /*
William M. Brackddf71d62004-05-06 04:17:26 +00002078 * Mark all states reachable from the current reachable state
Daniel Veillard23e73572002-09-19 19:56:43 +00002079 */
2080 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2081 if ((state->trans[transnr].to >= 0) &&
2082 ((state->trans[transnr].atom != NULL) ||
2083 (state->trans[transnr].count >= 0))) {
2084 int newto = state->trans[transnr].to;
2085
2086 if (ctxt->states[newto] == NULL)
2087 continue;
William M. Brack779af002003-08-01 15:55:39 +00002088 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
2089 ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00002090 target = ctxt->states[newto];
2091 }
2092 }
2093 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00002094
Daniel Veillard23e73572002-09-19 19:56:43 +00002095 /*
2096 * find the next accessible state not explored
2097 */
2098 if (target == NULL) {
2099 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
2100 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00002101 if ((state != NULL) && (state->reached ==
2102 XML_REGEXP_MARK_START)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00002103 target = state;
2104 break;
2105 }
2106 }
2107 }
2108 state = target;
2109 }
2110 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2111 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00002112 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00002113#ifdef DEBUG_REGEXP_GRAPH
2114 printf("Removed unreachable state %d\n", statenr);
2115#endif
2116 xmlRegFreeState(state);
2117 ctxt->states[statenr] = NULL;
2118 }
2119 }
2120
Daniel Veillard4255d502002-04-16 15:50:10 +00002121}
2122
Daniel Veillard567a45b2005-10-18 19:11:55 +00002123static int
2124xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
2125 int ret = 0;
2126
2127 if ((range1->type == XML_REGEXP_RANGES) ||
2128 (range2->type == XML_REGEXP_RANGES) ||
2129 (range2->type == XML_REGEXP_SUBREG) ||
2130 (range1->type == XML_REGEXP_SUBREG) ||
2131 (range1->type == XML_REGEXP_STRING) ||
2132 (range2->type == XML_REGEXP_STRING))
2133 return(-1);
2134
2135 /* put them in order */
2136 if (range1->type > range2->type) {
2137 xmlRegRangePtr tmp;
2138
2139 tmp = range1;
2140 range1 = range2;
2141 range2 = tmp;
2142 }
2143 if ((range1->type == XML_REGEXP_ANYCHAR) ||
2144 (range2->type == XML_REGEXP_ANYCHAR)) {
2145 ret = 1;
2146 } else if ((range1->type == XML_REGEXP_EPSILON) ||
2147 (range2->type == XML_REGEXP_EPSILON)) {
2148 return(0);
2149 } else if (range1->type == range2->type) {
Daniel Veillard9332b482009-09-23 18:28:43 +02002150 if (range1->type != XML_REGEXP_CHARVAL)
2151 ret = 1;
2152 else if ((range1->end < range2->start) ||
2153 (range2->end < range1->start))
Daniel Veillard567a45b2005-10-18 19:11:55 +00002154 ret = 0;
Daniel Veillard9332b482009-09-23 18:28:43 +02002155 else
2156 ret = 1;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002157 } else if (range1->type == XML_REGEXP_CHARVAL) {
2158 int codepoint;
2159 int neg = 0;
2160
2161 /*
2162 * just check all codepoints in the range for acceptance,
2163 * this is usually way cheaper since done only once at
Daniel Veillardf8e3db02012-09-11 13:26:36 +08002164 * compilation than testing over and over at runtime or
Daniel Veillard567a45b2005-10-18 19:11:55 +00002165 * pushing too many states when evaluating.
2166 */
2167 if (((range1->neg == 0) && (range2->neg != 0)) ||
2168 ((range1->neg != 0) && (range2->neg == 0)))
2169 neg = 1;
2170
2171 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2172 ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2173 0, range2->start, range2->end,
2174 range2->blockName);
2175 if (ret < 0)
2176 return(-1);
2177 if (((neg == 1) && (ret == 0)) ||
2178 ((neg == 0) && (ret == 1)))
2179 return(1);
2180 }
2181 return(0);
2182 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2183 (range2->type == XML_REGEXP_BLOCK_NAME)) {
2184 if (range1->type == range2->type) {
2185 ret = xmlStrEqual(range1->blockName, range2->blockName);
2186 } else {
2187 /*
2188 * comparing a block range with anything else is way
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002189 * too costly, and maintaining the table is like too much
Daniel Veillard567a45b2005-10-18 19:11:55 +00002190 * memory too, so let's force the automata to save state
2191 * here.
2192 */
2193 return(1);
2194 }
2195 } else if ((range1->type < XML_REGEXP_LETTER) ||
2196 (range2->type < XML_REGEXP_LETTER)) {
2197 if ((range1->type == XML_REGEXP_ANYSPACE) &&
2198 (range2->type == XML_REGEXP_NOTSPACE))
2199 ret = 0;
2200 else if ((range1->type == XML_REGEXP_INITNAME) &&
2201 (range2->type == XML_REGEXP_NOTINITNAME))
2202 ret = 0;
2203 else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2204 (range2->type == XML_REGEXP_NOTNAMECHAR))
2205 ret = 0;
2206 else if ((range1->type == XML_REGEXP_DECIMAL) &&
2207 (range2->type == XML_REGEXP_NOTDECIMAL))
2208 ret = 0;
2209 else if ((range1->type == XML_REGEXP_REALCHAR) &&
2210 (range2->type == XML_REGEXP_NOTREALCHAR))
2211 ret = 0;
2212 else {
2213 /* same thing to limit complexity */
2214 return(1);
2215 }
2216 } else {
2217 ret = 0;
2218 /* range1->type < range2->type here */
2219 switch (range1->type) {
2220 case XML_REGEXP_LETTER:
2221 /* all disjoint except in the subgroups */
2222 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2223 (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2224 (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2225 (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2226 (range2->type == XML_REGEXP_LETTER_OTHERS))
2227 ret = 1;
2228 break;
2229 case XML_REGEXP_MARK:
2230 if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2231 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2232 (range2->type == XML_REGEXP_MARK_ENCLOSING))
2233 ret = 1;
2234 break;
2235 case XML_REGEXP_NUMBER:
2236 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2237 (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2238 (range2->type == XML_REGEXP_NUMBER_OTHERS))
2239 ret = 1;
2240 break;
2241 case XML_REGEXP_PUNCT:
2242 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2243 (range2->type == XML_REGEXP_PUNCT_DASH) ||
2244 (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2245 (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2246 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2247 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2248 (range2->type == XML_REGEXP_PUNCT_OTHERS))
2249 ret = 1;
2250 break;
2251 case XML_REGEXP_SEPAR:
2252 if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2253 (range2->type == XML_REGEXP_SEPAR_LINE) ||
2254 (range2->type == XML_REGEXP_SEPAR_PARA))
2255 ret = 1;
2256 break;
2257 case XML_REGEXP_SYMBOL:
2258 if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2259 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2260 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2261 (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2262 ret = 1;
2263 break;
2264 case XML_REGEXP_OTHER:
2265 if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2266 (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2267 (range2->type == XML_REGEXP_OTHER_PRIVATE))
2268 ret = 1;
2269 break;
2270 default:
2271 if ((range2->type >= XML_REGEXP_LETTER) &&
2272 (range2->type < XML_REGEXP_BLOCK_NAME))
2273 ret = 0;
2274 else {
2275 /* safety net ! */
2276 return(1);
2277 }
2278 }
2279 }
2280 if (((range1->neg == 0) && (range2->neg != 0)) ||
2281 ((range1->neg != 0) && (range2->neg == 0)))
2282 ret = !ret;
Daniel Veillard594e5df2009-09-07 14:58:47 +02002283 return(ret);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002284}
2285
Daniel Veillarde19fc232002-04-22 16:01:24 +00002286/**
Daniel Veillardfc011b72006-02-12 19:14:15 +00002287 * xmlFACompareAtomTypes:
2288 * @type1: an atom type
2289 * @type2: an atom type
2290 *
2291 * Compares two atoms type to check whether they intersect in some ways,
2292 * this is used by xmlFACompareAtoms only
2293 *
2294 * Returns 1 if they may intersect and 0 otherwise
2295 */
2296static int
2297xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2298 if ((type1 == XML_REGEXP_EPSILON) ||
2299 (type1 == XML_REGEXP_CHARVAL) ||
2300 (type1 == XML_REGEXP_RANGES) ||
2301 (type1 == XML_REGEXP_SUBREG) ||
2302 (type1 == XML_REGEXP_STRING) ||
2303 (type1 == XML_REGEXP_ANYCHAR))
2304 return(1);
2305 if ((type2 == XML_REGEXP_EPSILON) ||
2306 (type2 == XML_REGEXP_CHARVAL) ||
2307 (type2 == XML_REGEXP_RANGES) ||
2308 (type2 == XML_REGEXP_SUBREG) ||
2309 (type2 == XML_REGEXP_STRING) ||
2310 (type2 == XML_REGEXP_ANYCHAR))
2311 return(1);
2312
2313 if (type1 == type2) return(1);
2314
2315 /* simplify subsequent compares by making sure type1 < type2 */
2316 if (type1 > type2) {
2317 xmlRegAtomType tmp = type1;
2318 type1 = type2;
2319 type2 = tmp;
2320 }
2321 switch (type1) {
2322 case XML_REGEXP_ANYSPACE: /* \s */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002323 /* can't be a letter, number, mark, punctuation, symbol */
Daniel Veillardfc011b72006-02-12 19:14:15 +00002324 if ((type2 == XML_REGEXP_NOTSPACE) ||
2325 ((type2 >= XML_REGEXP_LETTER) &&
2326 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2327 ((type2 >= XML_REGEXP_NUMBER) &&
2328 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2329 ((type2 >= XML_REGEXP_MARK) &&
2330 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2331 ((type2 >= XML_REGEXP_PUNCT) &&
2332 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2333 ((type2 >= XML_REGEXP_SYMBOL) &&
2334 (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2335 ) return(0);
2336 break;
2337 case XML_REGEXP_NOTSPACE: /* \S */
2338 break;
2339 case XML_REGEXP_INITNAME: /* \l */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002340 /* can't be a number, mark, separator, punctuation, symbol or other */
Daniel Veillardfc011b72006-02-12 19:14:15 +00002341 if ((type2 == XML_REGEXP_NOTINITNAME) ||
2342 ((type2 >= XML_REGEXP_NUMBER) &&
2343 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2344 ((type2 >= XML_REGEXP_MARK) &&
2345 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2346 ((type2 >= XML_REGEXP_SEPAR) &&
2347 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2348 ((type2 >= XML_REGEXP_PUNCT) &&
2349 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2350 ((type2 >= XML_REGEXP_SYMBOL) &&
2351 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2352 ((type2 >= XML_REGEXP_OTHER) &&
2353 (type2 <= XML_REGEXP_OTHER_NA))
2354 ) return(0);
2355 break;
2356 case XML_REGEXP_NOTINITNAME: /* \L */
2357 break;
2358 case XML_REGEXP_NAMECHAR: /* \c */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002359 /* can't be a mark, separator, punctuation, symbol or other */
Daniel Veillardfc011b72006-02-12 19:14:15 +00002360 if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2361 ((type2 >= XML_REGEXP_MARK) &&
2362 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2363 ((type2 >= XML_REGEXP_PUNCT) &&
2364 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2365 ((type2 >= XML_REGEXP_SEPAR) &&
2366 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2367 ((type2 >= XML_REGEXP_SYMBOL) &&
2368 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2369 ((type2 >= XML_REGEXP_OTHER) &&
2370 (type2 <= XML_REGEXP_OTHER_NA))
2371 ) return(0);
2372 break;
2373 case XML_REGEXP_NOTNAMECHAR: /* \C */
2374 break;
2375 case XML_REGEXP_DECIMAL: /* \d */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002376 /* can't be a letter, mark, separator, punctuation, symbol or other */
Daniel Veillardfc011b72006-02-12 19:14:15 +00002377 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2378 (type2 == XML_REGEXP_REALCHAR) ||
2379 ((type2 >= XML_REGEXP_LETTER) &&
2380 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2381 ((type2 >= XML_REGEXP_MARK) &&
2382 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2383 ((type2 >= XML_REGEXP_PUNCT) &&
2384 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2385 ((type2 >= XML_REGEXP_SEPAR) &&
2386 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2387 ((type2 >= XML_REGEXP_SYMBOL) &&
2388 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2389 ((type2 >= XML_REGEXP_OTHER) &&
2390 (type2 <= XML_REGEXP_OTHER_NA))
2391 )return(0);
2392 break;
2393 case XML_REGEXP_NOTDECIMAL: /* \D */
2394 break;
2395 case XML_REGEXP_REALCHAR: /* \w */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002396 /* can't be a mark, separator, punctuation, symbol or other */
Daniel Veillardfc011b72006-02-12 19:14:15 +00002397 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2398 ((type2 >= XML_REGEXP_MARK) &&
2399 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2400 ((type2 >= XML_REGEXP_PUNCT) &&
2401 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2402 ((type2 >= XML_REGEXP_SEPAR) &&
2403 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2404 ((type2 >= XML_REGEXP_SYMBOL) &&
2405 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2406 ((type2 >= XML_REGEXP_OTHER) &&
2407 (type2 <= XML_REGEXP_OTHER_NA))
2408 )return(0);
2409 break;
2410 case XML_REGEXP_NOTREALCHAR: /* \W */
2411 break;
2412 /*
2413 * at that point we know both type 1 and type2 are from
2414 * character categories are ordered and are different,
2415 * it becomes simple because this is a partition
2416 */
2417 case XML_REGEXP_LETTER:
2418 if (type2 <= XML_REGEXP_LETTER_OTHERS)
2419 return(1);
2420 return(0);
2421 case XML_REGEXP_LETTER_UPPERCASE:
2422 case XML_REGEXP_LETTER_LOWERCASE:
2423 case XML_REGEXP_LETTER_TITLECASE:
2424 case XML_REGEXP_LETTER_MODIFIER:
2425 case XML_REGEXP_LETTER_OTHERS:
2426 return(0);
2427 case XML_REGEXP_MARK:
2428 if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2429 return(1);
2430 return(0);
2431 case XML_REGEXP_MARK_NONSPACING:
2432 case XML_REGEXP_MARK_SPACECOMBINING:
2433 case XML_REGEXP_MARK_ENCLOSING:
2434 return(0);
2435 case XML_REGEXP_NUMBER:
2436 if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2437 return(1);
2438 return(0);
2439 case XML_REGEXP_NUMBER_DECIMAL:
2440 case XML_REGEXP_NUMBER_LETTER:
2441 case XML_REGEXP_NUMBER_OTHERS:
2442 return(0);
2443 case XML_REGEXP_PUNCT:
2444 if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2445 return(1);
2446 return(0);
2447 case XML_REGEXP_PUNCT_CONNECTOR:
2448 case XML_REGEXP_PUNCT_DASH:
2449 case XML_REGEXP_PUNCT_OPEN:
2450 case XML_REGEXP_PUNCT_CLOSE:
2451 case XML_REGEXP_PUNCT_INITQUOTE:
2452 case XML_REGEXP_PUNCT_FINQUOTE:
2453 case XML_REGEXP_PUNCT_OTHERS:
2454 return(0);
2455 case XML_REGEXP_SEPAR:
2456 if (type2 <= XML_REGEXP_SEPAR_PARA)
2457 return(1);
2458 return(0);
2459 case XML_REGEXP_SEPAR_SPACE:
2460 case XML_REGEXP_SEPAR_LINE:
2461 case XML_REGEXP_SEPAR_PARA:
2462 return(0);
2463 case XML_REGEXP_SYMBOL:
2464 if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2465 return(1);
2466 return(0);
2467 case XML_REGEXP_SYMBOL_MATH:
2468 case XML_REGEXP_SYMBOL_CURRENCY:
2469 case XML_REGEXP_SYMBOL_MODIFIER:
2470 case XML_REGEXP_SYMBOL_OTHERS:
2471 return(0);
2472 case XML_REGEXP_OTHER:
2473 if (type2 <= XML_REGEXP_OTHER_NA)
2474 return(1);
2475 return(0);
2476 case XML_REGEXP_OTHER_CONTROL:
2477 case XML_REGEXP_OTHER_FORMAT:
2478 case XML_REGEXP_OTHER_PRIVATE:
2479 case XML_REGEXP_OTHER_NA:
2480 return(0);
2481 default:
2482 break;
2483 }
2484 return(1);
2485}
2486
2487/**
2488 * xmlFAEqualAtoms:
Daniel Veillarde19fc232002-04-22 16:01:24 +00002489 * @atom1: an atom
2490 * @atom2: an atom
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002491 * @deep: if not set only compare string pointers
Daniel Veillarde19fc232002-04-22 16:01:24 +00002492 *
Daniel Veillardfc011b72006-02-12 19:14:15 +00002493 * Compares two atoms to check whether they are the same exactly
2494 * this is used to remove equivalent transitions
Daniel Veillarde19fc232002-04-22 16:01:24 +00002495 *
Daniel Veillardfc011b72006-02-12 19:14:15 +00002496 * Returns 1 if same and 0 otherwise
Daniel Veillarde19fc232002-04-22 16:01:24 +00002497 */
2498static int
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002499xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
Daniel Veillardfc011b72006-02-12 19:14:15 +00002500 int ret = 0;
Daniel Veillard9efc4762005-07-19 14:33:55 +00002501
Daniel Veillarde19fc232002-04-22 16:01:24 +00002502 if (atom1 == atom2)
2503 return(1);
2504 if ((atom1 == NULL) || (atom2 == NULL))
2505 return(0);
2506
Daniel Veillardfc011b72006-02-12 19:14:15 +00002507 if (atom1->type != atom2->type)
2508 return(0);
2509 switch (atom1->type) {
2510 case XML_REGEXP_EPSILON:
2511 ret = 0;
2512 break;
2513 case XML_REGEXP_STRING:
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002514 if (!deep)
2515 ret = (atom1->valuep == atom2->valuep);
2516 else
2517 ret = xmlStrEqual((xmlChar *)atom1->valuep,
2518 (xmlChar *)atom2->valuep);
Daniel Veillardfc011b72006-02-12 19:14:15 +00002519 break;
2520 case XML_REGEXP_CHARVAL:
2521 ret = (atom1->codepoint == atom2->codepoint);
2522 break;
2523 case XML_REGEXP_RANGES:
2524 /* too hard to do in the general case */
2525 ret = 0;
2526 default:
2527 break;
2528 }
2529 return(ret);
2530}
2531
2532/**
2533 * xmlFACompareAtoms:
2534 * @atom1: an atom
2535 * @atom2: an atom
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002536 * @deep: if not set only compare string pointers
Daniel Veillardfc011b72006-02-12 19:14:15 +00002537 *
2538 * Compares two atoms to check whether they intersect in some ways,
2539 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2540 *
2541 * Returns 1 if yes and 0 otherwise
2542 */
2543static int
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002544xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
Daniel Veillardfc011b72006-02-12 19:14:15 +00002545 int ret = 1;
2546
2547 if (atom1 == atom2)
2548 return(1);
2549 if ((atom1 == NULL) || (atom2 == NULL))
2550 return(0);
2551
2552 if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2553 (atom2->type == XML_REGEXP_ANYCHAR))
2554 return(1);
2555
2556 if (atom1->type > atom2->type) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002557 xmlRegAtomPtr tmp;
2558 tmp = atom1;
2559 atom1 = atom2;
2560 atom2 = tmp;
Daniel Veillardfc011b72006-02-12 19:14:15 +00002561 }
2562 if (atom1->type != atom2->type) {
2563 ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2564 /* if they can't intersect at the type level break now */
2565 if (ret == 0)
2566 return(0);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002567 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002568 switch (atom1->type) {
2569 case XML_REGEXP_STRING:
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002570 if (!deep)
2571 ret = (atom1->valuep != atom2->valuep);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002572 else {
2573 xmlChar *val1 = (xmlChar *)atom1->valuep;
2574 xmlChar *val2 = (xmlChar *)atom2->valuep;
2575 int compound1 = (xmlStrchr(val1, '|') != NULL);
2576 int compound2 = (xmlStrchr(val2, '|') != NULL);
2577
2578 /* Ignore negative match flag for ##other namespaces */
2579 if (compound1 != compound2)
2580 return(0);
2581
2582 ret = xmlRegStrEqualWildcard(val1, val2);
2583 }
Daniel Veillard9efc4762005-07-19 14:33:55 +00002584 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002585 case XML_REGEXP_EPSILON:
Daniel Veillard567a45b2005-10-18 19:11:55 +00002586 goto not_determinist;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002587 case XML_REGEXP_CHARVAL:
Daniel Veillardfc011b72006-02-12 19:14:15 +00002588 if (atom2->type == XML_REGEXP_CHARVAL) {
2589 ret = (atom1->codepoint == atom2->codepoint);
2590 } else {
2591 ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2592 if (ret < 0)
2593 ret = 1;
2594 }
Daniel Veillard9efc4762005-07-19 14:33:55 +00002595 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002596 case XML_REGEXP_RANGES:
Daniel Veillardfc011b72006-02-12 19:14:15 +00002597 if (atom2->type == XML_REGEXP_RANGES) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002598 int i, j, res;
2599 xmlRegRangePtr r1, r2;
2600
2601 /*
2602 * need to check that none of the ranges eventually matches
2603 */
2604 for (i = 0;i < atom1->nbRanges;i++) {
2605 for (j = 0;j < atom2->nbRanges;j++) {
2606 r1 = atom1->ranges[i];
2607 r2 = atom2->ranges[j];
2608 res = xmlFACompareRanges(r1, r2);
2609 if (res == 1) {
2610 ret = 1;
2611 goto done;
2612 }
2613 }
2614 }
2615 ret = 0;
2616 }
2617 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002618 default:
Daniel Veillard567a45b2005-10-18 19:11:55 +00002619 goto not_determinist;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002620 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002621done:
Daniel Veillard6e65e152005-08-09 11:09:52 +00002622 if (atom1->neg != atom2->neg) {
Daniel Veillard9efc4762005-07-19 14:33:55 +00002623 ret = !ret;
Daniel Veillard6e65e152005-08-09 11:09:52 +00002624 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002625 if (ret == 0)
2626 return(0);
2627not_determinist:
2628 return(1);
Daniel Veillarde19fc232002-04-22 16:01:24 +00002629}
2630
2631/**
2632 * xmlFARecurseDeterminism:
2633 * @ctxt: a regexp parser context
2634 *
2635 * Check whether the associated regexp is determinist,
2636 * should be called after xmlFAEliminateEpsilonTransitions()
2637 *
2638 */
2639static int
2640xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2641 int to, xmlRegAtomPtr atom) {
2642 int ret = 1;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002643 int res;
Daniel Veillard5de09382005-09-26 17:18:17 +00002644 int transnr, nbTrans;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002645 xmlRegTransPtr t1;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002646 int deep = 1;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002647
2648 if (state == NULL)
2649 return(ret);
Daniel Veillard466fcda2012-08-27 12:03:40 +08002650 if (state->markd == XML_REGEXP_MARK_VISITED)
2651 return(ret);
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002652
2653 if (ctxt->flags & AM_AUTOMATA_RNG)
2654 deep = 0;
2655
Daniel Veillard5de09382005-09-26 17:18:17 +00002656 /*
2657 * don't recurse on transitions potentially added in the course of
2658 * the elimination.
2659 */
2660 nbTrans = state->nbTrans;
2661 for (transnr = 0;transnr < nbTrans;transnr++) {
Daniel Veillarde19fc232002-04-22 16:01:24 +00002662 t1 = &(state->trans[transnr]);
2663 /*
2664 * check transitions conflicting with the one looked at
2665 */
2666 if (t1->atom == NULL) {
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00002667 if (t1->to < 0)
Daniel Veillarde19fc232002-04-22 16:01:24 +00002668 continue;
Daniel Veillard466fcda2012-08-27 12:03:40 +08002669 state->markd = XML_REGEXP_MARK_VISITED;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002670 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
Daniel Veillarde19fc232002-04-22 16:01:24 +00002671 to, atom);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002672 if (res == 0) {
2673 ret = 0;
Daniel Veillardaa622012005-10-20 15:55:25 +00002674 /* t1->nd = 1; */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002675 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002676 continue;
2677 }
2678 if (t1->to != to)
2679 continue;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002680 if (xmlFACompareAtoms(t1->atom, atom, deep)) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002681 ret = 0;
2682 /* mark the transition as non-deterministic */
2683 t1->nd = 1;
2684 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002685 }
2686 return(ret);
2687}
2688
2689/**
Haibo Huang35812382020-07-31 15:23:22 -07002690 * xmlFAFinishRecurseDeterminism:
2691 * @ctxt: a regexp parser context
2692 *
2693 * Reset flags after checking determinism.
2694 */
2695static void
2696xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
2697 int transnr, nbTrans;
2698
2699 if (state == NULL)
2700 return;
2701 if (state->markd != XML_REGEXP_MARK_VISITED)
2702 return;
2703 state->markd = 0;
2704
2705 nbTrans = state->nbTrans;
2706 for (transnr = 0; transnr < nbTrans; transnr++) {
2707 xmlRegTransPtr t1 = &state->trans[transnr];
2708 if ((t1->atom == NULL) && (t1->to >= 0))
2709 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2710 }
2711}
2712
2713/**
Daniel Veillarde19fc232002-04-22 16:01:24 +00002714 * xmlFAComputesDeterminism:
2715 * @ctxt: a regexp parser context
2716 *
2717 * Check whether the associated regexp is determinist,
2718 * should be called after xmlFAEliminateEpsilonTransitions()
2719 *
2720 */
2721static int
2722xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2723 int statenr, transnr;
2724 xmlRegStatePtr state;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002725 xmlRegTransPtr t1, t2, last;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002726 int i;
2727 int ret = 1;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002728 int deep = 1;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002729
Daniel Veillard4402ab42002-09-12 16:02:56 +00002730#ifdef DEBUG_REGEXP_GRAPH
2731 printf("xmlFAComputesDeterminism\n");
2732 xmlRegPrintCtxt(stdout, ctxt);
2733#endif
Daniel Veillarde19fc232002-04-22 16:01:24 +00002734 if (ctxt->determinist != -1)
2735 return(ctxt->determinist);
2736
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002737 if (ctxt->flags & AM_AUTOMATA_RNG)
2738 deep = 0;
2739
Daniel Veillarde19fc232002-04-22 16:01:24 +00002740 /*
Daniel Veillard567a45b2005-10-18 19:11:55 +00002741 * First cleanup the automata removing cancelled transitions
Daniel Veillarde19fc232002-04-22 16:01:24 +00002742 */
2743 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2744 state = ctxt->states[statenr];
2745 if (state == NULL)
2746 continue;
Daniel Veillard4f82c8a2005-08-09 21:40:08 +00002747 if (state->nbTrans < 2)
2748 continue;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002749 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2750 t1 = &(state->trans[transnr]);
2751 /*
2752 * Determinism checks in case of counted or all transitions
2753 * will have to be handled separately
2754 */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002755 if (t1->atom == NULL) {
Daniel Veillardaa622012005-10-20 15:55:25 +00002756 /* t1->nd = 1; */
Daniel Veillarde19fc232002-04-22 16:01:24 +00002757 continue;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002758 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002759 if (t1->to == -1) /* eliminated */
2760 continue;
2761 for (i = 0;i < transnr;i++) {
2762 t2 = &(state->trans[i]);
2763 if (t2->to == -1) /* eliminated */
2764 continue;
2765 if (t2->atom != NULL) {
2766 if (t1->to == t2->to) {
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002767 /*
2768 * Here we use deep because we want to keep the
2769 * transitions which indicate a conflict
2770 */
2771 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
Daniel Veillard11e28e42009-08-12 12:21:42 +02002772 (t1->counter == t2->counter) &&
2773 (t1->count == t2->count))
William M. Brackddf71d62004-05-06 04:17:26 +00002774 t2->to = -1; /* eliminated */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002775 }
2776 }
2777 }
2778 }
2779 }
2780
2781 /*
2782 * Check for all states that there aren't 2 transitions
2783 * with the same atom and a different target.
2784 */
2785 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2786 state = ctxt->states[statenr];
2787 if (state == NULL)
2788 continue;
2789 if (state->nbTrans < 2)
2790 continue;
2791 last = NULL;
2792 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2793 t1 = &(state->trans[transnr]);
2794 /*
2795 * Determinism checks in case of counted or all transitions
2796 * will have to be handled separately
2797 */
2798 if (t1->atom == NULL) {
2799 continue;
2800 }
2801 if (t1->to == -1) /* eliminated */
2802 continue;
2803 for (i = 0;i < transnr;i++) {
2804 t2 = &(state->trans[i]);
2805 if (t2->to == -1) /* eliminated */
2806 continue;
2807 if (t2->atom != NULL) {
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002808 /*
2809 * But here we don't use deep because we want to
2810 * find transitions which indicate a conflict
2811 */
2812 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002813 ret = 0;
2814 /* mark the transitions as non-deterministic ones */
2815 t1->nd = 1;
2816 t2->nd = 1;
2817 last = t1;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002818 }
2819 } else if (t1->to != -1) {
2820 /*
2821 * do the closure in case of remaining specific
2822 * epsilon transitions like choices or all
2823 */
2824 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2825 t2->to, t2->atom);
Haibo Huang35812382020-07-31 15:23:22 -07002826 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002827 /* don't shortcut the computation so all non deterministic
2828 transition get marked down
Daniel Veillarde19fc232002-04-22 16:01:24 +00002829 if (ret == 0)
Daniel Veillardaa622012005-10-20 15:55:25 +00002830 return(0);
2831 */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002832 if (ret == 0) {
2833 t1->nd = 1;
Daniel Veillardaa622012005-10-20 15:55:25 +00002834 /* t2->nd = 1; */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002835 last = t1;
2836 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002837 }
2838 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002839 /* don't shortcut the computation so all non deterministic
2840 transition get marked down
Daniel Veillarde19fc232002-04-22 16:01:24 +00002841 if (ret == 0)
Daniel Veillard567a45b2005-10-18 19:11:55 +00002842 break; */
Daniel Veillarde19fc232002-04-22 16:01:24 +00002843 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002844
2845 /*
2846 * mark specifically the last non-deterministic transition
2847 * from a state since there is no need to set-up rollback
2848 * from it
2849 */
2850 if (last != NULL) {
2851 last->nd = 2;
2852 }
2853
2854 /* don't shortcut the computation so all non deterministic
2855 transition get marked down
Daniel Veillarde19fc232002-04-22 16:01:24 +00002856 if (ret == 0)
Daniel Veillard567a45b2005-10-18 19:11:55 +00002857 break; */
Daniel Veillarde19fc232002-04-22 16:01:24 +00002858 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002859
Daniel Veillarde19fc232002-04-22 16:01:24 +00002860 ctxt->determinist = ret;
2861 return(ret);
2862}
2863
Daniel Veillard4255d502002-04-16 15:50:10 +00002864/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08002865 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00002866 * Routines to check input against transition atoms *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08002867 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00002868 ************************************************************************/
2869
2870static int
2871xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2872 int start, int end, const xmlChar *blockName) {
2873 int ret = 0;
2874
2875 switch (type) {
2876 case XML_REGEXP_STRING:
2877 case XML_REGEXP_SUBREG:
2878 case XML_REGEXP_RANGES:
2879 case XML_REGEXP_EPSILON:
2880 return(-1);
2881 case XML_REGEXP_ANYCHAR:
2882 ret = ((codepoint != '\n') && (codepoint != '\r'));
2883 break;
2884 case XML_REGEXP_CHARVAL:
2885 ret = ((codepoint >= start) && (codepoint <= end));
2886 break;
2887 case XML_REGEXP_NOTSPACE:
2888 neg = !neg;
J. Peter Mugaasd2c329a2017-10-21 13:49:31 +02002889 /* Falls through. */
Daniel Veillard4255d502002-04-16 15:50:10 +00002890 case XML_REGEXP_ANYSPACE:
2891 ret = ((codepoint == '\n') || (codepoint == '\r') ||
2892 (codepoint == '\t') || (codepoint == ' '));
2893 break;
2894 case XML_REGEXP_NOTINITNAME:
2895 neg = !neg;
J. Peter Mugaasd2c329a2017-10-21 13:49:31 +02002896 /* Falls through. */
Daniel Veillard4255d502002-04-16 15:50:10 +00002897 case XML_REGEXP_INITNAME:
Daniel Veillardf8e3db02012-09-11 13:26:36 +08002898 ret = (IS_LETTER(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00002899 (codepoint == '_') || (codepoint == ':'));
2900 break;
2901 case XML_REGEXP_NOTNAMECHAR:
2902 neg = !neg;
J. Peter Mugaasd2c329a2017-10-21 13:49:31 +02002903 /* Falls through. */
Daniel Veillard4255d502002-04-16 15:50:10 +00002904 case XML_REGEXP_NAMECHAR:
William M. Brack871611b2003-10-18 04:53:14 +00002905 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00002906 (codepoint == '.') || (codepoint == '-') ||
2907 (codepoint == '_') || (codepoint == ':') ||
William M. Brack871611b2003-10-18 04:53:14 +00002908 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
Daniel Veillard4255d502002-04-16 15:50:10 +00002909 break;
2910 case XML_REGEXP_NOTDECIMAL:
2911 neg = !neg;
J. Peter Mugaasd2c329a2017-10-21 13:49:31 +02002912 /* Falls through. */
Daniel Veillard4255d502002-04-16 15:50:10 +00002913 case XML_REGEXP_DECIMAL:
2914 ret = xmlUCSIsCatNd(codepoint);
2915 break;
2916 case XML_REGEXP_REALCHAR:
2917 neg = !neg;
J. Peter Mugaasd2c329a2017-10-21 13:49:31 +02002918 /* Falls through. */
Daniel Veillard4255d502002-04-16 15:50:10 +00002919 case XML_REGEXP_NOTREALCHAR:
2920 ret = xmlUCSIsCatP(codepoint);
2921 if (ret == 0)
2922 ret = xmlUCSIsCatZ(codepoint);
2923 if (ret == 0)
2924 ret = xmlUCSIsCatC(codepoint);
2925 break;
2926 case XML_REGEXP_LETTER:
2927 ret = xmlUCSIsCatL(codepoint);
2928 break;
2929 case XML_REGEXP_LETTER_UPPERCASE:
2930 ret = xmlUCSIsCatLu(codepoint);
2931 break;
2932 case XML_REGEXP_LETTER_LOWERCASE:
2933 ret = xmlUCSIsCatLl(codepoint);
2934 break;
2935 case XML_REGEXP_LETTER_TITLECASE:
2936 ret = xmlUCSIsCatLt(codepoint);
2937 break;
2938 case XML_REGEXP_LETTER_MODIFIER:
2939 ret = xmlUCSIsCatLm(codepoint);
2940 break;
2941 case XML_REGEXP_LETTER_OTHERS:
2942 ret = xmlUCSIsCatLo(codepoint);
2943 break;
2944 case XML_REGEXP_MARK:
2945 ret = xmlUCSIsCatM(codepoint);
2946 break;
2947 case XML_REGEXP_MARK_NONSPACING:
2948 ret = xmlUCSIsCatMn(codepoint);
2949 break;
2950 case XML_REGEXP_MARK_SPACECOMBINING:
2951 ret = xmlUCSIsCatMc(codepoint);
2952 break;
2953 case XML_REGEXP_MARK_ENCLOSING:
2954 ret = xmlUCSIsCatMe(codepoint);
2955 break;
2956 case XML_REGEXP_NUMBER:
2957 ret = xmlUCSIsCatN(codepoint);
2958 break;
2959 case XML_REGEXP_NUMBER_DECIMAL:
2960 ret = xmlUCSIsCatNd(codepoint);
2961 break;
2962 case XML_REGEXP_NUMBER_LETTER:
2963 ret = xmlUCSIsCatNl(codepoint);
2964 break;
2965 case XML_REGEXP_NUMBER_OTHERS:
2966 ret = xmlUCSIsCatNo(codepoint);
2967 break;
2968 case XML_REGEXP_PUNCT:
2969 ret = xmlUCSIsCatP(codepoint);
2970 break;
2971 case XML_REGEXP_PUNCT_CONNECTOR:
2972 ret = xmlUCSIsCatPc(codepoint);
2973 break;
2974 case XML_REGEXP_PUNCT_DASH:
2975 ret = xmlUCSIsCatPd(codepoint);
2976 break;
2977 case XML_REGEXP_PUNCT_OPEN:
2978 ret = xmlUCSIsCatPs(codepoint);
2979 break;
2980 case XML_REGEXP_PUNCT_CLOSE:
2981 ret = xmlUCSIsCatPe(codepoint);
2982 break;
2983 case XML_REGEXP_PUNCT_INITQUOTE:
2984 ret = xmlUCSIsCatPi(codepoint);
2985 break;
2986 case XML_REGEXP_PUNCT_FINQUOTE:
2987 ret = xmlUCSIsCatPf(codepoint);
2988 break;
2989 case XML_REGEXP_PUNCT_OTHERS:
2990 ret = xmlUCSIsCatPo(codepoint);
2991 break;
2992 case XML_REGEXP_SEPAR:
2993 ret = xmlUCSIsCatZ(codepoint);
2994 break;
2995 case XML_REGEXP_SEPAR_SPACE:
2996 ret = xmlUCSIsCatZs(codepoint);
2997 break;
2998 case XML_REGEXP_SEPAR_LINE:
2999 ret = xmlUCSIsCatZl(codepoint);
3000 break;
3001 case XML_REGEXP_SEPAR_PARA:
3002 ret = xmlUCSIsCatZp(codepoint);
3003 break;
3004 case XML_REGEXP_SYMBOL:
3005 ret = xmlUCSIsCatS(codepoint);
3006 break;
3007 case XML_REGEXP_SYMBOL_MATH:
3008 ret = xmlUCSIsCatSm(codepoint);
3009 break;
3010 case XML_REGEXP_SYMBOL_CURRENCY:
3011 ret = xmlUCSIsCatSc(codepoint);
3012 break;
3013 case XML_REGEXP_SYMBOL_MODIFIER:
3014 ret = xmlUCSIsCatSk(codepoint);
3015 break;
3016 case XML_REGEXP_SYMBOL_OTHERS:
3017 ret = xmlUCSIsCatSo(codepoint);
3018 break;
3019 case XML_REGEXP_OTHER:
3020 ret = xmlUCSIsCatC(codepoint);
3021 break;
3022 case XML_REGEXP_OTHER_CONTROL:
3023 ret = xmlUCSIsCatCc(codepoint);
3024 break;
3025 case XML_REGEXP_OTHER_FORMAT:
3026 ret = xmlUCSIsCatCf(codepoint);
3027 break;
3028 case XML_REGEXP_OTHER_PRIVATE:
3029 ret = xmlUCSIsCatCo(codepoint);
3030 break;
3031 case XML_REGEXP_OTHER_NA:
3032 /* ret = xmlUCSIsCatCn(codepoint); */
3033 /* Seems it doesn't exist anymore in recent Unicode releases */
3034 ret = 0;
3035 break;
3036 case XML_REGEXP_BLOCK_NAME:
3037 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
3038 break;
3039 }
3040 if (neg)
3041 return(!ret);
3042 return(ret);
3043}
3044
3045static int
3046xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
3047 int i, ret = 0;
3048 xmlRegRangePtr range;
3049
William M. Brack871611b2003-10-18 04:53:14 +00003050 if ((atom == NULL) || (!IS_CHAR(codepoint)))
Daniel Veillard4255d502002-04-16 15:50:10 +00003051 return(-1);
3052
3053 switch (atom->type) {
3054 case XML_REGEXP_SUBREG:
3055 case XML_REGEXP_EPSILON:
3056 return(-1);
3057 case XML_REGEXP_CHARVAL:
3058 return(codepoint == atom->codepoint);
3059 case XML_REGEXP_RANGES: {
3060 int accept = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00003061
Daniel Veillard4255d502002-04-16 15:50:10 +00003062 for (i = 0;i < atom->nbRanges;i++) {
3063 range = atom->ranges[i];
Daniel Veillardf8b9de32003-11-24 14:27:26 +00003064 if (range->neg == 2) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003065 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3066 0, range->start, range->end,
3067 range->blockName);
3068 if (ret != 0)
3069 return(0); /* excluded char */
Daniel Veillardf8b9de32003-11-24 14:27:26 +00003070 } else if (range->neg) {
3071 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3072 0, range->start, range->end,
3073 range->blockName);
3074 if (ret == 0)
Daniel Veillardf2a12832003-11-24 13:04:35 +00003075 accept = 1;
Daniel Veillardf8b9de32003-11-24 14:27:26 +00003076 else
3077 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00003078 } else {
3079 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3080 0, range->start, range->end,
3081 range->blockName);
3082 if (ret != 0)
3083 accept = 1; /* might still be excluded */
3084 }
3085 }
3086 return(accept);
3087 }
3088 case XML_REGEXP_STRING:
3089 printf("TODO: XML_REGEXP_STRING\n");
3090 return(-1);
3091 case XML_REGEXP_ANYCHAR:
3092 case XML_REGEXP_ANYSPACE:
3093 case XML_REGEXP_NOTSPACE:
3094 case XML_REGEXP_INITNAME:
3095 case XML_REGEXP_NOTINITNAME:
3096 case XML_REGEXP_NAMECHAR:
3097 case XML_REGEXP_NOTNAMECHAR:
3098 case XML_REGEXP_DECIMAL:
3099 case XML_REGEXP_NOTDECIMAL:
3100 case XML_REGEXP_REALCHAR:
3101 case XML_REGEXP_NOTREALCHAR:
3102 case XML_REGEXP_LETTER:
3103 case XML_REGEXP_LETTER_UPPERCASE:
3104 case XML_REGEXP_LETTER_LOWERCASE:
3105 case XML_REGEXP_LETTER_TITLECASE:
3106 case XML_REGEXP_LETTER_MODIFIER:
3107 case XML_REGEXP_LETTER_OTHERS:
3108 case XML_REGEXP_MARK:
3109 case XML_REGEXP_MARK_NONSPACING:
3110 case XML_REGEXP_MARK_SPACECOMBINING:
3111 case XML_REGEXP_MARK_ENCLOSING:
3112 case XML_REGEXP_NUMBER:
3113 case XML_REGEXP_NUMBER_DECIMAL:
3114 case XML_REGEXP_NUMBER_LETTER:
3115 case XML_REGEXP_NUMBER_OTHERS:
3116 case XML_REGEXP_PUNCT:
3117 case XML_REGEXP_PUNCT_CONNECTOR:
3118 case XML_REGEXP_PUNCT_DASH:
3119 case XML_REGEXP_PUNCT_OPEN:
3120 case XML_REGEXP_PUNCT_CLOSE:
3121 case XML_REGEXP_PUNCT_INITQUOTE:
3122 case XML_REGEXP_PUNCT_FINQUOTE:
3123 case XML_REGEXP_PUNCT_OTHERS:
3124 case XML_REGEXP_SEPAR:
3125 case XML_REGEXP_SEPAR_SPACE:
3126 case XML_REGEXP_SEPAR_LINE:
3127 case XML_REGEXP_SEPAR_PARA:
3128 case XML_REGEXP_SYMBOL:
3129 case XML_REGEXP_SYMBOL_MATH:
3130 case XML_REGEXP_SYMBOL_CURRENCY:
3131 case XML_REGEXP_SYMBOL_MODIFIER:
3132 case XML_REGEXP_SYMBOL_OTHERS:
3133 case XML_REGEXP_OTHER:
3134 case XML_REGEXP_OTHER_CONTROL:
3135 case XML_REGEXP_OTHER_FORMAT:
3136 case XML_REGEXP_OTHER_PRIVATE:
3137 case XML_REGEXP_OTHER_NA:
3138 case XML_REGEXP_BLOCK_NAME:
3139 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3140 (const xmlChar *)atom->valuep);
3141 if (atom->neg)
3142 ret = !ret;
3143 break;
3144 }
3145 return(ret);
3146}
3147
3148/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003149 * *
William M. Brackddf71d62004-05-06 04:17:26 +00003150 * Saving and restoring state of an execution context *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003151 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00003152 ************************************************************************/
3153
3154#ifdef DEBUG_REGEXP_EXEC
3155static void
3156xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
3157 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
3158 if (exec->inputStack != NULL) {
3159 int i;
3160 printf(": ");
3161 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00003162 printf("%s ", (const char *)
3163 exec->inputStack[exec->inputStackNr - (i + 1)].value);
Daniel Veillard4255d502002-04-16 15:50:10 +00003164 } else {
3165 printf(": %s", &(exec->inputString[exec->index]));
3166 }
3167 printf("\n");
3168}
3169#endif
3170
3171static void
3172xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3173#ifdef DEBUG_REGEXP_EXEC
3174 printf("saving ");
3175 exec->transno++;
3176 xmlFARegDebugExec(exec);
3177 exec->transno--;
3178#endif
Daniel Veillard94cc1032005-09-15 13:09:00 +00003179#ifdef MAX_PUSH
3180 if (exec->nbPush > MAX_PUSH) {
3181 return;
3182 }
3183 exec->nbPush++;
3184#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003185
3186 if (exec->maxRollbacks == 0) {
3187 exec->maxRollbacks = 4;
3188 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3189 sizeof(xmlRegExecRollback));
3190 if (exec->rollbacks == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003191 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00003192 exec->maxRollbacks = 0;
3193 return;
3194 }
3195 memset(exec->rollbacks, 0,
3196 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3197 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
3198 xmlRegExecRollback *tmp;
3199 int len = exec->maxRollbacks;
3200
3201 exec->maxRollbacks *= 2;
3202 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3203 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3204 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003205 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00003206 exec->maxRollbacks /= 2;
3207 return;
3208 }
3209 exec->rollbacks = tmp;
3210 tmp = &exec->rollbacks[len];
3211 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3212 }
3213 exec->rollbacks[exec->nbRollbacks].state = exec->state;
3214 exec->rollbacks[exec->nbRollbacks].index = exec->index;
3215 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3216 if (exec->comp->nbCounters > 0) {
3217 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3218 exec->rollbacks[exec->nbRollbacks].counts = (int *)
3219 xmlMalloc(exec->comp->nbCounters * sizeof(int));
3220 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003221 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00003222 exec->status = -5;
3223 return;
3224 }
3225 }
3226 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3227 exec->comp->nbCounters * sizeof(int));
3228 }
3229 exec->nbRollbacks++;
3230}
3231
3232static void
3233xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3234 if (exec->nbRollbacks <= 0) {
3235 exec->status = -1;
3236#ifdef DEBUG_REGEXP_EXEC
3237 printf("rollback failed on empty stack\n");
3238#endif
3239 return;
3240 }
3241 exec->nbRollbacks--;
3242 exec->state = exec->rollbacks[exec->nbRollbacks].state;
3243 exec->index = exec->rollbacks[exec->nbRollbacks].index;
3244 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3245 if (exec->comp->nbCounters > 0) {
3246 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3247 fprintf(stderr, "exec save: allocation failed");
3248 exec->status = -6;
3249 return;
3250 }
Gaurav2671b012013-09-11 14:59:06 +08003251 if (exec->counts) {
3252 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
Daniel Veillard4255d502002-04-16 15:50:10 +00003253 exec->comp->nbCounters * sizeof(int));
Gaurav2671b012013-09-11 14:59:06 +08003254 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003255 }
3256
3257#ifdef DEBUG_REGEXP_EXEC
3258 printf("restored ");
3259 xmlFARegDebugExec(exec);
3260#endif
3261}
3262
3263/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003264 * *
William M. Brackddf71d62004-05-06 04:17:26 +00003265 * Verifier, running an input against a compiled regexp *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003266 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00003267 ************************************************************************/
3268
3269static int
3270xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3271 xmlRegExecCtxt execval;
3272 xmlRegExecCtxtPtr exec = &execval;
Daniel Veillard567a45b2005-10-18 19:11:55 +00003273 int ret, codepoint = 0, len, deter;
Daniel Veillard4255d502002-04-16 15:50:10 +00003274
3275 exec->inputString = content;
3276 exec->index = 0;
Daniel Veillard94cc1032005-09-15 13:09:00 +00003277 exec->nbPush = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003278 exec->determinist = 1;
3279 exec->maxRollbacks = 0;
3280 exec->nbRollbacks = 0;
3281 exec->rollbacks = NULL;
3282 exec->status = 0;
3283 exec->comp = comp;
3284 exec->state = comp->states[0];
3285 exec->transno = 0;
3286 exec->transcount = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00003287 exec->inputStack = NULL;
3288 exec->inputStackMax = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003289 if (comp->nbCounters > 0) {
3290 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
Daniel Veillardff46a042003-10-08 08:53:17 +00003291 if (exec->counts == NULL) {
3292 xmlRegexpErrMemory(NULL, "running regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00003293 return(-1);
Daniel Veillardff46a042003-10-08 08:53:17 +00003294 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003295 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3296 } else
3297 exec->counts = NULL;
Daniel Veillard40851d02012-08-17 20:34:05 +08003298 while ((exec->status == 0) && (exec->state != NULL) &&
Daniel Veillard4255d502002-04-16 15:50:10 +00003299 ((exec->inputString[exec->index] != 0) ||
Daniel Veillardad559982008-05-12 13:15:35 +00003300 ((exec->state != NULL) &&
3301 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003302 xmlRegTransPtr trans;
3303 xmlRegAtomPtr atom;
3304
3305 /*
William M. Brack0e00b282004-04-26 15:40:47 +00003306 * If end of input on non-terminal state, rollback, however we may
Daniel Veillard4255d502002-04-16 15:50:10 +00003307 * still have epsilon like transition for counted transitions
William M. Brack0e00b282004-04-26 15:40:47 +00003308 * on counters, in that case don't break too early. Additionally,
3309 * if we are working on a range like "AB{0,2}", where B is not present,
3310 * we don't want to break.
Daniel Veillard4255d502002-04-16 15:50:10 +00003311 */
Daniel Veillard11ce4002006-03-10 00:36:23 +00003312 len = 1;
William M. Brack0e00b282004-04-26 15:40:47 +00003313 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
William M. Brackddf71d62004-05-06 04:17:26 +00003314 /*
3315 * if there is a transition, we must check if
3316 * atom allows minOccurs of 0
3317 */
3318 if (exec->transno < exec->state->nbTrans) {
William M. Brack0e00b282004-04-26 15:40:47 +00003319 trans = &exec->state->trans[exec->transno];
3320 if (trans->to >=0) {
3321 atom = trans->atom;
3322 if (!((atom->min == 0) && (atom->max > 0)))
3323 goto rollback;
3324 }
3325 } else
3326 goto rollback;
3327 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003328
3329 exec->transcount = 0;
3330 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3331 trans = &exec->state->trans[exec->transno];
3332 if (trans->to < 0)
3333 continue;
3334 atom = trans->atom;
3335 ret = 0;
Daniel Veillard567a45b2005-10-18 19:11:55 +00003336 deter = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003337 if (trans->count >= 0) {
3338 int count;
3339 xmlRegCounterPtr counter;
3340
Daniel Veillard11ce4002006-03-10 00:36:23 +00003341 if (exec->counts == NULL) {
3342 exec->status = -1;
3343 goto error;
3344 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003345 /*
3346 * A counted transition.
3347 */
3348
3349 count = exec->counts[trans->count];
3350 counter = &exec->comp->counters[trans->count];
3351#ifdef DEBUG_REGEXP_EXEC
3352 printf("testing count %d: val %d, min %d, max %d\n",
3353 trans->count, count, counter->min, counter->max);
3354#endif
3355 ret = ((count >= counter->min) && (count <= counter->max));
Daniel Veillard567a45b2005-10-18 19:11:55 +00003356 if ((ret) && (counter->min != counter->max))
3357 deter = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003358 } else if (atom == NULL) {
3359 fprintf(stderr, "epsilon transition left at runtime\n");
3360 exec->status = -2;
3361 break;
3362 } else if (exec->inputString[exec->index] != 0) {
3363 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3364 ret = xmlRegCheckCharacter(atom, codepoint);
William M. Brack0e00b282004-04-26 15:40:47 +00003365 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003366 xmlRegStatePtr to = comp->states[trans->to];
3367
3368 /*
3369 * this is a multiple input sequence
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003370 * If there is a counter associated increment it now.
3371 * before potentially saving and rollback
Daniel Veillardc821e032007-08-28 17:33:45 +00003372 * do not increment if the counter is already over the
3373 * maximum limit in which case get to next transition
Daniel Veillard4255d502002-04-16 15:50:10 +00003374 */
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003375 if (trans->counter >= 0) {
Daniel Veillardc821e032007-08-28 17:33:45 +00003376 xmlRegCounterPtr counter;
3377
3378 if ((exec->counts == NULL) ||
3379 (exec->comp == NULL) ||
3380 (exec->comp->counters == NULL)) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00003381 exec->status = -1;
3382 goto error;
3383 }
Daniel Veillardc821e032007-08-28 17:33:45 +00003384 counter = &exec->comp->counters[trans->counter];
3385 if (exec->counts[trans->counter] >= counter->max)
3386 continue; /* for loop on transitions */
3387
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003388#ifdef DEBUG_REGEXP_EXEC
3389 printf("Increasing count %d\n", trans->counter);
3390#endif
3391 exec->counts[trans->counter]++;
3392 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003393 if (exec->state->nbTrans > exec->transno + 1) {
3394 xmlFARegExecSave(exec);
3395 }
3396 exec->transcount = 1;
3397 do {
3398 /*
3399 * Try to progress as much as possible on the input
3400 */
3401 if (exec->transcount == atom->max) {
3402 break;
3403 }
3404 exec->index += len;
3405 /*
3406 * End of input: stop here
3407 */
3408 if (exec->inputString[exec->index] == 0) {
3409 exec->index -= len;
3410 break;
3411 }
3412 if (exec->transcount >= atom->min) {
3413 int transno = exec->transno;
3414 xmlRegStatePtr state = exec->state;
3415
3416 /*
3417 * The transition is acceptable save it
3418 */
3419 exec->transno = -1; /* trick */
3420 exec->state = to;
3421 xmlFARegExecSave(exec);
3422 exec->transno = transno;
3423 exec->state = state;
3424 }
3425 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3426 len);
3427 ret = xmlRegCheckCharacter(atom, codepoint);
3428 exec->transcount++;
3429 } while (ret == 1);
3430 if (exec->transcount < atom->min)
3431 ret = 0;
3432
3433 /*
3434 * If the last check failed but one transition was found
3435 * possible, rollback
3436 */
3437 if (ret < 0)
3438 ret = 0;
3439 if (ret == 0) {
3440 goto rollback;
3441 }
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003442 if (trans->counter >= 0) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00003443 if (exec->counts == NULL) {
3444 exec->status = -1;
3445 goto error;
3446 }
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003447#ifdef DEBUG_REGEXP_EXEC
3448 printf("Decreasing count %d\n", trans->counter);
3449#endif
3450 exec->counts[trans->counter]--;
3451 }
William M. Brack0e00b282004-04-26 15:40:47 +00003452 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3453 /*
3454 * we don't match on the codepoint, but minOccurs of 0
3455 * says that's ok. Setting len to 0 inhibits stepping
3456 * over the codepoint.
3457 */
3458 exec->transcount = 1;
3459 len = 0;
3460 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003461 }
William M. Brack0e00b282004-04-26 15:40:47 +00003462 } else if ((atom->min == 0) && (atom->max > 0)) {
3463 /* another spot to match when minOccurs is 0 */
3464 exec->transcount = 1;
3465 len = 0;
3466 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003467 }
3468 if (ret == 1) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00003469 if ((trans->nd == 1) ||
3470 ((trans->count >= 0) && (deter == 0) &&
3471 (exec->state->nbTrans > exec->transno + 1))) {
Daniel Veillardaa622012005-10-20 15:55:25 +00003472#ifdef DEBUG_REGEXP_EXEC
3473 if (trans->nd == 1)
3474 printf("Saving on nd transition atom %d for %c at %d\n",
3475 trans->atom->no, codepoint, exec->index);
3476 else
3477 printf("Saving on counted transition count %d for %c at %d\n",
3478 trans->count, codepoint, exec->index);
3479#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003480 xmlFARegExecSave(exec);
3481 }
3482 if (trans->counter >= 0) {
Daniel Veillardc821e032007-08-28 17:33:45 +00003483 xmlRegCounterPtr counter;
3484
3485 /* make sure we don't go over the counter maximum value */
3486 if ((exec->counts == NULL) ||
3487 (exec->comp == NULL) ||
3488 (exec->comp->counters == NULL)) {
3489 exec->status = -1;
Daniel Veillard11ce4002006-03-10 00:36:23 +00003490 goto error;
3491 }
Daniel Veillardc821e032007-08-28 17:33:45 +00003492 counter = &exec->comp->counters[trans->counter];
3493 if (exec->counts[trans->counter] >= counter->max)
3494 continue; /* for loop on transitions */
Daniel Veillard4255d502002-04-16 15:50:10 +00003495#ifdef DEBUG_REGEXP_EXEC
3496 printf("Increasing count %d\n", trans->counter);
3497#endif
3498 exec->counts[trans->counter]++;
3499 }
Daniel Veillard10752282005-08-08 13:05:13 +00003500 if ((trans->count >= 0) &&
3501 (trans->count < REGEXP_ALL_COUNTER)) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00003502 if (exec->counts == NULL) {
3503 exec->status = -1;
3504 goto error;
3505 }
Daniel Veillard10752282005-08-08 13:05:13 +00003506#ifdef DEBUG_REGEXP_EXEC
3507 printf("resetting count %d on transition\n",
3508 trans->count);
3509#endif
3510 exec->counts[trans->count] = 0;
3511 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003512#ifdef DEBUG_REGEXP_EXEC
3513 printf("entering state %d\n", trans->to);
3514#endif
3515 exec->state = comp->states[trans->to];
3516 exec->transno = 0;
3517 if (trans->atom != NULL) {
3518 exec->index += len;
3519 }
3520 goto progress;
3521 } else if (ret < 0) {
3522 exec->status = -4;
3523 break;
3524 }
3525 }
3526 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3527rollback:
3528 /*
3529 * Failed to find a way out
3530 */
3531 exec->determinist = 0;
Daniel Veillardaa622012005-10-20 15:55:25 +00003532#ifdef DEBUG_REGEXP_EXEC
3533 printf("rollback from state %d on %d:%c\n", exec->state->no,
3534 codepoint,codepoint);
3535#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003536 xmlFARegExecRollBack(exec);
3537 }
3538progress:
3539 continue;
3540 }
Daniel Veillard11ce4002006-03-10 00:36:23 +00003541error:
Daniel Veillard4255d502002-04-16 15:50:10 +00003542 if (exec->rollbacks != NULL) {
3543 if (exec->counts != NULL) {
3544 int i;
3545
3546 for (i = 0;i < exec->maxRollbacks;i++)
3547 if (exec->rollbacks[i].counts != NULL)
3548 xmlFree(exec->rollbacks[i].counts);
3549 }
3550 xmlFree(exec->rollbacks);
3551 }
Daniel Veillard40851d02012-08-17 20:34:05 +08003552 if (exec->state == NULL)
3553 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003554 if (exec->counts != NULL)
3555 xmlFree(exec->counts);
3556 if (exec->status == 0)
3557 return(1);
Daniel Veillard94cc1032005-09-15 13:09:00 +00003558 if (exec->status == -1) {
3559 if (exec->nbPush > MAX_PUSH)
3560 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003561 return(0);
Daniel Veillard94cc1032005-09-15 13:09:00 +00003562 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003563 return(exec->status);
3564}
3565
3566/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003567 * *
William M. Brackddf71d62004-05-06 04:17:26 +00003568 * Progressive interface to the verifier one atom at a time *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003569 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00003570 ************************************************************************/
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003571#ifdef DEBUG_ERR
3572static void testerr(xmlRegExecCtxtPtr exec);
3573#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003574
3575/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003576 * xmlRegNewExecCtxt:
Daniel Veillard4255d502002-04-16 15:50:10 +00003577 * @comp: a precompiled regular expression
3578 * @callback: a callback function used for handling progresses in the
3579 * automata matching phase
3580 * @data: the context data associated to the callback in this context
3581 *
3582 * Build a context used for progressive evaluation of a regexp.
Daniel Veillard01c13b52002-12-10 15:19:08 +00003583 *
3584 * Returns the new context
Daniel Veillard4255d502002-04-16 15:50:10 +00003585 */
3586xmlRegExecCtxtPtr
3587xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3588 xmlRegExecCtxtPtr exec;
3589
3590 if (comp == NULL)
3591 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00003592 if ((comp->compact == NULL) && (comp->states == NULL))
3593 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00003594 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3595 if (exec == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003596 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00003597 return(NULL);
3598 }
3599 memset(exec, 0, sizeof(xmlRegExecCtxt));
3600 exec->inputString = NULL;
3601 exec->index = 0;
3602 exec->determinist = 1;
3603 exec->maxRollbacks = 0;
3604 exec->nbRollbacks = 0;
3605 exec->rollbacks = NULL;
3606 exec->status = 0;
3607 exec->comp = comp;
Daniel Veillard23e73572002-09-19 19:56:43 +00003608 if (comp->compact == NULL)
3609 exec->state = comp->states[0];
Daniel Veillard4255d502002-04-16 15:50:10 +00003610 exec->transno = 0;
3611 exec->transcount = 0;
3612 exec->callback = callback;
3613 exec->data = data;
3614 if (comp->nbCounters > 0) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003615 /*
3616 * For error handling, exec->counts is allocated twice the size
3617 * the second half is used to store the data in case of rollback
3618 */
3619 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3620 * 2);
Daniel Veillard4255d502002-04-16 15:50:10 +00003621 if (exec->counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003622 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00003623 xmlFree(exec);
3624 return(NULL);
3625 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003626 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3627 exec->errCounts = &exec->counts[comp->nbCounters];
3628 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00003629 exec->counts = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003630 exec->errCounts = NULL;
3631 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003632 exec->inputStackMax = 0;
3633 exec->inputStackNr = 0;
3634 exec->inputStack = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003635 exec->errStateNo = -1;
3636 exec->errString = NULL;
Daniel Veillard94cc1032005-09-15 13:09:00 +00003637 exec->nbPush = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003638 return(exec);
3639}
3640
3641/**
3642 * xmlRegFreeExecCtxt:
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003643 * @exec: a regular expression evaluation context
Daniel Veillard4255d502002-04-16 15:50:10 +00003644 *
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003645 * Free the structures associated to a regular expression evaluation context.
Daniel Veillard4255d502002-04-16 15:50:10 +00003646 */
3647void
3648xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3649 if (exec == NULL)
3650 return;
3651
3652 if (exec->rollbacks != NULL) {
3653 if (exec->counts != NULL) {
3654 int i;
3655
3656 for (i = 0;i < exec->maxRollbacks;i++)
3657 if (exec->rollbacks[i].counts != NULL)
3658 xmlFree(exec->rollbacks[i].counts);
3659 }
3660 xmlFree(exec->rollbacks);
3661 }
3662 if (exec->counts != NULL)
3663 xmlFree(exec->counts);
3664 if (exec->inputStack != NULL) {
3665 int i;
3666
Daniel Veillard32370232002-10-16 14:08:14 +00003667 for (i = 0;i < exec->inputStackNr;i++) {
3668 if (exec->inputStack[i].value != NULL)
3669 xmlFree(exec->inputStack[i].value);
3670 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003671 xmlFree(exec->inputStack);
3672 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003673 if (exec->errString != NULL)
3674 xmlFree(exec->errString);
Daniel Veillard4255d502002-04-16 15:50:10 +00003675 xmlFree(exec);
3676}
3677
3678static void
3679xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3680 void *data) {
3681#ifdef DEBUG_PUSH
3682 printf("saving value: %d:%s\n", exec->inputStackNr, value);
3683#endif
3684 if (exec->inputStackMax == 0) {
3685 exec->inputStackMax = 4;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003686 exec->inputStack = (xmlRegInputTokenPtr)
Daniel Veillard4255d502002-04-16 15:50:10 +00003687 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3688 if (exec->inputStack == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003689 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00003690 exec->inputStackMax = 0;
3691 return;
3692 }
3693 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3694 xmlRegInputTokenPtr tmp;
3695
3696 exec->inputStackMax *= 2;
3697 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3698 exec->inputStackMax * sizeof(xmlRegInputToken));
3699 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003700 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00003701 exec->inputStackMax /= 2;
3702 return;
3703 }
3704 exec->inputStack = tmp;
3705 }
3706 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3707 exec->inputStack[exec->inputStackNr].data = data;
3708 exec->inputStackNr++;
3709 exec->inputStack[exec->inputStackNr].value = NULL;
3710 exec->inputStack[exec->inputStackNr].data = NULL;
3711}
3712
Daniel Veillardc0826a72004-08-10 14:17:33 +00003713/**
3714 * xmlRegStrEqualWildcard:
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003715 * @expStr: the string to be evaluated
Daniel Veillardc0826a72004-08-10 14:17:33 +00003716 * @valStr: the validation string
3717 *
3718 * Checks if both strings are equal or have the same content. "*"
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003719 * can be used as a wildcard in @valStr; "|" is used as a separator of
Daniel Veillardc0826a72004-08-10 14:17:33 +00003720 * substrings in both @expStr and @valStr.
3721 *
3722 * Returns 1 if the comparison is satisfied and the number of substrings
3723 * is equal, 0 otherwise.
3724 */
3725
3726static int
3727xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3728 if (expStr == valStr) return(1);
3729 if (expStr == NULL) return(0);
3730 if (valStr == NULL) return(0);
3731 do {
3732 /*
3733 * Eval if we have a wildcard for the current item.
3734 */
3735 if (*expStr != *valStr) {
Daniel Veillard4f82c8a2005-08-09 21:40:08 +00003736 /* if one of them starts with a wildcard make valStr be it */
3737 if (*valStr == '*') {
3738 const xmlChar *tmp;
3739
3740 tmp = valStr;
3741 valStr = expStr;
3742 expStr = tmp;
3743 }
Daniel Veillardc0826a72004-08-10 14:17:33 +00003744 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3745 do {
3746 if (*valStr == XML_REG_STRING_SEPARATOR)
3747 break;
Kasimier T. Buchcikc0e833f2005-04-19 15:02:20 +00003748 valStr++;
Daniel Veillardc0826a72004-08-10 14:17:33 +00003749 } while (*valStr != 0);
3750 continue;
3751 } else
3752 return(0);
3753 }
Kasimier T. Buchcikc0e833f2005-04-19 15:02:20 +00003754 expStr++;
3755 valStr++;
Daniel Veillardc0826a72004-08-10 14:17:33 +00003756 } while (*valStr != 0);
3757 if (*expStr != 0)
3758 return (0);
3759 else
3760 return (1);
3761}
Daniel Veillard4255d502002-04-16 15:50:10 +00003762
3763/**
Daniel Veillard23e73572002-09-19 19:56:43 +00003764 * xmlRegCompactPushString:
3765 * @exec: a regexp execution context
3766 * @comp: the precompiled exec with a compact table
3767 * @value: a string token input
3768 * @data: data associated to the token to reuse in callbacks
3769 *
3770 * Push one input token in the execution context
3771 *
3772 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3773 * a negative value in case of error.
3774 */
3775static int
3776xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3777 xmlRegexpPtr comp,
3778 const xmlChar *value,
3779 void *data) {
3780 int state = exec->index;
3781 int i, target;
3782
3783 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3784 return(-1);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003785
Daniel Veillard23e73572002-09-19 19:56:43 +00003786 if (value == NULL) {
3787 /*
3788 * are we at a final state ?
3789 */
3790 if (comp->compact[state * (comp->nbstrings + 1)] ==
3791 XML_REGEXP_FINAL_STATE)
3792 return(1);
3793 return(0);
3794 }
3795
3796#ifdef DEBUG_PUSH
3797 printf("value pushed: %s\n", value);
3798#endif
3799
3800 /*
William M. Brackddf71d62004-05-06 04:17:26 +00003801 * Examine all outside transitions from current state
Daniel Veillard23e73572002-09-19 19:56:43 +00003802 */
3803 for (i = 0;i < comp->nbstrings;i++) {
3804 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3805 if ((target > 0) && (target <= comp->nbstates)) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003806 target--; /* to avoid 0 */
Daniel Veillardc0826a72004-08-10 14:17:33 +00003807 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003808 exec->index = target;
Daniel Veillard118aed72002-09-24 14:13:13 +00003809 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3810 exec->callback(exec->data, value,
3811 comp->transdata[state * comp->nbstrings + i], data);
3812 }
Daniel Veillard23e73572002-09-19 19:56:43 +00003813#ifdef DEBUG_PUSH
3814 printf("entering state %d\n", target);
3815#endif
3816 if (comp->compact[target * (comp->nbstrings + 1)] ==
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003817 XML_REGEXP_SINK_STATE)
3818 goto error;
3819
3820 if (comp->compact[target * (comp->nbstrings + 1)] ==
Daniel Veillard23e73572002-09-19 19:56:43 +00003821 XML_REGEXP_FINAL_STATE)
3822 return(1);
3823 return(0);
3824 }
3825 }
3826 }
3827 /*
3828 * Failed to find an exit transition out from current state for the
3829 * current token
3830 */
3831#ifdef DEBUG_PUSH
3832 printf("failed to find a transition for %s on state %d\n", value, state);
3833#endif
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003834error:
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003835 if (exec->errString != NULL)
3836 xmlFree(exec->errString);
3837 exec->errString = xmlStrdup(value);
3838 exec->errStateNo = state;
Daniel Veillard23e73572002-09-19 19:56:43 +00003839 exec->status = -1;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003840#ifdef DEBUG_ERR
3841 testerr(exec);
3842#endif
Daniel Veillard23e73572002-09-19 19:56:43 +00003843 return(-1);
3844}
3845
3846/**
Daniel Veillard6e65e152005-08-09 11:09:52 +00003847 * xmlRegExecPushStringInternal:
Daniel Veillardea7751d2002-12-20 00:16:24 +00003848 * @exec: a regexp execution context or NULL to indicate the end
Daniel Veillard4255d502002-04-16 15:50:10 +00003849 * @value: a string token input
3850 * @data: data associated to the token to reuse in callbacks
Daniel Veillard6e65e152005-08-09 11:09:52 +00003851 * @compound: value was assembled from 2 strings
Daniel Veillard4255d502002-04-16 15:50:10 +00003852 *
3853 * Push one input token in the execution context
3854 *
3855 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3856 * a negative value in case of error.
3857 */
Daniel Veillard6e65e152005-08-09 11:09:52 +00003858static int
3859xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3860 void *data, int compound) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003861 xmlRegTransPtr trans;
3862 xmlRegAtomPtr atom;
3863 int ret;
3864 int final = 0;
Daniel Veillard90700152005-01-08 22:05:09 +00003865 int progress = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003866
3867 if (exec == NULL)
3868 return(-1);
Daniel Veillard23e73572002-09-19 19:56:43 +00003869 if (exec->comp == NULL)
3870 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003871 if (exec->status != 0)
3872 return(exec->status);
3873
Daniel Veillard23e73572002-09-19 19:56:43 +00003874 if (exec->comp->compact != NULL)
3875 return(xmlRegCompactPushString(exec, exec->comp, value, data));
3876
Daniel Veillard4255d502002-04-16 15:50:10 +00003877 if (value == NULL) {
3878 if (exec->state->type == XML_REGEXP_FINAL_STATE)
3879 return(1);
3880 final = 1;
3881 }
3882
3883#ifdef DEBUG_PUSH
3884 printf("value pushed: %s\n", value);
3885#endif
3886 /*
3887 * If we have an active rollback stack push the new value there
3888 * and get back to where we were left
3889 */
3890 if ((value != NULL) && (exec->inputStackNr > 0)) {
3891 xmlFARegExecSaveInputString(exec, value, data);
3892 value = exec->inputStack[exec->index].value;
3893 data = exec->inputStack[exec->index].data;
3894#ifdef DEBUG_PUSH
3895 printf("value loaded: %s\n", value);
3896#endif
3897 }
3898
3899 while ((exec->status == 0) &&
3900 ((value != NULL) ||
3901 ((final == 1) &&
3902 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3903
3904 /*
3905 * End of input on non-terminal state, rollback, however we may
3906 * still have epsilon like transition for counted transitions
3907 * on counters, in that case don't break too early.
3908 */
Daniel Veillardb509f152002-04-17 16:28:10 +00003909 if ((value == NULL) && (exec->counts == NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +00003910 goto rollback;
3911
3912 exec->transcount = 0;
3913 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3914 trans = &exec->state->trans[exec->transno];
3915 if (trans->to < 0)
3916 continue;
3917 atom = trans->atom;
3918 ret = 0;
Daniel Veillard441bc322002-04-20 17:38:48 +00003919 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3920 int i;
3921 int count;
3922 xmlRegTransPtr t;
3923 xmlRegCounterPtr counter;
3924
3925 ret = 0;
3926
3927#ifdef DEBUG_PUSH
3928 printf("testing all lax %d\n", trans->count);
3929#endif
3930 /*
3931 * Check all counted transitions from the current state
3932 */
3933 if ((value == NULL) && (final)) {
3934 ret = 1;
3935 } else if (value != NULL) {
3936 for (i = 0;i < exec->state->nbTrans;i++) {
3937 t = &exec->state->trans[i];
3938 if ((t->counter < 0) || (t == trans))
3939 continue;
3940 counter = &exec->comp->counters[t->counter];
3941 count = exec->counts[t->counter];
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003942 if ((count < counter->max) &&
Daniel Veillard441bc322002-04-20 17:38:48 +00003943 (t->atom != NULL) &&
3944 (xmlStrEqual(value, t->atom->valuep))) {
3945 ret = 0;
3946 break;
3947 }
3948 if ((count >= counter->min) &&
3949 (count < counter->max) &&
Daniel Veillard11ce4002006-03-10 00:36:23 +00003950 (t->atom != NULL) &&
Daniel Veillard441bc322002-04-20 17:38:48 +00003951 (xmlStrEqual(value, t->atom->valuep))) {
3952 ret = 1;
3953 break;
3954 }
3955 }
3956 }
3957 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillard8a001f62002-04-20 07:24:11 +00003958 int i;
3959 int count;
3960 xmlRegTransPtr t;
3961 xmlRegCounterPtr counter;
3962
3963 ret = 1;
3964
3965#ifdef DEBUG_PUSH
3966 printf("testing all %d\n", trans->count);
3967#endif
3968 /*
3969 * Check all counted transitions from the current state
3970 */
3971 for (i = 0;i < exec->state->nbTrans;i++) {
3972 t = &exec->state->trans[i];
3973 if ((t->counter < 0) || (t == trans))
3974 continue;
3975 counter = &exec->comp->counters[t->counter];
3976 count = exec->counts[t->counter];
3977 if ((count < counter->min) || (count > counter->max)) {
3978 ret = 0;
3979 break;
3980 }
3981 }
3982 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003983 int count;
3984 xmlRegCounterPtr counter;
3985
3986 /*
3987 * A counted transition.
3988 */
3989
3990 count = exec->counts[trans->count];
3991 counter = &exec->comp->counters[trans->count];
3992#ifdef DEBUG_PUSH
3993 printf("testing count %d: val %d, min %d, max %d\n",
3994 trans->count, count, counter->min, counter->max);
3995#endif
3996 ret = ((count >= counter->min) && (count <= counter->max));
3997 } else if (atom == NULL) {
3998 fprintf(stderr, "epsilon transition left at runtime\n");
3999 exec->status = -2;
4000 break;
4001 } else if (value != NULL) {
Daniel Veillardc0826a72004-08-10 14:17:33 +00004002 ret = xmlRegStrEqualWildcard(atom->valuep, value);
Daniel Veillard6e65e152005-08-09 11:09:52 +00004003 if (atom->neg) {
Daniel Veillard9efc4762005-07-19 14:33:55 +00004004 ret = !ret;
Daniel Veillard6e65e152005-08-09 11:09:52 +00004005 if (!compound)
4006 ret = 0;
4007 }
Daniel Veillard441bc322002-04-20 17:38:48 +00004008 if ((ret == 1) && (trans->counter >= 0)) {
4009 xmlRegCounterPtr counter;
4010 int count;
4011
4012 count = exec->counts[trans->counter];
4013 counter = &exec->comp->counters[trans->counter];
4014 if (count >= counter->max)
4015 ret = 0;
4016 }
4017
Daniel Veillard4255d502002-04-16 15:50:10 +00004018 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4019 xmlRegStatePtr to = exec->comp->states[trans->to];
4020
4021 /*
4022 * this is a multiple input sequence
4023 */
4024 if (exec->state->nbTrans > exec->transno + 1) {
4025 if (exec->inputStackNr <= 0) {
4026 xmlFARegExecSaveInputString(exec, value, data);
4027 }
4028 xmlFARegExecSave(exec);
4029 }
4030 exec->transcount = 1;
4031 do {
4032 /*
4033 * Try to progress as much as possible on the input
4034 */
4035 if (exec->transcount == atom->max) {
4036 break;
4037 }
4038 exec->index++;
4039 value = exec->inputStack[exec->index].value;
4040 data = exec->inputStack[exec->index].data;
4041#ifdef DEBUG_PUSH
4042 printf("value loaded: %s\n", value);
4043#endif
4044
4045 /*
4046 * End of input: stop here
4047 */
4048 if (value == NULL) {
4049 exec->index --;
4050 break;
4051 }
4052 if (exec->transcount >= atom->min) {
4053 int transno = exec->transno;
4054 xmlRegStatePtr state = exec->state;
4055
4056 /*
4057 * The transition is acceptable save it
4058 */
4059 exec->transno = -1; /* trick */
4060 exec->state = to;
4061 if (exec->inputStackNr <= 0) {
4062 xmlFARegExecSaveInputString(exec, value, data);
4063 }
4064 xmlFARegExecSave(exec);
4065 exec->transno = transno;
4066 exec->state = state;
4067 }
4068 ret = xmlStrEqual(value, atom->valuep);
4069 exec->transcount++;
4070 } while (ret == 1);
4071 if (exec->transcount < atom->min)
4072 ret = 0;
4073
4074 /*
4075 * If the last check failed but one transition was found
4076 * possible, rollback
4077 */
4078 if (ret < 0)
4079 ret = 0;
4080 if (ret == 0) {
4081 goto rollback;
4082 }
4083 }
4084 }
4085 if (ret == 1) {
William M. Brack98873952003-12-26 06:03:14 +00004086 if ((exec->callback != NULL) && (atom != NULL) &&
4087 (data != NULL)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00004088 exec->callback(exec->data, atom->valuep,
4089 atom->data, data);
4090 }
4091 if (exec->state->nbTrans > exec->transno + 1) {
4092 if (exec->inputStackNr <= 0) {
4093 xmlFARegExecSaveInputString(exec, value, data);
4094 }
4095 xmlFARegExecSave(exec);
4096 }
4097 if (trans->counter >= 0) {
4098#ifdef DEBUG_PUSH
4099 printf("Increasing count %d\n", trans->counter);
4100#endif
4101 exec->counts[trans->counter]++;
4102 }
Daniel Veillard10752282005-08-08 13:05:13 +00004103 if ((trans->count >= 0) &&
4104 (trans->count < REGEXP_ALL_COUNTER)) {
4105#ifdef DEBUG_REGEXP_EXEC
4106 printf("resetting count %d on transition\n",
4107 trans->count);
4108#endif
4109 exec->counts[trans->count] = 0;
4110 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004111#ifdef DEBUG_PUSH
4112 printf("entering state %d\n", trans->to);
4113#endif
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004114 if ((exec->comp->states[trans->to] != NULL) &&
4115 (exec->comp->states[trans->to]->type ==
4116 XML_REGEXP_SINK_STATE)) {
4117 /*
4118 * entering a sink state, save the current state as error
4119 * state.
4120 */
4121 if (exec->errString != NULL)
4122 xmlFree(exec->errString);
4123 exec->errString = xmlStrdup(value);
4124 exec->errState = exec->state;
4125 memcpy(exec->errCounts, exec->counts,
4126 exec->comp->nbCounters * sizeof(int));
4127 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004128 exec->state = exec->comp->states[trans->to];
4129 exec->transno = 0;
4130 if (trans->atom != NULL) {
4131 if (exec->inputStack != NULL) {
4132 exec->index++;
4133 if (exec->index < exec->inputStackNr) {
4134 value = exec->inputStack[exec->index].value;
4135 data = exec->inputStack[exec->index].data;
4136#ifdef DEBUG_PUSH
4137 printf("value loaded: %s\n", value);
4138#endif
4139 } else {
4140 value = NULL;
4141 data = NULL;
4142#ifdef DEBUG_PUSH
4143 printf("end of input\n");
4144#endif
4145 }
4146 } else {
4147 value = NULL;
4148 data = NULL;
4149#ifdef DEBUG_PUSH
4150 printf("end of input\n");
4151#endif
4152 }
4153 }
4154 goto progress;
4155 } else if (ret < 0) {
4156 exec->status = -4;
4157 break;
4158 }
4159 }
4160 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4161rollback:
Daniel Veillard90700152005-01-08 22:05:09 +00004162 /*
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004163 * if we didn't yet rollback on the current input
4164 * store the current state as the error state.
Daniel Veillard90700152005-01-08 22:05:09 +00004165 */
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004166 if ((progress) && (exec->state != NULL) &&
4167 (exec->state->type != XML_REGEXP_SINK_STATE)) {
Daniel Veillard90700152005-01-08 22:05:09 +00004168 progress = 0;
4169 if (exec->errString != NULL)
4170 xmlFree(exec->errString);
4171 exec->errString = xmlStrdup(value);
4172 exec->errState = exec->state;
Nick Wellnhofer34e44562017-05-31 16:48:27 +02004173 if (exec->comp->nbCounters)
4174 memcpy(exec->errCounts, exec->counts,
4175 exec->comp->nbCounters * sizeof(int));
Daniel Veillard90700152005-01-08 22:05:09 +00004176 }
4177
Daniel Veillard4255d502002-04-16 15:50:10 +00004178 /*
4179 * Failed to find a way out
4180 */
4181 exec->determinist = 0;
4182 xmlFARegExecRollBack(exec);
Gaurav2671b012013-09-11 14:59:06 +08004183 if ((exec->inputStack != NULL ) && (exec->status == 0)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00004184 value = exec->inputStack[exec->index].value;
4185 data = exec->inputStack[exec->index].data;
4186#ifdef DEBUG_PUSH
4187 printf("value loaded: %s\n", value);
4188#endif
4189 }
4190 }
Daniel Veillard90700152005-01-08 22:05:09 +00004191 continue;
Daniel Veillard4255d502002-04-16 15:50:10 +00004192progress:
Daniel Veillard90700152005-01-08 22:05:09 +00004193 progress = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00004194 continue;
4195 }
4196 if (exec->status == 0) {
4197 return(exec->state->type == XML_REGEXP_FINAL_STATE);
4198 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004199#ifdef DEBUG_ERR
Daniel Veillard90700152005-01-08 22:05:09 +00004200 if (exec->status < 0) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004201 testerr(exec);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004202 }
Daniel Veillard90700152005-01-08 22:05:09 +00004203#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00004204 return(exec->status);
4205}
4206
Daniel Veillard52b48c72003-04-13 19:53:42 +00004207/**
Daniel Veillard6e65e152005-08-09 11:09:52 +00004208 * xmlRegExecPushString:
4209 * @exec: a regexp execution context or NULL to indicate the end
4210 * @value: a string token input
4211 * @data: data associated to the token to reuse in callbacks
4212 *
4213 * Push one input token in the execution context
4214 *
4215 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4216 * a negative value in case of error.
4217 */
4218int
4219xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4220 void *data) {
4221 return(xmlRegExecPushStringInternal(exec, value, data, 0));
4222}
4223
4224/**
Daniel Veillard52b48c72003-04-13 19:53:42 +00004225 * xmlRegExecPushString2:
4226 * @exec: a regexp execution context or NULL to indicate the end
4227 * @value: the first string token input
4228 * @value2: the second string token input
4229 * @data: data associated to the token to reuse in callbacks
4230 *
4231 * Push one input token in the execution context
4232 *
4233 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4234 * a negative value in case of error.
4235 */
4236int
4237xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4238 const xmlChar *value2, void *data) {
4239 xmlChar buf[150];
4240 int lenn, lenp, ret;
4241 xmlChar *str;
4242
4243 if (exec == NULL)
4244 return(-1);
4245 if (exec->comp == NULL)
4246 return(-1);
4247 if (exec->status != 0)
4248 return(exec->status);
4249
4250 if (value2 == NULL)
4251 return(xmlRegExecPushString(exec, value, data));
4252
4253 lenn = strlen((char *) value2);
4254 lenp = strlen((char *) value);
4255
4256 if (150 < lenn + lenp + 2) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00004257 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00004258 if (str == NULL) {
4259 exec->status = -1;
4260 return(-1);
4261 }
4262 } else {
4263 str = buf;
4264 }
4265 memcpy(&str[0], value, lenp);
Daniel Veillardc0826a72004-08-10 14:17:33 +00004266 str[lenp] = XML_REG_STRING_SEPARATOR;
Daniel Veillard52b48c72003-04-13 19:53:42 +00004267 memcpy(&str[lenp + 1], value2, lenn);
4268 str[lenn + lenp + 1] = 0;
4269
4270 if (exec->comp->compact != NULL)
4271 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4272 else
Daniel Veillard6e65e152005-08-09 11:09:52 +00004273 ret = xmlRegExecPushStringInternal(exec, str, data, 1);
Daniel Veillard52b48c72003-04-13 19:53:42 +00004274
4275 if (str != buf)
Daniel Veillard0b1ff142005-12-28 21:13:33 +00004276 xmlFree(str);
Daniel Veillard52b48c72003-04-13 19:53:42 +00004277 return(ret);
4278}
4279
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004280/**
Daniel Veillard77005e62005-07-19 16:26:18 +00004281 * xmlRegExecGetValues:
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004282 * @exec: a regexp execution context
4283 * @err: error extraction or normal one
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004284 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004285 * @nbneg: return number of negative transitions
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004286 * @values: pointer to the array of acceptable values
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004287 * @terminal: return value if this was a terminal state
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004288 *
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004289 * Extract information from the regexp execution, internal routine to
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004290 * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004291 *
4292 * Returns: 0 in case of success or -1 in case of error.
4293 */
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004294static int
4295xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004296 int *nbval, int *nbneg,
4297 xmlChar **values, int *terminal) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004298 int maxval;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004299 int nb = 0;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004300
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004301 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004302 (values == NULL) || (*nbval <= 0))
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004303 return(-1);
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004304
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004305 maxval = *nbval;
4306 *nbval = 0;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004307 *nbneg = 0;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004308 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4309 xmlRegexpPtr comp;
4310 int target, i, state;
4311
4312 comp = exec->comp;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004313
4314 if (err) {
4315 if (exec->errStateNo == -1) return(-1);
4316 state = exec->errStateNo;
4317 } else {
4318 state = exec->index;
4319 }
4320 if (terminal != NULL) {
4321 if (comp->compact[state * (comp->nbstrings + 1)] ==
4322 XML_REGEXP_FINAL_STATE)
4323 *terminal = 1;
4324 else
4325 *terminal = 0;
4326 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004327 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004328 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004329 if ((target > 0) && (target <= comp->nbstates) &&
4330 (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4331 XML_REGEXP_SINK_STATE)) {
4332 values[nb++] = comp->stringMap[i];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004333 (*nbval)++;
4334 }
4335 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004336 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4337 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4338 if ((target > 0) && (target <= comp->nbstates) &&
4339 (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4340 XML_REGEXP_SINK_STATE)) {
4341 values[nb++] = comp->stringMap[i];
4342 (*nbneg)++;
4343 }
4344 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004345 } else {
4346 int transno;
4347 xmlRegTransPtr trans;
4348 xmlRegAtomPtr atom;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004349 xmlRegStatePtr state;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004350
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004351 if (terminal != NULL) {
4352 if (exec->state->type == XML_REGEXP_FINAL_STATE)
4353 *terminal = 1;
4354 else
4355 *terminal = 0;
4356 }
4357
4358 if (err) {
4359 if (exec->errState == NULL) return(-1);
4360 state = exec->errState;
4361 } else {
4362 if (exec->state == NULL) return(-1);
4363 state = exec->state;
4364 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004365 for (transno = 0;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004366 (transno < state->nbTrans) && (nb < maxval);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004367 transno++) {
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004368 trans = &state->trans[transno];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004369 if (trans->to < 0)
4370 continue;
4371 atom = trans->atom;
4372 if ((atom == NULL) || (atom->valuep == NULL))
4373 continue;
4374 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004375 /* this should not be reached but ... */
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004376 TODO;
4377 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004378 /* this should not be reached but ... */
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004379 TODO;
4380 } else if (trans->counter >= 0) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00004381 xmlRegCounterPtr counter = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004382 int count;
4383
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004384 if (err)
4385 count = exec->errCounts[trans->counter];
4386 else
4387 count = exec->counts[trans->counter];
Daniel Veillard11ce4002006-03-10 00:36:23 +00004388 if (exec->comp != NULL)
4389 counter = &exec->comp->counters[trans->counter];
4390 if ((counter == NULL) || (count < counter->max)) {
Daniel Veillard77005e62005-07-19 16:26:18 +00004391 if (atom->neg)
4392 values[nb++] = (xmlChar *) atom->valuep2;
4393 else
4394 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004395 (*nbval)++;
4396 }
4397 } else {
Gaurav2671b012013-09-11 14:59:06 +08004398 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004399 (exec->comp->states[trans->to]->type !=
4400 XML_REGEXP_SINK_STATE)) {
Daniel Veillard77005e62005-07-19 16:26:18 +00004401 if (atom->neg)
4402 values[nb++] = (xmlChar *) atom->valuep2;
4403 else
4404 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004405 (*nbval)++;
4406 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004407 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004408 }
4409 for (transno = 0;
4410 (transno < state->nbTrans) && (nb < maxval);
4411 transno++) {
4412 trans = &state->trans[transno];
4413 if (trans->to < 0)
4414 continue;
4415 atom = trans->atom;
4416 if ((atom == NULL) || (atom->valuep == NULL))
4417 continue;
4418 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4419 continue;
4420 } else if (trans->count == REGEXP_ALL_COUNTER) {
4421 continue;
4422 } else if (trans->counter >= 0) {
4423 continue;
4424 } else {
4425 if ((exec->comp->states[trans->to] != NULL) &&
4426 (exec->comp->states[trans->to]->type ==
4427 XML_REGEXP_SINK_STATE)) {
Daniel Veillard77005e62005-07-19 16:26:18 +00004428 if (atom->neg)
4429 values[nb++] = (xmlChar *) atom->valuep2;
4430 else
4431 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004432 (*nbneg)++;
4433 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004434 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004435 }
4436 }
4437 return(0);
4438}
4439
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004440/**
4441 * xmlRegExecNextValues:
4442 * @exec: a regexp execution context
4443 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004444 * @nbneg: return number of negative transitions
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004445 * @values: pointer to the array of acceptable values
4446 * @terminal: return value if this was a terminal state
4447 *
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004448 * Extract information from the regexp execution,
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004449 * the parameter @values must point to an array of @nbval string pointers
4450 * on return nbval will contain the number of possible strings in that
4451 * state and the @values array will be updated with them. The string values
4452 * returned will be freed with the @exec context and don't need to be
4453 * deallocated.
4454 *
4455 * Returns: 0 in case of success or -1 in case of error.
4456 */
4457int
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004458xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4459 xmlChar **values, int *terminal) {
4460 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004461}
4462
4463/**
4464 * xmlRegExecErrInfo:
4465 * @exec: a regexp execution context generating an error
4466 * @string: return value for the error string
4467 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004468 * @nbneg: return number of negative transitions
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004469 * @values: pointer to the array of acceptable values
4470 * @terminal: return value if this was a terminal state
4471 *
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004472 * Extract error information from the regexp execution, the parameter
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004473 * @string will be updated with the value pushed and not accepted,
4474 * the parameter @values must point to an array of @nbval string pointers
4475 * on return nbval will contain the number of possible strings in that
4476 * state and the @values array will be updated with them. The string values
4477 * returned will be freed with the @exec context and don't need to be
4478 * deallocated.
4479 *
4480 * Returns: 0 in case of success or -1 in case of error.
4481 */
4482int
4483xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004484 int *nbval, int *nbneg, xmlChar **values, int *terminal) {
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004485 if (exec == NULL)
4486 return(-1);
4487 if (string != NULL) {
4488 if (exec->status != 0)
4489 *string = exec->errString;
4490 else
4491 *string = NULL;
4492 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004493 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004494}
4495
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004496#ifdef DEBUG_ERR
4497static void testerr(xmlRegExecCtxtPtr exec) {
4498 const xmlChar *string;
Daniel Veillardcee2b3a2005-01-25 00:22:52 +00004499 xmlChar *values[5];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004500 int nb = 5;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004501 int nbneg;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004502 int terminal;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004503 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004504}
4505#endif
4506
Daniel Veillard4255d502002-04-16 15:50:10 +00004507#if 0
4508static int
4509xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4510 xmlRegTransPtr trans;
4511 xmlRegAtomPtr atom;
4512 int ret;
4513 int codepoint, len;
4514
4515 if (exec == NULL)
4516 return(-1);
4517 if (exec->status != 0)
4518 return(exec->status);
4519
4520 while ((exec->status == 0) &&
4521 ((exec->inputString[exec->index] != 0) ||
4522 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4523
4524 /*
4525 * End of input on non-terminal state, rollback, however we may
4526 * still have epsilon like transition for counted transitions
4527 * on counters, in that case don't break too early.
4528 */
4529 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4530 goto rollback;
4531
4532 exec->transcount = 0;
4533 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4534 trans = &exec->state->trans[exec->transno];
4535 if (trans->to < 0)
4536 continue;
4537 atom = trans->atom;
4538 ret = 0;
4539 if (trans->count >= 0) {
4540 int count;
4541 xmlRegCounterPtr counter;
4542
4543 /*
4544 * A counted transition.
4545 */
4546
4547 count = exec->counts[trans->count];
4548 counter = &exec->comp->counters[trans->count];
4549#ifdef DEBUG_REGEXP_EXEC
4550 printf("testing count %d: val %d, min %d, max %d\n",
4551 trans->count, count, counter->min, counter->max);
4552#endif
4553 ret = ((count >= counter->min) && (count <= counter->max));
4554 } else if (atom == NULL) {
4555 fprintf(stderr, "epsilon transition left at runtime\n");
4556 exec->status = -2;
4557 break;
4558 } else if (exec->inputString[exec->index] != 0) {
4559 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4560 ret = xmlRegCheckCharacter(atom, codepoint);
4561 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4562 xmlRegStatePtr to = exec->comp->states[trans->to];
4563
4564 /*
4565 * this is a multiple input sequence
4566 */
4567 if (exec->state->nbTrans > exec->transno + 1) {
4568 xmlFARegExecSave(exec);
4569 }
4570 exec->transcount = 1;
4571 do {
4572 /*
4573 * Try to progress as much as possible on the input
4574 */
4575 if (exec->transcount == atom->max) {
4576 break;
4577 }
4578 exec->index += len;
4579 /*
4580 * End of input: stop here
4581 */
4582 if (exec->inputString[exec->index] == 0) {
4583 exec->index -= len;
4584 break;
4585 }
4586 if (exec->transcount >= atom->min) {
4587 int transno = exec->transno;
4588 xmlRegStatePtr state = exec->state;
4589
4590 /*
4591 * The transition is acceptable save it
4592 */
4593 exec->transno = -1; /* trick */
4594 exec->state = to;
4595 xmlFARegExecSave(exec);
4596 exec->transno = transno;
4597 exec->state = state;
4598 }
4599 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4600 len);
4601 ret = xmlRegCheckCharacter(atom, codepoint);
4602 exec->transcount++;
4603 } while (ret == 1);
4604 if (exec->transcount < atom->min)
4605 ret = 0;
4606
4607 /*
4608 * If the last check failed but one transition was found
4609 * possible, rollback
4610 */
4611 if (ret < 0)
4612 ret = 0;
4613 if (ret == 0) {
4614 goto rollback;
4615 }
4616 }
4617 }
4618 if (ret == 1) {
4619 if (exec->state->nbTrans > exec->transno + 1) {
4620 xmlFARegExecSave(exec);
4621 }
Daniel Veillard54eb0242006-03-21 23:17:57 +00004622 /*
4623 * restart count for expressions like this ((abc){2})*
4624 */
4625 if (trans->count >= 0) {
4626#ifdef DEBUG_REGEXP_EXEC
4627 printf("Reset count %d\n", trans->count);
4628#endif
4629 exec->counts[trans->count] = 0;
4630 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004631 if (trans->counter >= 0) {
4632#ifdef DEBUG_REGEXP_EXEC
4633 printf("Increasing count %d\n", trans->counter);
4634#endif
4635 exec->counts[trans->counter]++;
4636 }
4637#ifdef DEBUG_REGEXP_EXEC
4638 printf("entering state %d\n", trans->to);
4639#endif
4640 exec->state = exec->comp->states[trans->to];
4641 exec->transno = 0;
4642 if (trans->atom != NULL) {
4643 exec->index += len;
4644 }
4645 goto progress;
4646 } else if (ret < 0) {
4647 exec->status = -4;
4648 break;
4649 }
4650 }
4651 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4652rollback:
4653 /*
4654 * Failed to find a way out
4655 */
4656 exec->determinist = 0;
4657 xmlFARegExecRollBack(exec);
4658 }
4659progress:
4660 continue;
4661 }
4662}
4663#endif
4664/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004665 * *
William M. Brackddf71d62004-05-06 04:17:26 +00004666 * Parser for the Schemas Datatype Regular Expressions *
Daniel Veillard4255d502002-04-16 15:50:10 +00004667 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004668 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00004669 ************************************************************************/
4670
4671/**
4672 * xmlFAIsChar:
Daniel Veillard441bc322002-04-20 17:38:48 +00004673 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004674 *
4675 * [10] Char ::= [^.\?*+()|#x5B#x5D]
4676 */
4677static int
4678xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4679 int cur;
4680 int len;
4681
4682 cur = CUR_SCHAR(ctxt->cur, len);
4683 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4684 (cur == '*') || (cur == '+') || (cur == '(') ||
4685 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
4686 (cur == 0x5D) || (cur == 0))
4687 return(-1);
4688 return(cur);
4689}
4690
4691/**
4692 * xmlFAParseCharProp:
Daniel Veillard441bc322002-04-20 17:38:48 +00004693 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004694 *
4695 * [27] charProp ::= IsCategory | IsBlock
4696 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004697 * Separators | Symbols | Others
Daniel Veillard4255d502002-04-16 15:50:10 +00004698 * [29] Letters ::= 'L' [ultmo]?
4699 * [30] Marks ::= 'M' [nce]?
4700 * [31] Numbers ::= 'N' [dlo]?
4701 * [32] Punctuation ::= 'P' [cdseifo]?
4702 * [33] Separators ::= 'Z' [slp]?
4703 * [34] Symbols ::= 'S' [mcko]?
4704 * [35] Others ::= 'C' [cfon]?
4705 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
4706 */
4707static void
4708xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4709 int cur;
William M. Brack779af002003-08-01 15:55:39 +00004710 xmlRegAtomType type = (xmlRegAtomType) 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00004711 xmlChar *blockName = NULL;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004712
Daniel Veillard4255d502002-04-16 15:50:10 +00004713 cur = CUR;
4714 if (cur == 'L') {
4715 NEXT;
4716 cur = CUR;
4717 if (cur == 'u') {
4718 NEXT;
4719 type = XML_REGEXP_LETTER_UPPERCASE;
4720 } else if (cur == 'l') {
4721 NEXT;
4722 type = XML_REGEXP_LETTER_LOWERCASE;
4723 } else if (cur == 't') {
4724 NEXT;
4725 type = XML_REGEXP_LETTER_TITLECASE;
4726 } else if (cur == 'm') {
4727 NEXT;
4728 type = XML_REGEXP_LETTER_MODIFIER;
4729 } else if (cur == 'o') {
4730 NEXT;
4731 type = XML_REGEXP_LETTER_OTHERS;
4732 } else {
4733 type = XML_REGEXP_LETTER;
4734 }
4735 } else if (cur == 'M') {
4736 NEXT;
4737 cur = CUR;
4738 if (cur == 'n') {
4739 NEXT;
4740 /* nonspacing */
4741 type = XML_REGEXP_MARK_NONSPACING;
4742 } else if (cur == 'c') {
4743 NEXT;
4744 /* spacing combining */
4745 type = XML_REGEXP_MARK_SPACECOMBINING;
4746 } else if (cur == 'e') {
4747 NEXT;
4748 /* enclosing */
4749 type = XML_REGEXP_MARK_ENCLOSING;
4750 } else {
4751 /* all marks */
4752 type = XML_REGEXP_MARK;
4753 }
4754 } else if (cur == 'N') {
4755 NEXT;
4756 cur = CUR;
4757 if (cur == 'd') {
4758 NEXT;
4759 /* digital */
4760 type = XML_REGEXP_NUMBER_DECIMAL;
4761 } else if (cur == 'l') {
4762 NEXT;
4763 /* letter */
4764 type = XML_REGEXP_NUMBER_LETTER;
4765 } else if (cur == 'o') {
4766 NEXT;
4767 /* other */
4768 type = XML_REGEXP_NUMBER_OTHERS;
4769 } else {
4770 /* all numbers */
4771 type = XML_REGEXP_NUMBER;
4772 }
4773 } else if (cur == 'P') {
4774 NEXT;
4775 cur = CUR;
4776 if (cur == 'c') {
4777 NEXT;
4778 /* connector */
4779 type = XML_REGEXP_PUNCT_CONNECTOR;
4780 } else if (cur == 'd') {
4781 NEXT;
4782 /* dash */
4783 type = XML_REGEXP_PUNCT_DASH;
4784 } else if (cur == 's') {
4785 NEXT;
4786 /* open */
4787 type = XML_REGEXP_PUNCT_OPEN;
4788 } else if (cur == 'e') {
4789 NEXT;
4790 /* close */
4791 type = XML_REGEXP_PUNCT_CLOSE;
4792 } else if (cur == 'i') {
4793 NEXT;
4794 /* initial quote */
4795 type = XML_REGEXP_PUNCT_INITQUOTE;
4796 } else if (cur == 'f') {
4797 NEXT;
4798 /* final quote */
4799 type = XML_REGEXP_PUNCT_FINQUOTE;
4800 } else if (cur == 'o') {
4801 NEXT;
4802 /* other */
4803 type = XML_REGEXP_PUNCT_OTHERS;
4804 } else {
4805 /* all punctuation */
4806 type = XML_REGEXP_PUNCT;
4807 }
4808 } else if (cur == 'Z') {
4809 NEXT;
4810 cur = CUR;
4811 if (cur == 's') {
4812 NEXT;
4813 /* space */
4814 type = XML_REGEXP_SEPAR_SPACE;
4815 } else if (cur == 'l') {
4816 NEXT;
4817 /* line */
4818 type = XML_REGEXP_SEPAR_LINE;
4819 } else if (cur == 'p') {
4820 NEXT;
4821 /* paragraph */
4822 type = XML_REGEXP_SEPAR_PARA;
4823 } else {
4824 /* all separators */
4825 type = XML_REGEXP_SEPAR;
4826 }
4827 } else if (cur == 'S') {
4828 NEXT;
4829 cur = CUR;
4830 if (cur == 'm') {
4831 NEXT;
4832 type = XML_REGEXP_SYMBOL_MATH;
4833 /* math */
4834 } else if (cur == 'c') {
4835 NEXT;
4836 type = XML_REGEXP_SYMBOL_CURRENCY;
4837 /* currency */
4838 } else if (cur == 'k') {
4839 NEXT;
4840 type = XML_REGEXP_SYMBOL_MODIFIER;
4841 /* modifiers */
4842 } else if (cur == 'o') {
4843 NEXT;
4844 type = XML_REGEXP_SYMBOL_OTHERS;
4845 /* other */
4846 } else {
4847 /* all symbols */
4848 type = XML_REGEXP_SYMBOL;
4849 }
4850 } else if (cur == 'C') {
4851 NEXT;
4852 cur = CUR;
4853 if (cur == 'c') {
4854 NEXT;
4855 /* control */
4856 type = XML_REGEXP_OTHER_CONTROL;
4857 } else if (cur == 'f') {
4858 NEXT;
4859 /* format */
4860 type = XML_REGEXP_OTHER_FORMAT;
4861 } else if (cur == 'o') {
4862 NEXT;
4863 /* private use */
4864 type = XML_REGEXP_OTHER_PRIVATE;
4865 } else if (cur == 'n') {
4866 NEXT;
4867 /* not assigned */
4868 type = XML_REGEXP_OTHER_NA;
4869 } else {
4870 /* all others */
4871 type = XML_REGEXP_OTHER;
4872 }
4873 } else if (cur == 'I') {
4874 const xmlChar *start;
4875 NEXT;
4876 cur = CUR;
4877 if (cur != 's') {
4878 ERROR("IsXXXX expected");
4879 return;
4880 }
4881 NEXT;
4882 start = ctxt->cur;
4883 cur = CUR;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004884 if (((cur >= 'a') && (cur <= 'z')) ||
4885 ((cur >= 'A') && (cur <= 'Z')) ||
4886 ((cur >= '0') && (cur <= '9')) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00004887 (cur == 0x2D)) {
4888 NEXT;
4889 cur = CUR;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004890 while (((cur >= 'a') && (cur <= 'z')) ||
4891 ((cur >= 'A') && (cur <= 'Z')) ||
4892 ((cur >= '0') && (cur <= '9')) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00004893 (cur == 0x2D)) {
4894 NEXT;
4895 cur = CUR;
4896 }
4897 }
4898 type = XML_REGEXP_BLOCK_NAME;
4899 blockName = xmlStrndup(start, ctxt->cur - start);
4900 } else {
4901 ERROR("Unknown char property");
4902 return;
4903 }
4904 if (ctxt->atom == NULL) {
4905 ctxt->atom = xmlRegNewAtom(ctxt, type);
4906 if (ctxt->atom != NULL)
4907 ctxt->atom->valuep = blockName;
4908 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4909 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4910 type, 0, 0, blockName);
4911 }
4912}
4913
4914/**
4915 * xmlFAParseCharClassEsc:
Daniel Veillard441bc322002-04-20 17:38:48 +00004916 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004917 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004918 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
Daniel Veillard4255d502002-04-16 15:50:10 +00004919 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4920 * [25] catEsc ::= '\p{' charProp '}'
4921 * [26] complEsc ::= '\P{' charProp '}'
4922 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4923 */
4924static void
4925xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4926 int cur;
4927
4928 if (CUR == '.') {
4929 if (ctxt->atom == NULL) {
4930 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4931 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4932 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4933 XML_REGEXP_ANYCHAR, 0, 0, NULL);
4934 }
4935 NEXT;
4936 return;
4937 }
4938 if (CUR != '\\') {
4939 ERROR("Escaped sequence: expecting \\");
4940 return;
4941 }
4942 NEXT;
4943 cur = CUR;
4944 if (cur == 'p') {
4945 NEXT;
4946 if (CUR != '{') {
4947 ERROR("Expecting '{'");
4948 return;
4949 }
4950 NEXT;
4951 xmlFAParseCharProp(ctxt);
4952 if (CUR != '}') {
4953 ERROR("Expecting '}'");
4954 return;
4955 }
4956 NEXT;
4957 } else if (cur == 'P') {
4958 NEXT;
4959 if (CUR != '{') {
4960 ERROR("Expecting '{'");
4961 return;
4962 }
4963 NEXT;
4964 xmlFAParseCharProp(ctxt);
Nick Wellnhofer8a0c6692017-07-04 17:13:06 +02004965 if (ctxt->atom != NULL)
4966 ctxt->atom->neg = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00004967 if (CUR != '}') {
4968 ERROR("Expecting '}'");
4969 return;
4970 }
4971 NEXT;
4972 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4973 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4974 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4975 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4976 (cur == 0x5E)) {
4977 if (ctxt->atom == NULL) {
4978 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
Daniel Veillard99c394d2005-07-14 12:58:49 +00004979 if (ctxt->atom != NULL) {
4980 switch (cur) {
4981 case 'n':
4982 ctxt->atom->codepoint = '\n';
4983 break;
4984 case 'r':
4985 ctxt->atom->codepoint = '\r';
4986 break;
4987 case 't':
4988 ctxt->atom->codepoint = '\t';
4989 break;
4990 default:
4991 ctxt->atom->codepoint = cur;
4992 }
4993 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004994 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
Daniel Veillard9543aee2010-03-15 11:13:39 +01004995 switch (cur) {
4996 case 'n':
4997 cur = '\n';
4998 break;
4999 case 'r':
5000 cur = '\r';
5001 break;
5002 case 't':
5003 cur = '\t';
5004 break;
5005 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005006 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5007 XML_REGEXP_CHARVAL, cur, cur, NULL);
5008 }
5009 NEXT;
5010 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
5011 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
5012 (cur == 'w') || (cur == 'W')) {
Daniel Veillardb509f152002-04-17 16:28:10 +00005013 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
Daniel Veillard4255d502002-04-16 15:50:10 +00005014
5015 switch (cur) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005016 case 's':
Daniel Veillard4255d502002-04-16 15:50:10 +00005017 type = XML_REGEXP_ANYSPACE;
5018 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005019 case 'S':
Daniel Veillard4255d502002-04-16 15:50:10 +00005020 type = XML_REGEXP_NOTSPACE;
5021 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005022 case 'i':
Daniel Veillard4255d502002-04-16 15:50:10 +00005023 type = XML_REGEXP_INITNAME;
5024 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005025 case 'I':
Daniel Veillard4255d502002-04-16 15:50:10 +00005026 type = XML_REGEXP_NOTINITNAME;
5027 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005028 case 'c':
Daniel Veillard4255d502002-04-16 15:50:10 +00005029 type = XML_REGEXP_NAMECHAR;
5030 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005031 case 'C':
Daniel Veillard4255d502002-04-16 15:50:10 +00005032 type = XML_REGEXP_NOTNAMECHAR;
5033 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005034 case 'd':
Daniel Veillard4255d502002-04-16 15:50:10 +00005035 type = XML_REGEXP_DECIMAL;
5036 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005037 case 'D':
Daniel Veillard4255d502002-04-16 15:50:10 +00005038 type = XML_REGEXP_NOTDECIMAL;
5039 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005040 case 'w':
Daniel Veillard4255d502002-04-16 15:50:10 +00005041 type = XML_REGEXP_REALCHAR;
5042 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005043 case 'W':
Daniel Veillard4255d502002-04-16 15:50:10 +00005044 type = XML_REGEXP_NOTREALCHAR;
5045 break;
5046 }
5047 NEXT;
5048 if (ctxt->atom == NULL) {
5049 ctxt->atom = xmlRegNewAtom(ctxt, type);
5050 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
5051 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5052 type, 0, 0, NULL);
5053 }
Daniel Veillardcb4284e2007-04-25 13:55:20 +00005054 } else {
5055 ERROR("Wrong escape sequence, misuse of character '\\'");
Daniel Veillard4255d502002-04-16 15:50:10 +00005056 }
5057}
5058
5059/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005060 * xmlFAParseCharRange:
Daniel Veillard441bc322002-04-20 17:38:48 +00005061 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005062 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005063 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
Daniel Veillard4255d502002-04-16 15:50:10 +00005064 * [18] seRange ::= charOrEsc '-' charOrEsc
5065 * [20] charOrEsc ::= XmlChar | SingleCharEsc
5066 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
5067 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
5068 */
5069static void
5070xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
William M. Brackdc99df92003-12-27 01:54:25 +00005071 int cur, len;
Daniel Veillard4255d502002-04-16 15:50:10 +00005072 int start = -1;
5073 int end = -1;
5074
Daniel Veillard777737e2006-10-17 21:23:17 +00005075 if (CUR == '\0') {
5076 ERROR("Expecting ']'");
5077 return;
5078 }
5079
Daniel Veillard4255d502002-04-16 15:50:10 +00005080 cur = CUR;
5081 if (cur == '\\') {
5082 NEXT;
5083 cur = CUR;
5084 switch (cur) {
5085 case 'n': start = 0xA; break;
5086 case 'r': start = 0xD; break;
5087 case 't': start = 0x9; break;
5088 case '\\': case '|': case '.': case '-': case '^': case '?':
5089 case '*': case '+': case '{': case '}': case '(': case ')':
5090 case '[': case ']':
5091 start = cur; break;
5092 default:
5093 ERROR("Invalid escape value");
5094 return;
5095 }
5096 end = start;
William M. Brackdc99df92003-12-27 01:54:25 +00005097 len = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00005098 } else if ((cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00005099 end = start = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00005100 } else {
5101 ERROR("Expecting a char range");
5102 return;
5103 }
William M. Bracka9cbf282007-03-21 13:16:33 +00005104 /*
5105 * Since we are "inside" a range, we can assume ctxt->cur is past
5106 * the start of ctxt->string, and PREV should be safe
5107 */
5108 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
5109 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00005110 return;
5111 }
William M. Bracka9cbf282007-03-21 13:16:33 +00005112 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00005113 cur = CUR;
William M. Brack10f1ef42004-03-20 14:51:25 +00005114 if ((cur != '-') || (NXT(1) == ']')) {
Daniel Veillard4255d502002-04-16 15:50:10 +00005115 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5116 XML_REGEXP_CHARVAL, start, end, NULL);
5117 return;
5118 }
5119 NEXT;
5120 cur = CUR;
5121 if (cur == '\\') {
5122 NEXT;
5123 cur = CUR;
5124 switch (cur) {
5125 case 'n': end = 0xA; break;
5126 case 'r': end = 0xD; break;
5127 case 't': end = 0x9; break;
5128 case '\\': case '|': case '.': case '-': case '^': case '?':
5129 case '*': case '+': case '{': case '}': case '(': case ')':
5130 case '[': case ']':
5131 end = cur; break;
5132 default:
5133 ERROR("Invalid escape value");
5134 return;
5135 }
William M. Brackdc99df92003-12-27 01:54:25 +00005136 len = 1;
David Kilzerfb56f802017-07-04 18:38:03 +02005137 } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00005138 end = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00005139 } else {
5140 ERROR("Expecting the end of a char range");
5141 return;
5142 }
Pranjal Jumdecbb27162016-03-07 06:34:26 -08005143
Daniel Veillard4255d502002-04-16 15:50:10 +00005144 /* TODO check that the values are acceptable character ranges for XML */
5145 if (end < start) {
5146 ERROR("End of range is before start of range");
5147 } else {
Pranjal Jumdecbb27162016-03-07 06:34:26 -08005148 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00005149 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5150 XML_REGEXP_CHARVAL, start, end, NULL);
5151 }
5152 return;
5153}
5154
5155/**
5156 * xmlFAParsePosCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00005157 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005158 *
5159 * [14] posCharGroup ::= ( charRange | charClassEsc )+
5160 */
5161static void
5162xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5163 do {
Daniel Veillard041b6872008-02-08 10:37:18 +00005164 if (CUR == '\\') {
Daniel Veillard4255d502002-04-16 15:50:10 +00005165 xmlFAParseCharClassEsc(ctxt);
5166 } else {
5167 xmlFAParseCharRange(ctxt);
5168 }
Haibo Huangd23e46c2020-10-28 22:26:09 -07005169 } while ((CUR != ']') && (CUR != '-') &&
Daniel Veillard777737e2006-10-17 21:23:17 +00005170 (CUR != 0) && (ctxt->error == 0));
Daniel Veillard4255d502002-04-16 15:50:10 +00005171}
5172
5173/**
5174 * xmlFAParseCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00005175 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005176 *
5177 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
5178 * [15] negCharGroup ::= '^' posCharGroup
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005179 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
Daniel Veillard4255d502002-04-16 15:50:10 +00005180 * [12] charClassExpr ::= '[' charGroup ']'
5181 */
5182static void
5183xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
Haibo Huangd23e46c2020-10-28 22:26:09 -07005184 int neg = ctxt->neg;
Daniel Veillard4255d502002-04-16 15:50:10 +00005185
Haibo Huangd23e46c2020-10-28 22:26:09 -07005186 if (CUR == '^') {
5187 NEXT;
5188 ctxt->neg = !ctxt->neg;
5189 xmlFAParsePosCharGroup(ctxt);
5190 ctxt->neg = neg;
5191 }
5192 while ((CUR != ']') && (ctxt->error == 0)) {
5193 if ((CUR == '-') && (NXT(1) == '[')) {
William M. Brack10f1ef42004-03-20 14:51:25 +00005194 NEXT; /* eat the '-' */
5195 NEXT; /* eat the '[' */
Haibo Huangd23e46c2020-10-28 22:26:09 -07005196 ctxt->neg = 2;
Daniel Veillard4255d502002-04-16 15:50:10 +00005197 xmlFAParseCharGroup(ctxt);
Haibo Huangd23e46c2020-10-28 22:26:09 -07005198 ctxt->neg = neg;
Daniel Veillard4255d502002-04-16 15:50:10 +00005199 if (CUR == ']') {
5200 NEXT;
5201 } else {
5202 ERROR("charClassExpr: ']' expected");
Daniel Veillard4255d502002-04-16 15:50:10 +00005203 }
5204 break;
Haibo Huangd23e46c2020-10-28 22:26:09 -07005205 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00005206 xmlFAParsePosCharGroup(ctxt);
5207 }
5208 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005209}
5210
5211/**
5212 * xmlFAParseCharClass:
Daniel Veillard441bc322002-04-20 17:38:48 +00005213 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005214 *
5215 * [11] charClass ::= charClassEsc | charClassExpr
5216 * [12] charClassExpr ::= '[' charGroup ']'
5217 */
5218static void
5219xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5220 if (CUR == '[') {
5221 NEXT;
5222 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5223 if (ctxt->atom == NULL)
5224 return;
5225 xmlFAParseCharGroup(ctxt);
5226 if (CUR == ']') {
5227 NEXT;
5228 } else {
5229 ERROR("xmlFAParseCharClass: ']' expected");
5230 }
5231 } else {
5232 xmlFAParseCharClassEsc(ctxt);
5233 }
5234}
5235
5236/**
5237 * xmlFAParseQuantExact:
Daniel Veillard441bc322002-04-20 17:38:48 +00005238 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005239 *
5240 * [8] QuantExact ::= [0-9]+
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005241 *
5242 * Returns 0 if success or -1 in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00005243 */
5244static int
5245xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5246 int ret = 0;
5247 int ok = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005248 int overflow = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00005249
5250 while ((CUR >= '0') && (CUR <= '9')) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005251 if (ret > INT_MAX / 10) {
5252 overflow = 1;
5253 } else {
5254 int digit = CUR - '0';
5255
5256 ret *= 10;
5257 if (ret > INT_MAX - digit)
5258 overflow = 1;
5259 else
5260 ret += digit;
5261 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005262 ok = 1;
5263 NEXT;
5264 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005265 if ((ok != 1) || (overflow == 1)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00005266 return(-1);
5267 }
5268 return(ret);
5269}
5270
5271/**
5272 * xmlFAParseQuantifier:
Daniel Veillard441bc322002-04-20 17:38:48 +00005273 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005274 *
5275 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
5276 * [5] quantity ::= quantRange | quantMin | QuantExact
5277 * [6] quantRange ::= QuantExact ',' QuantExact
5278 * [7] quantMin ::= QuantExact ','
5279 * [8] QuantExact ::= [0-9]+
5280 */
5281static int
5282xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5283 int cur;
5284
5285 cur = CUR;
5286 if ((cur == '?') || (cur == '*') || (cur == '+')) {
5287 if (ctxt->atom != NULL) {
5288 if (cur == '?')
5289 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5290 else if (cur == '*')
5291 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5292 else if (cur == '+')
5293 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5294 }
5295 NEXT;
5296 return(1);
5297 }
5298 if (cur == '{') {
5299 int min = 0, max = 0;
5300
5301 NEXT;
5302 cur = xmlFAParseQuantExact(ctxt);
5303 if (cur >= 0)
5304 min = cur;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005305 else {
5306 ERROR("Improper quantifier");
5307 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005308 if (CUR == ',') {
5309 NEXT;
Daniel Veillardebe48c62003-12-03 12:12:27 +00005310 if (CUR == '}')
5311 max = INT_MAX;
5312 else {
5313 cur = xmlFAParseQuantExact(ctxt);
5314 if (cur >= 0)
5315 max = cur;
5316 else {
5317 ERROR("Improper quantifier");
5318 }
5319 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005320 }
5321 if (CUR == '}') {
5322 NEXT;
5323 } else {
5324 ERROR("Unterminated quantifier");
5325 }
5326 if (max == 0)
5327 max = min;
5328 if (ctxt->atom != NULL) {
5329 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5330 ctxt->atom->min = min;
5331 ctxt->atom->max = max;
5332 }
5333 return(1);
5334 }
5335 return(0);
5336}
5337
5338/**
5339 * xmlFAParseAtom:
Daniel Veillard441bc322002-04-20 17:38:48 +00005340 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005341 *
5342 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
5343 */
5344static int
5345xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5346 int codepoint, len;
5347
5348 codepoint = xmlFAIsChar(ctxt);
5349 if (codepoint > 0) {
5350 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5351 if (ctxt->atom == NULL)
5352 return(-1);
5353 codepoint = CUR_SCHAR(ctxt->cur, len);
5354 ctxt->atom->codepoint = codepoint;
5355 NEXTL(len);
5356 return(1);
5357 } else if (CUR == '|') {
5358 return(0);
5359 } else if (CUR == 0) {
5360 return(0);
5361 } else if (CUR == ')') {
5362 return(0);
5363 } else if (CUR == '(') {
Daniel Veillard76d59b62007-08-22 16:29:21 +00005364 xmlRegStatePtr start, oldend, start0;
Daniel Veillard4255d502002-04-16 15:50:10 +00005365
5366 NEXT;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005367 if (ctxt->depth >= 50) {
5368 ERROR("xmlFAParseAtom: maximum nesting depth exceeded");
5369 return(-1);
5370 }
Daniel Veillard76d59b62007-08-22 16:29:21 +00005371 /*
5372 * this extra Epsilon transition is needed if we count with 0 allowed
5373 * unfortunately this can't be known at that point
5374 */
5375 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5376 start0 = ctxt->state;
Daniel Veillard4255d502002-04-16 15:50:10 +00005377 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5378 start = ctxt->state;
5379 oldend = ctxt->end;
5380 ctxt->end = NULL;
5381 ctxt->atom = NULL;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005382 ctxt->depth++;
Daniel Veillard4255d502002-04-16 15:50:10 +00005383 xmlFAParseRegExp(ctxt, 0);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005384 ctxt->depth--;
Daniel Veillard4255d502002-04-16 15:50:10 +00005385 if (CUR == ')') {
5386 NEXT;
5387 } else {
5388 ERROR("xmlFAParseAtom: expecting ')'");
5389 }
5390 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5391 if (ctxt->atom == NULL)
5392 return(-1);
5393 ctxt->atom->start = start;
Daniel Veillard76d59b62007-08-22 16:29:21 +00005394 ctxt->atom->start0 = start0;
Daniel Veillard4255d502002-04-16 15:50:10 +00005395 ctxt->atom->stop = ctxt->state;
5396 ctxt->end = oldend;
5397 return(1);
5398 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5399 xmlFAParseCharClass(ctxt);
5400 return(1);
5401 }
5402 return(0);
5403}
5404
5405/**
5406 * xmlFAParsePiece:
Daniel Veillard441bc322002-04-20 17:38:48 +00005407 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005408 *
5409 * [3] piece ::= atom quantifier?
5410 */
5411static int
5412xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5413 int ret;
5414
5415 ctxt->atom = NULL;
5416 ret = xmlFAParseAtom(ctxt);
5417 if (ret == 0)
5418 return(0);
5419 if (ctxt->atom == NULL) {
5420 ERROR("internal: no atom generated");
5421 }
5422 xmlFAParseQuantifier(ctxt);
5423 return(1);
5424}
5425
5426/**
5427 * xmlFAParseBranch:
Daniel Veillard441bc322002-04-20 17:38:48 +00005428 * @ctxt: a regexp parser context
Daniel Veillard54eb0242006-03-21 23:17:57 +00005429 * @to: optional target to the end of the branch
5430 *
5431 * @to is used to optimize by removing duplicate path in automata
5432 * in expressions like (a|b)(c|d)
Daniel Veillard4255d502002-04-16 15:50:10 +00005433 *
5434 * [2] branch ::= piece*
5435 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005436static int
Daniel Veillard54eb0242006-03-21 23:17:57 +00005437xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
Daniel Veillard4255d502002-04-16 15:50:10 +00005438 xmlRegStatePtr previous;
Daniel Veillard4255d502002-04-16 15:50:10 +00005439 int ret;
5440
5441 previous = ctxt->state;
5442 ret = xmlFAParsePiece(ctxt);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005443 if (ret == 0) {
5444 /* Empty branch */
5445 xmlFAGenerateEpsilonTransition(ctxt, previous, to);
5446 } else {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005447 if (xmlFAGenerateTransitions(ctxt, previous,
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005448 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL, ctxt->atom) < 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005449 return(-1);
5450 previous = ctxt->state;
Daniel Veillard4255d502002-04-16 15:50:10 +00005451 ctxt->atom = NULL;
5452 }
5453 while ((ret != 0) && (ctxt->error == 0)) {
5454 ret = xmlFAParsePiece(ctxt);
5455 if (ret != 0) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005456 if (xmlFAGenerateTransitions(ctxt, previous,
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005457 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5458 ctxt->atom) < 0)
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005459 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00005460 previous = ctxt->state;
5461 ctxt->atom = NULL;
5462 }
5463 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005464 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00005465}
5466
5467/**
5468 * xmlFAParseRegExp:
Daniel Veillard441bc322002-04-20 17:38:48 +00005469 * @ctxt: a regexp parser context
William M. Brackddf71d62004-05-06 04:17:26 +00005470 * @top: is this the top-level expression ?
Daniel Veillard4255d502002-04-16 15:50:10 +00005471 *
5472 * [1] regExp ::= branch ( '|' branch )*
5473 */
5474static void
5475xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
Daniel Veillardc7e3cc42004-09-28 12:33:52 +00005476 xmlRegStatePtr start, end;
Daniel Veillard4255d502002-04-16 15:50:10 +00005477
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005478 /* if not top start should have been generated by an epsilon trans */
Daniel Veillard4255d502002-04-16 15:50:10 +00005479 start = ctxt->state;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005480 ctxt->end = NULL;
Daniel Veillard54eb0242006-03-21 23:17:57 +00005481 xmlFAParseBranch(ctxt, NULL);
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005482 if (top) {
5483#ifdef DEBUG_REGEXP_GRAPH
5484 printf("State %d is final\n", ctxt->state->no);
5485#endif
5486 ctxt->state->type = XML_REGEXP_FINAL_STATE;
5487 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005488 if (CUR != '|') {
5489 ctxt->end = ctxt->state;
5490 return;
5491 }
5492 end = ctxt->state;
5493 while ((CUR == '|') && (ctxt->error == 0)) {
5494 NEXT;
5495 ctxt->state = start;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005496 ctxt->end = NULL;
Daniel Veillard54eb0242006-03-21 23:17:57 +00005497 xmlFAParseBranch(ctxt, end);
Daniel Veillard4255d502002-04-16 15:50:10 +00005498 }
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005499 if (!top) {
5500 ctxt->state = end;
5501 ctxt->end = end;
5502 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005503}
5504
5505/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005506 * *
5507 * The basic API *
5508 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00005509 ************************************************************************/
5510
5511/**
5512 * xmlRegexpPrint:
5513 * @output: the file for the output debug
5514 * @regexp: the compiled regexp
5515 *
5516 * Print the content of the compiled regular expression
5517 */
5518void
5519xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5520 int i;
5521
Daniel Veillarda82b1822004-11-08 16:24:57 +00005522 if (output == NULL)
5523 return;
Daniel Veillard4255d502002-04-16 15:50:10 +00005524 fprintf(output, " regexp: ");
5525 if (regexp == NULL) {
5526 fprintf(output, "NULL\n");
5527 return;
5528 }
5529 fprintf(output, "'%s' ", regexp->string);
5530 fprintf(output, "\n");
5531 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5532 for (i = 0;i < regexp->nbAtoms; i++) {
5533 fprintf(output, " %02d ", i);
5534 xmlRegPrintAtom(output, regexp->atoms[i]);
5535 }
5536 fprintf(output, "%d states:", regexp->nbStates);
5537 fprintf(output, "\n");
5538 for (i = 0;i < regexp->nbStates; i++) {
5539 xmlRegPrintState(output, regexp->states[i]);
5540 }
5541 fprintf(output, "%d counters:\n", regexp->nbCounters);
5542 for (i = 0;i < regexp->nbCounters; i++) {
5543 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5544 regexp->counters[i].max);
5545 }
5546}
5547
5548/**
5549 * xmlRegexpCompile:
5550 * @regexp: a regular expression string
5551 *
5552 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
William M. Brackddf71d62004-05-06 04:17:26 +00005553 * Appendix F and builds an automata suitable for testing strings against
Daniel Veillard4255d502002-04-16 15:50:10 +00005554 * that regular expression
5555 *
5556 * Returns the compiled expression or NULL in case of error
5557 */
5558xmlRegexpPtr
5559xmlRegexpCompile(const xmlChar *regexp) {
5560 xmlRegexpPtr ret;
5561 xmlRegParserCtxtPtr ctxt;
5562
5563 ctxt = xmlRegNewParserCtxt(regexp);
5564 if (ctxt == NULL)
5565 return(NULL);
5566
5567 /* initialize the parser */
5568 ctxt->end = NULL;
5569 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5570 xmlRegStatePush(ctxt, ctxt->start);
5571
5572 /* parse the expression building an automata */
5573 xmlFAParseRegExp(ctxt, 1);
5574 if (CUR != 0) {
5575 ERROR("xmlFAParseRegExp: extra characters");
5576 }
Daniel Veillardcb4284e2007-04-25 13:55:20 +00005577 if (ctxt->error != 0) {
5578 xmlRegFreeParserCtxt(ctxt);
5579 return(NULL);
5580 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005581 ctxt->end = ctxt->state;
5582 ctxt->start->type = XML_REGEXP_START_STATE;
5583 ctxt->end->type = XML_REGEXP_FINAL_STATE;
5584
5585 /* remove the Epsilon except for counted transitions */
5586 xmlFAEliminateEpsilonTransitions(ctxt);
5587
5588
5589 if (ctxt->error != 0) {
5590 xmlRegFreeParserCtxt(ctxt);
5591 return(NULL);
5592 }
5593 ret = xmlRegEpxFromParse(ctxt);
5594 xmlRegFreeParserCtxt(ctxt);
5595 return(ret);
5596}
5597
5598/**
5599 * xmlRegexpExec:
5600 * @comp: the compiled regular expression
5601 * @content: the value to check against the regular expression
5602 *
William M. Brackddf71d62004-05-06 04:17:26 +00005603 * Check if the regular expression generates the value
Daniel Veillard4255d502002-04-16 15:50:10 +00005604 *
William M. Brackddf71d62004-05-06 04:17:26 +00005605 * Returns 1 if it matches, 0 if not and a negative value in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00005606 */
5607int
5608xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5609 if ((comp == NULL) || (content == NULL))
5610 return(-1);
5611 return(xmlFARegExec(comp, content));
5612}
5613
5614/**
Daniel Veillard23e73572002-09-19 19:56:43 +00005615 * xmlRegexpIsDeterminist:
5616 * @comp: the compiled regular expression
5617 *
5618 * Check if the regular expression is determinist
5619 *
William M. Brackddf71d62004-05-06 04:17:26 +00005620 * Returns 1 if it yes, 0 if not and a negative value in case of error
Daniel Veillard23e73572002-09-19 19:56:43 +00005621 */
5622int
5623xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5624 xmlAutomataPtr am;
5625 int ret;
5626
5627 if (comp == NULL)
5628 return(-1);
5629 if (comp->determinist != -1)
5630 return(comp->determinist);
5631
5632 am = xmlNewAutomata();
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005633 if (am == NULL)
5634 return(-1);
Daniel Veillardbd9afb52002-09-25 22:25:35 +00005635 if (am->states != NULL) {
5636 int i;
5637
5638 for (i = 0;i < am->nbStates;i++)
5639 xmlRegFreeState(am->states[i]);
5640 xmlFree(am->states);
5641 }
Daniel Veillard23e73572002-09-19 19:56:43 +00005642 am->nbAtoms = comp->nbAtoms;
5643 am->atoms = comp->atoms;
5644 am->nbStates = comp->nbStates;
5645 am->states = comp->states;
5646 am->determinist = -1;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02005647 am->flags = comp->flags;
Daniel Veillard23e73572002-09-19 19:56:43 +00005648 ret = xmlFAComputesDeterminism(am);
5649 am->atoms = NULL;
5650 am->states = NULL;
5651 xmlFreeAutomata(am);
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02005652 comp->determinist = ret;
Daniel Veillard23e73572002-09-19 19:56:43 +00005653 return(ret);
5654}
5655
5656/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005657 * xmlRegFreeRegexp:
5658 * @regexp: the regexp
5659 *
5660 * Free a regexp
5661 */
5662void
5663xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5664 int i;
5665 if (regexp == NULL)
5666 return;
5667
5668 if (regexp->string != NULL)
5669 xmlFree(regexp->string);
5670 if (regexp->states != NULL) {
5671 for (i = 0;i < regexp->nbStates;i++)
5672 xmlRegFreeState(regexp->states[i]);
5673 xmlFree(regexp->states);
5674 }
5675 if (regexp->atoms != NULL) {
5676 for (i = 0;i < regexp->nbAtoms;i++)
5677 xmlRegFreeAtom(regexp->atoms[i]);
5678 xmlFree(regexp->atoms);
5679 }
5680 if (regexp->counters != NULL)
5681 xmlFree(regexp->counters);
Daniel Veillard23e73572002-09-19 19:56:43 +00005682 if (regexp->compact != NULL)
5683 xmlFree(regexp->compact);
Daniel Veillard118aed72002-09-24 14:13:13 +00005684 if (regexp->transdata != NULL)
5685 xmlFree(regexp->transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +00005686 if (regexp->stringMap != NULL) {
5687 for (i = 0; i < regexp->nbstrings;i++)
5688 xmlFree(regexp->stringMap[i]);
5689 xmlFree(regexp->stringMap);
5690 }
5691
Daniel Veillard4255d502002-04-16 15:50:10 +00005692 xmlFree(regexp);
5693}
5694
5695#ifdef LIBXML_AUTOMATA_ENABLED
5696/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005697 * *
5698 * The Automata interface *
5699 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00005700 ************************************************************************/
5701
5702/**
5703 * xmlNewAutomata:
5704 *
5705 * Create a new automata
5706 *
5707 * Returns the new object or NULL in case of failure
5708 */
5709xmlAutomataPtr
5710xmlNewAutomata(void) {
5711 xmlAutomataPtr ctxt;
5712
5713 ctxt = xmlRegNewParserCtxt(NULL);
5714 if (ctxt == NULL)
5715 return(NULL);
5716
5717 /* initialize the parser */
5718 ctxt->end = NULL;
5719 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005720 if (ctxt->start == NULL) {
5721 xmlFreeAutomata(ctxt);
5722 return(NULL);
5723 }
Daniel Veillardd0271472006-01-02 10:22:02 +00005724 ctxt->start->type = XML_REGEXP_START_STATE;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005725 if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5726 xmlRegFreeState(ctxt->start);
5727 xmlFreeAutomata(ctxt);
5728 return(NULL);
5729 }
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02005730 ctxt->flags = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00005731
5732 return(ctxt);
5733}
5734
5735/**
5736 * xmlFreeAutomata:
5737 * @am: an automata
5738 *
5739 * Free an automata
5740 */
5741void
5742xmlFreeAutomata(xmlAutomataPtr am) {
5743 if (am == NULL)
5744 return;
5745 xmlRegFreeParserCtxt(am);
5746}
5747
5748/**
Daniel Veillard29341682009-09-10 18:23:39 +02005749 * xmlAutomataSetFlags:
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02005750 * @am: an automata
5751 * @flags: a set of internal flags
5752 *
5753 * Set some flags on the automata
5754 */
5755void
5756xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5757 if (am == NULL)
5758 return;
5759 am->flags |= flags;
5760}
5761
5762/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005763 * xmlAutomataGetInitState:
5764 * @am: an automata
5765 *
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005766 * Initial state lookup
5767 *
Daniel Veillard4255d502002-04-16 15:50:10 +00005768 * Returns the initial state of the automata
5769 */
5770xmlAutomataStatePtr
5771xmlAutomataGetInitState(xmlAutomataPtr am) {
5772 if (am == NULL)
5773 return(NULL);
5774 return(am->start);
5775}
5776
5777/**
5778 * xmlAutomataSetFinalState:
5779 * @am: an automata
5780 * @state: a state in this automata
5781 *
5782 * Makes that state a final state
5783 *
5784 * Returns 0 or -1 in case of error
5785 */
5786int
5787xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5788 if ((am == NULL) || (state == NULL))
5789 return(-1);
5790 state->type = XML_REGEXP_FINAL_STATE;
5791 return(0);
5792}
5793
5794/**
5795 * xmlAutomataNewTransition:
5796 * @am: an automata
5797 * @from: the starting point of the transition
5798 * @to: the target point of the transition or NULL
5799 * @token: the input string associated to that transition
5800 * @data: data passed to the callback function if the transition is activated
5801 *
William M. Brackddf71d62004-05-06 04:17:26 +00005802 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00005803 * and then adds a transition from the @from state to the target state
5804 * activated by the value of @token
5805 *
5806 * Returns the target state or NULL in case of error
5807 */
5808xmlAutomataStatePtr
5809xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5810 xmlAutomataStatePtr to, const xmlChar *token,
5811 void *data) {
5812 xmlRegAtomPtr atom;
5813
5814 if ((am == NULL) || (from == NULL) || (token == NULL))
5815 return(NULL);
5816 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005817 if (atom == NULL)
5818 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00005819 atom->data = data;
Daniel Veillard4255d502002-04-16 15:50:10 +00005820 atom->valuep = xmlStrdup(token);
5821
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005822 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5823 xmlRegFreeAtom(atom);
5824 return(NULL);
5825 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005826 if (to == NULL)
5827 return(am->state);
5828 return(to);
5829}
5830
5831/**
Daniel Veillard52b48c72003-04-13 19:53:42 +00005832 * xmlAutomataNewTransition2:
5833 * @am: an automata
5834 * @from: the starting point of the transition
5835 * @to: the target point of the transition or NULL
5836 * @token: the first input string associated to that transition
5837 * @token2: the second input string associated to that transition
5838 * @data: data passed to the callback function if the transition is activated
5839 *
William M. Brackddf71d62004-05-06 04:17:26 +00005840 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard52b48c72003-04-13 19:53:42 +00005841 * and then adds a transition from the @from state to the target state
5842 * activated by the value of @token
5843 *
5844 * Returns the target state or NULL in case of error
5845 */
5846xmlAutomataStatePtr
5847xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5848 xmlAutomataStatePtr to, const xmlChar *token,
5849 const xmlChar *token2, void *data) {
5850 xmlRegAtomPtr atom;
5851
5852 if ((am == NULL) || (from == NULL) || (token == NULL))
5853 return(NULL);
5854 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
Daniel Veillard52b48c72003-04-13 19:53:42 +00005855 if (atom == NULL)
5856 return(NULL);
Daniel Veillard11ce4002006-03-10 00:36:23 +00005857 atom->data = data;
Daniel Veillard52b48c72003-04-13 19:53:42 +00005858 if ((token2 == NULL) || (*token2 == 0)) {
5859 atom->valuep = xmlStrdup(token);
5860 } else {
5861 int lenn, lenp;
5862 xmlChar *str;
5863
5864 lenn = strlen((char *) token2);
5865 lenp = strlen((char *) token);
5866
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005867 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00005868 if (str == NULL) {
5869 xmlRegFreeAtom(atom);
5870 return(NULL);
5871 }
5872 memcpy(&str[0], token, lenp);
5873 str[lenp] = '|';
5874 memcpy(&str[lenp + 1], token2, lenn);
5875 str[lenn + lenp + 1] = 0;
5876
5877 atom->valuep = str;
5878 }
5879
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005880 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5881 xmlRegFreeAtom(atom);
5882 return(NULL);
5883 }
Daniel Veillard52b48c72003-04-13 19:53:42 +00005884 if (to == NULL)
5885 return(am->state);
5886 return(to);
5887}
5888
5889/**
Daniel Veillard9efc4762005-07-19 14:33:55 +00005890 * xmlAutomataNewNegTrans:
5891 * @am: an automata
5892 * @from: the starting point of the transition
5893 * @to: the target point of the transition or NULL
5894 * @token: the first input string associated to that transition
5895 * @token2: the second input string associated to that transition
5896 * @data: data passed to the callback function if the transition is activated
5897 *
5898 * If @to is NULL, this creates first a new target state in the automata
5899 * and then adds a transition from the @from state to the target state
5900 * activated by any value except (@token,@token2)
Daniel Veillard6e65e152005-08-09 11:09:52 +00005901 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5902 # the semantic of XSD ##other
Daniel Veillard9efc4762005-07-19 14:33:55 +00005903 *
5904 * Returns the target state or NULL in case of error
5905 */
5906xmlAutomataStatePtr
5907xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5908 xmlAutomataStatePtr to, const xmlChar *token,
5909 const xmlChar *token2, void *data) {
5910 xmlRegAtomPtr atom;
Daniel Veillard77005e62005-07-19 16:26:18 +00005911 xmlChar err_msg[200];
Daniel Veillard9efc4762005-07-19 14:33:55 +00005912
5913 if ((am == NULL) || (from == NULL) || (token == NULL))
5914 return(NULL);
5915 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5916 if (atom == NULL)
5917 return(NULL);
5918 atom->data = data;
5919 atom->neg = 1;
5920 if ((token2 == NULL) || (*token2 == 0)) {
5921 atom->valuep = xmlStrdup(token);
5922 } else {
5923 int lenn, lenp;
5924 xmlChar *str;
5925
5926 lenn = strlen((char *) token2);
5927 lenp = strlen((char *) token);
5928
5929 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5930 if (str == NULL) {
5931 xmlRegFreeAtom(atom);
5932 return(NULL);
5933 }
5934 memcpy(&str[0], token, lenp);
5935 str[lenp] = '|';
5936 memcpy(&str[lenp + 1], token2, lenn);
5937 str[lenn + lenp + 1] = 0;
5938
5939 atom->valuep = str;
5940 }
Daniel Veillarddb68b742005-07-30 13:18:24 +00005941 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
Daniel Veillard77005e62005-07-19 16:26:18 +00005942 err_msg[199] = 0;
5943 atom->valuep2 = xmlStrdup(err_msg);
Daniel Veillard9efc4762005-07-19 14:33:55 +00005944
5945 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5946 xmlRegFreeAtom(atom);
5947 return(NULL);
5948 }
Daniel Veillard6e65e152005-08-09 11:09:52 +00005949 am->negs++;
Daniel Veillard9efc4762005-07-19 14:33:55 +00005950 if (to == NULL)
5951 return(am->state);
5952 return(to);
5953}
5954
5955/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005956 * xmlAutomataNewCountTrans2:
5957 * @am: an automata
5958 * @from: the starting point of the transition
5959 * @to: the target point of the transition or NULL
5960 * @token: the input string associated to that transition
5961 * @token2: the second input string associated to that transition
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005962 * @min: the minimum successive occurrences of token
5963 * @max: the maximum successive occurrences of token
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005964 * @data: data associated to the transition
5965 *
5966 * If @to is NULL, this creates first a new target state in the automata
5967 * and then adds a transition from the @from state to the target state
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005968 * activated by a succession of input of value @token and @token2 and
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005969 * whose number is between @min and @max
5970 *
5971 * Returns the target state or NULL in case of error
5972 */
5973xmlAutomataStatePtr
5974xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5975 xmlAutomataStatePtr to, const xmlChar *token,
5976 const xmlChar *token2,
5977 int min, int max, void *data) {
5978 xmlRegAtomPtr atom;
5979 int counter;
5980
5981 if ((am == NULL) || (from == NULL) || (token == NULL))
5982 return(NULL);
5983 if (min < 0)
5984 return(NULL);
5985 if ((max < min) || (max < 1))
5986 return(NULL);
5987 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5988 if (atom == NULL)
5989 return(NULL);
5990 if ((token2 == NULL) || (*token2 == 0)) {
5991 atom->valuep = xmlStrdup(token);
5992 } else {
5993 int lenn, lenp;
5994 xmlChar *str;
5995
5996 lenn = strlen((char *) token2);
5997 lenp = strlen((char *) token);
5998
5999 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6000 if (str == NULL) {
6001 xmlRegFreeAtom(atom);
6002 return(NULL);
6003 }
6004 memcpy(&str[0], token, lenp);
6005 str[lenp] = '|';
6006 memcpy(&str[lenp + 1], token2, lenn);
6007 str[lenn + lenp + 1] = 0;
6008
6009 atom->valuep = str;
6010 }
6011 atom->data = data;
6012 if (min == 0)
6013 atom->min = 1;
6014 else
6015 atom->min = min;
6016 atom->max = max;
6017
6018 /*
6019 * associate a counter to the transition.
6020 */
6021 counter = xmlRegGetCounter(am);
Nick Wellnhofer1ccf89b2023-02-17 15:53:07 +01006022 if (counter < 0)
6023 goto error;
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006024 am->counters[counter].min = min;
6025 am->counters[counter].max = max;
6026
6027 /* xmlFAGenerateTransitions(am, from, to, atom); */
6028 if (to == NULL) {
6029 to = xmlRegNewState(am);
6030 xmlRegStatePush(am, to);
6031 }
Daniel Veillard5de09382005-09-26 17:18:17 +00006032 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006033 xmlRegAtomPush(am, atom);
6034 am->state = to;
6035
6036 if (to == NULL)
6037 to = am->state;
6038 if (to == NULL)
6039 return(NULL);
6040 if (min == 0)
6041 xmlFAGenerateEpsilonTransition(am, from, to);
6042 return(to);
Nick Wellnhofer1ccf89b2023-02-17 15:53:07 +01006043
6044error:
6045 xmlRegFreeAtom(atom);
6046 return(NULL);
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006047}
6048
6049/**
Daniel Veillard4255d502002-04-16 15:50:10 +00006050 * xmlAutomataNewCountTrans:
6051 * @am: an automata
6052 * @from: the starting point of the transition
6053 * @to: the target point of the transition or NULL
6054 * @token: the input string associated to that transition
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006055 * @min: the minimum successive occurrences of token
6056 * @max: the maximum successive occurrences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00006057 * @data: data associated to the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00006058 *
William M. Brackddf71d62004-05-06 04:17:26 +00006059 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00006060 * and then adds a transition from the @from state to the target state
6061 * activated by a succession of input of value @token and whose number
6062 * is between @min and @max
6063 *
6064 * Returns the target state or NULL in case of error
6065 */
6066xmlAutomataStatePtr
6067xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6068 xmlAutomataStatePtr to, const xmlChar *token,
6069 int min, int max, void *data) {
6070 xmlRegAtomPtr atom;
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00006071 int counter;
Daniel Veillard4255d502002-04-16 15:50:10 +00006072
6073 if ((am == NULL) || (from == NULL) || (token == NULL))
6074 return(NULL);
6075 if (min < 0)
6076 return(NULL);
6077 if ((max < min) || (max < 1))
6078 return(NULL);
6079 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6080 if (atom == NULL)
6081 return(NULL);
6082 atom->valuep = xmlStrdup(token);
6083 atom->data = data;
6084 if (min == 0)
6085 atom->min = 1;
6086 else
6087 atom->min = min;
6088 atom->max = max;
6089
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00006090 /*
6091 * associate a counter to the transition.
6092 */
6093 counter = xmlRegGetCounter(am);
Nick Wellnhofer1ccf89b2023-02-17 15:53:07 +01006094 if (counter < 0)
6095 goto error;
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00006096 am->counters[counter].min = min;
6097 am->counters[counter].max = max;
6098
6099 /* xmlFAGenerateTransitions(am, from, to, atom); */
6100 if (to == NULL) {
6101 to = xmlRegNewState(am);
6102 xmlRegStatePush(am, to);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00006103 }
Daniel Veillard5de09382005-09-26 17:18:17 +00006104 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00006105 xmlRegAtomPush(am, atom);
6106 am->state = to;
6107
Daniel Veillard4255d502002-04-16 15:50:10 +00006108 if (to == NULL)
6109 to = am->state;
6110 if (to == NULL)
6111 return(NULL);
6112 if (min == 0)
6113 xmlFAGenerateEpsilonTransition(am, from, to);
6114 return(to);
Nick Wellnhofer1ccf89b2023-02-17 15:53:07 +01006115
6116error:
6117 xmlRegFreeAtom(atom);
6118 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00006119}
6120
6121/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006122 * xmlAutomataNewOnceTrans2:
6123 * @am: an automata
6124 * @from: the starting point of the transition
6125 * @to: the target point of the transition or NULL
6126 * @token: the input string associated to that transition
6127 * @token2: the second input string associated to that transition
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006128 * @min: the minimum successive occurrences of token
6129 * @max: the maximum successive occurrences of token
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006130 * @data: data associated to the transition
6131 *
6132 * If @to is NULL, this creates first a new target state in the automata
6133 * and then adds a transition from the @from state to the target state
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006134 * activated by a succession of input of value @token and @token2 and whose
6135 * number is between @min and @max, moreover that transition can only be
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006136 * crossed once.
6137 *
6138 * Returns the target state or NULL in case of error
6139 */
6140xmlAutomataStatePtr
6141xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6142 xmlAutomataStatePtr to, const xmlChar *token,
6143 const xmlChar *token2,
6144 int min, int max, void *data) {
6145 xmlRegAtomPtr atom;
6146 int counter;
6147
6148 if ((am == NULL) || (from == NULL) || (token == NULL))
6149 return(NULL);
6150 if (min < 1)
6151 return(NULL);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006152 if (max < min)
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006153 return(NULL);
6154 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6155 if (atom == NULL)
6156 return(NULL);
6157 if ((token2 == NULL) || (*token2 == 0)) {
6158 atom->valuep = xmlStrdup(token);
6159 } else {
6160 int lenn, lenp;
6161 xmlChar *str;
6162
6163 lenn = strlen((char *) token2);
6164 lenp = strlen((char *) token);
6165
6166 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6167 if (str == NULL) {
6168 xmlRegFreeAtom(atom);
6169 return(NULL);
6170 }
6171 memcpy(&str[0], token, lenp);
6172 str[lenp] = '|';
6173 memcpy(&str[lenp + 1], token2, lenn);
6174 str[lenn + lenp + 1] = 0;
6175
6176 atom->valuep = str;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006177 }
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006178 atom->data = data;
6179 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
Daniel Veillard11ce4002006-03-10 00:36:23 +00006180 atom->min = min;
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006181 atom->max = max;
6182 /*
6183 * associate a counter to the transition.
6184 */
6185 counter = xmlRegGetCounter(am);
Nick Wellnhofer1ccf89b2023-02-17 15:53:07 +01006186 if (counter < 0)
6187 goto error;
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006188 am->counters[counter].min = 1;
6189 am->counters[counter].max = 1;
6190
6191 /* xmlFAGenerateTransitions(am, from, to, atom); */
6192 if (to == NULL) {
6193 to = xmlRegNewState(am);
6194 xmlRegStatePush(am, to);
6195 }
Daniel Veillard5de09382005-09-26 17:18:17 +00006196 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006197 xmlRegAtomPush(am, atom);
6198 am->state = to;
6199 return(to);
Nick Wellnhofer1ccf89b2023-02-17 15:53:07 +01006200
6201error:
6202 xmlRegFreeAtom(atom);
6203 return(NULL);
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006204}
6205
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006206
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006207
6208/**
Daniel Veillard7646b182002-04-20 06:41:40 +00006209 * xmlAutomataNewOnceTrans:
6210 * @am: an automata
6211 * @from: the starting point of the transition
6212 * @to: the target point of the transition or NULL
6213 * @token: the input string associated to that transition
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006214 * @min: the minimum successive occurrences of token
6215 * @max: the maximum successive occurrences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00006216 * @data: data associated to the transition
Daniel Veillard7646b182002-04-20 06:41:40 +00006217 *
William M. Brackddf71d62004-05-06 04:17:26 +00006218 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00006219 * and then adds a transition from the @from state to the target state
6220 * activated by a succession of input of value @token and whose number
William M. Brackddf71d62004-05-06 04:17:26 +00006221 * is between @min and @max, moreover that transition can only be crossed
Daniel Veillard7646b182002-04-20 06:41:40 +00006222 * once.
6223 *
6224 * Returns the target state or NULL in case of error
6225 */
6226xmlAutomataStatePtr
6227xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6228 xmlAutomataStatePtr to, const xmlChar *token,
6229 int min, int max, void *data) {
6230 xmlRegAtomPtr atom;
6231 int counter;
6232
6233 if ((am == NULL) || (from == NULL) || (token == NULL))
6234 return(NULL);
6235 if (min < 1)
6236 return(NULL);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006237 if (max < min)
Daniel Veillard7646b182002-04-20 06:41:40 +00006238 return(NULL);
6239 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6240 if (atom == NULL)
6241 return(NULL);
6242 atom->valuep = xmlStrdup(token);
6243 atom->data = data;
6244 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
Daniel Veillard11ce4002006-03-10 00:36:23 +00006245 atom->min = min;
Daniel Veillard7646b182002-04-20 06:41:40 +00006246 atom->max = max;
6247 /*
6248 * associate a counter to the transition.
6249 */
6250 counter = xmlRegGetCounter(am);
Nick Wellnhofer1ccf89b2023-02-17 15:53:07 +01006251 if (counter < 0)
6252 goto error;
Daniel Veillard7646b182002-04-20 06:41:40 +00006253 am->counters[counter].min = 1;
6254 am->counters[counter].max = 1;
6255
6256 /* xmlFAGenerateTransitions(am, from, to, atom); */
6257 if (to == NULL) {
6258 to = xmlRegNewState(am);
6259 xmlRegStatePush(am, to);
6260 }
Daniel Veillard5de09382005-09-26 17:18:17 +00006261 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Daniel Veillard7646b182002-04-20 06:41:40 +00006262 xmlRegAtomPush(am, atom);
6263 am->state = to;
Daniel Veillard7646b182002-04-20 06:41:40 +00006264 return(to);
Nick Wellnhofer1ccf89b2023-02-17 15:53:07 +01006265
6266error:
6267 xmlRegFreeAtom(atom);
6268 return(NULL);
Daniel Veillard7646b182002-04-20 06:41:40 +00006269}
6270
6271/**
Daniel Veillard4255d502002-04-16 15:50:10 +00006272 * xmlAutomataNewState:
6273 * @am: an automata
6274 *
6275 * Create a new disconnected state in the automata
6276 *
6277 * Returns the new state or NULL in case of error
6278 */
6279xmlAutomataStatePtr
6280xmlAutomataNewState(xmlAutomataPtr am) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006281 xmlAutomataStatePtr to;
Daniel Veillard4255d502002-04-16 15:50:10 +00006282
6283 if (am == NULL)
6284 return(NULL);
6285 to = xmlRegNewState(am);
6286 xmlRegStatePush(am, to);
6287 return(to);
6288}
6289
6290/**
Daniel Veillarda9b66d02002-12-11 14:23:49 +00006291 * xmlAutomataNewEpsilon:
Daniel Veillard4255d502002-04-16 15:50:10 +00006292 * @am: an automata
6293 * @from: the starting point of the transition
6294 * @to: the target point of the transition or NULL
6295 *
William M. Brackddf71d62004-05-06 04:17:26 +00006296 * If @to is NULL, this creates first a new target state in the automata
6297 * and then adds an epsilon transition from the @from state to the
Daniel Veillard4255d502002-04-16 15:50:10 +00006298 * target state
6299 *
6300 * Returns the target state or NULL in case of error
6301 */
6302xmlAutomataStatePtr
6303xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
6304 xmlAutomataStatePtr to) {
6305 if ((am == NULL) || (from == NULL))
6306 return(NULL);
6307 xmlFAGenerateEpsilonTransition(am, from, to);
6308 if (to == NULL)
6309 return(am->state);
6310 return(to);
6311}
6312
Daniel Veillardb509f152002-04-17 16:28:10 +00006313/**
Daniel Veillard7646b182002-04-20 06:41:40 +00006314 * xmlAutomataNewAllTrans:
6315 * @am: an automata
6316 * @from: the starting point of the transition
6317 * @to: the target point of the transition or NULL
Daniel Veillarda9b66d02002-12-11 14:23:49 +00006318 * @lax: allow to transition if not all all transitions have been activated
Daniel Veillard7646b182002-04-20 06:41:40 +00006319 *
William M. Brackddf71d62004-05-06 04:17:26 +00006320 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00006321 * and then adds a an ALL transition from the @from state to the
6322 * target state. That transition is an epsilon transition allowed only when
6323 * all transitions from the @from node have been activated.
6324 *
6325 * Returns the target state or NULL in case of error
6326 */
6327xmlAutomataStatePtr
6328xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
Daniel Veillard441bc322002-04-20 17:38:48 +00006329 xmlAutomataStatePtr to, int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00006330 if ((am == NULL) || (from == NULL))
6331 return(NULL);
Daniel Veillard441bc322002-04-20 17:38:48 +00006332 xmlFAGenerateAllTransition(am, from, to, lax);
Daniel Veillard7646b182002-04-20 06:41:40 +00006333 if (to == NULL)
6334 return(am->state);
6335 return(to);
6336}
6337
6338/**
Daniel Veillardb509f152002-04-17 16:28:10 +00006339 * xmlAutomataNewCounter:
6340 * @am: an automata
6341 * @min: the minimal value on the counter
6342 * @max: the maximal value on the counter
6343 *
6344 * Create a new counter
6345 *
6346 * Returns the counter number or -1 in case of error
6347 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006348int
Daniel Veillardb509f152002-04-17 16:28:10 +00006349xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6350 int ret;
6351
6352 if (am == NULL)
6353 return(-1);
6354
6355 ret = xmlRegGetCounter(am);
6356 if (ret < 0)
6357 return(-1);
6358 am->counters[ret].min = min;
6359 am->counters[ret].max = max;
6360 return(ret);
6361}
6362
6363/**
6364 * xmlAutomataNewCountedTrans:
6365 * @am: an automata
6366 * @from: the starting point of the transition
6367 * @to: the target point of the transition or NULL
6368 * @counter: the counter associated to that transition
6369 *
William M. Brackddf71d62004-05-06 04:17:26 +00006370 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00006371 * and then adds an epsilon transition from the @from state to the target state
6372 * which will increment the counter provided
6373 *
6374 * Returns the target state or NULL in case of error
6375 */
6376xmlAutomataStatePtr
6377xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6378 xmlAutomataStatePtr to, int counter) {
6379 if ((am == NULL) || (from == NULL) || (counter < 0))
6380 return(NULL);
6381 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6382 if (to == NULL)
6383 return(am->state);
6384 return(to);
6385}
6386
6387/**
6388 * xmlAutomataNewCounterTrans:
6389 * @am: an automata
6390 * @from: the starting point of the transition
6391 * @to: the target point of the transition or NULL
6392 * @counter: the counter associated to that transition
6393 *
William M. Brackddf71d62004-05-06 04:17:26 +00006394 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00006395 * and then adds an epsilon transition from the @from state to the target state
6396 * which will be allowed only if the counter is within the right range.
6397 *
6398 * Returns the target state or NULL in case of error
6399 */
6400xmlAutomataStatePtr
6401xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6402 xmlAutomataStatePtr to, int counter) {
6403 if ((am == NULL) || (from == NULL) || (counter < 0))
6404 return(NULL);
6405 xmlFAGenerateCountedTransition(am, from, to, counter);
6406 if (to == NULL)
6407 return(am->state);
6408 return(to);
6409}
Daniel Veillard4255d502002-04-16 15:50:10 +00006410
6411/**
6412 * xmlAutomataCompile:
6413 * @am: an automata
6414 *
6415 * Compile the automata into a Reg Exp ready for being executed.
6416 * The automata should be free after this point.
6417 *
6418 * Returns the compiled regexp or NULL in case of error
6419 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006420xmlRegexpPtr
Daniel Veillard4255d502002-04-16 15:50:10 +00006421xmlAutomataCompile(xmlAutomataPtr am) {
6422 xmlRegexpPtr ret;
6423
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00006424 if ((am == NULL) || (am->error != 0)) return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00006425 xmlFAEliminateEpsilonTransitions(am);
Daniel Veillard23e73572002-09-19 19:56:43 +00006426 /* xmlFAComputesDeterminism(am); */
Daniel Veillard4255d502002-04-16 15:50:10 +00006427 ret = xmlRegEpxFromParse(am);
6428
6429 return(ret);
6430}
Daniel Veillarde19fc232002-04-22 16:01:24 +00006431
6432/**
6433 * xmlAutomataIsDeterminist:
6434 * @am: an automata
6435 *
6436 * Checks if an automata is determinist.
6437 *
6438 * Returns 1 if true, 0 if not, and -1 in case of error
6439 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006440int
Daniel Veillarde19fc232002-04-22 16:01:24 +00006441xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6442 int ret;
6443
6444 if (am == NULL)
6445 return(-1);
6446
6447 ret = xmlFAComputesDeterminism(am);
6448 return(ret);
6449}
Daniel Veillard4255d502002-04-16 15:50:10 +00006450#endif /* LIBXML_AUTOMATA_ENABLED */
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006451
6452#ifdef LIBXML_EXPR_ENABLED
6453/************************************************************************
6454 * *
6455 * Formal Expression handling code *
6456 * *
6457 ************************************************************************/
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006458/************************************************************************
6459 * *
6460 * Expression handling context *
6461 * *
6462 ************************************************************************/
6463
6464struct _xmlExpCtxt {
6465 xmlDictPtr dict;
6466 xmlExpNodePtr *table;
6467 int size;
6468 int nbElems;
6469 int nb_nodes;
Daniel Veillard594e5df2009-09-07 14:58:47 +02006470 int maxNodes;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006471 const char *expr;
6472 const char *cur;
6473 int nb_cons;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006474 int tabSize;
6475};
6476
6477/**
6478 * xmlExpNewCtxt:
6479 * @maxNodes: the maximum number of nodes
Jan Pokornýbb654fe2016-04-13 16:56:07 +02006480 * @dict: optional dictionary to use internally
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006481 *
6482 * Creates a new context for manipulating expressions
6483 *
6484 * Returns the context or NULL in case of error
6485 */
6486xmlExpCtxtPtr
6487xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6488 xmlExpCtxtPtr ret;
6489 int size = 256;
6490
6491 if (maxNodes <= 4096)
6492 maxNodes = 4096;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006493
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006494 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6495 if (ret == NULL)
6496 return(NULL);
6497 memset(ret, 0, sizeof(xmlExpCtxt));
6498 ret->size = size;
6499 ret->nbElems = 0;
Daniel Veillard594e5df2009-09-07 14:58:47 +02006500 ret->maxNodes = maxNodes;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006501 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6502 if (ret->table == NULL) {
6503 xmlFree(ret);
6504 return(NULL);
6505 }
6506 memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6507 if (dict == NULL) {
6508 ret->dict = xmlDictCreate();
6509 if (ret->dict == NULL) {
6510 xmlFree(ret->table);
6511 xmlFree(ret);
6512 return(NULL);
6513 }
6514 } else {
6515 ret->dict = dict;
6516 xmlDictReference(ret->dict);
6517 }
6518 return(ret);
6519}
6520
6521/**
6522 * xmlExpFreeCtxt:
6523 * @ctxt: an expression context
6524 *
6525 * Free an expression context
6526 */
6527void
6528xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6529 if (ctxt == NULL)
6530 return;
6531 xmlDictFree(ctxt->dict);
6532 if (ctxt->table != NULL)
6533 xmlFree(ctxt->table);
6534 xmlFree(ctxt);
6535}
6536
6537/************************************************************************
6538 * *
6539 * Structure associated to an expression node *
6540 * *
6541 ************************************************************************/
Daniel Veillard465a0002005-08-22 12:07:04 +00006542#define MAX_NODES 10000
6543
6544/* #define DEBUG_DERIV */
6545
6546/*
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006547 * TODO:
Daniel Veillard465a0002005-08-22 12:07:04 +00006548 * - Wildcards
6549 * - public API for creation
6550 *
6551 * Started
6552 * - regression testing
6553 *
6554 * Done
6555 * - split into module and test tool
6556 * - memleaks
6557 */
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006558
6559typedef enum {
6560 XML_EXP_NILABLE = (1 << 0)
6561} xmlExpNodeInfo;
6562
6563#define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6564
6565struct _xmlExpNode {
6566 unsigned char type;/* xmlExpNodeType */
6567 unsigned char info;/* OR of xmlExpNodeInfo */
6568 unsigned short key; /* the hash key */
6569 unsigned int ref; /* The number of references */
6570 int c_max; /* the maximum length it can consume */
6571 xmlExpNodePtr exp_left;
6572 xmlExpNodePtr next;/* the next node in the hash table or free list */
6573 union {
6574 struct {
6575 int f_min;
6576 int f_max;
6577 } count;
6578 struct {
6579 xmlExpNodePtr f_right;
6580 } children;
6581 const xmlChar *f_str;
6582 } field;
6583};
6584
6585#define exp_min field.count.f_min
6586#define exp_max field.count.f_max
6587/* #define exp_left field.children.f_left */
6588#define exp_right field.children.f_right
6589#define exp_str field.f_str
6590
6591static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6592static xmlExpNode forbiddenExpNode = {
6593 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6594};
6595xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6596static xmlExpNode emptyExpNode = {
6597 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6598};
6599xmlExpNodePtr emptyExp = &emptyExpNode;
6600
6601/************************************************************************
6602 * *
6603 * The custom hash table for unicity and canonicalization *
6604 * of sub-expressions pointers *
6605 * *
6606 ************************************************************************/
6607/*
6608 * xmlExpHashNameComputeKey:
6609 * Calculate the hash key for a token
6610 */
6611static unsigned short
6612xmlExpHashNameComputeKey(const xmlChar *name) {
6613 unsigned short value = 0L;
6614 char ch;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006615
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006616 if (name != NULL) {
6617 value += 30 * (*name);
6618 while ((ch = *name++) != 0) {
6619 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6620 }
6621 }
6622 return (value);
6623}
6624
6625/*
6626 * xmlExpHashComputeKey:
6627 * Calculate the hash key for a compound expression
6628 */
6629static unsigned short
6630xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6631 xmlExpNodePtr right) {
6632 unsigned long value;
6633 unsigned short ret;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006634
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006635 switch (type) {
6636 case XML_EXP_SEQ:
6637 value = left->key;
6638 value += right->key;
6639 value *= 3;
6640 ret = (unsigned short) value;
6641 break;
6642 case XML_EXP_OR:
6643 value = left->key;
6644 value += right->key;
6645 value *= 7;
6646 ret = (unsigned short) value;
6647 break;
6648 case XML_EXP_COUNT:
6649 value = left->key;
6650 value += right->key;
6651 ret = (unsigned short) value;
6652 break;
6653 default:
6654 ret = 0;
6655 }
6656 return(ret);
6657}
6658
6659
6660static xmlExpNodePtr
6661xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6662 xmlExpNodePtr ret;
6663
6664 if (ctxt->nb_nodes >= MAX_NODES)
6665 return(NULL);
6666 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6667 if (ret == NULL)
6668 return(NULL);
6669 memset(ret, 0, sizeof(xmlExpNode));
6670 ret->type = type;
6671 ret->next = NULL;
6672 ctxt->nb_nodes++;
6673 ctxt->nb_cons++;
6674 return(ret);
6675}
6676
6677/**
6678 * xmlExpHashGetEntry:
6679 * @table: the hash table
6680 *
6681 * Get the unique entry from the hash table. The entry is created if
6682 * needed. @left and @right are consumed, i.e. their ref count will
6683 * be decremented by the operation.
6684 *
6685 * Returns the pointer or NULL in case of error
6686 */
6687static xmlExpNodePtr
6688xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6689 xmlExpNodePtr left, xmlExpNodePtr right,
6690 const xmlChar *name, int min, int max) {
6691 unsigned short kbase, key;
6692 xmlExpNodePtr entry;
6693 xmlExpNodePtr insert;
6694
6695 if (ctxt == NULL)
6696 return(NULL);
6697
6698 /*
6699 * Check for duplicate and insertion location.
6700 */
6701 if (type == XML_EXP_ATOM) {
6702 kbase = xmlExpHashNameComputeKey(name);
6703 } else if (type == XML_EXP_COUNT) {
6704 /* COUNT reduction rule 1 */
6705 /* a{1} -> a */
6706 if (min == max) {
6707 if (min == 1) {
6708 return(left);
6709 }
6710 if (min == 0) {
6711 xmlExpFree(ctxt, left);
6712 return(emptyExp);
6713 }
6714 }
6715 if (min < 0) {
6716 xmlExpFree(ctxt, left);
6717 return(forbiddenExp);
6718 }
6719 if (max == -1)
6720 kbase = min + 79;
6721 else
6722 kbase = max - min;
6723 kbase += left->key;
6724 } else if (type == XML_EXP_OR) {
6725 /* Forbid reduction rules */
6726 if (left->type == XML_EXP_FORBID) {
6727 xmlExpFree(ctxt, left);
6728 return(right);
6729 }
6730 if (right->type == XML_EXP_FORBID) {
6731 xmlExpFree(ctxt, right);
6732 return(left);
6733 }
6734
6735 /* OR reduction rule 1 */
6736 /* a | a reduced to a */
6737 if (left == right) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006738 xmlExpFree(ctxt, right);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006739 return(left);
6740 }
6741 /* OR canonicalization rule 1 */
6742 /* linearize (a | b) | c into a | (b | c) */
6743 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6744 xmlExpNodePtr tmp = left;
6745 left = right;
6746 right = tmp;
6747 }
6748 /* OR reduction rule 2 */
6749 /* a | (a | b) and b | (a | b) are reduced to a | b */
6750 if (right->type == XML_EXP_OR) {
6751 if ((left == right->exp_left) ||
6752 (left == right->exp_right)) {
6753 xmlExpFree(ctxt, left);
6754 return(right);
6755 }
6756 }
6757 /* OR canonicalization rule 2 */
6758 /* linearize (a | b) | c into a | (b | c) */
6759 if (left->type == XML_EXP_OR) {
6760 xmlExpNodePtr tmp;
6761
6762 /* OR canonicalization rule 2 */
6763 if ((left->exp_right->type != XML_EXP_OR) &&
6764 (left->exp_right->key < left->exp_left->key)) {
6765 tmp = left->exp_right;
6766 left->exp_right = left->exp_left;
6767 left->exp_left = tmp;
6768 }
6769 left->exp_right->ref++;
6770 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6771 NULL, 0, 0);
6772 left->exp_left->ref++;
6773 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6774 NULL, 0, 0);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006775
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006776 xmlExpFree(ctxt, left);
6777 return(tmp);
6778 }
6779 if (right->type == XML_EXP_OR) {
6780 /* Ordering in the tree */
6781 /* C | (A | B) -> A | (B | C) */
6782 if (left->key > right->exp_right->key) {
6783 xmlExpNodePtr tmp;
6784 right->exp_right->ref++;
6785 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6786 left, NULL, 0, 0);
6787 right->exp_left->ref++;
6788 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6789 tmp, NULL, 0, 0);
6790 xmlExpFree(ctxt, right);
6791 return(tmp);
6792 }
6793 /* Ordering in the tree */
6794 /* B | (A | C) -> A | (B | C) */
6795 if (left->key > right->exp_left->key) {
6796 xmlExpNodePtr tmp;
6797 right->exp_right->ref++;
6798 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6799 right->exp_right, NULL, 0, 0);
6800 right->exp_left->ref++;
6801 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6802 tmp, NULL, 0, 0);
6803 xmlExpFree(ctxt, right);
6804 return(tmp);
6805 }
6806 }
6807 /* we know both types are != XML_EXP_OR here */
6808 else if (left->key > right->key) {
6809 xmlExpNodePtr tmp = left;
6810 left = right;
6811 right = tmp;
6812 }
6813 kbase = xmlExpHashComputeKey(type, left, right);
6814 } else if (type == XML_EXP_SEQ) {
6815 /* Forbid reduction rules */
6816 if (left->type == XML_EXP_FORBID) {
6817 xmlExpFree(ctxt, right);
6818 return(left);
6819 }
6820 if (right->type == XML_EXP_FORBID) {
6821 xmlExpFree(ctxt, left);
6822 return(right);
6823 }
6824 /* Empty reduction rules */
6825 if (right->type == XML_EXP_EMPTY) {
6826 return(left);
6827 }
6828 if (left->type == XML_EXP_EMPTY) {
6829 return(right);
6830 }
6831 kbase = xmlExpHashComputeKey(type, left, right);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006832 } else
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006833 return(NULL);
6834
6835 key = kbase % ctxt->size;
6836 if (ctxt->table[key] != NULL) {
6837 for (insert = ctxt->table[key]; insert != NULL;
6838 insert = insert->next) {
6839 if ((insert->key == kbase) &&
6840 (insert->type == type)) {
6841 if (type == XML_EXP_ATOM) {
6842 if (name == insert->exp_str) {
6843 insert->ref++;
6844 return(insert);
6845 }
6846 } else if (type == XML_EXP_COUNT) {
6847 if ((insert->exp_min == min) && (insert->exp_max == max) &&
6848 (insert->exp_left == left)) {
6849 insert->ref++;
6850 left->ref--;
6851 return(insert);
6852 }
6853 } else if ((insert->exp_left == left) &&
6854 (insert->exp_right == right)) {
6855 insert->ref++;
6856 left->ref--;
6857 right->ref--;
6858 return(insert);
6859 }
6860 }
6861 }
6862 }
6863
6864 entry = xmlExpNewNode(ctxt, type);
6865 if (entry == NULL)
6866 return(NULL);
6867 entry->key = kbase;
6868 if (type == XML_EXP_ATOM) {
6869 entry->exp_str = name;
6870 entry->c_max = 1;
6871 } else if (type == XML_EXP_COUNT) {
6872 entry->exp_min = min;
6873 entry->exp_max = max;
6874 entry->exp_left = left;
6875 if ((min == 0) || (IS_NILLABLE(left)))
6876 entry->info |= XML_EXP_NILABLE;
6877 if (max < 0)
6878 entry->c_max = -1;
6879 else
6880 entry->c_max = max * entry->exp_left->c_max;
6881 } else {
6882 entry->exp_left = left;
6883 entry->exp_right = right;
6884 if (type == XML_EXP_OR) {
6885 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6886 entry->info |= XML_EXP_NILABLE;
6887 if ((entry->exp_left->c_max == -1) ||
6888 (entry->exp_right->c_max == -1))
6889 entry->c_max = -1;
6890 else if (entry->exp_left->c_max > entry->exp_right->c_max)
6891 entry->c_max = entry->exp_left->c_max;
6892 else
6893 entry->c_max = entry->exp_right->c_max;
6894 } else {
6895 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6896 entry->info |= XML_EXP_NILABLE;
6897 if ((entry->exp_left->c_max == -1) ||
6898 (entry->exp_right->c_max == -1))
6899 entry->c_max = -1;
6900 else
6901 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6902 }
6903 }
6904 entry->ref = 1;
6905 if (ctxt->table[key] != NULL)
6906 entry->next = ctxt->table[key];
6907
6908 ctxt->table[key] = entry;
6909 ctxt->nbElems++;
6910
6911 return(entry);
6912}
6913
6914/**
6915 * xmlExpFree:
6916 * @ctxt: the expression context
6917 * @exp: the expression
6918 *
6919 * Dereference the expression
6920 */
6921void
6922xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6923 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6924 return;
6925 exp->ref--;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006926 if (exp->ref == 0) {
6927 unsigned short key;
6928
6929 /* Unlink it first from the hash table */
6930 key = exp->key % ctxt->size;
6931 if (ctxt->table[key] == exp) {
6932 ctxt->table[key] = exp->next;
6933 } else {
6934 xmlExpNodePtr tmp;
6935
6936 tmp = ctxt->table[key];
6937 while (tmp != NULL) {
6938 if (tmp->next == exp) {
6939 tmp->next = exp->next;
6940 break;
6941 }
6942 tmp = tmp->next;
6943 }
6944 }
6945
6946 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6947 xmlExpFree(ctxt, exp->exp_left);
6948 xmlExpFree(ctxt, exp->exp_right);
6949 } else if (exp->type == XML_EXP_COUNT) {
6950 xmlExpFree(ctxt, exp->exp_left);
6951 }
6952 xmlFree(exp);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006953 ctxt->nb_nodes--;
6954 }
6955}
6956
6957/**
6958 * xmlExpRef:
6959 * @exp: the expression
6960 *
6961 * Increase the reference count of the expression
6962 */
6963void
6964xmlExpRef(xmlExpNodePtr exp) {
6965 if (exp != NULL)
6966 exp->ref++;
6967}
6968
Daniel Veillardccb4d412005-08-23 13:41:17 +00006969/**
6970 * xmlExpNewAtom:
6971 * @ctxt: the expression context
6972 * @name: the atom name
Michael Woodfb27e2c2012-09-28 08:59:33 +02006973 * @len: the atom name length in byte (or -1);
Daniel Veillardccb4d412005-08-23 13:41:17 +00006974 *
6975 * Get the atom associated to this name from that context
6976 *
6977 * Returns the node or NULL in case of error
6978 */
6979xmlExpNodePtr
6980xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
6981 if ((ctxt == NULL) || (name == NULL))
6982 return(NULL);
6983 name = xmlDictLookup(ctxt->dict, name, len);
6984 if (name == NULL)
6985 return(NULL);
6986 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
6987}
6988
6989/**
6990 * xmlExpNewOr:
6991 * @ctxt: the expression context
6992 * @left: left expression
6993 * @right: right expression
6994 *
6995 * Get the atom associated to the choice @left | @right
6996 * Note that @left and @right are consumed in the operation, to keep
6997 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6998 * this is true even in case of failure (unless ctxt == NULL).
6999 *
7000 * Returns the node or NULL in case of error
7001 */
7002xmlExpNodePtr
7003xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00007004 if (ctxt == NULL)
7005 return(NULL);
7006 if ((left == NULL) || (right == NULL)) {
Daniel Veillardccb4d412005-08-23 13:41:17 +00007007 xmlExpFree(ctxt, left);
7008 xmlExpFree(ctxt, right);
7009 return(NULL);
7010 }
7011 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
7012}
7013
7014/**
7015 * xmlExpNewSeq:
7016 * @ctxt: the expression context
7017 * @left: left expression
7018 * @right: right expression
7019 *
7020 * Get the atom associated to the sequence @left , @right
7021 * Note that @left and @right are consumed in the operation, to keep
7022 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
7023 * this is true even in case of failure (unless ctxt == NULL).
7024 *
7025 * Returns the node or NULL in case of error
7026 */
7027xmlExpNodePtr
7028xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00007029 if (ctxt == NULL)
7030 return(NULL);
7031 if ((left == NULL) || (right == NULL)) {
Daniel Veillardccb4d412005-08-23 13:41:17 +00007032 xmlExpFree(ctxt, left);
7033 xmlExpFree(ctxt, right);
7034 return(NULL);
7035 }
7036 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
7037}
7038
7039/**
7040 * xmlExpNewRange:
7041 * @ctxt: the expression context
7042 * @subset: the expression to be repeated
7043 * @min: the lower bound for the repetition
7044 * @max: the upper bound for the repetition, -1 means infinite
7045 *
7046 * Get the atom associated to the range (@subset){@min, @max}
7047 * Note that @subset is consumed in the operation, to keep
7048 * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
7049 * this is true even in case of failure (unless ctxt == NULL).
7050 *
7051 * Returns the node or NULL in case of error
7052 */
7053xmlExpNodePtr
7054xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00007055 if (ctxt == NULL)
7056 return(NULL);
7057 if ((subset == NULL) || (min < 0) || (max < -1) ||
Daniel Veillardccb4d412005-08-23 13:41:17 +00007058 ((max >= 0) && (min > max))) {
7059 xmlExpFree(ctxt, subset);
7060 return(NULL);
7061 }
7062 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
7063 NULL, NULL, min, max));
7064}
7065
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007066/************************************************************************
7067 * *
7068 * Public API for operations on expressions *
7069 * *
7070 ************************************************************************/
7071
7072static int
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007073xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007074 const xmlChar**list, int len, int nb) {
7075 int tmp, tmp2;
7076tail:
7077 switch (exp->type) {
7078 case XML_EXP_EMPTY:
7079 return(0);
7080 case XML_EXP_ATOM:
7081 for (tmp = 0;tmp < nb;tmp++)
7082 if (list[tmp] == exp->exp_str)
7083 return(0);
7084 if (nb >= len)
7085 return(-2);
Daniel Veillard13cee4e2009-09-05 14:52:55 +02007086 list[nb] = exp->exp_str;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007087 return(1);
7088 case XML_EXP_COUNT:
7089 exp = exp->exp_left;
7090 goto tail;
7091 case XML_EXP_SEQ:
7092 case XML_EXP_OR:
7093 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
7094 if (tmp < 0)
7095 return(tmp);
7096 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
7097 nb + tmp);
7098 if (tmp2 < 0)
7099 return(tmp2);
7100 return(tmp + tmp2);
7101 }
7102 return(-1);
7103}
7104
7105/**
7106 * xmlExpGetLanguage:
7107 * @ctxt: the expression context
7108 * @exp: the expression
Daniel Veillard7802ba52005-10-27 11:56:20 +00007109 * @langList: where to store the tokens
Michael Woodfb27e2c2012-09-28 08:59:33 +02007110 * @len: the allocated length of @list
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007111 *
7112 * Find all the strings used in @exp and store them in @list
7113 *
7114 * Returns the number of unique strings found, -1 in case of errors and
7115 * -2 if there is more than @len strings
7116 */
7117int
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007118xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard7802ba52005-10-27 11:56:20 +00007119 const xmlChar**langList, int len) {
7120 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007121 return(-1);
Daniel Veillard7802ba52005-10-27 11:56:20 +00007122 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007123}
7124
7125static int
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007126xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007127 const xmlChar**list, int len, int nb) {
7128 int tmp, tmp2;
7129tail:
7130 switch (exp->type) {
7131 case XML_EXP_FORBID:
7132 return(0);
7133 case XML_EXP_EMPTY:
7134 return(0);
7135 case XML_EXP_ATOM:
7136 for (tmp = 0;tmp < nb;tmp++)
7137 if (list[tmp] == exp->exp_str)
7138 return(0);
7139 if (nb >= len)
7140 return(-2);
Daniel Veillard13cee4e2009-09-05 14:52:55 +02007141 list[nb] = exp->exp_str;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007142 return(1);
7143 case XML_EXP_COUNT:
7144 exp = exp->exp_left;
7145 goto tail;
7146 case XML_EXP_SEQ:
7147 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7148 if (tmp < 0)
7149 return(tmp);
7150 if (IS_NILLABLE(exp->exp_left)) {
7151 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7152 nb + tmp);
7153 if (tmp2 < 0)
7154 return(tmp2);
7155 tmp += tmp2;
7156 }
7157 return(tmp);
7158 case XML_EXP_OR:
7159 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7160 if (tmp < 0)
7161 return(tmp);
7162 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7163 nb + tmp);
7164 if (tmp2 < 0)
7165 return(tmp2);
7166 return(tmp + tmp2);
7167 }
7168 return(-1);
7169}
7170
7171/**
7172 * xmlExpGetStart:
7173 * @ctxt: the expression context
7174 * @exp: the expression
Daniel Veillard7802ba52005-10-27 11:56:20 +00007175 * @tokList: where to store the tokens
Michael Woodfb27e2c2012-09-28 08:59:33 +02007176 * @len: the allocated length of @list
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007177 *
7178 * Find all the strings that appears at the start of the languages
7179 * accepted by @exp and store them in @list. E.g. for (a, b) | c
7180 * it will return the list [a, c]
7181 *
7182 * Returns the number of unique strings found, -1 in case of errors and
7183 * -2 if there is more than @len strings
7184 */
7185int
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007186xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard7802ba52005-10-27 11:56:20 +00007187 const xmlChar**tokList, int len) {
7188 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007189 return(-1);
Daniel Veillard7802ba52005-10-27 11:56:20 +00007190 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007191}
7192
7193/**
7194 * xmlExpIsNillable:
7195 * @exp: the expression
7196 *
Haibo Huangcfd91dc2020-07-30 23:01:33 -07007197 * Finds if the expression is nillable, i.e. if it accepts the empty sequence
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007198 *
7199 * Returns 1 if nillable, 0 if not and -1 in case of error
7200 */
7201int
7202xmlExpIsNillable(xmlExpNodePtr exp) {
7203 if (exp == NULL)
7204 return(-1);
7205 return(IS_NILLABLE(exp) != 0);
7206}
7207
7208static xmlExpNodePtr
7209xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
7210{
7211 xmlExpNodePtr ret;
7212
7213 switch (exp->type) {
7214 case XML_EXP_EMPTY:
7215 return(forbiddenExp);
7216 case XML_EXP_FORBID:
7217 return(forbiddenExp);
7218 case XML_EXP_ATOM:
7219 if (exp->exp_str == str) {
7220#ifdef DEBUG_DERIV
7221 printf("deriv atom: equal => Empty\n");
7222#endif
7223 ret = emptyExp;
7224 } else {
7225#ifdef DEBUG_DERIV
7226 printf("deriv atom: mismatch => forbid\n");
7227#endif
7228 /* TODO wildcards here */
7229 ret = forbiddenExp;
7230 }
7231 return(ret);
7232 case XML_EXP_OR: {
7233 xmlExpNodePtr tmp;
7234
7235#ifdef DEBUG_DERIV
7236 printf("deriv or: => or(derivs)\n");
7237#endif
7238 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7239 if (tmp == NULL) {
7240 return(NULL);
7241 }
7242 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7243 if (ret == NULL) {
7244 xmlExpFree(ctxt, tmp);
7245 return(NULL);
7246 }
7247 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
7248 NULL, 0, 0);
7249 return(ret);
7250 }
7251 case XML_EXP_SEQ:
7252#ifdef DEBUG_DERIV
7253 printf("deriv seq: starting with left\n");
7254#endif
7255 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7256 if (ret == NULL) {
7257 return(NULL);
7258 } else if (ret == forbiddenExp) {
7259 if (IS_NILLABLE(exp->exp_left)) {
7260#ifdef DEBUG_DERIV
7261 printf("deriv seq: left failed but nillable\n");
7262#endif
7263 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7264 }
7265 } else {
7266#ifdef DEBUG_DERIV
7267 printf("deriv seq: left match => sequence\n");
7268#endif
7269 exp->exp_right->ref++;
7270 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
7271 NULL, 0, 0);
7272 }
7273 return(ret);
7274 case XML_EXP_COUNT: {
7275 int min, max;
7276 xmlExpNodePtr tmp;
7277
7278 if (exp->exp_max == 0)
7279 return(forbiddenExp);
7280 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7281 if (ret == NULL)
7282 return(NULL);
7283 if (ret == forbiddenExp) {
7284#ifdef DEBUG_DERIV
7285 printf("deriv count: pattern mismatch => forbid\n");
7286#endif
7287 return(ret);
7288 }
7289 if (exp->exp_max == 1)
7290 return(ret);
7291 if (exp->exp_max < 0) /* unbounded */
7292 max = -1;
7293 else
7294 max = exp->exp_max - 1;
7295 if (exp->exp_min > 0)
7296 min = exp->exp_min - 1;
7297 else
7298 min = 0;
7299 exp->exp_left->ref++;
7300 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
7301 NULL, min, max);
7302 if (ret == emptyExp) {
7303#ifdef DEBUG_DERIV
7304 printf("deriv count: match to empty => new count\n");
7305#endif
7306 return(tmp);
7307 }
7308#ifdef DEBUG_DERIV
7309 printf("deriv count: match => sequence with new count\n");
7310#endif
7311 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
7312 NULL, 0, 0));
7313 }
7314 }
7315 return(NULL);
7316}
7317
7318/**
7319 * xmlExpStringDerive:
7320 * @ctxt: the expression context
7321 * @exp: the expression
7322 * @str: the string
7323 * @len: the string len in bytes if available
7324 *
7325 * Do one step of Brzozowski derivation of the expression @exp with
7326 * respect to the input string
7327 *
7328 * Returns the resulting expression or NULL in case of internal error
7329 */
7330xmlExpNodePtr
7331xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7332 const xmlChar *str, int len) {
7333 const xmlChar *input;
7334
7335 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
7336 return(NULL);
7337 }
7338 /*
Jan Pokornýbb654fe2016-04-13 16:56:07 +02007339 * check the string is in the dictionary, if yes use an interned
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007340 * copy, otherwise we know it's not an acceptable input
7341 */
7342 input = xmlDictExists(ctxt->dict, str, len);
7343 if (input == NULL) {
7344 return(forbiddenExp);
7345 }
7346 return(xmlExpStringDeriveInt(ctxt, exp, input));
7347}
7348
7349static int
7350xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7351 int ret = 1;
7352
7353 if (sub->c_max == -1) {
7354 if (exp->c_max != -1)
7355 ret = 0;
7356 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7357 ret = 0;
7358 }
7359#if 0
7360 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7361 ret = 0;
7362#endif
7363 return(ret);
7364}
7365
7366static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7367 xmlExpNodePtr sub);
7368/**
7369 * xmlExpDivide:
7370 * @ctxt: the expressions context
7371 * @exp: the englobing expression
7372 * @sub: the subexpression
7373 * @mult: the multiple expression
7374 * @remain: the remain from the derivation of the multiple
7375 *
7376 * Check if exp is a multiple of sub, i.e. if there is a finite number n
7377 * so that sub{n} subsume exp
7378 *
7379 * Returns the multiple value if successful, 0 if it is not a multiple
Haibo Huangcfd91dc2020-07-30 23:01:33 -07007380 * and -1 in case of internal error.
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007381 */
7382
7383static int
7384xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7385 xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7386 int i;
7387 xmlExpNodePtr tmp, tmp2;
7388
7389 if (mult != NULL) *mult = NULL;
7390 if (remain != NULL) *remain = NULL;
7391 if (exp->c_max == -1) return(0);
7392 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7393
7394 for (i = 1;i <= exp->c_max;i++) {
7395 sub->ref++;
7396 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7397 sub, NULL, NULL, i, i);
7398 if (tmp == NULL) {
7399 return(-1);
7400 }
7401 if (!xmlExpCheckCard(tmp, exp)) {
7402 xmlExpFree(ctxt, tmp);
7403 continue;
7404 }
7405 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7406 if (tmp2 == NULL) {
7407 xmlExpFree(ctxt, tmp);
7408 return(-1);
7409 }
7410 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7411 if (remain != NULL)
7412 *remain = tmp2;
7413 else
7414 xmlExpFree(ctxt, tmp2);
7415 if (mult != NULL)
7416 *mult = tmp;
7417 else
7418 xmlExpFree(ctxt, tmp);
7419#ifdef DEBUG_DERIV
7420 printf("Divide succeeded %d\n", i);
7421#endif
7422 return(i);
7423 }
7424 xmlExpFree(ctxt, tmp);
7425 xmlExpFree(ctxt, tmp2);
7426 }
7427#ifdef DEBUG_DERIV
7428 printf("Divide failed\n");
7429#endif
7430 return(0);
7431}
7432
7433/**
7434 * xmlExpExpDeriveInt:
7435 * @ctxt: the expressions context
7436 * @exp: the englobing expression
7437 * @sub: the subexpression
7438 *
7439 * Try to do a step of Brzozowski derivation but at a higher level
7440 * the input being a subexpression.
7441 *
7442 * Returns the resulting expression or NULL in case of internal error
7443 */
7444static xmlExpNodePtr
7445xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7446 xmlExpNodePtr ret, tmp, tmp2, tmp3;
7447 const xmlChar **tab;
7448 int len, i;
7449
7450 /*
7451 * In case of equality and if the expression can only consume a finite
7452 * amount, then the derivation is empty
7453 */
7454 if ((exp == sub) && (exp->c_max >= 0)) {
7455#ifdef DEBUG_DERIV
7456 printf("Equal(exp, sub) and finite -> Empty\n");
7457#endif
7458 return(emptyExp);
7459 }
7460 /*
7461 * decompose sub sequence first
7462 */
7463 if (sub->type == XML_EXP_EMPTY) {
7464#ifdef DEBUG_DERIV
7465 printf("Empty(sub) -> Empty\n");
7466#endif
7467 exp->ref++;
7468 return(exp);
7469 }
7470 if (sub->type == XML_EXP_SEQ) {
7471#ifdef DEBUG_DERIV
7472 printf("Seq(sub) -> decompose\n");
7473#endif
7474 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7475 if (tmp == NULL)
7476 return(NULL);
7477 if (tmp == forbiddenExp)
7478 return(tmp);
7479 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7480 xmlExpFree(ctxt, tmp);
7481 return(ret);
7482 }
7483 if (sub->type == XML_EXP_OR) {
7484#ifdef DEBUG_DERIV
7485 printf("Or(sub) -> decompose\n");
7486#endif
7487 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7488 if (tmp == forbiddenExp)
7489 return(tmp);
7490 if (tmp == NULL)
7491 return(NULL);
7492 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7493 if ((ret == NULL) || (ret == forbiddenExp)) {
7494 xmlExpFree(ctxt, tmp);
7495 return(ret);
7496 }
7497 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7498 }
7499 if (!xmlExpCheckCard(exp, sub)) {
7500#ifdef DEBUG_DERIV
7501 printf("CheckCard(exp, sub) failed -> Forbid\n");
7502#endif
7503 return(forbiddenExp);
7504 }
7505 switch (exp->type) {
7506 case XML_EXP_EMPTY:
7507 if (sub == emptyExp)
7508 return(emptyExp);
7509#ifdef DEBUG_DERIV
7510 printf("Empty(exp) -> Forbid\n");
7511#endif
7512 return(forbiddenExp);
7513 case XML_EXP_FORBID:
7514#ifdef DEBUG_DERIV
7515 printf("Forbid(exp) -> Forbid\n");
7516#endif
7517 return(forbiddenExp);
7518 case XML_EXP_ATOM:
7519 if (sub->type == XML_EXP_ATOM) {
7520 /* TODO: handle wildcards */
7521 if (exp->exp_str == sub->exp_str) {
7522#ifdef DEBUG_DERIV
7523 printf("Atom match -> Empty\n");
7524#endif
7525 return(emptyExp);
7526 }
7527#ifdef DEBUG_DERIV
7528 printf("Atom mismatch -> Forbid\n");
7529#endif
7530 return(forbiddenExp);
7531 }
7532 if ((sub->type == XML_EXP_COUNT) &&
7533 (sub->exp_max == 1) &&
7534 (sub->exp_left->type == XML_EXP_ATOM)) {
7535 /* TODO: handle wildcards */
7536 if (exp->exp_str == sub->exp_left->exp_str) {
7537#ifdef DEBUG_DERIV
7538 printf("Atom match -> Empty\n");
7539#endif
7540 return(emptyExp);
7541 }
7542#ifdef DEBUG_DERIV
7543 printf("Atom mismatch -> Forbid\n");
7544#endif
7545 return(forbiddenExp);
7546 }
7547#ifdef DEBUG_DERIV
Haibo Huangcfd91dc2020-07-30 23:01:33 -07007548 printf("Complex exp vs Atom -> Forbid\n");
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007549#endif
7550 return(forbiddenExp);
7551 case XML_EXP_SEQ:
7552 /* try to get the sequence consumed only if possible */
7553 if (xmlExpCheckCard(exp->exp_left, sub)) {
7554 /* See if the sequence can be consumed directly */
7555#ifdef DEBUG_DERIV
7556 printf("Seq trying left only\n");
7557#endif
7558 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7559 if ((ret != forbiddenExp) && (ret != NULL)) {
7560#ifdef DEBUG_DERIV
7561 printf("Seq trying left only worked\n");
7562#endif
7563 /*
7564 * TODO: assumption here that we are determinist
7565 * i.e. we won't get to a nillable exp left
7566 * subset which could be matched by the right
7567 * part too.
7568 * e.g.: (a | b)+,(a | c) and 'a+,a'
7569 */
7570 exp->exp_right->ref++;
7571 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7572 exp->exp_right, NULL, 0, 0));
7573 }
7574#ifdef DEBUG_DERIV
7575 } else {
7576 printf("Seq: left too short\n");
7577#endif
7578 }
7579 /* Try instead to decompose */
7580 if (sub->type == XML_EXP_COUNT) {
7581 int min, max;
7582
7583#ifdef DEBUG_DERIV
7584 printf("Seq: sub is a count\n");
7585#endif
7586 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7587 if (ret == NULL)
7588 return(NULL);
7589 if (ret != forbiddenExp) {
7590#ifdef DEBUG_DERIV
7591 printf("Seq , Count match on left\n");
7592#endif
7593 if (sub->exp_max < 0)
7594 max = -1;
7595 else
7596 max = sub->exp_max -1;
7597 if (sub->exp_min > 0)
7598 min = sub->exp_min -1;
7599 else
7600 min = 0;
7601 exp->exp_right->ref++;
7602 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7603 exp->exp_right, NULL, 0, 0);
7604 if (tmp == NULL)
7605 return(NULL);
7606
7607 sub->exp_left->ref++;
7608 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7609 sub->exp_left, NULL, NULL, min, max);
7610 if (tmp2 == NULL) {
7611 xmlExpFree(ctxt, tmp);
7612 return(NULL);
7613 }
7614 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7615 xmlExpFree(ctxt, tmp);
7616 xmlExpFree(ctxt, tmp2);
7617 return(ret);
7618 }
7619 }
7620 /* we made no progress on structured operations */
7621 break;
7622 case XML_EXP_OR:
7623#ifdef DEBUG_DERIV
7624 printf("Or , trying both side\n");
7625#endif
7626 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7627 if (ret == NULL)
7628 return(NULL);
7629 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7630 if (tmp == NULL) {
7631 xmlExpFree(ctxt, ret);
7632 return(NULL);
7633 }
7634 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7635 case XML_EXP_COUNT: {
7636 int min, max;
7637
7638 if (sub->type == XML_EXP_COUNT) {
7639 /*
7640 * Try to see if the loop is completely subsumed
7641 */
7642 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7643 if (tmp == NULL)
7644 return(NULL);
7645 if (tmp == forbiddenExp) {
7646 int mult;
7647
7648#ifdef DEBUG_DERIV
7649 printf("Count, Count inner don't subsume\n");
7650#endif
7651 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7652 NULL, &tmp);
7653 if (mult <= 0) {
7654#ifdef DEBUG_DERIV
7655 printf("Count, Count not multiple => forbidden\n");
7656#endif
7657 return(forbiddenExp);
7658 }
7659 if (sub->exp_max == -1) {
7660 max = -1;
7661 if (exp->exp_max == -1) {
7662 if (exp->exp_min <= sub->exp_min * mult)
7663 min = 0;
7664 else
7665 min = exp->exp_min - sub->exp_min * mult;
7666 } else {
7667#ifdef DEBUG_DERIV
7668 printf("Count, Count finite can't subsume infinite\n");
7669#endif
7670 xmlExpFree(ctxt, tmp);
7671 return(forbiddenExp);
7672 }
7673 } else {
7674 if (exp->exp_max == -1) {
7675#ifdef DEBUG_DERIV
7676 printf("Infinite loop consume mult finite loop\n");
7677#endif
7678 if (exp->exp_min > sub->exp_min * mult) {
7679 max = -1;
7680 min = exp->exp_min - sub->exp_min * mult;
7681 } else {
7682 max = -1;
7683 min = 0;
7684 }
7685 } else {
7686 if (exp->exp_max < sub->exp_max * mult) {
7687#ifdef DEBUG_DERIV
7688 printf("loops max mult mismatch => forbidden\n");
7689#endif
7690 xmlExpFree(ctxt, tmp);
7691 return(forbiddenExp);
7692 }
7693 if (sub->exp_max * mult > exp->exp_min)
7694 min = 0;
7695 else
7696 min = exp->exp_min - sub->exp_max * mult;
7697 max = exp->exp_max - sub->exp_max * mult;
7698 }
7699 }
7700 } else if (!IS_NILLABLE(tmp)) {
7701 /*
7702 * TODO: loop here to try to grow if working on finite
7703 * blocks.
7704 */
7705#ifdef DEBUG_DERIV
7706 printf("Count, Count remain not nillable => forbidden\n");
7707#endif
7708 xmlExpFree(ctxt, tmp);
7709 return(forbiddenExp);
7710 } else if (sub->exp_max == -1) {
7711 if (exp->exp_max == -1) {
7712 if (exp->exp_min <= sub->exp_min) {
7713#ifdef DEBUG_DERIV
7714 printf("Infinite loops Okay => COUNT(0,Inf)\n");
7715#endif
7716 max = -1;
7717 min = 0;
7718 } else {
7719#ifdef DEBUG_DERIV
7720 printf("Infinite loops min => Count(X,Inf)\n");
7721#endif
7722 max = -1;
7723 min = exp->exp_min - sub->exp_min;
7724 }
7725 } else if (exp->exp_min > sub->exp_min) {
7726#ifdef DEBUG_DERIV
7727 printf("loops min mismatch 1 => forbidden ???\n");
7728#endif
7729 xmlExpFree(ctxt, tmp);
7730 return(forbiddenExp);
7731 } else {
7732 max = -1;
7733 min = 0;
7734 }
7735 } else {
7736 if (exp->exp_max == -1) {
7737#ifdef DEBUG_DERIV
7738 printf("Infinite loop consume finite loop\n");
7739#endif
7740 if (exp->exp_min > sub->exp_min) {
7741 max = -1;
7742 min = exp->exp_min - sub->exp_min;
7743 } else {
7744 max = -1;
7745 min = 0;
7746 }
7747 } else {
7748 if (exp->exp_max < sub->exp_max) {
7749#ifdef DEBUG_DERIV
7750 printf("loops max mismatch => forbidden\n");
7751#endif
7752 xmlExpFree(ctxt, tmp);
7753 return(forbiddenExp);
7754 }
7755 if (sub->exp_max > exp->exp_min)
7756 min = 0;
7757 else
7758 min = exp->exp_min - sub->exp_max;
7759 max = exp->exp_max - sub->exp_max;
7760 }
7761 }
7762#ifdef DEBUG_DERIV
7763 printf("loops match => SEQ(COUNT())\n");
7764#endif
7765 exp->exp_left->ref++;
7766 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7767 NULL, NULL, min, max);
7768 if (tmp2 == NULL) {
7769 return(NULL);
7770 }
7771 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7772 NULL, 0, 0);
7773 return(ret);
7774 }
7775 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7776 if (tmp == NULL)
7777 return(NULL);
7778 if (tmp == forbiddenExp) {
7779#ifdef DEBUG_DERIV
7780 printf("loop mismatch => forbidden\n");
7781#endif
7782 return(forbiddenExp);
7783 }
7784 if (exp->exp_min > 0)
7785 min = exp->exp_min - 1;
7786 else
7787 min = 0;
7788 if (exp->exp_max < 0)
7789 max = -1;
7790 else
7791 max = exp->exp_max - 1;
7792
7793#ifdef DEBUG_DERIV
7794 printf("loop match => SEQ(COUNT())\n");
7795#endif
7796 exp->exp_left->ref++;
7797 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7798 NULL, NULL, min, max);
7799 if (tmp2 == NULL)
7800 return(NULL);
7801 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7802 NULL, 0, 0);
7803 return(ret);
7804 }
7805 }
7806
Daniel Veillardccb4d412005-08-23 13:41:17 +00007807#ifdef DEBUG_DERIV
7808 printf("Fallback to derivative\n");
7809#endif
7810 if (IS_NILLABLE(sub)) {
7811 if (!(IS_NILLABLE(exp)))
7812 return(forbiddenExp);
7813 else
7814 ret = emptyExp;
7815 } else
7816 ret = NULL;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007817 /*
7818 * here the structured derivation made no progress so
7819 * we use the default token based derivation to force one more step
7820 */
7821 if (ctxt->tabSize == 0)
7822 ctxt->tabSize = 40;
7823
7824 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7825 sizeof(const xmlChar *));
7826 if (tab == NULL) {
7827 return(NULL);
7828 }
7829
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007830 /*
7831 * collect all the strings accepted by the subexpression on input
7832 */
7833 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7834 while (len < 0) {
7835 const xmlChar **temp;
Rob Richards54a8f672005-10-07 02:33:00 +00007836 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007837 sizeof(const xmlChar *));
7838 if (temp == NULL) {
Rob Richards54a8f672005-10-07 02:33:00 +00007839 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007840 return(NULL);
7841 }
7842 tab = temp;
7843 ctxt->tabSize *= 2;
7844 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7845 }
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007846 for (i = 0;i < len;i++) {
7847 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7848 if ((tmp == NULL) || (tmp == forbiddenExp)) {
7849 xmlExpFree(ctxt, ret);
Rob Richards54a8f672005-10-07 02:33:00 +00007850 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007851 return(tmp);
7852 }
7853 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7854 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7855 xmlExpFree(ctxt, tmp);
7856 xmlExpFree(ctxt, ret);
Rob Richards54a8f672005-10-07 02:33:00 +00007857 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007858 return(tmp);
7859 }
7860 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7861 xmlExpFree(ctxt, tmp);
7862 xmlExpFree(ctxt, tmp2);
7863
7864 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7865 xmlExpFree(ctxt, ret);
Rob Richards54a8f672005-10-07 02:33:00 +00007866 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007867 return(tmp3);
7868 }
7869
7870 if (ret == NULL)
7871 ret = tmp3;
7872 else {
7873 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7874 if (ret == NULL) {
Rob Richards54a8f672005-10-07 02:33:00 +00007875 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007876 return(NULL);
7877 }
7878 }
7879 }
Rob Richards54a8f672005-10-07 02:33:00 +00007880 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007881 return(ret);
7882}
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007883
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007884/**
Daniel Veillard0090bd52005-08-22 14:43:43 +00007885 * xmlExpExpDerive:
7886 * @ctxt: the expressions context
7887 * @exp: the englobing expression
7888 * @sub: the subexpression
7889 *
7890 * Evaluates the expression resulting from @exp consuming a sub expression @sub
7891 * Based on algebraic derivation and sometimes direct Brzozowski derivation
Haibo Huangcfd91dc2020-07-30 23:01:33 -07007892 * it usually takes less than linear time and can handle expressions generating
Daniel Veillard0090bd52005-08-22 14:43:43 +00007893 * infinite languages.
7894 *
7895 * Returns the resulting expression or NULL in case of internal error, the
7896 * result must be freed
7897 */
7898xmlExpNodePtr
7899xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7900 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7901 return(NULL);
7902
7903 /*
7904 * O(1) speedups
7905 */
7906 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7907#ifdef DEBUG_DERIV
7908 printf("Sub nillable and not exp : can't subsume\n");
7909#endif
7910 return(forbiddenExp);
7911 }
7912 if (xmlExpCheckCard(exp, sub) == 0) {
7913#ifdef DEBUG_DERIV
Haibo Huangcfd91dc2020-07-30 23:01:33 -07007914 printf("sub generate longer sequences than exp : can't subsume\n");
Daniel Veillard0090bd52005-08-22 14:43:43 +00007915#endif
7916 return(forbiddenExp);
7917 }
7918 return(xmlExpExpDeriveInt(ctxt, exp, sub));
7919}
7920
7921/**
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007922 * xmlExpSubsume:
7923 * @ctxt: the expressions context
7924 * @exp: the englobing expression
7925 * @sub: the subexpression
7926 *
Haibo Huangcfd91dc2020-07-30 23:01:33 -07007927 * Check whether @exp accepts all the languages accepted by @sub
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007928 * the input being a subexpression.
7929 *
7930 * Returns 1 if true 0 if false and -1 in case of failure.
7931 */
7932int
7933xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7934 xmlExpNodePtr tmp;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007935
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007936 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7937 return(-1);
7938
7939 /*
7940 * TODO: speedup by checking the language of sub is a subset of the
7941 * language of exp
7942 */
7943 /*
7944 * O(1) speedups
7945 */
7946 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7947#ifdef DEBUG_DERIV
7948 printf("Sub nillable and not exp : can't subsume\n");
7949#endif
7950 return(0);
7951 }
7952 if (xmlExpCheckCard(exp, sub) == 0) {
7953#ifdef DEBUG_DERIV
Haibo Huangcfd91dc2020-07-30 23:01:33 -07007954 printf("sub generate longer sequences than exp : can't subsume\n");
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007955#endif
7956 return(0);
7957 }
7958 tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7959#ifdef DEBUG_DERIV
7960 printf("Result derivation :\n");
7961 PRINT_EXP(tmp);
7962#endif
7963 if (tmp == NULL)
7964 return(-1);
7965 if (tmp == forbiddenExp)
7966 return(0);
7967 if (tmp == emptyExp)
7968 return(1);
7969 if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
7970 xmlExpFree(ctxt, tmp);
7971 return(1);
7972 }
7973 xmlExpFree(ctxt, tmp);
7974 return(0);
7975}
Daniel Veillard465a0002005-08-22 12:07:04 +00007976
7977/************************************************************************
7978 * *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007979 * Parsing expression *
Daniel Veillard465a0002005-08-22 12:07:04 +00007980 * *
7981 ************************************************************************/
7982
7983static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
7984
7985#undef CUR
7986#define CUR (*ctxt->cur)
7987#undef NEXT
7988#define NEXT ctxt->cur++;
7989#undef IS_BLANK
7990#define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
7991#define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
7992
7993static int
7994xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
7995 int ret = 0;
7996
7997 SKIP_BLANKS
7998 if (CUR == '*') {
7999 NEXT
8000 return(-1);
8001 }
8002 if ((CUR < '0') || (CUR > '9'))
8003 return(-1);
8004 while ((CUR >= '0') && (CUR <= '9')) {
8005 ret = ret * 10 + (CUR - '0');
8006 NEXT
8007 }
8008 return(ret);
8009}
8010
8011static xmlExpNodePtr
8012xmlExpParseOr(xmlExpCtxtPtr ctxt) {
8013 const char *base;
8014 xmlExpNodePtr ret;
8015 const xmlChar *val;
8016
8017 SKIP_BLANKS
8018 base = ctxt->cur;
8019 if (*ctxt->cur == '(') {
8020 NEXT
8021 ret = xmlExpParseExpr(ctxt);
8022 SKIP_BLANKS
8023 if (*ctxt->cur != ')') {
8024 fprintf(stderr, "unbalanced '(' : %s\n", base);
8025 xmlExpFree(ctxt, ret);
8026 return(NULL);
8027 }
8028 NEXT;
8029 SKIP_BLANKS
8030 goto parse_quantifier;
8031 }
8032 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
8033 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
8034 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
8035 NEXT;
8036 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
8037 if (val == NULL)
8038 return(NULL);
8039 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
8040 if (ret == NULL)
8041 return(NULL);
8042 SKIP_BLANKS
8043parse_quantifier:
8044 if (CUR == '{') {
8045 int min, max;
8046
8047 NEXT
8048 min = xmlExpParseNumber(ctxt);
8049 if (min < 0) {
8050 xmlExpFree(ctxt, ret);
8051 return(NULL);
8052 }
8053 SKIP_BLANKS
8054 if (CUR == ',') {
8055 NEXT
8056 max = xmlExpParseNumber(ctxt);
8057 SKIP_BLANKS
8058 } else
8059 max = min;
8060 if (CUR != '}') {
8061 xmlExpFree(ctxt, ret);
8062 return(NULL);
8063 }
8064 NEXT
8065 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8066 min, max);
8067 SKIP_BLANKS
8068 } else if (CUR == '?') {
8069 NEXT
8070 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8071 0, 1);
8072 SKIP_BLANKS
8073 } else if (CUR == '+') {
8074 NEXT
8075 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8076 1, -1);
8077 SKIP_BLANKS
8078 } else if (CUR == '*') {
8079 NEXT
8080 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8081 0, -1);
8082 SKIP_BLANKS
Daniel Veillardf8e3db02012-09-11 13:26:36 +08008083 }
Daniel Veillard465a0002005-08-22 12:07:04 +00008084 return(ret);
8085}
8086
8087
8088static xmlExpNodePtr
8089xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
8090 xmlExpNodePtr ret, right;
8091
8092 ret = xmlExpParseOr(ctxt);
8093 SKIP_BLANKS
8094 while (CUR == '|') {
8095 NEXT
8096 right = xmlExpParseOr(ctxt);
8097 if (right == NULL) {
8098 xmlExpFree(ctxt, ret);
8099 return(NULL);
8100 }
8101 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
8102 if (ret == NULL)
8103 return(NULL);
8104 }
8105 return(ret);
8106}
8107
8108static xmlExpNodePtr
8109xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
8110 xmlExpNodePtr ret, right;
8111
8112 ret = xmlExpParseSeq(ctxt);
8113 SKIP_BLANKS
8114 while (CUR == ',') {
8115 NEXT
8116 right = xmlExpParseSeq(ctxt);
8117 if (right == NULL) {
8118 xmlExpFree(ctxt, ret);
8119 return(NULL);
8120 }
8121 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
8122 if (ret == NULL)
8123 return(NULL);
8124 }
8125 return(ret);
8126}
8127
8128/**
8129 * xmlExpParse:
8130 * @ctxt: the expressions context
8131 * @expr: the 0 terminated string
8132 *
8133 * Minimal parser for regexps, it understand the following constructs
8134 * - string terminals
8135 * - choice operator |
8136 * - sequence operator ,
8137 * - subexpressions (...)
8138 * - usual cardinality operators + * and ?
8139 * - finite sequences { min, max }
8140 * - infinite sequences { min, * }
8141 * There is minimal checkings made especially no checking on strings values
8142 *
8143 * Returns a new expression or NULL in case of failure
8144 */
8145xmlExpNodePtr
8146xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
8147 xmlExpNodePtr ret;
8148
8149 ctxt->expr = expr;
8150 ctxt->cur = expr;
8151
8152 ret = xmlExpParseExpr(ctxt);
8153 SKIP_BLANKS
8154 if (*ctxt->cur != 0) {
8155 xmlExpFree(ctxt, ret);
8156 return(NULL);
8157 }
8158 return(ret);
8159}
8160
8161static void
8162xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
8163 xmlExpNodePtr c;
8164
8165 if (expr == NULL) return;
8166 if (glob) xmlBufferWriteChar(buf, "(");
8167 switch (expr->type) {
8168 case XML_EXP_EMPTY:
8169 xmlBufferWriteChar(buf, "empty");
8170 break;
8171 case XML_EXP_FORBID:
8172 xmlBufferWriteChar(buf, "forbidden");
8173 break;
8174 case XML_EXP_ATOM:
8175 xmlBufferWriteCHAR(buf, expr->exp_str);
8176 break;
8177 case XML_EXP_SEQ:
8178 c = expr->exp_left;
8179 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8180 xmlExpDumpInt(buf, c, 1);
8181 else
8182 xmlExpDumpInt(buf, c, 0);
8183 xmlBufferWriteChar(buf, " , ");
8184 c = expr->exp_right;
8185 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8186 xmlExpDumpInt(buf, c, 1);
8187 else
8188 xmlExpDumpInt(buf, c, 0);
8189 break;
8190 case XML_EXP_OR:
8191 c = expr->exp_left;
8192 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8193 xmlExpDumpInt(buf, c, 1);
8194 else
8195 xmlExpDumpInt(buf, c, 0);
8196 xmlBufferWriteChar(buf, " | ");
8197 c = expr->exp_right;
8198 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8199 xmlExpDumpInt(buf, c, 1);
8200 else
8201 xmlExpDumpInt(buf, c, 0);
8202 break;
8203 case XML_EXP_COUNT: {
8204 char rep[40];
Daniel Veillardf8e3db02012-09-11 13:26:36 +08008205
Daniel Veillard465a0002005-08-22 12:07:04 +00008206 c = expr->exp_left;
8207 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8208 xmlExpDumpInt(buf, c, 1);
8209 else
8210 xmlExpDumpInt(buf, c, 0);
8211 if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
8212 rep[0] = '?';
8213 rep[1] = 0;
8214 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
8215 rep[0] = '*';
8216 rep[1] = 0;
8217 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
8218 rep[0] = '+';
8219 rep[1] = 0;
8220 } else if (expr->exp_max == expr->exp_min) {
8221 snprintf(rep, 39, "{%d}", expr->exp_min);
8222 } else if (expr->exp_max < 0) {
8223 snprintf(rep, 39, "{%d,inf}", expr->exp_min);
8224 } else {
8225 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
8226 }
8227 rep[39] = 0;
8228 xmlBufferWriteChar(buf, rep);
8229 break;
8230 }
8231 default:
8232 fprintf(stderr, "Error in tree\n");
8233 }
8234 if (glob)
8235 xmlBufferWriteChar(buf, ")");
8236}
8237/**
8238 * xmlExpDump:
8239 * @buf: a buffer to receive the output
8240 * @expr: the compiled expression
8241 *
8242 * Serialize the expression as compiled to the buffer
8243 */
8244void
Daniel Veillard5eee7672005-08-22 21:22:27 +00008245xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
8246 if ((buf == NULL) || (expr == NULL))
Daniel Veillard465a0002005-08-22 12:07:04 +00008247 return;
Daniel Veillard5eee7672005-08-22 21:22:27 +00008248 xmlExpDumpInt(buf, expr, 0);
Daniel Veillard465a0002005-08-22 12:07:04 +00008249}
8250
8251/**
8252 * xmlExpMaxToken:
8253 * @expr: a compiled expression
8254 *
8255 * Indicate the maximum number of input a expression can accept
8256 *
8257 * Returns the maximum length or -1 in case of error
8258 */
8259int
8260xmlExpMaxToken(xmlExpNodePtr expr) {
8261 if (expr == NULL)
8262 return(-1);
8263 return(expr->c_max);
8264}
8265
8266/**
8267 * xmlExpCtxtNbNodes:
8268 * @ctxt: an expression context
8269 *
8270 * Debugging facility provides the number of allocated nodes at a that point
8271 *
8272 * Returns the number of nodes in use or -1 in case of error
8273 */
8274int
8275xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
8276 if (ctxt == NULL)
8277 return(-1);
8278 return(ctxt->nb_nodes);
8279}
8280
8281/**
8282 * xmlExpCtxtNbCons:
8283 * @ctxt: an expression context
8284 *
8285 * Debugging facility provides the number of allocated nodes over lifetime
8286 *
8287 * Returns the number of nodes ever allocated or -1 in case of error
8288 */
8289int
8290xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
8291 if (ctxt == NULL)
8292 return(-1);
8293 return(ctxt->nb_cons);
8294}
8295
Daniel Veillard81a8ec62005-08-22 00:20:58 +00008296#endif /* LIBXML_EXPR_ENABLED */
Elliott Hughesecdab2a2022-02-23 14:33:50 -08008297
Daniel Veillard4255d502002-04-16 15:50:10 +00008298#endif /* LIBXML_REGEXP_ENABLED */