blob: dbf3bf2c207a9ba19944e2d8c6b07a38dabc3e71 [file] [log] [blame]
Daniel Veillard4255d502002-04-16 15:50:10 +00001/*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004 * Basically designed with the purpose of compiling regexps for
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005 * the variety of validation/schemas mechanisms now available in
William M. Brackddf71d62004-05-06 04:17:26 +00006 * XML related specifications these include:
Daniel Veillard4255d502002-04-16 15:50:10 +00007 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17#define IN_LIBXML
18#include "libxml.h"
19
20#ifdef LIBXML_REGEXP_ENABLED
21
Daniel Veillardcee2b3a2005-01-25 00:22:52 +000022/* #define DEBUG_ERR */
Daniel Veillardfc0b6f62005-01-09 17:48:02 +000023
Daniel Veillard4255d502002-04-16 15:50:10 +000024#include <stdio.h>
25#include <string.h>
Daniel Veillardebe48c62003-12-03 12:12:27 +000026#ifdef HAVE_LIMITS_H
27#include <limits.h>
28#endif
Haibo Huangcfd91dc2020-07-30 23:01:33 -070029#ifdef HAVE_STDINT_H
30#include <stdint.h>
31#endif
Daniel Veillardebe48c62003-12-03 12:12:27 +000032
Daniel Veillard4255d502002-04-16 15:50:10 +000033#include <libxml/tree.h>
34#include <libxml/parserInternals.h>
35#include <libxml/xmlregexp.h>
36#include <libxml/xmlautomata.h>
37#include <libxml/xmlunicode.h>
38
Daniel Veillardebe48c62003-12-03 12:12:27 +000039#ifndef INT_MAX
40#define INT_MAX 123456789 /* easy to flag and big enough for our needs */
41#endif
Haibo Huangcfd91dc2020-07-30 23:01:33 -070042#ifndef SIZE_MAX
43#define SIZE_MAX ((size_t) -1)
44#endif
Daniel Veillardebe48c62003-12-03 12:12:27 +000045
Daniel Veillardc0826a72004-08-10 14:17:33 +000046/* #define DEBUG_REGEXP_GRAPH */
Daniel Veillard10752282005-08-08 13:05:13 +000047/* #define DEBUG_REGEXP_EXEC */
Daniel Veillard4255d502002-04-16 15:50:10 +000048/* #define DEBUG_PUSH */
Daniel Veillard23e73572002-09-19 19:56:43 +000049/* #define DEBUG_COMPACTION */
Daniel Veillard4255d502002-04-16 15:50:10 +000050
Daniel Veillard567a45b2005-10-18 19:11:55 +000051#define MAX_PUSH 10000000
Daniel Veillard94cc1032005-09-15 13:09:00 +000052
Patrick R. Gansterer204f1f12012-05-10 20:24:00 +080053#ifdef ERROR
54#undef ERROR
55#endif
Daniel Veillardff46a042003-10-08 08:53:17 +000056#define ERROR(str) \
57 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
58 xmlRegexpErrCompile(ctxt, str);
Daniel Veillard4255d502002-04-16 15:50:10 +000059#define NEXT ctxt->cur++
60#define CUR (*(ctxt->cur))
61#define NXT(index) (ctxt->cur[index])
62
63#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
64#define NEXTL(l) ctxt->cur += l;
Daniel Veillardc0826a72004-08-10 14:17:33 +000065#define XML_REG_STRING_SEPARATOR '|'
William M. Bracka9cbf282007-03-21 13:16:33 +000066/*
67 * Need PREV to check on a '-' within a Character Group. May only be used
68 * when it's guaranteed that cur is not at the beginning of ctxt->string!
69 */
70#define PREV (ctxt->cur[-1])
Daniel Veillard4255d502002-04-16 15:50:10 +000071
Daniel Veillarde19fc232002-04-22 16:01:24 +000072/**
73 * TODO:
74 *
75 * macro to flag unimplemented blocks
76 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +080077#define TODO \
Daniel Veillarde19fc232002-04-22 16:01:24 +000078 xmlGenericError(xmlGenericErrorContext, \
79 "Unimplemented block at %s:%d\n", \
80 __FILE__, __LINE__);
81
Daniel Veillard4255d502002-04-16 15:50:10 +000082/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +080083 * *
84 * Datatypes and structures *
85 * *
Daniel Veillard4255d502002-04-16 15:50:10 +000086 ************************************************************************/
87
Daniel Veillardfc011b72006-02-12 19:14:15 +000088/*
89 * Note: the order of the enums below is significant, do not shuffle
90 */
Daniel Veillard4255d502002-04-16 15:50:10 +000091typedef enum {
92 XML_REGEXP_EPSILON = 1,
93 XML_REGEXP_CHARVAL,
94 XML_REGEXP_RANGES,
Daniel Veillard567a45b2005-10-18 19:11:55 +000095 XML_REGEXP_SUBREG, /* used for () sub regexps */
Daniel Veillard4255d502002-04-16 15:50:10 +000096 XML_REGEXP_STRING,
97 XML_REGEXP_ANYCHAR, /* . */
98 XML_REGEXP_ANYSPACE, /* \s */
99 XML_REGEXP_NOTSPACE, /* \S */
100 XML_REGEXP_INITNAME, /* \l */
Daniel Veillard567a45b2005-10-18 19:11:55 +0000101 XML_REGEXP_NOTINITNAME, /* \L */
Daniel Veillard4255d502002-04-16 15:50:10 +0000102 XML_REGEXP_NAMECHAR, /* \c */
103 XML_REGEXP_NOTNAMECHAR, /* \C */
104 XML_REGEXP_DECIMAL, /* \d */
Daniel Veillard567a45b2005-10-18 19:11:55 +0000105 XML_REGEXP_NOTDECIMAL, /* \D */
Daniel Veillard4255d502002-04-16 15:50:10 +0000106 XML_REGEXP_REALCHAR, /* \w */
Daniel Veillard567a45b2005-10-18 19:11:55 +0000107 XML_REGEXP_NOTREALCHAR, /* \W */
108 XML_REGEXP_LETTER = 100,
Daniel Veillard4255d502002-04-16 15:50:10 +0000109 XML_REGEXP_LETTER_UPPERCASE,
110 XML_REGEXP_LETTER_LOWERCASE,
111 XML_REGEXP_LETTER_TITLECASE,
112 XML_REGEXP_LETTER_MODIFIER,
113 XML_REGEXP_LETTER_OTHERS,
114 XML_REGEXP_MARK,
115 XML_REGEXP_MARK_NONSPACING,
116 XML_REGEXP_MARK_SPACECOMBINING,
117 XML_REGEXP_MARK_ENCLOSING,
118 XML_REGEXP_NUMBER,
119 XML_REGEXP_NUMBER_DECIMAL,
120 XML_REGEXP_NUMBER_LETTER,
121 XML_REGEXP_NUMBER_OTHERS,
122 XML_REGEXP_PUNCT,
123 XML_REGEXP_PUNCT_CONNECTOR,
124 XML_REGEXP_PUNCT_DASH,
125 XML_REGEXP_PUNCT_OPEN,
126 XML_REGEXP_PUNCT_CLOSE,
127 XML_REGEXP_PUNCT_INITQUOTE,
128 XML_REGEXP_PUNCT_FINQUOTE,
129 XML_REGEXP_PUNCT_OTHERS,
130 XML_REGEXP_SEPAR,
131 XML_REGEXP_SEPAR_SPACE,
132 XML_REGEXP_SEPAR_LINE,
133 XML_REGEXP_SEPAR_PARA,
134 XML_REGEXP_SYMBOL,
135 XML_REGEXP_SYMBOL_MATH,
136 XML_REGEXP_SYMBOL_CURRENCY,
137 XML_REGEXP_SYMBOL_MODIFIER,
138 XML_REGEXP_SYMBOL_OTHERS,
139 XML_REGEXP_OTHER,
140 XML_REGEXP_OTHER_CONTROL,
141 XML_REGEXP_OTHER_FORMAT,
142 XML_REGEXP_OTHER_PRIVATE,
143 XML_REGEXP_OTHER_NA,
144 XML_REGEXP_BLOCK_NAME
145} xmlRegAtomType;
146
147typedef enum {
148 XML_REGEXP_QUANT_EPSILON = 1,
149 XML_REGEXP_QUANT_ONCE,
150 XML_REGEXP_QUANT_OPT,
151 XML_REGEXP_QUANT_MULT,
152 XML_REGEXP_QUANT_PLUS,
Daniel Veillard7646b182002-04-20 06:41:40 +0000153 XML_REGEXP_QUANT_ONCEONLY,
154 XML_REGEXP_QUANT_ALL,
Daniel Veillard4255d502002-04-16 15:50:10 +0000155 XML_REGEXP_QUANT_RANGE
156} xmlRegQuantType;
157
158typedef enum {
159 XML_REGEXP_START_STATE = 1,
160 XML_REGEXP_FINAL_STATE,
Daniel Veillardcc026dc2005-01-12 13:21:17 +0000161 XML_REGEXP_TRANS_STATE,
Daniel Veillard0e05f4c2006-11-01 15:33:04 +0000162 XML_REGEXP_SINK_STATE,
163 XML_REGEXP_UNREACH_STATE
Daniel Veillard4255d502002-04-16 15:50:10 +0000164} xmlRegStateType;
165
166typedef enum {
167 XML_REGEXP_MARK_NORMAL = 0,
168 XML_REGEXP_MARK_START,
169 XML_REGEXP_MARK_VISITED
170} xmlRegMarkedType;
171
172typedef struct _xmlRegRange xmlRegRange;
173typedef xmlRegRange *xmlRegRangePtr;
174
175struct _xmlRegRange {
Daniel Veillardf8b9de32003-11-24 14:27:26 +0000176 int neg; /* 0 normal, 1 not, 2 exclude */
Daniel Veillard4255d502002-04-16 15:50:10 +0000177 xmlRegAtomType type;
178 int start;
179 int end;
180 xmlChar *blockName;
181};
182
183typedef struct _xmlRegAtom xmlRegAtom;
184typedef xmlRegAtom *xmlRegAtomPtr;
185
186typedef struct _xmlAutomataState xmlRegState;
187typedef xmlRegState *xmlRegStatePtr;
188
189struct _xmlRegAtom {
190 int no;
191 xmlRegAtomType type;
192 xmlRegQuantType quant;
193 int min;
194 int max;
195
196 void *valuep;
Daniel Veillarda646cfd2002-09-17 21:50:03 +0000197 void *valuep2;
Daniel Veillard4255d502002-04-16 15:50:10 +0000198 int neg;
199 int codepoint;
200 xmlRegStatePtr start;
Daniel Veillard76d59b62007-08-22 16:29:21 +0000201 xmlRegStatePtr start0;
Daniel Veillard4255d502002-04-16 15:50:10 +0000202 xmlRegStatePtr stop;
203 int maxRanges;
204 int nbRanges;
205 xmlRegRangePtr *ranges;
206 void *data;
207};
208
209typedef struct _xmlRegCounter xmlRegCounter;
210typedef xmlRegCounter *xmlRegCounterPtr;
211
212struct _xmlRegCounter {
213 int min;
214 int max;
215};
216
217typedef struct _xmlRegTrans xmlRegTrans;
218typedef xmlRegTrans *xmlRegTransPtr;
219
220struct _xmlRegTrans {
221 xmlRegAtomPtr atom;
222 int to;
223 int counter;
224 int count;
Daniel Veillard567a45b2005-10-18 19:11:55 +0000225 int nd;
Daniel Veillard4255d502002-04-16 15:50:10 +0000226};
227
228struct _xmlAutomataState {
229 xmlRegStateType type;
230 xmlRegMarkedType mark;
Daniel Veillard466fcda2012-08-27 12:03:40 +0800231 xmlRegMarkedType markd;
Daniel Veillard23e73572002-09-19 19:56:43 +0000232 xmlRegMarkedType reached;
Daniel Veillard4255d502002-04-16 15:50:10 +0000233 int no;
Daniel Veillard4255d502002-04-16 15:50:10 +0000234 int maxTrans;
235 int nbTrans;
236 xmlRegTrans *trans;
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700237 /* knowing states pointing to us can speed things up */
Daniel Veillarddb68b742005-07-30 13:18:24 +0000238 int maxTransTo;
239 int nbTransTo;
240 int *transTo;
Daniel Veillard4255d502002-04-16 15:50:10 +0000241};
242
243typedef struct _xmlAutomata xmlRegParserCtxt;
244typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
245
Daniel Veillard1ba2aca2009-08-31 16:47:39 +0200246#define AM_AUTOMATA_RNG 1
247
Daniel Veillard4255d502002-04-16 15:50:10 +0000248struct _xmlAutomata {
249 xmlChar *string;
250 xmlChar *cur;
251
252 int error;
253 int neg;
254
255 xmlRegStatePtr start;
256 xmlRegStatePtr end;
257 xmlRegStatePtr state;
258
259 xmlRegAtomPtr atom;
260
261 int maxAtoms;
262 int nbAtoms;
263 xmlRegAtomPtr *atoms;
264
265 int maxStates;
266 int nbStates;
267 xmlRegStatePtr *states;
268
269 int maxCounters;
270 int nbCounters;
271 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000272
273 int determinist;
Daniel Veillard6e65e152005-08-09 11:09:52 +0000274 int negs;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +0200275 int flags;
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700276
277 int depth;
Daniel Veillard4255d502002-04-16 15:50:10 +0000278};
279
280struct _xmlRegexp {
281 xmlChar *string;
282 int nbStates;
283 xmlRegStatePtr *states;
284 int nbAtoms;
285 xmlRegAtomPtr *atoms;
286 int nbCounters;
287 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000288 int determinist;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +0200289 int flags;
Daniel Veillard23e73572002-09-19 19:56:43 +0000290 /*
291 * That's the compact form for determinists automatas
292 */
293 int nbstates;
294 int *compact;
Daniel Veillard118aed72002-09-24 14:13:13 +0000295 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000296 int nbstrings;
297 xmlChar **stringMap;
Daniel Veillard4255d502002-04-16 15:50:10 +0000298};
299
300typedef struct _xmlRegExecRollback xmlRegExecRollback;
301typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
302
303struct _xmlRegExecRollback {
304 xmlRegStatePtr state;/* the current state */
305 int index; /* the index in the input stack */
306 int nextbranch; /* the next transition to explore in that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000307 int *counts; /* save the automata state if it has some */
Daniel Veillard4255d502002-04-16 15:50:10 +0000308};
309
310typedef struct _xmlRegInputToken xmlRegInputToken;
311typedef xmlRegInputToken *xmlRegInputTokenPtr;
312
313struct _xmlRegInputToken {
314 xmlChar *value;
315 void *data;
316};
317
318struct _xmlRegExecCtxt {
319 int status; /* execution status != 0 indicate an error */
William M. Brackddf71d62004-05-06 04:17:26 +0000320 int determinist; /* did we find an indeterministic behaviour */
Daniel Veillard4255d502002-04-16 15:50:10 +0000321 xmlRegexpPtr comp; /* the compiled regexp */
322 xmlRegExecCallbacks callback;
323 void *data;
324
325 xmlRegStatePtr state;/* the current state */
326 int transno; /* the current transition on that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000327 int transcount; /* the number of chars in char counted transitions */
Daniel Veillard4255d502002-04-16 15:50:10 +0000328
329 /*
330 * A stack of rollback states
331 */
332 int maxRollbacks;
333 int nbRollbacks;
334 xmlRegExecRollback *rollbacks;
335
336 /*
337 * The state of the automata if any
338 */
339 int *counts;
340
341 /*
342 * The input stack
343 */
344 int inputStackMax;
345 int inputStackNr;
346 int index;
347 int *charStack;
348 const xmlChar *inputString; /* when operating on characters */
349 xmlRegInputTokenPtr inputStack;/* when operating on strings */
350
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +0000351 /*
352 * error handling
353 */
354 int errStateNo; /* the error state number */
355 xmlRegStatePtr errState; /* the error state */
356 xmlChar *errString; /* the string raising the error */
357 int *errCounts; /* counters at the error state */
Daniel Veillard94cc1032005-09-15 13:09:00 +0000358 int nbPush;
Daniel Veillard4255d502002-04-16 15:50:10 +0000359};
360
Daniel Veillard441bc322002-04-20 17:38:48 +0000361#define REGEXP_ALL_COUNTER 0x123456
362#define REGEXP_ALL_LAX_COUNTER 0x123457
Daniel Veillard7646b182002-04-20 06:41:40 +0000363
Daniel Veillard4255d502002-04-16 15:50:10 +0000364static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
Daniel Veillard23e73572002-09-19 19:56:43 +0000365static void xmlRegFreeState(xmlRegStatePtr state);
366static void xmlRegFreeAtom(xmlRegAtomPtr atom);
Daniel Veillard9efc4762005-07-19 14:33:55 +0000367static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
Daniel Veillard567a45b2005-10-18 19:11:55 +0000368static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
369static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
370 int neg, int start, int end, const xmlChar *blockName);
Daniel Veillard4255d502002-04-16 15:50:10 +0000371
Daniel Veillard1ba2aca2009-08-31 16:47:39 +0200372void xmlAutomataSetFlags(xmlAutomataPtr am, int flags);
373
Daniel Veillard4255d502002-04-16 15:50:10 +0000374/************************************************************************
Daniel Veillardff46a042003-10-08 08:53:17 +0000375 * *
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800376 * Regexp memory error handler *
Daniel Veillardff46a042003-10-08 08:53:17 +0000377 * *
378 ************************************************************************/
379/**
380 * xmlRegexpErrMemory:
William M. Brackddf71d62004-05-06 04:17:26 +0000381 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000382 *
383 * Handle an out of memory condition
384 */
385static void
386xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
387{
388 const char *regexp = NULL;
389 if (ctxt != NULL) {
390 regexp = (const char *) ctxt->string;
391 ctxt->error = XML_ERR_NO_MEMORY;
392 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000393 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000394 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
395 regexp, NULL, 0, 0,
396 "Memory allocation failed : %s\n", extra);
397}
398
399/**
400 * xmlRegexpErrCompile:
William M. Brackddf71d62004-05-06 04:17:26 +0000401 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000402 *
William M. Brackddf71d62004-05-06 04:17:26 +0000403 * Handle a compilation failure
Daniel Veillardff46a042003-10-08 08:53:17 +0000404 */
405static void
406xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
407{
408 const char *regexp = NULL;
409 int idx = 0;
410
411 if (ctxt != NULL) {
412 regexp = (const char *) ctxt->string;
413 idx = ctxt->cur - ctxt->string;
414 ctxt->error = XML_REGEXP_COMPILE_ERROR;
415 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000416 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000417 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
418 regexp, NULL, idx, 0,
419 "failed to compile: %s\n", extra);
420}
421
422/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800423 * *
424 * Allocation/Deallocation *
425 * *
Daniel Veillard4255d502002-04-16 15:50:10 +0000426 ************************************************************************/
427
Daniel Veillard23e73572002-09-19 19:56:43 +0000428static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700429
430/**
431 * xmlRegCalloc2:
432 * @dim1: size of first dimension
433 * @dim2: size of second dimension
434 * @elemSize: size of element
435 *
436 * Allocate a two-dimensional array and set all elements to zero.
437 *
438 * Returns the new array or NULL in case of error.
439 */
440static void*
441xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) {
442 size_t totalSize;
443 void *ret;
444
445 /* Check for overflow */
446 if (dim1 > SIZE_MAX / dim2 / elemSize)
447 return (NULL);
448 totalSize = dim1 * dim2 * elemSize;
449 ret = xmlMalloc(totalSize);
450 if (ret != NULL)
451 memset(ret, 0, totalSize);
452 return (ret);
453}
454
Daniel Veillard4255d502002-04-16 15:50:10 +0000455/**
456 * xmlRegEpxFromParse:
457 * @ctxt: the parser context used to build it
458 *
William M. Brackddf71d62004-05-06 04:17:26 +0000459 * Allocate a new regexp and fill it with the result from the parser
Daniel Veillard4255d502002-04-16 15:50:10 +0000460 *
461 * Returns the new regexp or NULL in case of error
462 */
463static xmlRegexpPtr
464xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
465 xmlRegexpPtr ret;
466
467 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000468 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000469 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +0000470 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000471 }
Daniel Veillard4255d502002-04-16 15:50:10 +0000472 memset(ret, 0, sizeof(xmlRegexp));
473 ret->string = ctxt->string;
Daniel Veillard4255d502002-04-16 15:50:10 +0000474 ret->nbStates = ctxt->nbStates;
Daniel Veillard4255d502002-04-16 15:50:10 +0000475 ret->states = ctxt->states;
Daniel Veillard4255d502002-04-16 15:50:10 +0000476 ret->nbAtoms = ctxt->nbAtoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000477 ret->atoms = ctxt->atoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000478 ret->nbCounters = ctxt->nbCounters;
Daniel Veillard4255d502002-04-16 15:50:10 +0000479 ret->counters = ctxt->counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000480 ret->determinist = ctxt->determinist;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +0200481 ret->flags = ctxt->flags;
Daniel Veillard567a45b2005-10-18 19:11:55 +0000482 if (ret->determinist == -1) {
483 xmlRegexpIsDeterminist(ret);
484 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000485
486 if ((ret->determinist != 0) &&
487 (ret->nbCounters == 0) &&
Daniel Veillard6e65e152005-08-09 11:09:52 +0000488 (ctxt->negs == 0) &&
Daniel Veillard118aed72002-09-24 14:13:13 +0000489 (ret->atoms != NULL) &&
Daniel Veillard23e73572002-09-19 19:56:43 +0000490 (ret->atoms[0] != NULL) &&
491 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
492 int i, j, nbstates = 0, nbatoms = 0;
493 int *stateRemap;
494 int *stringRemap;
495 int *transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000496 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000497 xmlChar **stringMap;
498 xmlChar *value;
499
500 /*
501 * Switch to a compact representation
502 * 1/ counting the effective number of states left
William M. Brackddf71d62004-05-06 04:17:26 +0000503 * 2/ counting the unique number of atoms, and check that
Daniel Veillard23e73572002-09-19 19:56:43 +0000504 * they are all of the string type
505 * 3/ build a table state x atom for the transitions
506 */
507
508 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000509 if (stateRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000510 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000511 xmlFree(ret);
512 return(NULL);
513 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000514 for (i = 0;i < ret->nbStates;i++) {
515 if (ret->states[i] != NULL) {
516 stateRemap[i] = nbstates;
517 nbstates++;
518 } else {
519 stateRemap[i] = -1;
520 }
521 }
522#ifdef DEBUG_COMPACTION
523 printf("Final: %d states\n", nbstates);
524#endif
525 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000526 if (stringMap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000527 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000528 xmlFree(stateRemap);
529 xmlFree(ret);
530 return(NULL);
531 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000532 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000533 if (stringRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000534 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000535 xmlFree(stringMap);
536 xmlFree(stateRemap);
537 xmlFree(ret);
538 return(NULL);
539 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000540 for (i = 0;i < ret->nbAtoms;i++) {
541 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
542 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
543 value = ret->atoms[i]->valuep;
544 for (j = 0;j < nbatoms;j++) {
545 if (xmlStrEqual(stringMap[j], value)) {
546 stringRemap[i] = j;
547 break;
548 }
549 }
550 if (j >= nbatoms) {
551 stringRemap[i] = nbatoms;
552 stringMap[nbatoms] = xmlStrdup(value);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000553 if (stringMap[nbatoms] == NULL) {
554 for (i = 0;i < nbatoms;i++)
555 xmlFree(stringMap[i]);
556 xmlFree(stringRemap);
557 xmlFree(stringMap);
558 xmlFree(stateRemap);
559 xmlFree(ret);
560 return(NULL);
561 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000562 nbatoms++;
563 }
564 } else {
565 xmlFree(stateRemap);
566 xmlFree(stringRemap);
567 for (i = 0;i < nbatoms;i++)
568 xmlFree(stringMap[i]);
569 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000570 xmlFree(ret);
571 return(NULL);
Daniel Veillard23e73572002-09-19 19:56:43 +0000572 }
573 }
574#ifdef DEBUG_COMPACTION
575 printf("Final: %d atoms\n", nbatoms);
576#endif
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700577 transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1,
578 sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000579 if (transitions == NULL) {
580 xmlFree(stateRemap);
581 xmlFree(stringRemap);
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700582 for (i = 0;i < nbatoms;i++)
583 xmlFree(stringMap[i]);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000584 xmlFree(stringMap);
585 xmlFree(ret);
586 return(NULL);
587 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000588
589 /*
590 * Allocate the transition table. The first entry for each
William M. Brackddf71d62004-05-06 04:17:26 +0000591 * state corresponds to the state type.
Daniel Veillard23e73572002-09-19 19:56:43 +0000592 */
Daniel Veillard118aed72002-09-24 14:13:13 +0000593 transdata = NULL;
Daniel Veillard23e73572002-09-19 19:56:43 +0000594
595 for (i = 0;i < ret->nbStates;i++) {
596 int stateno, atomno, targetno, prev;
597 xmlRegStatePtr state;
598 xmlRegTransPtr trans;
599
600 stateno = stateRemap[i];
601 if (stateno == -1)
602 continue;
603 state = ret->states[i];
604
605 transitions[stateno * (nbatoms + 1)] = state->type;
606
607 for (j = 0;j < state->nbTrans;j++) {
608 trans = &(state->trans[j]);
609 if ((trans->to == -1) || (trans->atom == NULL))
610 continue;
611 atomno = stringRemap[trans->atom->no];
Daniel Veillard118aed72002-09-24 14:13:13 +0000612 if ((trans->atom->data != NULL) && (transdata == NULL)) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700613 transdata = (void **) xmlRegCalloc2(nbstates, nbatoms,
614 sizeof(void *));
615 if (transdata == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000616 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000617 break;
618 }
Daniel Veillard118aed72002-09-24 14:13:13 +0000619 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000620 targetno = stateRemap[trans->to];
621 /*
William M. Brackddf71d62004-05-06 04:17:26 +0000622 * if the same atom can generate transitions to 2 different
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700623 * states then it means the automata is not deterministic and
Daniel Veillard23e73572002-09-19 19:56:43 +0000624 * the compact form can't be used !
625 */
626 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
627 if (prev != 0) {
628 if (prev != targetno + 1) {
Daniel Veillard23e73572002-09-19 19:56:43 +0000629 ret->determinist = 0;
630#ifdef DEBUG_COMPACTION
631 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
632 i, j, trans->atom->no, trans->to, atomno, targetno);
633 printf(" previous to is %d\n", prev);
634#endif
Daniel Veillard118aed72002-09-24 14:13:13 +0000635 if (transdata != NULL)
636 xmlFree(transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +0000637 xmlFree(transitions);
638 xmlFree(stateRemap);
639 xmlFree(stringRemap);
640 for (i = 0;i < nbatoms;i++)
641 xmlFree(stringMap[i]);
642 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000643 goto not_determ;
Daniel Veillard23e73572002-09-19 19:56:43 +0000644 }
645 } else {
646#if 0
647 printf("State %d trans %d: atom %d to %d : %d to %d\n",
648 i, j, trans->atom->no, trans->to, atomno, targetno);
649#endif
650 transitions[stateno * (nbatoms + 1) + atomno + 1] =
Daniel Veillard118aed72002-09-24 14:13:13 +0000651 targetno + 1; /* to avoid 0 */
652 if (transdata != NULL)
653 transdata[stateno * nbatoms + atomno] =
654 trans->atom->data;
Daniel Veillard23e73572002-09-19 19:56:43 +0000655 }
656 }
657 }
658 ret->determinist = 1;
659#ifdef DEBUG_COMPACTION
660 /*
661 * Debug
662 */
663 for (i = 0;i < nbstates;i++) {
664 for (j = 0;j < nbatoms + 1;j++) {
665 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
666 }
667 printf("\n");
668 }
669 printf("\n");
670#endif
671 /*
672 * Cleanup of the old data
673 */
674 if (ret->states != NULL) {
675 for (i = 0;i < ret->nbStates;i++)
676 xmlRegFreeState(ret->states[i]);
677 xmlFree(ret->states);
678 }
679 ret->states = NULL;
680 ret->nbStates = 0;
681 if (ret->atoms != NULL) {
682 for (i = 0;i < ret->nbAtoms;i++)
683 xmlRegFreeAtom(ret->atoms[i]);
684 xmlFree(ret->atoms);
685 }
686 ret->atoms = NULL;
687 ret->nbAtoms = 0;
688
689 ret->compact = transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000690 ret->transdata = transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000691 ret->stringMap = stringMap;
692 ret->nbstrings = nbatoms;
693 ret->nbstates = nbstates;
694 xmlFree(stateRemap);
695 xmlFree(stringRemap);
696 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000697not_determ:
698 ctxt->string = NULL;
699 ctxt->nbStates = 0;
700 ctxt->states = NULL;
701 ctxt->nbAtoms = 0;
702 ctxt->atoms = NULL;
703 ctxt->nbCounters = 0;
704 ctxt->counters = NULL;
Daniel Veillard4255d502002-04-16 15:50:10 +0000705 return(ret);
706}
707
708/**
709 * xmlRegNewParserCtxt:
710 * @string: the string to parse
711 *
712 * Allocate a new regexp parser context
713 *
714 * Returns the new context or NULL in case of error
715 */
716static xmlRegParserCtxtPtr
717xmlRegNewParserCtxt(const xmlChar *string) {
718 xmlRegParserCtxtPtr ret;
719
720 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
721 if (ret == NULL)
722 return(NULL);
723 memset(ret, 0, sizeof(xmlRegParserCtxt));
724 if (string != NULL)
725 ret->string = xmlStrdup(string);
726 ret->cur = ret->string;
727 ret->neg = 0;
Daniel Veillard6e65e152005-08-09 11:09:52 +0000728 ret->negs = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +0000729 ret->error = 0;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000730 ret->determinist = -1;
Daniel Veillard4255d502002-04-16 15:50:10 +0000731 return(ret);
732}
733
734/**
735 * xmlRegNewRange:
736 * @ctxt: the regexp parser context
737 * @neg: is that negative
738 * @type: the type of range
739 * @start: the start codepoint
740 * @end: the end codepoint
741 *
742 * Allocate a new regexp range
743 *
744 * Returns the new range or NULL in case of error
745 */
746static xmlRegRangePtr
747xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
748 int neg, xmlRegAtomType type, int start, int end) {
749 xmlRegRangePtr ret;
750
751 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
752 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000753 xmlRegexpErrMemory(ctxt, "allocating range");
Daniel Veillard4255d502002-04-16 15:50:10 +0000754 return(NULL);
755 }
756 ret->neg = neg;
757 ret->type = type;
758 ret->start = start;
759 ret->end = end;
760 return(ret);
761}
762
763/**
764 * xmlRegFreeRange:
765 * @range: the regexp range
766 *
767 * Free a regexp range
768 */
769static void
770xmlRegFreeRange(xmlRegRangePtr range) {
771 if (range == NULL)
772 return;
773
774 if (range->blockName != NULL)
775 xmlFree(range->blockName);
776 xmlFree(range);
777}
778
779/**
Daniel Veillard76d59b62007-08-22 16:29:21 +0000780 * xmlRegCopyRange:
781 * @range: the regexp range
782 *
783 * Copy a regexp range
784 *
785 * Returns the new copy or NULL in case of error.
786 */
787static xmlRegRangePtr
788xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
789 xmlRegRangePtr ret;
790
791 if (range == NULL)
792 return(NULL);
793
794 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
795 range->end);
796 if (ret == NULL)
797 return(NULL);
798 if (range->blockName != NULL) {
799 ret->blockName = xmlStrdup(range->blockName);
800 if (ret->blockName == NULL) {
801 xmlRegexpErrMemory(ctxt, "allocating range");
802 xmlRegFreeRange(ret);
803 return(NULL);
804 }
805 }
806 return(ret);
807}
808
809/**
Daniel Veillard4255d502002-04-16 15:50:10 +0000810 * xmlRegNewAtom:
811 * @ctxt: the regexp parser context
812 * @type: the type of atom
813 *
Daniel Veillard76d59b62007-08-22 16:29:21 +0000814 * Allocate a new atom
Daniel Veillard4255d502002-04-16 15:50:10 +0000815 *
816 * Returns the new atom or NULL in case of error
817 */
818static xmlRegAtomPtr
819xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
820 xmlRegAtomPtr ret;
821
822 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
823 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000824 xmlRegexpErrMemory(ctxt, "allocating atom");
Daniel Veillard4255d502002-04-16 15:50:10 +0000825 return(NULL);
826 }
827 memset(ret, 0, sizeof(xmlRegAtom));
828 ret->type = type;
829 ret->quant = XML_REGEXP_QUANT_ONCE;
830 ret->min = 0;
831 ret->max = 0;
832 return(ret);
833}
834
835/**
836 * xmlRegFreeAtom:
837 * @atom: the regexp atom
838 *
839 * Free a regexp atom
840 */
841static void
842xmlRegFreeAtom(xmlRegAtomPtr atom) {
843 int i;
844
845 if (atom == NULL)
846 return;
847
848 for (i = 0;i < atom->nbRanges;i++)
849 xmlRegFreeRange(atom->ranges[i]);
850 if (atom->ranges != NULL)
851 xmlFree(atom->ranges);
Daniel Veillardde0e4982005-07-03 14:35:44 +0000852 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
853 xmlFree(atom->valuep);
Daniel Veillard77005e62005-07-19 16:26:18 +0000854 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
855 xmlFree(atom->valuep2);
Daniel Veillardde0e4982005-07-03 14:35:44 +0000856 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +0000857 xmlFree(atom->valuep);
858 xmlFree(atom);
859}
860
Daniel Veillard76d59b62007-08-22 16:29:21 +0000861/**
862 * xmlRegCopyAtom:
863 * @ctxt: the regexp parser context
Haibo Huangcfd91dc2020-07-30 23:01:33 -0700864 * @atom: the original atom
Daniel Veillard76d59b62007-08-22 16:29:21 +0000865 *
866 * Allocate a new regexp range
867 *
868 * Returns the new atom or NULL in case of error
869 */
870static xmlRegAtomPtr
871xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
872 xmlRegAtomPtr ret;
873
874 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
875 if (ret == NULL) {
876 xmlRegexpErrMemory(ctxt, "copying atom");
877 return(NULL);
878 }
879 memset(ret, 0, sizeof(xmlRegAtom));
880 ret->type = atom->type;
881 ret->quant = atom->quant;
882 ret->min = atom->min;
883 ret->max = atom->max;
884 if (atom->nbRanges > 0) {
885 int i;
886
887 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
888 atom->nbRanges);
889 if (ret->ranges == NULL) {
890 xmlRegexpErrMemory(ctxt, "copying atom");
891 goto error;
892 }
893 for (i = 0;i < atom->nbRanges;i++) {
894 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
895 if (ret->ranges[i] == NULL)
896 goto error;
897 ret->nbRanges = i + 1;
898 }
899 }
900 return(ret);
901
902error:
903 xmlRegFreeAtom(ret);
904 return(NULL);
905}
906
Daniel Veillard4255d502002-04-16 15:50:10 +0000907static xmlRegStatePtr
908xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
909 xmlRegStatePtr ret;
910
911 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
912 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000913 xmlRegexpErrMemory(ctxt, "allocating state");
Daniel Veillard4255d502002-04-16 15:50:10 +0000914 return(NULL);
915 }
916 memset(ret, 0, sizeof(xmlRegState));
917 ret->type = XML_REGEXP_TRANS_STATE;
918 ret->mark = XML_REGEXP_MARK_NORMAL;
919 return(ret);
920}
921
922/**
923 * xmlRegFreeState:
924 * @state: the regexp state
925 *
926 * Free a regexp state
927 */
928static void
929xmlRegFreeState(xmlRegStatePtr state) {
930 if (state == NULL)
931 return;
932
933 if (state->trans != NULL)
934 xmlFree(state->trans);
Daniel Veillarddb68b742005-07-30 13:18:24 +0000935 if (state->transTo != NULL)
936 xmlFree(state->transTo);
Daniel Veillard4255d502002-04-16 15:50:10 +0000937 xmlFree(state);
938}
939
940/**
941 * xmlRegFreeParserCtxt:
942 * @ctxt: the regexp parser context
943 *
944 * Free a regexp parser context
945 */
946static void
947xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
948 int i;
949 if (ctxt == NULL)
950 return;
951
952 if (ctxt->string != NULL)
953 xmlFree(ctxt->string);
954 if (ctxt->states != NULL) {
955 for (i = 0;i < ctxt->nbStates;i++)
956 xmlRegFreeState(ctxt->states[i]);
957 xmlFree(ctxt->states);
958 }
959 if (ctxt->atoms != NULL) {
960 for (i = 0;i < ctxt->nbAtoms;i++)
961 xmlRegFreeAtom(ctxt->atoms[i]);
962 xmlFree(ctxt->atoms);
963 }
964 if (ctxt->counters != NULL)
965 xmlFree(ctxt->counters);
966 xmlFree(ctxt);
967}
968
969/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800970 * *
971 * Display of Data structures *
972 * *
Daniel Veillard4255d502002-04-16 15:50:10 +0000973 ************************************************************************/
974
975static void
976xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
977 switch (type) {
978 case XML_REGEXP_EPSILON:
979 fprintf(output, "epsilon "); break;
980 case XML_REGEXP_CHARVAL:
981 fprintf(output, "charval "); break;
982 case XML_REGEXP_RANGES:
983 fprintf(output, "ranges "); break;
984 case XML_REGEXP_SUBREG:
985 fprintf(output, "subexpr "); break;
986 case XML_REGEXP_STRING:
987 fprintf(output, "string "); break;
988 case XML_REGEXP_ANYCHAR:
989 fprintf(output, "anychar "); break;
990 case XML_REGEXP_ANYSPACE:
991 fprintf(output, "anyspace "); break;
992 case XML_REGEXP_NOTSPACE:
993 fprintf(output, "notspace "); break;
994 case XML_REGEXP_INITNAME:
995 fprintf(output, "initname "); break;
996 case XML_REGEXP_NOTINITNAME:
997 fprintf(output, "notinitname "); break;
998 case XML_REGEXP_NAMECHAR:
999 fprintf(output, "namechar "); break;
1000 case XML_REGEXP_NOTNAMECHAR:
1001 fprintf(output, "notnamechar "); break;
1002 case XML_REGEXP_DECIMAL:
1003 fprintf(output, "decimal "); break;
1004 case XML_REGEXP_NOTDECIMAL:
1005 fprintf(output, "notdecimal "); break;
1006 case XML_REGEXP_REALCHAR:
1007 fprintf(output, "realchar "); break;
1008 case XML_REGEXP_NOTREALCHAR:
1009 fprintf(output, "notrealchar "); break;
1010 case XML_REGEXP_LETTER:
1011 fprintf(output, "LETTER "); break;
1012 case XML_REGEXP_LETTER_UPPERCASE:
1013 fprintf(output, "LETTER_UPPERCASE "); break;
1014 case XML_REGEXP_LETTER_LOWERCASE:
1015 fprintf(output, "LETTER_LOWERCASE "); break;
1016 case XML_REGEXP_LETTER_TITLECASE:
1017 fprintf(output, "LETTER_TITLECASE "); break;
1018 case XML_REGEXP_LETTER_MODIFIER:
1019 fprintf(output, "LETTER_MODIFIER "); break;
1020 case XML_REGEXP_LETTER_OTHERS:
1021 fprintf(output, "LETTER_OTHERS "); break;
1022 case XML_REGEXP_MARK:
1023 fprintf(output, "MARK "); break;
1024 case XML_REGEXP_MARK_NONSPACING:
1025 fprintf(output, "MARK_NONSPACING "); break;
1026 case XML_REGEXP_MARK_SPACECOMBINING:
1027 fprintf(output, "MARK_SPACECOMBINING "); break;
1028 case XML_REGEXP_MARK_ENCLOSING:
1029 fprintf(output, "MARK_ENCLOSING "); break;
1030 case XML_REGEXP_NUMBER:
1031 fprintf(output, "NUMBER "); break;
1032 case XML_REGEXP_NUMBER_DECIMAL:
1033 fprintf(output, "NUMBER_DECIMAL "); break;
1034 case XML_REGEXP_NUMBER_LETTER:
1035 fprintf(output, "NUMBER_LETTER "); break;
1036 case XML_REGEXP_NUMBER_OTHERS:
1037 fprintf(output, "NUMBER_OTHERS "); break;
1038 case XML_REGEXP_PUNCT:
1039 fprintf(output, "PUNCT "); break;
1040 case XML_REGEXP_PUNCT_CONNECTOR:
1041 fprintf(output, "PUNCT_CONNECTOR "); break;
1042 case XML_REGEXP_PUNCT_DASH:
1043 fprintf(output, "PUNCT_DASH "); break;
1044 case XML_REGEXP_PUNCT_OPEN:
1045 fprintf(output, "PUNCT_OPEN "); break;
1046 case XML_REGEXP_PUNCT_CLOSE:
1047 fprintf(output, "PUNCT_CLOSE "); break;
1048 case XML_REGEXP_PUNCT_INITQUOTE:
1049 fprintf(output, "PUNCT_INITQUOTE "); break;
1050 case XML_REGEXP_PUNCT_FINQUOTE:
1051 fprintf(output, "PUNCT_FINQUOTE "); break;
1052 case XML_REGEXP_PUNCT_OTHERS:
1053 fprintf(output, "PUNCT_OTHERS "); break;
1054 case XML_REGEXP_SEPAR:
1055 fprintf(output, "SEPAR "); break;
1056 case XML_REGEXP_SEPAR_SPACE:
1057 fprintf(output, "SEPAR_SPACE "); break;
1058 case XML_REGEXP_SEPAR_LINE:
1059 fprintf(output, "SEPAR_LINE "); break;
1060 case XML_REGEXP_SEPAR_PARA:
1061 fprintf(output, "SEPAR_PARA "); break;
1062 case XML_REGEXP_SYMBOL:
1063 fprintf(output, "SYMBOL "); break;
1064 case XML_REGEXP_SYMBOL_MATH:
1065 fprintf(output, "SYMBOL_MATH "); break;
1066 case XML_REGEXP_SYMBOL_CURRENCY:
1067 fprintf(output, "SYMBOL_CURRENCY "); break;
1068 case XML_REGEXP_SYMBOL_MODIFIER:
1069 fprintf(output, "SYMBOL_MODIFIER "); break;
1070 case XML_REGEXP_SYMBOL_OTHERS:
1071 fprintf(output, "SYMBOL_OTHERS "); break;
1072 case XML_REGEXP_OTHER:
1073 fprintf(output, "OTHER "); break;
1074 case XML_REGEXP_OTHER_CONTROL:
1075 fprintf(output, "OTHER_CONTROL "); break;
1076 case XML_REGEXP_OTHER_FORMAT:
1077 fprintf(output, "OTHER_FORMAT "); break;
1078 case XML_REGEXP_OTHER_PRIVATE:
1079 fprintf(output, "OTHER_PRIVATE "); break;
1080 case XML_REGEXP_OTHER_NA:
1081 fprintf(output, "OTHER_NA "); break;
1082 case XML_REGEXP_BLOCK_NAME:
1083 fprintf(output, "BLOCK "); break;
1084 }
1085}
1086
1087static void
1088xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1089 switch (type) {
1090 case XML_REGEXP_QUANT_EPSILON:
1091 fprintf(output, "epsilon "); break;
1092 case XML_REGEXP_QUANT_ONCE:
1093 fprintf(output, "once "); break;
1094 case XML_REGEXP_QUANT_OPT:
1095 fprintf(output, "? "); break;
1096 case XML_REGEXP_QUANT_MULT:
1097 fprintf(output, "* "); break;
1098 case XML_REGEXP_QUANT_PLUS:
1099 fprintf(output, "+ "); break;
1100 case XML_REGEXP_QUANT_RANGE:
1101 fprintf(output, "range "); break;
Daniel Veillard7646b182002-04-20 06:41:40 +00001102 case XML_REGEXP_QUANT_ONCEONLY:
1103 fprintf(output, "onceonly "); break;
1104 case XML_REGEXP_QUANT_ALL:
1105 fprintf(output, "all "); break;
Daniel Veillard4255d502002-04-16 15:50:10 +00001106 }
1107}
1108static void
1109xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1110 fprintf(output, " range: ");
1111 if (range->neg)
1112 fprintf(output, "negative ");
1113 xmlRegPrintAtomType(output, range->type);
1114 fprintf(output, "%c - %c\n", range->start, range->end);
1115}
1116
1117static void
1118xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1119 fprintf(output, " atom: ");
1120 if (atom == NULL) {
1121 fprintf(output, "NULL\n");
1122 return;
1123 }
Daniel Veillard9efc4762005-07-19 14:33:55 +00001124 if (atom->neg)
1125 fprintf(output, "not ");
Daniel Veillard4255d502002-04-16 15:50:10 +00001126 xmlRegPrintAtomType(output, atom->type);
1127 xmlRegPrintQuantType(output, atom->quant);
1128 if (atom->quant == XML_REGEXP_QUANT_RANGE)
1129 fprintf(output, "%d-%d ", atom->min, atom->max);
1130 if (atom->type == XML_REGEXP_STRING)
1131 fprintf(output, "'%s' ", (char *) atom->valuep);
1132 if (atom->type == XML_REGEXP_CHARVAL)
1133 fprintf(output, "char %c\n", atom->codepoint);
1134 else if (atom->type == XML_REGEXP_RANGES) {
1135 int i;
1136 fprintf(output, "%d entries\n", atom->nbRanges);
1137 for (i = 0; i < atom->nbRanges;i++)
1138 xmlRegPrintRange(output, atom->ranges[i]);
1139 } else if (atom->type == XML_REGEXP_SUBREG) {
1140 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1141 } else {
1142 fprintf(output, "\n");
1143 }
1144}
1145
1146static void
1147xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1148 fprintf(output, " trans: ");
1149 if (trans == NULL) {
1150 fprintf(output, "NULL\n");
1151 return;
1152 }
1153 if (trans->to < 0) {
1154 fprintf(output, "removed\n");
1155 return;
1156 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001157 if (trans->nd != 0) {
1158 if (trans->nd == 2)
1159 fprintf(output, "last not determinist, ");
1160 else
1161 fprintf(output, "not determinist, ");
1162 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001163 if (trans->counter >= 0) {
1164 fprintf(output, "counted %d, ", trans->counter);
1165 }
Daniel Veillard8a001f62002-04-20 07:24:11 +00001166 if (trans->count == REGEXP_ALL_COUNTER) {
1167 fprintf(output, "all transition, ");
1168 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00001169 fprintf(output, "count based %d, ", trans->count);
1170 }
1171 if (trans->atom == NULL) {
1172 fprintf(output, "epsilon to %d\n", trans->to);
1173 return;
1174 }
1175 if (trans->atom->type == XML_REGEXP_CHARVAL)
1176 fprintf(output, "char %c ", trans->atom->codepoint);
1177 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1178}
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001179
Daniel Veillard4255d502002-04-16 15:50:10 +00001180static void
1181xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1182 int i;
1183
1184 fprintf(output, " state: ");
1185 if (state == NULL) {
1186 fprintf(output, "NULL\n");
1187 return;
1188 }
1189 if (state->type == XML_REGEXP_START_STATE)
1190 fprintf(output, "START ");
1191 if (state->type == XML_REGEXP_FINAL_STATE)
1192 fprintf(output, "FINAL ");
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001193
Daniel Veillard4255d502002-04-16 15:50:10 +00001194 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1195 for (i = 0;i < state->nbTrans; i++) {
1196 xmlRegPrintTrans(output, &(state->trans[i]));
1197 }
1198}
1199
Daniel Veillard23e73572002-09-19 19:56:43 +00001200#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard4255d502002-04-16 15:50:10 +00001201static void
1202xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1203 int i;
1204
1205 fprintf(output, " ctxt: ");
1206 if (ctxt == NULL) {
1207 fprintf(output, "NULL\n");
1208 return;
1209 }
1210 fprintf(output, "'%s' ", ctxt->string);
1211 if (ctxt->error)
1212 fprintf(output, "error ");
1213 if (ctxt->neg)
1214 fprintf(output, "neg ");
1215 fprintf(output, "\n");
1216 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1217 for (i = 0;i < ctxt->nbAtoms; i++) {
1218 fprintf(output, " %02d ", i);
1219 xmlRegPrintAtom(output, ctxt->atoms[i]);
1220 }
1221 if (ctxt->atom != NULL) {
1222 fprintf(output, "current atom:\n");
1223 xmlRegPrintAtom(output, ctxt->atom);
1224 }
1225 fprintf(output, "%d states:", ctxt->nbStates);
1226 if (ctxt->start != NULL)
1227 fprintf(output, " start: %d", ctxt->start->no);
1228 if (ctxt->end != NULL)
1229 fprintf(output, " end: %d", ctxt->end->no);
1230 fprintf(output, "\n");
1231 for (i = 0;i < ctxt->nbStates; i++) {
1232 xmlRegPrintState(output, ctxt->states[i]);
1233 }
1234 fprintf(output, "%d counters:\n", ctxt->nbCounters);
1235 for (i = 0;i < ctxt->nbCounters; i++) {
1236 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1237 ctxt->counters[i].max);
1238 }
1239}
Daniel Veillard23e73572002-09-19 19:56:43 +00001240#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00001241
1242/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001243 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00001244 * Finite Automata structures manipulations *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001245 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00001246 ************************************************************************/
1247
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001248static void
Daniel Veillard4255d502002-04-16 15:50:10 +00001249xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1250 int neg, xmlRegAtomType type, int start, int end,
1251 xmlChar *blockName) {
1252 xmlRegRangePtr range;
1253
1254 if (atom == NULL) {
1255 ERROR("add range: atom is NULL");
1256 return;
1257 }
1258 if (atom->type != XML_REGEXP_RANGES) {
1259 ERROR("add range: atom is not ranges");
1260 return;
1261 }
1262 if (atom->maxRanges == 0) {
1263 atom->maxRanges = 4;
1264 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1265 sizeof(xmlRegRangePtr));
1266 if (atom->ranges == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001267 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001268 atom->maxRanges = 0;
1269 return;
1270 }
1271 } else if (atom->nbRanges >= atom->maxRanges) {
1272 xmlRegRangePtr *tmp;
1273 atom->maxRanges *= 2;
1274 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1275 sizeof(xmlRegRangePtr));
1276 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001277 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001278 atom->maxRanges /= 2;
1279 return;
1280 }
1281 atom->ranges = tmp;
1282 }
1283 range = xmlRegNewRange(ctxt, neg, type, start, end);
1284 if (range == NULL)
1285 return;
1286 range->blockName = blockName;
1287 atom->ranges[atom->nbRanges++] = range;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001288
Daniel Veillard4255d502002-04-16 15:50:10 +00001289}
1290
1291static int
1292xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1293 if (ctxt->maxCounters == 0) {
1294 ctxt->maxCounters = 4;
1295 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1296 sizeof(xmlRegCounter));
1297 if (ctxt->counters == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001298 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001299 ctxt->maxCounters = 0;
1300 return(-1);
1301 }
1302 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1303 xmlRegCounter *tmp;
1304 ctxt->maxCounters *= 2;
1305 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1306 sizeof(xmlRegCounter));
1307 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001308 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001309 ctxt->maxCounters /= 2;
1310 return(-1);
1311 }
1312 ctxt->counters = tmp;
1313 }
1314 ctxt->counters[ctxt->nbCounters].min = -1;
1315 ctxt->counters[ctxt->nbCounters].max = -1;
1316 return(ctxt->nbCounters++);
1317}
1318
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001319static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001320xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1321 if (atom == NULL) {
1322 ERROR("atom push: atom is NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001323 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001324 }
1325 if (ctxt->maxAtoms == 0) {
1326 ctxt->maxAtoms = 4;
1327 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1328 sizeof(xmlRegAtomPtr));
1329 if (ctxt->atoms == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001330 xmlRegexpErrMemory(ctxt, "pushing atom");
Daniel Veillard4255d502002-04-16 15:50:10 +00001331 ctxt->maxAtoms = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001332 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001333 }
1334 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1335 xmlRegAtomPtr *tmp;
1336 ctxt->maxAtoms *= 2;
1337 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1338 sizeof(xmlRegAtomPtr));
1339 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001340 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001341 ctxt->maxAtoms /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001342 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001343 }
1344 ctxt->atoms = tmp;
1345 }
1346 atom->no = ctxt->nbAtoms;
1347 ctxt->atoms[ctxt->nbAtoms++] = atom;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001348 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001349}
1350
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001351static void
Daniel Veillarddb68b742005-07-30 13:18:24 +00001352xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1353 int from) {
1354 if (target->maxTransTo == 0) {
1355 target->maxTransTo = 8;
1356 target->transTo = (int *) xmlMalloc(target->maxTransTo *
1357 sizeof(int));
1358 if (target->transTo == NULL) {
1359 xmlRegexpErrMemory(ctxt, "adding transition");
1360 target->maxTransTo = 0;
1361 return;
1362 }
1363 } else if (target->nbTransTo >= target->maxTransTo) {
1364 int *tmp;
1365 target->maxTransTo *= 2;
1366 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1367 sizeof(int));
1368 if (tmp == NULL) {
1369 xmlRegexpErrMemory(ctxt, "adding transition");
1370 target->maxTransTo /= 2;
1371 return;
1372 }
1373 target->transTo = tmp;
1374 }
1375 target->transTo[target->nbTransTo] = from;
1376 target->nbTransTo++;
1377}
1378
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001379static void
Daniel Veillard4255d502002-04-16 15:50:10 +00001380xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1381 xmlRegAtomPtr atom, xmlRegStatePtr target,
Daniel Veillard5de09382005-09-26 17:18:17 +00001382 int counter, int count) {
William M. Brackf9b5fa22004-05-10 07:52:15 +00001383
1384 int nrtrans;
1385
Daniel Veillard4255d502002-04-16 15:50:10 +00001386 if (state == NULL) {
1387 ERROR("add state: state is NULL");
1388 return;
1389 }
1390 if (target == NULL) {
1391 ERROR("add state: target is NULL");
1392 return;
1393 }
William M. Brackf9b5fa22004-05-10 07:52:15 +00001394 /*
1395 * Other routines follow the philosophy 'When in doubt, add a transition'
1396 * so we check here whether such a transition is already present and, if
1397 * so, silently ignore this request.
1398 */
1399
Daniel Veillard5de09382005-09-26 17:18:17 +00001400 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1401 xmlRegTransPtr trans = &(state->trans[nrtrans]);
1402 if ((trans->atom == atom) &&
1403 (trans->to == target->no) &&
1404 (trans->counter == counter) &&
1405 (trans->count == count)) {
William M. Brackf9b5fa22004-05-10 07:52:15 +00001406#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard5de09382005-09-26 17:18:17 +00001407 printf("Ignoring duplicate transition from %d to %d\n",
1408 state->no, target->no);
William M. Brackf9b5fa22004-05-10 07:52:15 +00001409#endif
Daniel Veillard5de09382005-09-26 17:18:17 +00001410 return;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001411 }
William M. Brackf9b5fa22004-05-10 07:52:15 +00001412 }
1413
Daniel Veillard4255d502002-04-16 15:50:10 +00001414 if (state->maxTrans == 0) {
Daniel Veillarddb68b742005-07-30 13:18:24 +00001415 state->maxTrans = 8;
Daniel Veillard4255d502002-04-16 15:50:10 +00001416 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1417 sizeof(xmlRegTrans));
1418 if (state->trans == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001419 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001420 state->maxTrans = 0;
1421 return;
1422 }
1423 } else if (state->nbTrans >= state->maxTrans) {
1424 xmlRegTrans *tmp;
1425 state->maxTrans *= 2;
1426 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1427 sizeof(xmlRegTrans));
1428 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001429 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001430 state->maxTrans /= 2;
1431 return;
1432 }
1433 state->trans = tmp;
1434 }
1435#ifdef DEBUG_REGEXP_GRAPH
1436 printf("Add trans from %d to %d ", state->no, target->no);
Daniel Veillard8a001f62002-04-20 07:24:11 +00001437 if (count == REGEXP_ALL_COUNTER)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001438 printf("all transition\n");
Daniel Veillard4402ab42002-09-12 16:02:56 +00001439 else if (count >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001440 printf("count based %d\n", count);
Daniel Veillard4255d502002-04-16 15:50:10 +00001441 else if (counter >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001442 printf("counted %d\n", counter);
Daniel Veillard4255d502002-04-16 15:50:10 +00001443 else if (atom == NULL)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001444 printf("epsilon transition\n");
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001445 else if (atom != NULL)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001446 xmlRegPrintAtom(stdout, atom);
Daniel Veillard4255d502002-04-16 15:50:10 +00001447#endif
1448
1449 state->trans[state->nbTrans].atom = atom;
1450 state->trans[state->nbTrans].to = target->no;
1451 state->trans[state->nbTrans].counter = counter;
1452 state->trans[state->nbTrans].count = count;
Daniel Veillard567a45b2005-10-18 19:11:55 +00001453 state->trans[state->nbTrans].nd = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00001454 state->nbTrans++;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001455 xmlRegStateAddTransTo(ctxt, target, state->no);
Daniel Veillard4255d502002-04-16 15:50:10 +00001456}
1457
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001458static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001459xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001460 if (state == NULL) return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001461 if (ctxt->maxStates == 0) {
1462 ctxt->maxStates = 4;
1463 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1464 sizeof(xmlRegStatePtr));
1465 if (ctxt->states == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001466 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001467 ctxt->maxStates = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001468 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001469 }
1470 } else if (ctxt->nbStates >= ctxt->maxStates) {
1471 xmlRegStatePtr *tmp;
1472 ctxt->maxStates *= 2;
1473 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1474 sizeof(xmlRegStatePtr));
1475 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001476 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001477 ctxt->maxStates /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001478 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001479 }
1480 ctxt->states = tmp;
1481 }
1482 state->no = ctxt->nbStates;
1483 ctxt->states[ctxt->nbStates++] = state;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001484 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001485}
1486
1487/**
Daniel Veillard7646b182002-04-20 06:41:40 +00001488 * xmlFAGenerateAllTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001489 * @ctxt: a regexp parser context
1490 * @from: the from state
1491 * @to: the target state or NULL for building a new one
1492 * @lax:
Daniel Veillard7646b182002-04-20 06:41:40 +00001493 *
1494 */
1495static void
1496xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
Daniel Veillard441bc322002-04-20 17:38:48 +00001497 xmlRegStatePtr from, xmlRegStatePtr to,
1498 int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00001499 if (to == NULL) {
1500 to = xmlRegNewState(ctxt);
1501 xmlRegStatePush(ctxt, to);
1502 ctxt->state = to;
1503 }
Daniel Veillard441bc322002-04-20 17:38:48 +00001504 if (lax)
Daniel Veillard5de09382005-09-26 17:18:17 +00001505 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
Daniel Veillard441bc322002-04-20 17:38:48 +00001506 else
Daniel Veillard5de09382005-09-26 17:18:17 +00001507 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
Daniel Veillard7646b182002-04-20 06:41:40 +00001508}
1509
1510/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001511 * xmlFAGenerateEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001512 * @ctxt: a regexp parser context
1513 * @from: the from state
1514 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001515 *
1516 */
1517static void
1518xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1519 xmlRegStatePtr from, xmlRegStatePtr to) {
1520 if (to == NULL) {
1521 to = xmlRegNewState(ctxt);
1522 xmlRegStatePush(ctxt, to);
1523 ctxt->state = to;
1524 }
Daniel Veillard5de09382005-09-26 17:18:17 +00001525 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001526}
1527
1528/**
1529 * xmlFAGenerateCountedEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001530 * @ctxt: a regexp parser context
1531 * @from: the from state
1532 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001533 * counter: the counter for that transition
1534 *
1535 */
1536static void
1537xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1538 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1539 if (to == NULL) {
1540 to = xmlRegNewState(ctxt);
1541 xmlRegStatePush(ctxt, to);
1542 ctxt->state = to;
1543 }
Daniel Veillard5de09382005-09-26 17:18:17 +00001544 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001545}
1546
1547/**
1548 * xmlFAGenerateCountedTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001549 * @ctxt: a regexp parser context
1550 * @from: the from state
1551 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001552 * counter: the counter for that transition
1553 *
1554 */
1555static void
1556xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1557 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1558 if (to == NULL) {
1559 to = xmlRegNewState(ctxt);
1560 xmlRegStatePush(ctxt, to);
1561 ctxt->state = to;
1562 }
Daniel Veillard5de09382005-09-26 17:18:17 +00001563 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
Daniel Veillard4255d502002-04-16 15:50:10 +00001564}
1565
1566/**
1567 * xmlFAGenerateTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001568 * @ctxt: a regexp parser context
1569 * @from: the from state
1570 * @to: the target state or NULL for building a new one
1571 * @atom: the atom generating the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00001572 *
William M. Brackddf71d62004-05-06 04:17:26 +00001573 * Returns 0 if success and -1 in case of error.
Daniel Veillard4255d502002-04-16 15:50:10 +00001574 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001575static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001576xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1577 xmlRegStatePtr to, xmlRegAtomPtr atom) {
Daniel Veillard10bda622008-03-13 07:27:24 +00001578 xmlRegStatePtr end;
Daniel Veillard34b35002016-05-09 09:28:38 +08001579 int nullable = 0;
Daniel Veillard10bda622008-03-13 07:27:24 +00001580
Daniel Veillard4255d502002-04-16 15:50:10 +00001581 if (atom == NULL) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07001582 ERROR("generate transition: atom == NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001583 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001584 }
1585 if (atom->type == XML_REGEXP_SUBREG) {
1586 /*
1587 * this is a subexpression handling one should not need to
William M. Brackddf71d62004-05-06 04:17:26 +00001588 * create a new node except for XML_REGEXP_QUANT_RANGE.
Daniel Veillard4255d502002-04-16 15:50:10 +00001589 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001590 if (xmlRegAtomPush(ctxt, atom) < 0) {
1591 return(-1);
1592 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001593 if ((to != NULL) && (atom->stop != to) &&
1594 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1595 /*
1596 * Generate an epsilon transition to link to the target
1597 */
1598 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
Daniel Veillardaa622012005-10-20 15:55:25 +00001599#ifdef DV
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001600 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
Daniel Veillardaa622012005-10-20 15:55:25 +00001601 (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1602 to = xmlRegNewState(ctxt);
1603 xmlRegStatePush(ctxt, to);
1604 ctxt->state = to;
1605 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1606#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00001607 }
1608 switch (atom->quant) {
1609 case XML_REGEXP_QUANT_OPT:
1610 atom->quant = XML_REGEXP_QUANT_ONCE;
Daniel Veillard54eb0242006-03-21 23:17:57 +00001611 /*
1612 * transition done to the state after end of atom.
1613 * 1. set transition from atom start to new state
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001614 * 2. set transition from atom end to this state.
Daniel Veillard54eb0242006-03-21 23:17:57 +00001615 */
Daniel Veillardd80d0722009-08-22 18:56:01 +02001616 if (to == NULL) {
1617 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1618 xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1619 ctxt->state);
1620 } else {
1621 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1622 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001623 break;
1624 case XML_REGEXP_QUANT_MULT:
1625 atom->quant = XML_REGEXP_QUANT_ONCE;
1626 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1627 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1628 break;
1629 case XML_REGEXP_QUANT_PLUS:
1630 atom->quant = XML_REGEXP_QUANT_ONCE;
1631 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1632 break;
1633 case XML_REGEXP_QUANT_RANGE: {
1634 int counter;
Daniel Veillard76d59b62007-08-22 16:29:21 +00001635 xmlRegStatePtr inter, newstate;
Daniel Veillard4255d502002-04-16 15:50:10 +00001636
1637 /*
Daniel Veillard76d59b62007-08-22 16:29:21 +00001638 * create the final state now if needed
Daniel Veillard4255d502002-04-16 15:50:10 +00001639 */
Daniel Veillard4255d502002-04-16 15:50:10 +00001640 if (to != NULL) {
1641 newstate = to;
1642 } else {
1643 newstate = xmlRegNewState(ctxt);
1644 xmlRegStatePush(ctxt, newstate);
Daniel Veillard4255d502002-04-16 15:50:10 +00001645 }
Daniel Veillard76d59b62007-08-22 16:29:21 +00001646
1647 /*
1648 * The principle here is to use counted transition
1649 * to avoid explosion in the number of states in the
1650 * graph. This is clearly more complex but should not
1651 * be exploitable at runtime.
Daniel Veillard54eb0242006-03-21 23:17:57 +00001652 */
Daniel Veillard76d59b62007-08-22 16:29:21 +00001653 if ((atom->min == 0) && (atom->start0 == NULL)) {
1654 xmlRegAtomPtr copy;
1655 /*
1656 * duplicate a transition based on atom to count next
Haibo Huangcfd91dc2020-07-30 23:01:33 -07001657 * occurrences after 1. We cannot loop to atom->start
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001658 * directly because we need an epsilon transition to
Daniel Veillard76d59b62007-08-22 16:29:21 +00001659 * newstate.
1660 */
1661 /* ???? For some reason it seems we never reach that
1662 case, I suppose this got optimized out before when
1663 building the automata */
Daniel Veillardc821e032007-08-28 17:33:45 +00001664 copy = xmlRegCopyAtom(ctxt, atom);
Daniel Veillard76d59b62007-08-22 16:29:21 +00001665 if (copy == NULL)
1666 return(-1);
Daniel Veillard76d59b62007-08-22 16:29:21 +00001667 copy->quant = XML_REGEXP_QUANT_ONCE;
1668 copy->min = 0;
1669 copy->max = 0;
1670
1671 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1672 < 0)
1673 return(-1);
1674 inter = ctxt->state;
1675 counter = xmlRegGetCounter(ctxt);
1676 ctxt->counters[counter].min = atom->min - 1;
1677 ctxt->counters[counter].max = atom->max - 1;
1678 /* count the number of times we see it again */
1679 xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1680 atom->stop, counter);
1681 /* allow a way out based on the count */
1682 xmlFAGenerateCountedTransition(ctxt, inter,
1683 newstate, counter);
1684 /* and also allow a direct exit for 0 */
1685 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1686 newstate);
1687 } else {
1688 /*
1689 * either we need the atom at least once or there
Haibo Huangcfd91dc2020-07-30 23:01:33 -07001690 * is an atom->start0 allowing to easily plug the
Daniel Veillard76d59b62007-08-22 16:29:21 +00001691 * epsilon transition.
1692 */
1693 counter = xmlRegGetCounter(ctxt);
1694 ctxt->counters[counter].min = atom->min - 1;
1695 ctxt->counters[counter].max = atom->max - 1;
1696 /* count the number of times we see it again */
1697 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1698 atom->start, counter);
1699 /* allow a way out based on the count */
1700 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1701 newstate, counter);
1702 /* and if needed allow a direct exit for 0 */
1703 if (atom->min == 0)
1704 xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1705 newstate);
1706
1707 }
1708 atom->min = 0;
1709 atom->max = 0;
1710 atom->quant = XML_REGEXP_QUANT_ONCE;
1711 ctxt->state = newstate;
Daniel Veillard4255d502002-04-16 15:50:10 +00001712 }
1713 default:
1714 break;
1715 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001716 return(0);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001717 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001718 if ((atom->min == 0) && (atom->max == 0) &&
Daniel Veillard99c394d2005-07-14 12:58:49 +00001719 (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1720 /*
1721 * we can discard the atom and generate an epsilon transition instead
1722 */
1723 if (to == NULL) {
1724 to = xmlRegNewState(ctxt);
1725 if (to != NULL)
1726 xmlRegStatePush(ctxt, to);
1727 else {
1728 return(-1);
1729 }
1730 }
1731 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1732 ctxt->state = to;
1733 xmlRegFreeAtom(atom);
1734 return(0);
Daniel Veillard567a45b2005-10-18 19:11:55 +00001735 }
1736 if (to == NULL) {
1737 to = xmlRegNewState(ctxt);
1738 if (to != NULL)
1739 xmlRegStatePush(ctxt, to);
1740 else {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001741 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001742 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001743 }
Daniel Veillard10bda622008-03-13 07:27:24 +00001744 end = to;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001745 if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
Daniel Veillard10bda622008-03-13 07:27:24 +00001746 (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1747 /*
1748 * Do not pollute the target state by adding transitions from
1749 * it as it is likely to be the shared target of multiple branches.
1750 * So isolate with an epsilon transition.
1751 */
1752 xmlRegStatePtr tmp;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001753
Daniel Veillard10bda622008-03-13 07:27:24 +00001754 tmp = xmlRegNewState(ctxt);
1755 if (tmp != NULL)
1756 xmlRegStatePush(ctxt, tmp);
1757 else {
1758 return(-1);
1759 }
1760 xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1761 to = tmp;
Daniel Veillard4255d502002-04-16 15:50:10 +00001762 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001763 if (xmlRegAtomPush(ctxt, atom) < 0) {
1764 return(-1);
1765 }
Daniel Veillard34b35002016-05-09 09:28:38 +08001766 if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
1767 (atom->min == 0) && (atom->max > 0)) {
1768 nullable = 1;
1769 atom->min = 1;
1770 if (atom->max == 1)
1771 atom->quant = XML_REGEXP_QUANT_OPT;
1772 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00001773 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
Daniel Veillard10bda622008-03-13 07:27:24 +00001774 ctxt->state = end;
Daniel Veillard4255d502002-04-16 15:50:10 +00001775 switch (atom->quant) {
1776 case XML_REGEXP_QUANT_OPT:
1777 atom->quant = XML_REGEXP_QUANT_ONCE;
1778 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1779 break;
1780 case XML_REGEXP_QUANT_MULT:
1781 atom->quant = XML_REGEXP_QUANT_ONCE;
1782 xmlFAGenerateEpsilonTransition(ctxt, from, to);
Daniel Veillard5de09382005-09-26 17:18:17 +00001783 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001784 break;
1785 case XML_REGEXP_QUANT_PLUS:
1786 atom->quant = XML_REGEXP_QUANT_ONCE;
Daniel Veillard5de09382005-09-26 17:18:17 +00001787 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001788 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001789 case XML_REGEXP_QUANT_RANGE:
Daniel Veillard34b35002016-05-09 09:28:38 +08001790 if (nullable)
William M. Brack56578372007-04-11 14:33:46 +00001791 xmlFAGenerateEpsilonTransition(ctxt, from, to);
William M. Brack56578372007-04-11 14:33:46 +00001792 break;
Daniel Veillard4255d502002-04-16 15:50:10 +00001793 default:
1794 break;
1795 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001796 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001797}
1798
1799/**
1800 * xmlFAReduceEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001801 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001802 * @fromnr: the from state
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001803 * @tonr: the to state
William M. Brackddf71d62004-05-06 04:17:26 +00001804 * @counter: should that transition be associated to a counted
Daniel Veillard4255d502002-04-16 15:50:10 +00001805 *
1806 */
1807static void
1808xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1809 int tonr, int counter) {
1810 int transnr;
1811 xmlRegStatePtr from;
1812 xmlRegStatePtr to;
1813
1814#ifdef DEBUG_REGEXP_GRAPH
1815 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1816#endif
1817 from = ctxt->states[fromnr];
1818 if (from == NULL)
1819 return;
1820 to = ctxt->states[tonr];
1821 if (to == NULL)
1822 return;
1823 if ((to->mark == XML_REGEXP_MARK_START) ||
1824 (to->mark == XML_REGEXP_MARK_VISITED))
1825 return;
1826
1827 to->mark = XML_REGEXP_MARK_VISITED;
1828 if (to->type == XML_REGEXP_FINAL_STATE) {
1829#ifdef DEBUG_REGEXP_GRAPH
1830 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1831#endif
1832 from->type = XML_REGEXP_FINAL_STATE;
1833 }
1834 for (transnr = 0;transnr < to->nbTrans;transnr++) {
Daniel Veillarddb68b742005-07-30 13:18:24 +00001835 if (to->trans[transnr].to < 0)
1836 continue;
Daniel Veillard4255d502002-04-16 15:50:10 +00001837 if (to->trans[transnr].atom == NULL) {
1838 /*
1839 * Don't remove counted transitions
1840 * Don't loop either
1841 */
Daniel Veillardb509f152002-04-17 16:28:10 +00001842 if (to->trans[transnr].to != fromnr) {
1843 if (to->trans[transnr].count >= 0) {
1844 int newto = to->trans[transnr].to;
1845
1846 xmlRegStateAddTrans(ctxt, from, NULL,
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001847 ctxt->states[newto],
Daniel Veillard5de09382005-09-26 17:18:17 +00001848 -1, to->trans[transnr].count);
Daniel Veillardb509f152002-04-17 16:28:10 +00001849 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00001850#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillardb509f152002-04-17 16:28:10 +00001851 printf("Found epsilon trans %d from %d to %d\n",
1852 transnr, tonr, to->trans[transnr].to);
Daniel Veillard4255d502002-04-16 15:50:10 +00001853#endif
Daniel Veillardb509f152002-04-17 16:28:10 +00001854 if (to->trans[transnr].counter >= 0) {
1855 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1856 to->trans[transnr].to,
1857 to->trans[transnr].counter);
1858 } else {
1859 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1860 to->trans[transnr].to,
1861 counter);
1862 }
1863 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001864 }
1865 } else {
1866 int newto = to->trans[transnr].to;
1867
Daniel Veillardb509f152002-04-17 16:28:10 +00001868 if (to->trans[transnr].counter >= 0) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001869 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1870 ctxt->states[newto],
Daniel Veillard5de09382005-09-26 17:18:17 +00001871 to->trans[transnr].counter, -1);
Daniel Veillardb509f152002-04-17 16:28:10 +00001872 } else {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001873 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
Daniel Veillard5de09382005-09-26 17:18:17 +00001874 ctxt->states[newto], counter, -1);
Daniel Veillardb509f152002-04-17 16:28:10 +00001875 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001876 }
1877 }
1878 to->mark = XML_REGEXP_MARK_NORMAL;
1879}
1880
1881/**
Daniel Veillarddb68b742005-07-30 13:18:24 +00001882 * xmlFAEliminateSimpleEpsilonTransitions:
1883 * @ctxt: a regexp parser context
1884 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001885 * Eliminating general epsilon transitions can get costly in the general
Daniel Veillarddb68b742005-07-30 13:18:24 +00001886 * algorithm due to the large amount of generated new transitions and
1887 * associated comparisons. However for simple epsilon transition used just
1888 * to separate building blocks when generating the automata this can be
1889 * reduced to state elimination:
1890 * - if there exists an epsilon from X to Y
1891 * - if there is no other transition from X
1892 * then X and Y are semantically equivalent and X can be eliminated
1893 * If X is the start state then make Y the start state, else replace the
1894 * target of all transitions to X by transitions to Y.
1895 */
1896static void
1897xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1898 int statenr, i, j, newto;
1899 xmlRegStatePtr state, tmp;
1900
1901 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1902 state = ctxt->states[statenr];
1903 if (state == NULL)
1904 continue;
1905 if (state->nbTrans != 1)
1906 continue;
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001907 if (state->type == XML_REGEXP_UNREACH_STATE)
1908 continue;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001909 /* is the only transition out a basic transition */
1910 if ((state->trans[0].atom == NULL) &&
1911 (state->trans[0].to >= 0) &&
1912 (state->trans[0].to != statenr) &&
1913 (state->trans[0].counter < 0) &&
1914 (state->trans[0].count < 0)) {
1915 newto = state->trans[0].to;
1916
1917 if (state->type == XML_REGEXP_START_STATE) {
1918#ifdef DEBUG_REGEXP_GRAPH
1919 printf("Found simple epsilon trans from start %d to %d\n",
1920 statenr, newto);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001921#endif
Daniel Veillarddb68b742005-07-30 13:18:24 +00001922 } else {
1923#ifdef DEBUG_REGEXP_GRAPH
1924 printf("Found simple epsilon trans from %d to %d\n",
1925 statenr, newto);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001926#endif
Daniel Veillarddb68b742005-07-30 13:18:24 +00001927 for (i = 0;i < state->nbTransTo;i++) {
1928 tmp = ctxt->states[state->transTo[i]];
1929 for (j = 0;j < tmp->nbTrans;j++) {
1930 if (tmp->trans[j].to == statenr) {
Daniel Veillarddb68b742005-07-30 13:18:24 +00001931#ifdef DEBUG_REGEXP_GRAPH
1932 printf("Changed transition %d on %d to go to %d\n",
1933 j, tmp->no, newto);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001934#endif
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001935 tmp->trans[j].to = -1;
1936 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001937 ctxt->states[newto],
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001938 tmp->trans[j].counter,
1939 tmp->trans[j].count);
Daniel Veillarddb68b742005-07-30 13:18:24 +00001940 }
1941 }
1942 }
Daniel Veillarddb68b742005-07-30 13:18:24 +00001943 if (state->type == XML_REGEXP_FINAL_STATE)
1944 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1945 /* eliminate the transition completely */
1946 state->nbTrans = 0;
1947
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001948 state->type = XML_REGEXP_UNREACH_STATE;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001949
1950 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +08001951
Daniel Veillarddb68b742005-07-30 13:18:24 +00001952 }
1953 }
1954}
1955/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001956 * xmlFAEliminateEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001957 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001958 *
1959 */
1960static void
1961xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1962 int statenr, transnr;
1963 xmlRegStatePtr state;
Daniel Veillarddb68b742005-07-30 13:18:24 +00001964 int has_epsilon;
Daniel Veillard4255d502002-04-16 15:50:10 +00001965
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001966 if (ctxt->states == NULL) return;
1967
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001968 /*
1969 * Eliminate simple epsilon transition and the associated unreachable
1970 * states.
1971 */
Daniel Veillarddb68b742005-07-30 13:18:24 +00001972 xmlFAEliminateSimpleEpsilonTransitions(ctxt);
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00001973 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1974 state = ctxt->states[statenr];
1975 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1976#ifdef DEBUG_REGEXP_GRAPH
1977 printf("Removed unreachable state %d\n", statenr);
1978#endif
1979 xmlRegFreeState(state);
1980 ctxt->states[statenr] = NULL;
1981 }
1982 }
Daniel Veillarddb68b742005-07-30 13:18:24 +00001983
1984 has_epsilon = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001985
Daniel Veillard4255d502002-04-16 15:50:10 +00001986 /*
Daniel Veillardfcd18ff2006-11-02 10:28:04 +00001987 * Build the completed transitions bypassing the epsilons
Daniel Veillard4255d502002-04-16 15:50:10 +00001988 * Use a marking algorithm to avoid loops
Daniel Veillardfcd18ff2006-11-02 10:28:04 +00001989 * Mark sink states too.
Haibo Huangcfd91dc2020-07-30 23:01:33 -07001990 * Process from the latest states backward to the start when
Daniel Veillardfcd18ff2006-11-02 10:28:04 +00001991 * there is long cascading epsilon chains this minimize the
1992 * recursions and transition compares when adding the new ones
Daniel Veillard4255d502002-04-16 15:50:10 +00001993 */
Daniel Veillardfcd18ff2006-11-02 10:28:04 +00001994 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
Daniel Veillard4255d502002-04-16 15:50:10 +00001995 state = ctxt->states[statenr];
1996 if (state == NULL)
1997 continue;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00001998 if ((state->nbTrans == 0) &&
1999 (state->type != XML_REGEXP_FINAL_STATE)) {
2000 state->type = XML_REGEXP_SINK_STATE;
2001 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002002 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2003 if ((state->trans[transnr].atom == NULL) &&
2004 (state->trans[transnr].to >= 0)) {
2005 if (state->trans[transnr].to == statenr) {
2006 state->trans[transnr].to = -1;
2007#ifdef DEBUG_REGEXP_GRAPH
2008 printf("Removed loopback epsilon trans %d on %d\n",
2009 transnr, statenr);
2010#endif
2011 } else if (state->trans[transnr].count < 0) {
2012 int newto = state->trans[transnr].to;
2013
2014#ifdef DEBUG_REGEXP_GRAPH
2015 printf("Found epsilon trans %d from %d to %d\n",
2016 transnr, statenr, newto);
2017#endif
Daniel Veillarddb68b742005-07-30 13:18:24 +00002018 has_epsilon = 1;
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00002019 state->trans[transnr].to = -2;
2020 state->mark = XML_REGEXP_MARK_START;
Daniel Veillard4255d502002-04-16 15:50:10 +00002021 xmlFAReduceEpsilonTransitions(ctxt, statenr,
2022 newto, state->trans[transnr].counter);
2023 state->mark = XML_REGEXP_MARK_NORMAL;
2024#ifdef DEBUG_REGEXP_GRAPH
2025 } else {
2026 printf("Found counted transition %d on %d\n",
2027 transnr, statenr);
2028#endif
2029 }
2030 }
2031 }
2032 }
2033 /*
2034 * Eliminate the epsilon transitions
2035 */
Daniel Veillarddb68b742005-07-30 13:18:24 +00002036 if (has_epsilon) {
2037 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2038 state = ctxt->states[statenr];
2039 if (state == NULL)
2040 continue;
2041 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2042 xmlRegTransPtr trans = &(state->trans[transnr]);
2043 if ((trans->atom == NULL) &&
2044 (trans->count < 0) &&
2045 (trans->to >= 0)) {
2046 trans->to = -1;
2047 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002048 }
2049 }
2050 }
Daniel Veillard23e73572002-09-19 19:56:43 +00002051
2052 /*
2053 * Use this pass to detect unreachable states too
2054 */
2055 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2056 state = ctxt->states[statenr];
2057 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00002058 state->reached = XML_REGEXP_MARK_NORMAL;
Daniel Veillard23e73572002-09-19 19:56:43 +00002059 }
2060 state = ctxt->states[0];
2061 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00002062 state->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00002063 while (state != NULL) {
2064 xmlRegStatePtr target = NULL;
William M. Brack779af002003-08-01 15:55:39 +00002065 state->reached = XML_REGEXP_MARK_VISITED;
Daniel Veillard23e73572002-09-19 19:56:43 +00002066 /*
William M. Brackddf71d62004-05-06 04:17:26 +00002067 * Mark all states reachable from the current reachable state
Daniel Veillard23e73572002-09-19 19:56:43 +00002068 */
2069 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2070 if ((state->trans[transnr].to >= 0) &&
2071 ((state->trans[transnr].atom != NULL) ||
2072 (state->trans[transnr].count >= 0))) {
2073 int newto = state->trans[transnr].to;
2074
2075 if (ctxt->states[newto] == NULL)
2076 continue;
William M. Brack779af002003-08-01 15:55:39 +00002077 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
2078 ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00002079 target = ctxt->states[newto];
2080 }
2081 }
2082 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00002083
Daniel Veillard23e73572002-09-19 19:56:43 +00002084 /*
2085 * find the next accessible state not explored
2086 */
2087 if (target == NULL) {
2088 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
2089 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00002090 if ((state != NULL) && (state->reached ==
2091 XML_REGEXP_MARK_START)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00002092 target = state;
2093 break;
2094 }
2095 }
2096 }
2097 state = target;
2098 }
2099 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2100 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00002101 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00002102#ifdef DEBUG_REGEXP_GRAPH
2103 printf("Removed unreachable state %d\n", statenr);
2104#endif
2105 xmlRegFreeState(state);
2106 ctxt->states[statenr] = NULL;
2107 }
2108 }
2109
Daniel Veillard4255d502002-04-16 15:50:10 +00002110}
2111
Daniel Veillard567a45b2005-10-18 19:11:55 +00002112static int
2113xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
2114 int ret = 0;
2115
2116 if ((range1->type == XML_REGEXP_RANGES) ||
2117 (range2->type == XML_REGEXP_RANGES) ||
2118 (range2->type == XML_REGEXP_SUBREG) ||
2119 (range1->type == XML_REGEXP_SUBREG) ||
2120 (range1->type == XML_REGEXP_STRING) ||
2121 (range2->type == XML_REGEXP_STRING))
2122 return(-1);
2123
2124 /* put them in order */
2125 if (range1->type > range2->type) {
2126 xmlRegRangePtr tmp;
2127
2128 tmp = range1;
2129 range1 = range2;
2130 range2 = tmp;
2131 }
2132 if ((range1->type == XML_REGEXP_ANYCHAR) ||
2133 (range2->type == XML_REGEXP_ANYCHAR)) {
2134 ret = 1;
2135 } else if ((range1->type == XML_REGEXP_EPSILON) ||
2136 (range2->type == XML_REGEXP_EPSILON)) {
2137 return(0);
2138 } else if (range1->type == range2->type) {
Daniel Veillard9332b482009-09-23 18:28:43 +02002139 if (range1->type != XML_REGEXP_CHARVAL)
2140 ret = 1;
2141 else if ((range1->end < range2->start) ||
2142 (range2->end < range1->start))
Daniel Veillard567a45b2005-10-18 19:11:55 +00002143 ret = 0;
Daniel Veillard9332b482009-09-23 18:28:43 +02002144 else
2145 ret = 1;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002146 } else if (range1->type == XML_REGEXP_CHARVAL) {
2147 int codepoint;
2148 int neg = 0;
2149
2150 /*
2151 * just check all codepoints in the range for acceptance,
2152 * this is usually way cheaper since done only once at
Daniel Veillardf8e3db02012-09-11 13:26:36 +08002153 * compilation than testing over and over at runtime or
Daniel Veillard567a45b2005-10-18 19:11:55 +00002154 * pushing too many states when evaluating.
2155 */
2156 if (((range1->neg == 0) && (range2->neg != 0)) ||
2157 ((range1->neg != 0) && (range2->neg == 0)))
2158 neg = 1;
2159
2160 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2161 ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2162 0, range2->start, range2->end,
2163 range2->blockName);
2164 if (ret < 0)
2165 return(-1);
2166 if (((neg == 1) && (ret == 0)) ||
2167 ((neg == 0) && (ret == 1)))
2168 return(1);
2169 }
2170 return(0);
2171 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2172 (range2->type == XML_REGEXP_BLOCK_NAME)) {
2173 if (range1->type == range2->type) {
2174 ret = xmlStrEqual(range1->blockName, range2->blockName);
2175 } else {
2176 /*
2177 * comparing a block range with anything else is way
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002178 * too costly, and maintaining the table is like too much
Daniel Veillard567a45b2005-10-18 19:11:55 +00002179 * memory too, so let's force the automata to save state
2180 * here.
2181 */
2182 return(1);
2183 }
2184 } else if ((range1->type < XML_REGEXP_LETTER) ||
2185 (range2->type < XML_REGEXP_LETTER)) {
2186 if ((range1->type == XML_REGEXP_ANYSPACE) &&
2187 (range2->type == XML_REGEXP_NOTSPACE))
2188 ret = 0;
2189 else if ((range1->type == XML_REGEXP_INITNAME) &&
2190 (range2->type == XML_REGEXP_NOTINITNAME))
2191 ret = 0;
2192 else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2193 (range2->type == XML_REGEXP_NOTNAMECHAR))
2194 ret = 0;
2195 else if ((range1->type == XML_REGEXP_DECIMAL) &&
2196 (range2->type == XML_REGEXP_NOTDECIMAL))
2197 ret = 0;
2198 else if ((range1->type == XML_REGEXP_REALCHAR) &&
2199 (range2->type == XML_REGEXP_NOTREALCHAR))
2200 ret = 0;
2201 else {
2202 /* same thing to limit complexity */
2203 return(1);
2204 }
2205 } else {
2206 ret = 0;
2207 /* range1->type < range2->type here */
2208 switch (range1->type) {
2209 case XML_REGEXP_LETTER:
2210 /* all disjoint except in the subgroups */
2211 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2212 (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2213 (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2214 (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2215 (range2->type == XML_REGEXP_LETTER_OTHERS))
2216 ret = 1;
2217 break;
2218 case XML_REGEXP_MARK:
2219 if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2220 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2221 (range2->type == XML_REGEXP_MARK_ENCLOSING))
2222 ret = 1;
2223 break;
2224 case XML_REGEXP_NUMBER:
2225 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2226 (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2227 (range2->type == XML_REGEXP_NUMBER_OTHERS))
2228 ret = 1;
2229 break;
2230 case XML_REGEXP_PUNCT:
2231 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2232 (range2->type == XML_REGEXP_PUNCT_DASH) ||
2233 (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2234 (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2235 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2236 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2237 (range2->type == XML_REGEXP_PUNCT_OTHERS))
2238 ret = 1;
2239 break;
2240 case XML_REGEXP_SEPAR:
2241 if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2242 (range2->type == XML_REGEXP_SEPAR_LINE) ||
2243 (range2->type == XML_REGEXP_SEPAR_PARA))
2244 ret = 1;
2245 break;
2246 case XML_REGEXP_SYMBOL:
2247 if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2248 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2249 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2250 (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2251 ret = 1;
2252 break;
2253 case XML_REGEXP_OTHER:
2254 if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2255 (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2256 (range2->type == XML_REGEXP_OTHER_PRIVATE))
2257 ret = 1;
2258 break;
2259 default:
2260 if ((range2->type >= XML_REGEXP_LETTER) &&
2261 (range2->type < XML_REGEXP_BLOCK_NAME))
2262 ret = 0;
2263 else {
2264 /* safety net ! */
2265 return(1);
2266 }
2267 }
2268 }
2269 if (((range1->neg == 0) && (range2->neg != 0)) ||
2270 ((range1->neg != 0) && (range2->neg == 0)))
2271 ret = !ret;
Daniel Veillard594e5df2009-09-07 14:58:47 +02002272 return(ret);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002273}
2274
Daniel Veillarde19fc232002-04-22 16:01:24 +00002275/**
Daniel Veillardfc011b72006-02-12 19:14:15 +00002276 * xmlFACompareAtomTypes:
2277 * @type1: an atom type
2278 * @type2: an atom type
2279 *
2280 * Compares two atoms type to check whether they intersect in some ways,
2281 * this is used by xmlFACompareAtoms only
2282 *
2283 * Returns 1 if they may intersect and 0 otherwise
2284 */
2285static int
2286xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2287 if ((type1 == XML_REGEXP_EPSILON) ||
2288 (type1 == XML_REGEXP_CHARVAL) ||
2289 (type1 == XML_REGEXP_RANGES) ||
2290 (type1 == XML_REGEXP_SUBREG) ||
2291 (type1 == XML_REGEXP_STRING) ||
2292 (type1 == XML_REGEXP_ANYCHAR))
2293 return(1);
2294 if ((type2 == XML_REGEXP_EPSILON) ||
2295 (type2 == XML_REGEXP_CHARVAL) ||
2296 (type2 == XML_REGEXP_RANGES) ||
2297 (type2 == XML_REGEXP_SUBREG) ||
2298 (type2 == XML_REGEXP_STRING) ||
2299 (type2 == XML_REGEXP_ANYCHAR))
2300 return(1);
2301
2302 if (type1 == type2) return(1);
2303
2304 /* simplify subsequent compares by making sure type1 < type2 */
2305 if (type1 > type2) {
2306 xmlRegAtomType tmp = type1;
2307 type1 = type2;
2308 type2 = tmp;
2309 }
2310 switch (type1) {
2311 case XML_REGEXP_ANYSPACE: /* \s */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002312 /* can't be a letter, number, mark, punctuation, symbol */
Daniel Veillardfc011b72006-02-12 19:14:15 +00002313 if ((type2 == XML_REGEXP_NOTSPACE) ||
2314 ((type2 >= XML_REGEXP_LETTER) &&
2315 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2316 ((type2 >= XML_REGEXP_NUMBER) &&
2317 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2318 ((type2 >= XML_REGEXP_MARK) &&
2319 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2320 ((type2 >= XML_REGEXP_PUNCT) &&
2321 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2322 ((type2 >= XML_REGEXP_SYMBOL) &&
2323 (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2324 ) return(0);
2325 break;
2326 case XML_REGEXP_NOTSPACE: /* \S */
2327 break;
2328 case XML_REGEXP_INITNAME: /* \l */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002329 /* can't be a number, mark, separator, punctuation, symbol or other */
Daniel Veillardfc011b72006-02-12 19:14:15 +00002330 if ((type2 == XML_REGEXP_NOTINITNAME) ||
2331 ((type2 >= XML_REGEXP_NUMBER) &&
2332 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2333 ((type2 >= XML_REGEXP_MARK) &&
2334 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2335 ((type2 >= XML_REGEXP_SEPAR) &&
2336 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2337 ((type2 >= XML_REGEXP_PUNCT) &&
2338 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2339 ((type2 >= XML_REGEXP_SYMBOL) &&
2340 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2341 ((type2 >= XML_REGEXP_OTHER) &&
2342 (type2 <= XML_REGEXP_OTHER_NA))
2343 ) return(0);
2344 break;
2345 case XML_REGEXP_NOTINITNAME: /* \L */
2346 break;
2347 case XML_REGEXP_NAMECHAR: /* \c */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002348 /* can't be a mark, separator, punctuation, symbol or other */
Daniel Veillardfc011b72006-02-12 19:14:15 +00002349 if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2350 ((type2 >= XML_REGEXP_MARK) &&
2351 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2352 ((type2 >= XML_REGEXP_PUNCT) &&
2353 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2354 ((type2 >= XML_REGEXP_SEPAR) &&
2355 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2356 ((type2 >= XML_REGEXP_SYMBOL) &&
2357 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2358 ((type2 >= XML_REGEXP_OTHER) &&
2359 (type2 <= XML_REGEXP_OTHER_NA))
2360 ) return(0);
2361 break;
2362 case XML_REGEXP_NOTNAMECHAR: /* \C */
2363 break;
2364 case XML_REGEXP_DECIMAL: /* \d */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002365 /* can't be a letter, mark, separator, punctuation, symbol or other */
Daniel Veillardfc011b72006-02-12 19:14:15 +00002366 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2367 (type2 == XML_REGEXP_REALCHAR) ||
2368 ((type2 >= XML_REGEXP_LETTER) &&
2369 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2370 ((type2 >= XML_REGEXP_MARK) &&
2371 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2372 ((type2 >= XML_REGEXP_PUNCT) &&
2373 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2374 ((type2 >= XML_REGEXP_SEPAR) &&
2375 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2376 ((type2 >= XML_REGEXP_SYMBOL) &&
2377 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2378 ((type2 >= XML_REGEXP_OTHER) &&
2379 (type2 <= XML_REGEXP_OTHER_NA))
2380 )return(0);
2381 break;
2382 case XML_REGEXP_NOTDECIMAL: /* \D */
2383 break;
2384 case XML_REGEXP_REALCHAR: /* \w */
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002385 /* can't be a mark, separator, punctuation, symbol or other */
Daniel Veillardfc011b72006-02-12 19:14:15 +00002386 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2387 ((type2 >= XML_REGEXP_MARK) &&
2388 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2389 ((type2 >= XML_REGEXP_PUNCT) &&
2390 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2391 ((type2 >= XML_REGEXP_SEPAR) &&
2392 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2393 ((type2 >= XML_REGEXP_SYMBOL) &&
2394 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2395 ((type2 >= XML_REGEXP_OTHER) &&
2396 (type2 <= XML_REGEXP_OTHER_NA))
2397 )return(0);
2398 break;
2399 case XML_REGEXP_NOTREALCHAR: /* \W */
2400 break;
2401 /*
2402 * at that point we know both type 1 and type2 are from
2403 * character categories are ordered and are different,
2404 * it becomes simple because this is a partition
2405 */
2406 case XML_REGEXP_LETTER:
2407 if (type2 <= XML_REGEXP_LETTER_OTHERS)
2408 return(1);
2409 return(0);
2410 case XML_REGEXP_LETTER_UPPERCASE:
2411 case XML_REGEXP_LETTER_LOWERCASE:
2412 case XML_REGEXP_LETTER_TITLECASE:
2413 case XML_REGEXP_LETTER_MODIFIER:
2414 case XML_REGEXP_LETTER_OTHERS:
2415 return(0);
2416 case XML_REGEXP_MARK:
2417 if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2418 return(1);
2419 return(0);
2420 case XML_REGEXP_MARK_NONSPACING:
2421 case XML_REGEXP_MARK_SPACECOMBINING:
2422 case XML_REGEXP_MARK_ENCLOSING:
2423 return(0);
2424 case XML_REGEXP_NUMBER:
2425 if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2426 return(1);
2427 return(0);
2428 case XML_REGEXP_NUMBER_DECIMAL:
2429 case XML_REGEXP_NUMBER_LETTER:
2430 case XML_REGEXP_NUMBER_OTHERS:
2431 return(0);
2432 case XML_REGEXP_PUNCT:
2433 if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2434 return(1);
2435 return(0);
2436 case XML_REGEXP_PUNCT_CONNECTOR:
2437 case XML_REGEXP_PUNCT_DASH:
2438 case XML_REGEXP_PUNCT_OPEN:
2439 case XML_REGEXP_PUNCT_CLOSE:
2440 case XML_REGEXP_PUNCT_INITQUOTE:
2441 case XML_REGEXP_PUNCT_FINQUOTE:
2442 case XML_REGEXP_PUNCT_OTHERS:
2443 return(0);
2444 case XML_REGEXP_SEPAR:
2445 if (type2 <= XML_REGEXP_SEPAR_PARA)
2446 return(1);
2447 return(0);
2448 case XML_REGEXP_SEPAR_SPACE:
2449 case XML_REGEXP_SEPAR_LINE:
2450 case XML_REGEXP_SEPAR_PARA:
2451 return(0);
2452 case XML_REGEXP_SYMBOL:
2453 if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2454 return(1);
2455 return(0);
2456 case XML_REGEXP_SYMBOL_MATH:
2457 case XML_REGEXP_SYMBOL_CURRENCY:
2458 case XML_REGEXP_SYMBOL_MODIFIER:
2459 case XML_REGEXP_SYMBOL_OTHERS:
2460 return(0);
2461 case XML_REGEXP_OTHER:
2462 if (type2 <= XML_REGEXP_OTHER_NA)
2463 return(1);
2464 return(0);
2465 case XML_REGEXP_OTHER_CONTROL:
2466 case XML_REGEXP_OTHER_FORMAT:
2467 case XML_REGEXP_OTHER_PRIVATE:
2468 case XML_REGEXP_OTHER_NA:
2469 return(0);
2470 default:
2471 break;
2472 }
2473 return(1);
2474}
2475
2476/**
2477 * xmlFAEqualAtoms:
Daniel Veillarde19fc232002-04-22 16:01:24 +00002478 * @atom1: an atom
2479 * @atom2: an atom
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002480 * @deep: if not set only compare string pointers
Daniel Veillarde19fc232002-04-22 16:01:24 +00002481 *
Daniel Veillardfc011b72006-02-12 19:14:15 +00002482 * Compares two atoms to check whether they are the same exactly
2483 * this is used to remove equivalent transitions
Daniel Veillarde19fc232002-04-22 16:01:24 +00002484 *
Daniel Veillardfc011b72006-02-12 19:14:15 +00002485 * Returns 1 if same and 0 otherwise
Daniel Veillarde19fc232002-04-22 16:01:24 +00002486 */
2487static int
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002488xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
Daniel Veillardfc011b72006-02-12 19:14:15 +00002489 int ret = 0;
Daniel Veillard9efc4762005-07-19 14:33:55 +00002490
Daniel Veillarde19fc232002-04-22 16:01:24 +00002491 if (atom1 == atom2)
2492 return(1);
2493 if ((atom1 == NULL) || (atom2 == NULL))
2494 return(0);
2495
Daniel Veillardfc011b72006-02-12 19:14:15 +00002496 if (atom1->type != atom2->type)
2497 return(0);
2498 switch (atom1->type) {
2499 case XML_REGEXP_EPSILON:
2500 ret = 0;
2501 break;
2502 case XML_REGEXP_STRING:
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002503 if (!deep)
2504 ret = (atom1->valuep == atom2->valuep);
2505 else
2506 ret = xmlStrEqual((xmlChar *)atom1->valuep,
2507 (xmlChar *)atom2->valuep);
Daniel Veillardfc011b72006-02-12 19:14:15 +00002508 break;
2509 case XML_REGEXP_CHARVAL:
2510 ret = (atom1->codepoint == atom2->codepoint);
2511 break;
2512 case XML_REGEXP_RANGES:
2513 /* too hard to do in the general case */
2514 ret = 0;
2515 default:
2516 break;
2517 }
2518 return(ret);
2519}
2520
2521/**
2522 * xmlFACompareAtoms:
2523 * @atom1: an atom
2524 * @atom2: an atom
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002525 * @deep: if not set only compare string pointers
Daniel Veillardfc011b72006-02-12 19:14:15 +00002526 *
2527 * Compares two atoms to check whether they intersect in some ways,
2528 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2529 *
2530 * Returns 1 if yes and 0 otherwise
2531 */
2532static int
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002533xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
Daniel Veillardfc011b72006-02-12 19:14:15 +00002534 int ret = 1;
2535
2536 if (atom1 == atom2)
2537 return(1);
2538 if ((atom1 == NULL) || (atom2 == NULL))
2539 return(0);
2540
2541 if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2542 (atom2->type == XML_REGEXP_ANYCHAR))
2543 return(1);
2544
2545 if (atom1->type > atom2->type) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002546 xmlRegAtomPtr tmp;
2547 tmp = atom1;
2548 atom1 = atom2;
2549 atom2 = tmp;
Daniel Veillardfc011b72006-02-12 19:14:15 +00002550 }
2551 if (atom1->type != atom2->type) {
2552 ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2553 /* if they can't intersect at the type level break now */
2554 if (ret == 0)
2555 return(0);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002556 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002557 switch (atom1->type) {
2558 case XML_REGEXP_STRING:
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002559 if (!deep)
2560 ret = (atom1->valuep != atom2->valuep);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07002561 else {
2562 xmlChar *val1 = (xmlChar *)atom1->valuep;
2563 xmlChar *val2 = (xmlChar *)atom2->valuep;
2564 int compound1 = (xmlStrchr(val1, '|') != NULL);
2565 int compound2 = (xmlStrchr(val2, '|') != NULL);
2566
2567 /* Ignore negative match flag for ##other namespaces */
2568 if (compound1 != compound2)
2569 return(0);
2570
2571 ret = xmlRegStrEqualWildcard(val1, val2);
2572 }
Daniel Veillard9efc4762005-07-19 14:33:55 +00002573 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002574 case XML_REGEXP_EPSILON:
Daniel Veillard567a45b2005-10-18 19:11:55 +00002575 goto not_determinist;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002576 case XML_REGEXP_CHARVAL:
Daniel Veillardfc011b72006-02-12 19:14:15 +00002577 if (atom2->type == XML_REGEXP_CHARVAL) {
2578 ret = (atom1->codepoint == atom2->codepoint);
2579 } else {
2580 ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2581 if (ret < 0)
2582 ret = 1;
2583 }
Daniel Veillard9efc4762005-07-19 14:33:55 +00002584 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002585 case XML_REGEXP_RANGES:
Daniel Veillardfc011b72006-02-12 19:14:15 +00002586 if (atom2->type == XML_REGEXP_RANGES) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002587 int i, j, res;
2588 xmlRegRangePtr r1, r2;
2589
2590 /*
2591 * need to check that none of the ranges eventually matches
2592 */
2593 for (i = 0;i < atom1->nbRanges;i++) {
2594 for (j = 0;j < atom2->nbRanges;j++) {
2595 r1 = atom1->ranges[i];
2596 r2 = atom2->ranges[j];
2597 res = xmlFACompareRanges(r1, r2);
2598 if (res == 1) {
2599 ret = 1;
2600 goto done;
2601 }
2602 }
2603 }
2604 ret = 0;
2605 }
2606 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002607 default:
Daniel Veillard567a45b2005-10-18 19:11:55 +00002608 goto not_determinist;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002609 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002610done:
Daniel Veillard6e65e152005-08-09 11:09:52 +00002611 if (atom1->neg != atom2->neg) {
Daniel Veillard9efc4762005-07-19 14:33:55 +00002612 ret = !ret;
Daniel Veillard6e65e152005-08-09 11:09:52 +00002613 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002614 if (ret == 0)
2615 return(0);
2616not_determinist:
2617 return(1);
Daniel Veillarde19fc232002-04-22 16:01:24 +00002618}
2619
2620/**
2621 * xmlFARecurseDeterminism:
2622 * @ctxt: a regexp parser context
2623 *
2624 * Check whether the associated regexp is determinist,
2625 * should be called after xmlFAEliminateEpsilonTransitions()
2626 *
2627 */
2628static int
2629xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2630 int to, xmlRegAtomPtr atom) {
2631 int ret = 1;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002632 int res;
Daniel Veillard5de09382005-09-26 17:18:17 +00002633 int transnr, nbTrans;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002634 xmlRegTransPtr t1;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002635 int deep = 1;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002636
2637 if (state == NULL)
2638 return(ret);
Daniel Veillard466fcda2012-08-27 12:03:40 +08002639 if (state->markd == XML_REGEXP_MARK_VISITED)
2640 return(ret);
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002641
2642 if (ctxt->flags & AM_AUTOMATA_RNG)
2643 deep = 0;
2644
Daniel Veillard5de09382005-09-26 17:18:17 +00002645 /*
2646 * don't recurse on transitions potentially added in the course of
2647 * the elimination.
2648 */
2649 nbTrans = state->nbTrans;
2650 for (transnr = 0;transnr < nbTrans;transnr++) {
Daniel Veillarde19fc232002-04-22 16:01:24 +00002651 t1 = &(state->trans[transnr]);
2652 /*
2653 * check transitions conflicting with the one looked at
2654 */
2655 if (t1->atom == NULL) {
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00002656 if (t1->to < 0)
Daniel Veillarde19fc232002-04-22 16:01:24 +00002657 continue;
Daniel Veillard466fcda2012-08-27 12:03:40 +08002658 state->markd = XML_REGEXP_MARK_VISITED;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002659 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
Daniel Veillarde19fc232002-04-22 16:01:24 +00002660 to, atom);
Daniel Veillard466fcda2012-08-27 12:03:40 +08002661 state->markd = 0;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002662 if (res == 0) {
2663 ret = 0;
Daniel Veillardaa622012005-10-20 15:55:25 +00002664 /* t1->nd = 1; */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002665 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002666 continue;
2667 }
2668 if (t1->to != to)
2669 continue;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002670 if (xmlFACompareAtoms(t1->atom, atom, deep)) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002671 ret = 0;
2672 /* mark the transition as non-deterministic */
2673 t1->nd = 1;
2674 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002675 }
2676 return(ret);
2677}
2678
2679/**
2680 * xmlFAComputesDeterminism:
2681 * @ctxt: a regexp parser context
2682 *
2683 * Check whether the associated regexp is determinist,
2684 * should be called after xmlFAEliminateEpsilonTransitions()
2685 *
2686 */
2687static int
2688xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2689 int statenr, transnr;
2690 xmlRegStatePtr state;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002691 xmlRegTransPtr t1, t2, last;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002692 int i;
2693 int ret = 1;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002694 int deep = 1;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002695
Daniel Veillard4402ab42002-09-12 16:02:56 +00002696#ifdef DEBUG_REGEXP_GRAPH
2697 printf("xmlFAComputesDeterminism\n");
2698 xmlRegPrintCtxt(stdout, ctxt);
2699#endif
Daniel Veillarde19fc232002-04-22 16:01:24 +00002700 if (ctxt->determinist != -1)
2701 return(ctxt->determinist);
2702
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002703 if (ctxt->flags & AM_AUTOMATA_RNG)
2704 deep = 0;
2705
Daniel Veillarde19fc232002-04-22 16:01:24 +00002706 /*
Daniel Veillard567a45b2005-10-18 19:11:55 +00002707 * First cleanup the automata removing cancelled transitions
Daniel Veillarde19fc232002-04-22 16:01:24 +00002708 */
2709 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2710 state = ctxt->states[statenr];
2711 if (state == NULL)
2712 continue;
Daniel Veillard4f82c8a2005-08-09 21:40:08 +00002713 if (state->nbTrans < 2)
2714 continue;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002715 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2716 t1 = &(state->trans[transnr]);
2717 /*
2718 * Determinism checks in case of counted or all transitions
2719 * will have to be handled separately
2720 */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002721 if (t1->atom == NULL) {
Daniel Veillardaa622012005-10-20 15:55:25 +00002722 /* t1->nd = 1; */
Daniel Veillarde19fc232002-04-22 16:01:24 +00002723 continue;
Daniel Veillard567a45b2005-10-18 19:11:55 +00002724 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002725 if (t1->to == -1) /* eliminated */
2726 continue;
2727 for (i = 0;i < transnr;i++) {
2728 t2 = &(state->trans[i]);
2729 if (t2->to == -1) /* eliminated */
2730 continue;
2731 if (t2->atom != NULL) {
2732 if (t1->to == t2->to) {
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002733 /*
2734 * Here we use deep because we want to keep the
2735 * transitions which indicate a conflict
2736 */
2737 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
Daniel Veillard11e28e42009-08-12 12:21:42 +02002738 (t1->counter == t2->counter) &&
2739 (t1->count == t2->count))
William M. Brackddf71d62004-05-06 04:17:26 +00002740 t2->to = -1; /* eliminated */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002741 }
2742 }
2743 }
2744 }
2745 }
2746
2747 /*
2748 * Check for all states that there aren't 2 transitions
2749 * with the same atom and a different target.
2750 */
2751 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2752 state = ctxt->states[statenr];
2753 if (state == NULL)
2754 continue;
2755 if (state->nbTrans < 2)
2756 continue;
2757 last = NULL;
2758 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2759 t1 = &(state->trans[transnr]);
2760 /*
2761 * Determinism checks in case of counted or all transitions
2762 * will have to be handled separately
2763 */
2764 if (t1->atom == NULL) {
2765 continue;
2766 }
2767 if (t1->to == -1) /* eliminated */
2768 continue;
2769 for (i = 0;i < transnr;i++) {
2770 t2 = &(state->trans[i]);
2771 if (t2->to == -1) /* eliminated */
2772 continue;
2773 if (t2->atom != NULL) {
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02002774 /*
2775 * But here we don't use deep because we want to
2776 * find transitions which indicate a conflict
2777 */
2778 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00002779 ret = 0;
2780 /* mark the transitions as non-deterministic ones */
2781 t1->nd = 1;
2782 t2->nd = 1;
2783 last = t1;
Daniel Veillarde19fc232002-04-22 16:01:24 +00002784 }
2785 } else if (t1->to != -1) {
2786 /*
2787 * do the closure in case of remaining specific
2788 * epsilon transitions like choices or all
2789 */
2790 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2791 t2->to, t2->atom);
Daniel Veillard567a45b2005-10-18 19:11:55 +00002792 /* don't shortcut the computation so all non deterministic
2793 transition get marked down
Daniel Veillarde19fc232002-04-22 16:01:24 +00002794 if (ret == 0)
Daniel Veillardaa622012005-10-20 15:55:25 +00002795 return(0);
2796 */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002797 if (ret == 0) {
2798 t1->nd = 1;
Daniel Veillardaa622012005-10-20 15:55:25 +00002799 /* t2->nd = 1; */
Daniel Veillard567a45b2005-10-18 19:11:55 +00002800 last = t1;
2801 }
Daniel Veillarde19fc232002-04-22 16:01:24 +00002802 }
2803 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002804 /* don't shortcut the computation so all non deterministic
2805 transition get marked down
Daniel Veillarde19fc232002-04-22 16:01:24 +00002806 if (ret == 0)
Daniel Veillard567a45b2005-10-18 19:11:55 +00002807 break; */
Daniel Veillarde19fc232002-04-22 16:01:24 +00002808 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002809
2810 /*
2811 * mark specifically the last non-deterministic transition
2812 * from a state since there is no need to set-up rollback
2813 * from it
2814 */
2815 if (last != NULL) {
2816 last->nd = 2;
2817 }
2818
2819 /* don't shortcut the computation so all non deterministic
2820 transition get marked down
Daniel Veillarde19fc232002-04-22 16:01:24 +00002821 if (ret == 0)
Daniel Veillard567a45b2005-10-18 19:11:55 +00002822 break; */
Daniel Veillarde19fc232002-04-22 16:01:24 +00002823 }
Daniel Veillard567a45b2005-10-18 19:11:55 +00002824
Daniel Veillarde19fc232002-04-22 16:01:24 +00002825 ctxt->determinist = ret;
2826 return(ret);
2827}
2828
Daniel Veillard4255d502002-04-16 15:50:10 +00002829/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08002830 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00002831 * Routines to check input against transition atoms *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08002832 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00002833 ************************************************************************/
2834
2835static int
2836xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2837 int start, int end, const xmlChar *blockName) {
2838 int ret = 0;
2839
2840 switch (type) {
2841 case XML_REGEXP_STRING:
2842 case XML_REGEXP_SUBREG:
2843 case XML_REGEXP_RANGES:
2844 case XML_REGEXP_EPSILON:
2845 return(-1);
2846 case XML_REGEXP_ANYCHAR:
2847 ret = ((codepoint != '\n') && (codepoint != '\r'));
2848 break;
2849 case XML_REGEXP_CHARVAL:
2850 ret = ((codepoint >= start) && (codepoint <= end));
2851 break;
2852 case XML_REGEXP_NOTSPACE:
2853 neg = !neg;
J. Peter Mugaasd2c329a2017-10-21 13:49:31 +02002854 /* Falls through. */
Daniel Veillard4255d502002-04-16 15:50:10 +00002855 case XML_REGEXP_ANYSPACE:
2856 ret = ((codepoint == '\n') || (codepoint == '\r') ||
2857 (codepoint == '\t') || (codepoint == ' '));
2858 break;
2859 case XML_REGEXP_NOTINITNAME:
2860 neg = !neg;
J. Peter Mugaasd2c329a2017-10-21 13:49:31 +02002861 /* Falls through. */
Daniel Veillard4255d502002-04-16 15:50:10 +00002862 case XML_REGEXP_INITNAME:
Daniel Veillardf8e3db02012-09-11 13:26:36 +08002863 ret = (IS_LETTER(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00002864 (codepoint == '_') || (codepoint == ':'));
2865 break;
2866 case XML_REGEXP_NOTNAMECHAR:
2867 neg = !neg;
J. Peter Mugaasd2c329a2017-10-21 13:49:31 +02002868 /* Falls through. */
Daniel Veillard4255d502002-04-16 15:50:10 +00002869 case XML_REGEXP_NAMECHAR:
William M. Brack871611b2003-10-18 04:53:14 +00002870 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00002871 (codepoint == '.') || (codepoint == '-') ||
2872 (codepoint == '_') || (codepoint == ':') ||
William M. Brack871611b2003-10-18 04:53:14 +00002873 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
Daniel Veillard4255d502002-04-16 15:50:10 +00002874 break;
2875 case XML_REGEXP_NOTDECIMAL:
2876 neg = !neg;
J. Peter Mugaasd2c329a2017-10-21 13:49:31 +02002877 /* Falls through. */
Daniel Veillard4255d502002-04-16 15:50:10 +00002878 case XML_REGEXP_DECIMAL:
2879 ret = xmlUCSIsCatNd(codepoint);
2880 break;
2881 case XML_REGEXP_REALCHAR:
2882 neg = !neg;
J. Peter Mugaasd2c329a2017-10-21 13:49:31 +02002883 /* Falls through. */
Daniel Veillard4255d502002-04-16 15:50:10 +00002884 case XML_REGEXP_NOTREALCHAR:
2885 ret = xmlUCSIsCatP(codepoint);
2886 if (ret == 0)
2887 ret = xmlUCSIsCatZ(codepoint);
2888 if (ret == 0)
2889 ret = xmlUCSIsCatC(codepoint);
2890 break;
2891 case XML_REGEXP_LETTER:
2892 ret = xmlUCSIsCatL(codepoint);
2893 break;
2894 case XML_REGEXP_LETTER_UPPERCASE:
2895 ret = xmlUCSIsCatLu(codepoint);
2896 break;
2897 case XML_REGEXP_LETTER_LOWERCASE:
2898 ret = xmlUCSIsCatLl(codepoint);
2899 break;
2900 case XML_REGEXP_LETTER_TITLECASE:
2901 ret = xmlUCSIsCatLt(codepoint);
2902 break;
2903 case XML_REGEXP_LETTER_MODIFIER:
2904 ret = xmlUCSIsCatLm(codepoint);
2905 break;
2906 case XML_REGEXP_LETTER_OTHERS:
2907 ret = xmlUCSIsCatLo(codepoint);
2908 break;
2909 case XML_REGEXP_MARK:
2910 ret = xmlUCSIsCatM(codepoint);
2911 break;
2912 case XML_REGEXP_MARK_NONSPACING:
2913 ret = xmlUCSIsCatMn(codepoint);
2914 break;
2915 case XML_REGEXP_MARK_SPACECOMBINING:
2916 ret = xmlUCSIsCatMc(codepoint);
2917 break;
2918 case XML_REGEXP_MARK_ENCLOSING:
2919 ret = xmlUCSIsCatMe(codepoint);
2920 break;
2921 case XML_REGEXP_NUMBER:
2922 ret = xmlUCSIsCatN(codepoint);
2923 break;
2924 case XML_REGEXP_NUMBER_DECIMAL:
2925 ret = xmlUCSIsCatNd(codepoint);
2926 break;
2927 case XML_REGEXP_NUMBER_LETTER:
2928 ret = xmlUCSIsCatNl(codepoint);
2929 break;
2930 case XML_REGEXP_NUMBER_OTHERS:
2931 ret = xmlUCSIsCatNo(codepoint);
2932 break;
2933 case XML_REGEXP_PUNCT:
2934 ret = xmlUCSIsCatP(codepoint);
2935 break;
2936 case XML_REGEXP_PUNCT_CONNECTOR:
2937 ret = xmlUCSIsCatPc(codepoint);
2938 break;
2939 case XML_REGEXP_PUNCT_DASH:
2940 ret = xmlUCSIsCatPd(codepoint);
2941 break;
2942 case XML_REGEXP_PUNCT_OPEN:
2943 ret = xmlUCSIsCatPs(codepoint);
2944 break;
2945 case XML_REGEXP_PUNCT_CLOSE:
2946 ret = xmlUCSIsCatPe(codepoint);
2947 break;
2948 case XML_REGEXP_PUNCT_INITQUOTE:
2949 ret = xmlUCSIsCatPi(codepoint);
2950 break;
2951 case XML_REGEXP_PUNCT_FINQUOTE:
2952 ret = xmlUCSIsCatPf(codepoint);
2953 break;
2954 case XML_REGEXP_PUNCT_OTHERS:
2955 ret = xmlUCSIsCatPo(codepoint);
2956 break;
2957 case XML_REGEXP_SEPAR:
2958 ret = xmlUCSIsCatZ(codepoint);
2959 break;
2960 case XML_REGEXP_SEPAR_SPACE:
2961 ret = xmlUCSIsCatZs(codepoint);
2962 break;
2963 case XML_REGEXP_SEPAR_LINE:
2964 ret = xmlUCSIsCatZl(codepoint);
2965 break;
2966 case XML_REGEXP_SEPAR_PARA:
2967 ret = xmlUCSIsCatZp(codepoint);
2968 break;
2969 case XML_REGEXP_SYMBOL:
2970 ret = xmlUCSIsCatS(codepoint);
2971 break;
2972 case XML_REGEXP_SYMBOL_MATH:
2973 ret = xmlUCSIsCatSm(codepoint);
2974 break;
2975 case XML_REGEXP_SYMBOL_CURRENCY:
2976 ret = xmlUCSIsCatSc(codepoint);
2977 break;
2978 case XML_REGEXP_SYMBOL_MODIFIER:
2979 ret = xmlUCSIsCatSk(codepoint);
2980 break;
2981 case XML_REGEXP_SYMBOL_OTHERS:
2982 ret = xmlUCSIsCatSo(codepoint);
2983 break;
2984 case XML_REGEXP_OTHER:
2985 ret = xmlUCSIsCatC(codepoint);
2986 break;
2987 case XML_REGEXP_OTHER_CONTROL:
2988 ret = xmlUCSIsCatCc(codepoint);
2989 break;
2990 case XML_REGEXP_OTHER_FORMAT:
2991 ret = xmlUCSIsCatCf(codepoint);
2992 break;
2993 case XML_REGEXP_OTHER_PRIVATE:
2994 ret = xmlUCSIsCatCo(codepoint);
2995 break;
2996 case XML_REGEXP_OTHER_NA:
2997 /* ret = xmlUCSIsCatCn(codepoint); */
2998 /* Seems it doesn't exist anymore in recent Unicode releases */
2999 ret = 0;
3000 break;
3001 case XML_REGEXP_BLOCK_NAME:
3002 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
3003 break;
3004 }
3005 if (neg)
3006 return(!ret);
3007 return(ret);
3008}
3009
3010static int
3011xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
3012 int i, ret = 0;
3013 xmlRegRangePtr range;
3014
William M. Brack871611b2003-10-18 04:53:14 +00003015 if ((atom == NULL) || (!IS_CHAR(codepoint)))
Daniel Veillard4255d502002-04-16 15:50:10 +00003016 return(-1);
3017
3018 switch (atom->type) {
3019 case XML_REGEXP_SUBREG:
3020 case XML_REGEXP_EPSILON:
3021 return(-1);
3022 case XML_REGEXP_CHARVAL:
3023 return(codepoint == atom->codepoint);
3024 case XML_REGEXP_RANGES: {
3025 int accept = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00003026
Daniel Veillard4255d502002-04-16 15:50:10 +00003027 for (i = 0;i < atom->nbRanges;i++) {
3028 range = atom->ranges[i];
Daniel Veillardf8b9de32003-11-24 14:27:26 +00003029 if (range->neg == 2) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003030 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3031 0, range->start, range->end,
3032 range->blockName);
3033 if (ret != 0)
3034 return(0); /* excluded char */
Daniel Veillardf8b9de32003-11-24 14:27:26 +00003035 } else if (range->neg) {
3036 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3037 0, range->start, range->end,
3038 range->blockName);
3039 if (ret == 0)
Daniel Veillardf2a12832003-11-24 13:04:35 +00003040 accept = 1;
Daniel Veillardf8b9de32003-11-24 14:27:26 +00003041 else
3042 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00003043 } else {
3044 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3045 0, range->start, range->end,
3046 range->blockName);
3047 if (ret != 0)
3048 accept = 1; /* might still be excluded */
3049 }
3050 }
3051 return(accept);
3052 }
3053 case XML_REGEXP_STRING:
3054 printf("TODO: XML_REGEXP_STRING\n");
3055 return(-1);
3056 case XML_REGEXP_ANYCHAR:
3057 case XML_REGEXP_ANYSPACE:
3058 case XML_REGEXP_NOTSPACE:
3059 case XML_REGEXP_INITNAME:
3060 case XML_REGEXP_NOTINITNAME:
3061 case XML_REGEXP_NAMECHAR:
3062 case XML_REGEXP_NOTNAMECHAR:
3063 case XML_REGEXP_DECIMAL:
3064 case XML_REGEXP_NOTDECIMAL:
3065 case XML_REGEXP_REALCHAR:
3066 case XML_REGEXP_NOTREALCHAR:
3067 case XML_REGEXP_LETTER:
3068 case XML_REGEXP_LETTER_UPPERCASE:
3069 case XML_REGEXP_LETTER_LOWERCASE:
3070 case XML_REGEXP_LETTER_TITLECASE:
3071 case XML_REGEXP_LETTER_MODIFIER:
3072 case XML_REGEXP_LETTER_OTHERS:
3073 case XML_REGEXP_MARK:
3074 case XML_REGEXP_MARK_NONSPACING:
3075 case XML_REGEXP_MARK_SPACECOMBINING:
3076 case XML_REGEXP_MARK_ENCLOSING:
3077 case XML_REGEXP_NUMBER:
3078 case XML_REGEXP_NUMBER_DECIMAL:
3079 case XML_REGEXP_NUMBER_LETTER:
3080 case XML_REGEXP_NUMBER_OTHERS:
3081 case XML_REGEXP_PUNCT:
3082 case XML_REGEXP_PUNCT_CONNECTOR:
3083 case XML_REGEXP_PUNCT_DASH:
3084 case XML_REGEXP_PUNCT_OPEN:
3085 case XML_REGEXP_PUNCT_CLOSE:
3086 case XML_REGEXP_PUNCT_INITQUOTE:
3087 case XML_REGEXP_PUNCT_FINQUOTE:
3088 case XML_REGEXP_PUNCT_OTHERS:
3089 case XML_REGEXP_SEPAR:
3090 case XML_REGEXP_SEPAR_SPACE:
3091 case XML_REGEXP_SEPAR_LINE:
3092 case XML_REGEXP_SEPAR_PARA:
3093 case XML_REGEXP_SYMBOL:
3094 case XML_REGEXP_SYMBOL_MATH:
3095 case XML_REGEXP_SYMBOL_CURRENCY:
3096 case XML_REGEXP_SYMBOL_MODIFIER:
3097 case XML_REGEXP_SYMBOL_OTHERS:
3098 case XML_REGEXP_OTHER:
3099 case XML_REGEXP_OTHER_CONTROL:
3100 case XML_REGEXP_OTHER_FORMAT:
3101 case XML_REGEXP_OTHER_PRIVATE:
3102 case XML_REGEXP_OTHER_NA:
3103 case XML_REGEXP_BLOCK_NAME:
3104 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3105 (const xmlChar *)atom->valuep);
3106 if (atom->neg)
3107 ret = !ret;
3108 break;
3109 }
3110 return(ret);
3111}
3112
3113/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003114 * *
William M. Brackddf71d62004-05-06 04:17:26 +00003115 * Saving and restoring state of an execution context *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003116 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00003117 ************************************************************************/
3118
3119#ifdef DEBUG_REGEXP_EXEC
3120static void
3121xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
3122 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
3123 if (exec->inputStack != NULL) {
3124 int i;
3125 printf(": ");
3126 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
Daniel Veillard0e05f4c2006-11-01 15:33:04 +00003127 printf("%s ", (const char *)
3128 exec->inputStack[exec->inputStackNr - (i + 1)].value);
Daniel Veillard4255d502002-04-16 15:50:10 +00003129 } else {
3130 printf(": %s", &(exec->inputString[exec->index]));
3131 }
3132 printf("\n");
3133}
3134#endif
3135
3136static void
3137xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3138#ifdef DEBUG_REGEXP_EXEC
3139 printf("saving ");
3140 exec->transno++;
3141 xmlFARegDebugExec(exec);
3142 exec->transno--;
3143#endif
Daniel Veillard94cc1032005-09-15 13:09:00 +00003144#ifdef MAX_PUSH
3145 if (exec->nbPush > MAX_PUSH) {
3146 return;
3147 }
3148 exec->nbPush++;
3149#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003150
3151 if (exec->maxRollbacks == 0) {
3152 exec->maxRollbacks = 4;
3153 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3154 sizeof(xmlRegExecRollback));
3155 if (exec->rollbacks == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003156 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00003157 exec->maxRollbacks = 0;
3158 return;
3159 }
3160 memset(exec->rollbacks, 0,
3161 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3162 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
3163 xmlRegExecRollback *tmp;
3164 int len = exec->maxRollbacks;
3165
3166 exec->maxRollbacks *= 2;
3167 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3168 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3169 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003170 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00003171 exec->maxRollbacks /= 2;
3172 return;
3173 }
3174 exec->rollbacks = tmp;
3175 tmp = &exec->rollbacks[len];
3176 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3177 }
3178 exec->rollbacks[exec->nbRollbacks].state = exec->state;
3179 exec->rollbacks[exec->nbRollbacks].index = exec->index;
3180 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3181 if (exec->comp->nbCounters > 0) {
3182 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3183 exec->rollbacks[exec->nbRollbacks].counts = (int *)
3184 xmlMalloc(exec->comp->nbCounters * sizeof(int));
3185 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003186 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00003187 exec->status = -5;
3188 return;
3189 }
3190 }
3191 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3192 exec->comp->nbCounters * sizeof(int));
3193 }
3194 exec->nbRollbacks++;
3195}
3196
3197static void
3198xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3199 if (exec->nbRollbacks <= 0) {
3200 exec->status = -1;
3201#ifdef DEBUG_REGEXP_EXEC
3202 printf("rollback failed on empty stack\n");
3203#endif
3204 return;
3205 }
3206 exec->nbRollbacks--;
3207 exec->state = exec->rollbacks[exec->nbRollbacks].state;
3208 exec->index = exec->rollbacks[exec->nbRollbacks].index;
3209 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3210 if (exec->comp->nbCounters > 0) {
3211 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3212 fprintf(stderr, "exec save: allocation failed");
3213 exec->status = -6;
3214 return;
3215 }
Gaurav2671b012013-09-11 14:59:06 +08003216 if (exec->counts) {
3217 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
Daniel Veillard4255d502002-04-16 15:50:10 +00003218 exec->comp->nbCounters * sizeof(int));
Gaurav2671b012013-09-11 14:59:06 +08003219 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003220 }
3221
3222#ifdef DEBUG_REGEXP_EXEC
3223 printf("restored ");
3224 xmlFARegDebugExec(exec);
3225#endif
3226}
3227
3228/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003229 * *
William M. Brackddf71d62004-05-06 04:17:26 +00003230 * Verifier, running an input against a compiled regexp *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003231 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00003232 ************************************************************************/
3233
3234static int
3235xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3236 xmlRegExecCtxt execval;
3237 xmlRegExecCtxtPtr exec = &execval;
Daniel Veillard567a45b2005-10-18 19:11:55 +00003238 int ret, codepoint = 0, len, deter;
Daniel Veillard4255d502002-04-16 15:50:10 +00003239
3240 exec->inputString = content;
3241 exec->index = 0;
Daniel Veillard94cc1032005-09-15 13:09:00 +00003242 exec->nbPush = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003243 exec->determinist = 1;
3244 exec->maxRollbacks = 0;
3245 exec->nbRollbacks = 0;
3246 exec->rollbacks = NULL;
3247 exec->status = 0;
3248 exec->comp = comp;
3249 exec->state = comp->states[0];
3250 exec->transno = 0;
3251 exec->transcount = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00003252 exec->inputStack = NULL;
3253 exec->inputStackMax = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003254 if (comp->nbCounters > 0) {
3255 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
Daniel Veillardff46a042003-10-08 08:53:17 +00003256 if (exec->counts == NULL) {
3257 xmlRegexpErrMemory(NULL, "running regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00003258 return(-1);
Daniel Veillardff46a042003-10-08 08:53:17 +00003259 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003260 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3261 } else
3262 exec->counts = NULL;
Daniel Veillard40851d02012-08-17 20:34:05 +08003263 while ((exec->status == 0) && (exec->state != NULL) &&
Daniel Veillard4255d502002-04-16 15:50:10 +00003264 ((exec->inputString[exec->index] != 0) ||
Daniel Veillardad559982008-05-12 13:15:35 +00003265 ((exec->state != NULL) &&
3266 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003267 xmlRegTransPtr trans;
3268 xmlRegAtomPtr atom;
3269
3270 /*
William M. Brack0e00b282004-04-26 15:40:47 +00003271 * If end of input on non-terminal state, rollback, however we may
Daniel Veillard4255d502002-04-16 15:50:10 +00003272 * still have epsilon like transition for counted transitions
William M. Brack0e00b282004-04-26 15:40:47 +00003273 * on counters, in that case don't break too early. Additionally,
3274 * if we are working on a range like "AB{0,2}", where B is not present,
3275 * we don't want to break.
Daniel Veillard4255d502002-04-16 15:50:10 +00003276 */
Daniel Veillard11ce4002006-03-10 00:36:23 +00003277 len = 1;
William M. Brack0e00b282004-04-26 15:40:47 +00003278 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
William M. Brackddf71d62004-05-06 04:17:26 +00003279 /*
3280 * if there is a transition, we must check if
3281 * atom allows minOccurs of 0
3282 */
3283 if (exec->transno < exec->state->nbTrans) {
William M. Brack0e00b282004-04-26 15:40:47 +00003284 trans = &exec->state->trans[exec->transno];
3285 if (trans->to >=0) {
3286 atom = trans->atom;
3287 if (!((atom->min == 0) && (atom->max > 0)))
3288 goto rollback;
3289 }
3290 } else
3291 goto rollback;
3292 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003293
3294 exec->transcount = 0;
3295 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3296 trans = &exec->state->trans[exec->transno];
3297 if (trans->to < 0)
3298 continue;
3299 atom = trans->atom;
3300 ret = 0;
Daniel Veillard567a45b2005-10-18 19:11:55 +00003301 deter = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003302 if (trans->count >= 0) {
3303 int count;
3304 xmlRegCounterPtr counter;
3305
Daniel Veillard11ce4002006-03-10 00:36:23 +00003306 if (exec->counts == NULL) {
3307 exec->status = -1;
3308 goto error;
3309 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003310 /*
3311 * A counted transition.
3312 */
3313
3314 count = exec->counts[trans->count];
3315 counter = &exec->comp->counters[trans->count];
3316#ifdef DEBUG_REGEXP_EXEC
3317 printf("testing count %d: val %d, min %d, max %d\n",
3318 trans->count, count, counter->min, counter->max);
3319#endif
3320 ret = ((count >= counter->min) && (count <= counter->max));
Daniel Veillard567a45b2005-10-18 19:11:55 +00003321 if ((ret) && (counter->min != counter->max))
3322 deter = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003323 } else if (atom == NULL) {
3324 fprintf(stderr, "epsilon transition left at runtime\n");
3325 exec->status = -2;
3326 break;
3327 } else if (exec->inputString[exec->index] != 0) {
3328 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3329 ret = xmlRegCheckCharacter(atom, codepoint);
William M. Brack0e00b282004-04-26 15:40:47 +00003330 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003331 xmlRegStatePtr to = comp->states[trans->to];
3332
3333 /*
3334 * this is a multiple input sequence
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003335 * If there is a counter associated increment it now.
3336 * before potentially saving and rollback
Daniel Veillardc821e032007-08-28 17:33:45 +00003337 * do not increment if the counter is already over the
3338 * maximum limit in which case get to next transition
Daniel Veillard4255d502002-04-16 15:50:10 +00003339 */
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003340 if (trans->counter >= 0) {
Daniel Veillardc821e032007-08-28 17:33:45 +00003341 xmlRegCounterPtr counter;
3342
3343 if ((exec->counts == NULL) ||
3344 (exec->comp == NULL) ||
3345 (exec->comp->counters == NULL)) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00003346 exec->status = -1;
3347 goto error;
3348 }
Daniel Veillardc821e032007-08-28 17:33:45 +00003349 counter = &exec->comp->counters[trans->counter];
3350 if (exec->counts[trans->counter] >= counter->max)
3351 continue; /* for loop on transitions */
3352
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003353#ifdef DEBUG_REGEXP_EXEC
3354 printf("Increasing count %d\n", trans->counter);
3355#endif
3356 exec->counts[trans->counter]++;
3357 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003358 if (exec->state->nbTrans > exec->transno + 1) {
3359 xmlFARegExecSave(exec);
3360 }
3361 exec->transcount = 1;
3362 do {
3363 /*
3364 * Try to progress as much as possible on the input
3365 */
3366 if (exec->transcount == atom->max) {
3367 break;
3368 }
3369 exec->index += len;
3370 /*
3371 * End of input: stop here
3372 */
3373 if (exec->inputString[exec->index] == 0) {
3374 exec->index -= len;
3375 break;
3376 }
3377 if (exec->transcount >= atom->min) {
3378 int transno = exec->transno;
3379 xmlRegStatePtr state = exec->state;
3380
3381 /*
3382 * The transition is acceptable save it
3383 */
3384 exec->transno = -1; /* trick */
3385 exec->state = to;
3386 xmlFARegExecSave(exec);
3387 exec->transno = transno;
3388 exec->state = state;
3389 }
3390 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3391 len);
3392 ret = xmlRegCheckCharacter(atom, codepoint);
3393 exec->transcount++;
3394 } while (ret == 1);
3395 if (exec->transcount < atom->min)
3396 ret = 0;
3397
3398 /*
3399 * If the last check failed but one transition was found
3400 * possible, rollback
3401 */
3402 if (ret < 0)
3403 ret = 0;
3404 if (ret == 0) {
3405 goto rollback;
3406 }
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003407 if (trans->counter >= 0) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00003408 if (exec->counts == NULL) {
3409 exec->status = -1;
3410 goto error;
3411 }
Daniel Veillardfc6eca02005-11-01 15:24:02 +00003412#ifdef DEBUG_REGEXP_EXEC
3413 printf("Decreasing count %d\n", trans->counter);
3414#endif
3415 exec->counts[trans->counter]--;
3416 }
William M. Brack0e00b282004-04-26 15:40:47 +00003417 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3418 /*
3419 * we don't match on the codepoint, but minOccurs of 0
3420 * says that's ok. Setting len to 0 inhibits stepping
3421 * over the codepoint.
3422 */
3423 exec->transcount = 1;
3424 len = 0;
3425 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003426 }
William M. Brack0e00b282004-04-26 15:40:47 +00003427 } else if ((atom->min == 0) && (atom->max > 0)) {
3428 /* another spot to match when minOccurs is 0 */
3429 exec->transcount = 1;
3430 len = 0;
3431 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003432 }
3433 if (ret == 1) {
Daniel Veillard567a45b2005-10-18 19:11:55 +00003434 if ((trans->nd == 1) ||
3435 ((trans->count >= 0) && (deter == 0) &&
3436 (exec->state->nbTrans > exec->transno + 1))) {
Daniel Veillardaa622012005-10-20 15:55:25 +00003437#ifdef DEBUG_REGEXP_EXEC
3438 if (trans->nd == 1)
3439 printf("Saving on nd transition atom %d for %c at %d\n",
3440 trans->atom->no, codepoint, exec->index);
3441 else
3442 printf("Saving on counted transition count %d for %c at %d\n",
3443 trans->count, codepoint, exec->index);
3444#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003445 xmlFARegExecSave(exec);
3446 }
3447 if (trans->counter >= 0) {
Daniel Veillardc821e032007-08-28 17:33:45 +00003448 xmlRegCounterPtr counter;
3449
3450 /* make sure we don't go over the counter maximum value */
3451 if ((exec->counts == NULL) ||
3452 (exec->comp == NULL) ||
3453 (exec->comp->counters == NULL)) {
3454 exec->status = -1;
Daniel Veillard11ce4002006-03-10 00:36:23 +00003455 goto error;
3456 }
Daniel Veillardc821e032007-08-28 17:33:45 +00003457 counter = &exec->comp->counters[trans->counter];
3458 if (exec->counts[trans->counter] >= counter->max)
3459 continue; /* for loop on transitions */
Daniel Veillard4255d502002-04-16 15:50:10 +00003460#ifdef DEBUG_REGEXP_EXEC
3461 printf("Increasing count %d\n", trans->counter);
3462#endif
3463 exec->counts[trans->counter]++;
3464 }
Daniel Veillard10752282005-08-08 13:05:13 +00003465 if ((trans->count >= 0) &&
3466 (trans->count < REGEXP_ALL_COUNTER)) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00003467 if (exec->counts == NULL) {
3468 exec->status = -1;
3469 goto error;
3470 }
Daniel Veillard10752282005-08-08 13:05:13 +00003471#ifdef DEBUG_REGEXP_EXEC
3472 printf("resetting count %d on transition\n",
3473 trans->count);
3474#endif
3475 exec->counts[trans->count] = 0;
3476 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003477#ifdef DEBUG_REGEXP_EXEC
3478 printf("entering state %d\n", trans->to);
3479#endif
3480 exec->state = comp->states[trans->to];
3481 exec->transno = 0;
3482 if (trans->atom != NULL) {
3483 exec->index += len;
3484 }
3485 goto progress;
3486 } else if (ret < 0) {
3487 exec->status = -4;
3488 break;
3489 }
3490 }
3491 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3492rollback:
3493 /*
3494 * Failed to find a way out
3495 */
3496 exec->determinist = 0;
Daniel Veillardaa622012005-10-20 15:55:25 +00003497#ifdef DEBUG_REGEXP_EXEC
3498 printf("rollback from state %d on %d:%c\n", exec->state->no,
3499 codepoint,codepoint);
3500#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003501 xmlFARegExecRollBack(exec);
3502 }
3503progress:
3504 continue;
3505 }
Daniel Veillard11ce4002006-03-10 00:36:23 +00003506error:
Daniel Veillard4255d502002-04-16 15:50:10 +00003507 if (exec->rollbacks != NULL) {
3508 if (exec->counts != NULL) {
3509 int i;
3510
3511 for (i = 0;i < exec->maxRollbacks;i++)
3512 if (exec->rollbacks[i].counts != NULL)
3513 xmlFree(exec->rollbacks[i].counts);
3514 }
3515 xmlFree(exec->rollbacks);
3516 }
Daniel Veillard40851d02012-08-17 20:34:05 +08003517 if (exec->state == NULL)
3518 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003519 if (exec->counts != NULL)
3520 xmlFree(exec->counts);
3521 if (exec->status == 0)
3522 return(1);
Daniel Veillard94cc1032005-09-15 13:09:00 +00003523 if (exec->status == -1) {
3524 if (exec->nbPush > MAX_PUSH)
3525 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003526 return(0);
Daniel Veillard94cc1032005-09-15 13:09:00 +00003527 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003528 return(exec->status);
3529}
3530
3531/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003532 * *
William M. Brackddf71d62004-05-06 04:17:26 +00003533 * Progressive interface to the verifier one atom at a time *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003534 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00003535 ************************************************************************/
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003536#ifdef DEBUG_ERR
3537static void testerr(xmlRegExecCtxtPtr exec);
3538#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003539
3540/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00003541 * xmlRegNewExecCtxt:
Daniel Veillard4255d502002-04-16 15:50:10 +00003542 * @comp: a precompiled regular expression
3543 * @callback: a callback function used for handling progresses in the
3544 * automata matching phase
3545 * @data: the context data associated to the callback in this context
3546 *
3547 * Build a context used for progressive evaluation of a regexp.
Daniel Veillard01c13b52002-12-10 15:19:08 +00003548 *
3549 * Returns the new context
Daniel Veillard4255d502002-04-16 15:50:10 +00003550 */
3551xmlRegExecCtxtPtr
3552xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3553 xmlRegExecCtxtPtr exec;
3554
3555 if (comp == NULL)
3556 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00003557 if ((comp->compact == NULL) && (comp->states == NULL))
3558 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00003559 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3560 if (exec == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003561 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00003562 return(NULL);
3563 }
3564 memset(exec, 0, sizeof(xmlRegExecCtxt));
3565 exec->inputString = NULL;
3566 exec->index = 0;
3567 exec->determinist = 1;
3568 exec->maxRollbacks = 0;
3569 exec->nbRollbacks = 0;
3570 exec->rollbacks = NULL;
3571 exec->status = 0;
3572 exec->comp = comp;
Daniel Veillard23e73572002-09-19 19:56:43 +00003573 if (comp->compact == NULL)
3574 exec->state = comp->states[0];
Daniel Veillard4255d502002-04-16 15:50:10 +00003575 exec->transno = 0;
3576 exec->transcount = 0;
3577 exec->callback = callback;
3578 exec->data = data;
3579 if (comp->nbCounters > 0) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003580 /*
3581 * For error handling, exec->counts is allocated twice the size
3582 * the second half is used to store the data in case of rollback
3583 */
3584 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3585 * 2);
Daniel Veillard4255d502002-04-16 15:50:10 +00003586 if (exec->counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003587 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00003588 xmlFree(exec);
3589 return(NULL);
3590 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003591 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3592 exec->errCounts = &exec->counts[comp->nbCounters];
3593 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00003594 exec->counts = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003595 exec->errCounts = NULL;
3596 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003597 exec->inputStackMax = 0;
3598 exec->inputStackNr = 0;
3599 exec->inputStack = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003600 exec->errStateNo = -1;
3601 exec->errString = NULL;
Daniel Veillard94cc1032005-09-15 13:09:00 +00003602 exec->nbPush = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003603 return(exec);
3604}
3605
3606/**
3607 * xmlRegFreeExecCtxt:
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003608 * @exec: a regular expression evaluation context
Daniel Veillard4255d502002-04-16 15:50:10 +00003609 *
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003610 * Free the structures associated to a regular expression evaluation context.
Daniel Veillard4255d502002-04-16 15:50:10 +00003611 */
3612void
3613xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3614 if (exec == NULL)
3615 return;
3616
3617 if (exec->rollbacks != NULL) {
3618 if (exec->counts != NULL) {
3619 int i;
3620
3621 for (i = 0;i < exec->maxRollbacks;i++)
3622 if (exec->rollbacks[i].counts != NULL)
3623 xmlFree(exec->rollbacks[i].counts);
3624 }
3625 xmlFree(exec->rollbacks);
3626 }
3627 if (exec->counts != NULL)
3628 xmlFree(exec->counts);
3629 if (exec->inputStack != NULL) {
3630 int i;
3631
Daniel Veillard32370232002-10-16 14:08:14 +00003632 for (i = 0;i < exec->inputStackNr;i++) {
3633 if (exec->inputStack[i].value != NULL)
3634 xmlFree(exec->inputStack[i].value);
3635 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003636 xmlFree(exec->inputStack);
3637 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003638 if (exec->errString != NULL)
3639 xmlFree(exec->errString);
Daniel Veillard4255d502002-04-16 15:50:10 +00003640 xmlFree(exec);
3641}
3642
3643static void
3644xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3645 void *data) {
3646#ifdef DEBUG_PUSH
3647 printf("saving value: %d:%s\n", exec->inputStackNr, value);
3648#endif
3649 if (exec->inputStackMax == 0) {
3650 exec->inputStackMax = 4;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003651 exec->inputStack = (xmlRegInputTokenPtr)
Daniel Veillard4255d502002-04-16 15:50:10 +00003652 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3653 if (exec->inputStack == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003654 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00003655 exec->inputStackMax = 0;
3656 return;
3657 }
3658 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3659 xmlRegInputTokenPtr tmp;
3660
3661 exec->inputStackMax *= 2;
3662 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3663 exec->inputStackMax * sizeof(xmlRegInputToken));
3664 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00003665 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00003666 exec->inputStackMax /= 2;
3667 return;
3668 }
3669 exec->inputStack = tmp;
3670 }
3671 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3672 exec->inputStack[exec->inputStackNr].data = data;
3673 exec->inputStackNr++;
3674 exec->inputStack[exec->inputStackNr].value = NULL;
3675 exec->inputStack[exec->inputStackNr].data = NULL;
3676}
3677
Daniel Veillardc0826a72004-08-10 14:17:33 +00003678/**
3679 * xmlRegStrEqualWildcard:
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003680 * @expStr: the string to be evaluated
Daniel Veillardc0826a72004-08-10 14:17:33 +00003681 * @valStr: the validation string
3682 *
3683 * Checks if both strings are equal or have the same content. "*"
Haibo Huangcfd91dc2020-07-30 23:01:33 -07003684 * can be used as a wildcard in @valStr; "|" is used as a separator of
Daniel Veillardc0826a72004-08-10 14:17:33 +00003685 * substrings in both @expStr and @valStr.
3686 *
3687 * Returns 1 if the comparison is satisfied and the number of substrings
3688 * is equal, 0 otherwise.
3689 */
3690
3691static int
3692xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3693 if (expStr == valStr) return(1);
3694 if (expStr == NULL) return(0);
3695 if (valStr == NULL) return(0);
3696 do {
3697 /*
3698 * Eval if we have a wildcard for the current item.
3699 */
3700 if (*expStr != *valStr) {
Daniel Veillard4f82c8a2005-08-09 21:40:08 +00003701 /* if one of them starts with a wildcard make valStr be it */
3702 if (*valStr == '*') {
3703 const xmlChar *tmp;
3704
3705 tmp = valStr;
3706 valStr = expStr;
3707 expStr = tmp;
3708 }
Daniel Veillardc0826a72004-08-10 14:17:33 +00003709 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3710 do {
3711 if (*valStr == XML_REG_STRING_SEPARATOR)
3712 break;
Kasimier T. Buchcikc0e833f2005-04-19 15:02:20 +00003713 valStr++;
Daniel Veillardc0826a72004-08-10 14:17:33 +00003714 } while (*valStr != 0);
3715 continue;
3716 } else
3717 return(0);
3718 }
Kasimier T. Buchcikc0e833f2005-04-19 15:02:20 +00003719 expStr++;
3720 valStr++;
Daniel Veillardc0826a72004-08-10 14:17:33 +00003721 } while (*valStr != 0);
3722 if (*expStr != 0)
3723 return (0);
3724 else
3725 return (1);
3726}
Daniel Veillard4255d502002-04-16 15:50:10 +00003727
3728/**
Daniel Veillard23e73572002-09-19 19:56:43 +00003729 * xmlRegCompactPushString:
3730 * @exec: a regexp execution context
3731 * @comp: the precompiled exec with a compact table
3732 * @value: a string token input
3733 * @data: data associated to the token to reuse in callbacks
3734 *
3735 * Push one input token in the execution context
3736 *
3737 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3738 * a negative value in case of error.
3739 */
3740static int
3741xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3742 xmlRegexpPtr comp,
3743 const xmlChar *value,
3744 void *data) {
3745 int state = exec->index;
3746 int i, target;
3747
3748 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3749 return(-1);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003750
Daniel Veillard23e73572002-09-19 19:56:43 +00003751 if (value == NULL) {
3752 /*
3753 * are we at a final state ?
3754 */
3755 if (comp->compact[state * (comp->nbstrings + 1)] ==
3756 XML_REGEXP_FINAL_STATE)
3757 return(1);
3758 return(0);
3759 }
3760
3761#ifdef DEBUG_PUSH
3762 printf("value pushed: %s\n", value);
3763#endif
3764
3765 /*
William M. Brackddf71d62004-05-06 04:17:26 +00003766 * Examine all outside transitions from current state
Daniel Veillard23e73572002-09-19 19:56:43 +00003767 */
3768 for (i = 0;i < comp->nbstrings;i++) {
3769 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3770 if ((target > 0) && (target <= comp->nbstates)) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003771 target--; /* to avoid 0 */
Daniel Veillardc0826a72004-08-10 14:17:33 +00003772 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003773 exec->index = target;
Daniel Veillard118aed72002-09-24 14:13:13 +00003774 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3775 exec->callback(exec->data, value,
3776 comp->transdata[state * comp->nbstrings + i], data);
3777 }
Daniel Veillard23e73572002-09-19 19:56:43 +00003778#ifdef DEBUG_PUSH
3779 printf("entering state %d\n", target);
3780#endif
3781 if (comp->compact[target * (comp->nbstrings + 1)] ==
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003782 XML_REGEXP_SINK_STATE)
3783 goto error;
3784
3785 if (comp->compact[target * (comp->nbstrings + 1)] ==
Daniel Veillard23e73572002-09-19 19:56:43 +00003786 XML_REGEXP_FINAL_STATE)
3787 return(1);
3788 return(0);
3789 }
3790 }
3791 }
3792 /*
3793 * Failed to find an exit transition out from current state for the
3794 * current token
3795 */
3796#ifdef DEBUG_PUSH
3797 printf("failed to find a transition for %s on state %d\n", value, state);
3798#endif
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003799error:
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003800 if (exec->errString != NULL)
3801 xmlFree(exec->errString);
3802 exec->errString = xmlStrdup(value);
3803 exec->errStateNo = state;
Daniel Veillard23e73572002-09-19 19:56:43 +00003804 exec->status = -1;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003805#ifdef DEBUG_ERR
3806 testerr(exec);
3807#endif
Daniel Veillard23e73572002-09-19 19:56:43 +00003808 return(-1);
3809}
3810
3811/**
Daniel Veillard6e65e152005-08-09 11:09:52 +00003812 * xmlRegExecPushStringInternal:
Daniel Veillardea7751d2002-12-20 00:16:24 +00003813 * @exec: a regexp execution context or NULL to indicate the end
Daniel Veillard4255d502002-04-16 15:50:10 +00003814 * @value: a string token input
3815 * @data: data associated to the token to reuse in callbacks
Daniel Veillard6e65e152005-08-09 11:09:52 +00003816 * @compound: value was assembled from 2 strings
Daniel Veillard4255d502002-04-16 15:50:10 +00003817 *
3818 * Push one input token in the execution context
3819 *
3820 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3821 * a negative value in case of error.
3822 */
Daniel Veillard6e65e152005-08-09 11:09:52 +00003823static int
3824xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3825 void *data, int compound) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003826 xmlRegTransPtr trans;
3827 xmlRegAtomPtr atom;
3828 int ret;
3829 int final = 0;
Daniel Veillard90700152005-01-08 22:05:09 +00003830 int progress = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003831
3832 if (exec == NULL)
3833 return(-1);
Daniel Veillard23e73572002-09-19 19:56:43 +00003834 if (exec->comp == NULL)
3835 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003836 if (exec->status != 0)
3837 return(exec->status);
3838
Daniel Veillard23e73572002-09-19 19:56:43 +00003839 if (exec->comp->compact != NULL)
3840 return(xmlRegCompactPushString(exec, exec->comp, value, data));
3841
Daniel Veillard4255d502002-04-16 15:50:10 +00003842 if (value == NULL) {
3843 if (exec->state->type == XML_REGEXP_FINAL_STATE)
3844 return(1);
3845 final = 1;
3846 }
3847
3848#ifdef DEBUG_PUSH
3849 printf("value pushed: %s\n", value);
3850#endif
3851 /*
3852 * If we have an active rollback stack push the new value there
3853 * and get back to where we were left
3854 */
3855 if ((value != NULL) && (exec->inputStackNr > 0)) {
3856 xmlFARegExecSaveInputString(exec, value, data);
3857 value = exec->inputStack[exec->index].value;
3858 data = exec->inputStack[exec->index].data;
3859#ifdef DEBUG_PUSH
3860 printf("value loaded: %s\n", value);
3861#endif
3862 }
3863
3864 while ((exec->status == 0) &&
3865 ((value != NULL) ||
3866 ((final == 1) &&
3867 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3868
3869 /*
3870 * End of input on non-terminal state, rollback, however we may
3871 * still have epsilon like transition for counted transitions
3872 * on counters, in that case don't break too early.
3873 */
Daniel Veillardb509f152002-04-17 16:28:10 +00003874 if ((value == NULL) && (exec->counts == NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +00003875 goto rollback;
3876
3877 exec->transcount = 0;
3878 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3879 trans = &exec->state->trans[exec->transno];
3880 if (trans->to < 0)
3881 continue;
3882 atom = trans->atom;
3883 ret = 0;
Daniel Veillard441bc322002-04-20 17:38:48 +00003884 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3885 int i;
3886 int count;
3887 xmlRegTransPtr t;
3888 xmlRegCounterPtr counter;
3889
3890 ret = 0;
3891
3892#ifdef DEBUG_PUSH
3893 printf("testing all lax %d\n", trans->count);
3894#endif
3895 /*
3896 * Check all counted transitions from the current state
3897 */
3898 if ((value == NULL) && (final)) {
3899 ret = 1;
3900 } else if (value != NULL) {
3901 for (i = 0;i < exec->state->nbTrans;i++) {
3902 t = &exec->state->trans[i];
3903 if ((t->counter < 0) || (t == trans))
3904 continue;
3905 counter = &exec->comp->counters[t->counter];
3906 count = exec->counts[t->counter];
Daniel Veillardf8e3db02012-09-11 13:26:36 +08003907 if ((count < counter->max) &&
Daniel Veillard441bc322002-04-20 17:38:48 +00003908 (t->atom != NULL) &&
3909 (xmlStrEqual(value, t->atom->valuep))) {
3910 ret = 0;
3911 break;
3912 }
3913 if ((count >= counter->min) &&
3914 (count < counter->max) &&
Daniel Veillard11ce4002006-03-10 00:36:23 +00003915 (t->atom != NULL) &&
Daniel Veillard441bc322002-04-20 17:38:48 +00003916 (xmlStrEqual(value, t->atom->valuep))) {
3917 ret = 1;
3918 break;
3919 }
3920 }
3921 }
3922 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillard8a001f62002-04-20 07:24:11 +00003923 int i;
3924 int count;
3925 xmlRegTransPtr t;
3926 xmlRegCounterPtr counter;
3927
3928 ret = 1;
3929
3930#ifdef DEBUG_PUSH
3931 printf("testing all %d\n", trans->count);
3932#endif
3933 /*
3934 * Check all counted transitions from the current state
3935 */
3936 for (i = 0;i < exec->state->nbTrans;i++) {
3937 t = &exec->state->trans[i];
3938 if ((t->counter < 0) || (t == trans))
3939 continue;
3940 counter = &exec->comp->counters[t->counter];
3941 count = exec->counts[t->counter];
3942 if ((count < counter->min) || (count > counter->max)) {
3943 ret = 0;
3944 break;
3945 }
3946 }
3947 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003948 int count;
3949 xmlRegCounterPtr counter;
3950
3951 /*
3952 * A counted transition.
3953 */
3954
3955 count = exec->counts[trans->count];
3956 counter = &exec->comp->counters[trans->count];
3957#ifdef DEBUG_PUSH
3958 printf("testing count %d: val %d, min %d, max %d\n",
3959 trans->count, count, counter->min, counter->max);
3960#endif
3961 ret = ((count >= counter->min) && (count <= counter->max));
3962 } else if (atom == NULL) {
3963 fprintf(stderr, "epsilon transition left at runtime\n");
3964 exec->status = -2;
3965 break;
3966 } else if (value != NULL) {
Daniel Veillardc0826a72004-08-10 14:17:33 +00003967 ret = xmlRegStrEqualWildcard(atom->valuep, value);
Daniel Veillard6e65e152005-08-09 11:09:52 +00003968 if (atom->neg) {
Daniel Veillard9efc4762005-07-19 14:33:55 +00003969 ret = !ret;
Daniel Veillard6e65e152005-08-09 11:09:52 +00003970 if (!compound)
3971 ret = 0;
3972 }
Daniel Veillard441bc322002-04-20 17:38:48 +00003973 if ((ret == 1) && (trans->counter >= 0)) {
3974 xmlRegCounterPtr counter;
3975 int count;
3976
3977 count = exec->counts[trans->counter];
3978 counter = &exec->comp->counters[trans->counter];
3979 if (count >= counter->max)
3980 ret = 0;
3981 }
3982
Daniel Veillard4255d502002-04-16 15:50:10 +00003983 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3984 xmlRegStatePtr to = exec->comp->states[trans->to];
3985
3986 /*
3987 * this is a multiple input sequence
3988 */
3989 if (exec->state->nbTrans > exec->transno + 1) {
3990 if (exec->inputStackNr <= 0) {
3991 xmlFARegExecSaveInputString(exec, value, data);
3992 }
3993 xmlFARegExecSave(exec);
3994 }
3995 exec->transcount = 1;
3996 do {
3997 /*
3998 * Try to progress as much as possible on the input
3999 */
4000 if (exec->transcount == atom->max) {
4001 break;
4002 }
4003 exec->index++;
4004 value = exec->inputStack[exec->index].value;
4005 data = exec->inputStack[exec->index].data;
4006#ifdef DEBUG_PUSH
4007 printf("value loaded: %s\n", value);
4008#endif
4009
4010 /*
4011 * End of input: stop here
4012 */
4013 if (value == NULL) {
4014 exec->index --;
4015 break;
4016 }
4017 if (exec->transcount >= atom->min) {
4018 int transno = exec->transno;
4019 xmlRegStatePtr state = exec->state;
4020
4021 /*
4022 * The transition is acceptable save it
4023 */
4024 exec->transno = -1; /* trick */
4025 exec->state = to;
4026 if (exec->inputStackNr <= 0) {
4027 xmlFARegExecSaveInputString(exec, value, data);
4028 }
4029 xmlFARegExecSave(exec);
4030 exec->transno = transno;
4031 exec->state = state;
4032 }
4033 ret = xmlStrEqual(value, atom->valuep);
4034 exec->transcount++;
4035 } while (ret == 1);
4036 if (exec->transcount < atom->min)
4037 ret = 0;
4038
4039 /*
4040 * If the last check failed but one transition was found
4041 * possible, rollback
4042 */
4043 if (ret < 0)
4044 ret = 0;
4045 if (ret == 0) {
4046 goto rollback;
4047 }
4048 }
4049 }
4050 if (ret == 1) {
William M. Brack98873952003-12-26 06:03:14 +00004051 if ((exec->callback != NULL) && (atom != NULL) &&
4052 (data != NULL)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00004053 exec->callback(exec->data, atom->valuep,
4054 atom->data, data);
4055 }
4056 if (exec->state->nbTrans > exec->transno + 1) {
4057 if (exec->inputStackNr <= 0) {
4058 xmlFARegExecSaveInputString(exec, value, data);
4059 }
4060 xmlFARegExecSave(exec);
4061 }
4062 if (trans->counter >= 0) {
4063#ifdef DEBUG_PUSH
4064 printf("Increasing count %d\n", trans->counter);
4065#endif
4066 exec->counts[trans->counter]++;
4067 }
Daniel Veillard10752282005-08-08 13:05:13 +00004068 if ((trans->count >= 0) &&
4069 (trans->count < REGEXP_ALL_COUNTER)) {
4070#ifdef DEBUG_REGEXP_EXEC
4071 printf("resetting count %d on transition\n",
4072 trans->count);
4073#endif
4074 exec->counts[trans->count] = 0;
4075 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004076#ifdef DEBUG_PUSH
4077 printf("entering state %d\n", trans->to);
4078#endif
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004079 if ((exec->comp->states[trans->to] != NULL) &&
4080 (exec->comp->states[trans->to]->type ==
4081 XML_REGEXP_SINK_STATE)) {
4082 /*
4083 * entering a sink state, save the current state as error
4084 * state.
4085 */
4086 if (exec->errString != NULL)
4087 xmlFree(exec->errString);
4088 exec->errString = xmlStrdup(value);
4089 exec->errState = exec->state;
4090 memcpy(exec->errCounts, exec->counts,
4091 exec->comp->nbCounters * sizeof(int));
4092 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004093 exec->state = exec->comp->states[trans->to];
4094 exec->transno = 0;
4095 if (trans->atom != NULL) {
4096 if (exec->inputStack != NULL) {
4097 exec->index++;
4098 if (exec->index < exec->inputStackNr) {
4099 value = exec->inputStack[exec->index].value;
4100 data = exec->inputStack[exec->index].data;
4101#ifdef DEBUG_PUSH
4102 printf("value loaded: %s\n", value);
4103#endif
4104 } else {
4105 value = NULL;
4106 data = NULL;
4107#ifdef DEBUG_PUSH
4108 printf("end of input\n");
4109#endif
4110 }
4111 } else {
4112 value = NULL;
4113 data = NULL;
4114#ifdef DEBUG_PUSH
4115 printf("end of input\n");
4116#endif
4117 }
4118 }
4119 goto progress;
4120 } else if (ret < 0) {
4121 exec->status = -4;
4122 break;
4123 }
4124 }
4125 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4126rollback:
Daniel Veillard90700152005-01-08 22:05:09 +00004127 /*
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004128 * if we didn't yet rollback on the current input
4129 * store the current state as the error state.
Daniel Veillard90700152005-01-08 22:05:09 +00004130 */
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004131 if ((progress) && (exec->state != NULL) &&
4132 (exec->state->type != XML_REGEXP_SINK_STATE)) {
Daniel Veillard90700152005-01-08 22:05:09 +00004133 progress = 0;
4134 if (exec->errString != NULL)
4135 xmlFree(exec->errString);
4136 exec->errString = xmlStrdup(value);
4137 exec->errState = exec->state;
Nick Wellnhofer34e44562017-05-31 16:48:27 +02004138 if (exec->comp->nbCounters)
4139 memcpy(exec->errCounts, exec->counts,
4140 exec->comp->nbCounters * sizeof(int));
Daniel Veillard90700152005-01-08 22:05:09 +00004141 }
4142
Daniel Veillard4255d502002-04-16 15:50:10 +00004143 /*
4144 * Failed to find a way out
4145 */
4146 exec->determinist = 0;
4147 xmlFARegExecRollBack(exec);
Gaurav2671b012013-09-11 14:59:06 +08004148 if ((exec->inputStack != NULL ) && (exec->status == 0)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00004149 value = exec->inputStack[exec->index].value;
4150 data = exec->inputStack[exec->index].data;
4151#ifdef DEBUG_PUSH
4152 printf("value loaded: %s\n", value);
4153#endif
4154 }
4155 }
Daniel Veillard90700152005-01-08 22:05:09 +00004156 continue;
Daniel Veillard4255d502002-04-16 15:50:10 +00004157progress:
Daniel Veillard90700152005-01-08 22:05:09 +00004158 progress = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00004159 continue;
4160 }
4161 if (exec->status == 0) {
4162 return(exec->state->type == XML_REGEXP_FINAL_STATE);
4163 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004164#ifdef DEBUG_ERR
Daniel Veillard90700152005-01-08 22:05:09 +00004165 if (exec->status < 0) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004166 testerr(exec);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004167 }
Daniel Veillard90700152005-01-08 22:05:09 +00004168#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00004169 return(exec->status);
4170}
4171
Daniel Veillard52b48c72003-04-13 19:53:42 +00004172/**
Daniel Veillard6e65e152005-08-09 11:09:52 +00004173 * xmlRegExecPushString:
4174 * @exec: a regexp execution context or NULL to indicate the end
4175 * @value: a string token input
4176 * @data: data associated to the token to reuse in callbacks
4177 *
4178 * Push one input token in the execution context
4179 *
4180 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4181 * a negative value in case of error.
4182 */
4183int
4184xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4185 void *data) {
4186 return(xmlRegExecPushStringInternal(exec, value, data, 0));
4187}
4188
4189/**
Daniel Veillard52b48c72003-04-13 19:53:42 +00004190 * xmlRegExecPushString2:
4191 * @exec: a regexp execution context or NULL to indicate the end
4192 * @value: the first string token input
4193 * @value2: the second string token input
4194 * @data: data associated to the token to reuse in callbacks
4195 *
4196 * Push one input token in the execution context
4197 *
4198 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4199 * a negative value in case of error.
4200 */
4201int
4202xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4203 const xmlChar *value2, void *data) {
4204 xmlChar buf[150];
4205 int lenn, lenp, ret;
4206 xmlChar *str;
4207
4208 if (exec == NULL)
4209 return(-1);
4210 if (exec->comp == NULL)
4211 return(-1);
4212 if (exec->status != 0)
4213 return(exec->status);
4214
4215 if (value2 == NULL)
4216 return(xmlRegExecPushString(exec, value, data));
4217
4218 lenn = strlen((char *) value2);
4219 lenp = strlen((char *) value);
4220
4221 if (150 < lenn + lenp + 2) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00004222 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00004223 if (str == NULL) {
4224 exec->status = -1;
4225 return(-1);
4226 }
4227 } else {
4228 str = buf;
4229 }
4230 memcpy(&str[0], value, lenp);
Daniel Veillardc0826a72004-08-10 14:17:33 +00004231 str[lenp] = XML_REG_STRING_SEPARATOR;
Daniel Veillard52b48c72003-04-13 19:53:42 +00004232 memcpy(&str[lenp + 1], value2, lenn);
4233 str[lenn + lenp + 1] = 0;
4234
4235 if (exec->comp->compact != NULL)
4236 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4237 else
Daniel Veillard6e65e152005-08-09 11:09:52 +00004238 ret = xmlRegExecPushStringInternal(exec, str, data, 1);
Daniel Veillard52b48c72003-04-13 19:53:42 +00004239
4240 if (str != buf)
Daniel Veillard0b1ff142005-12-28 21:13:33 +00004241 xmlFree(str);
Daniel Veillard52b48c72003-04-13 19:53:42 +00004242 return(ret);
4243}
4244
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004245/**
Daniel Veillard77005e62005-07-19 16:26:18 +00004246 * xmlRegExecGetValues:
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004247 * @exec: a regexp execution context
4248 * @err: error extraction or normal one
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004249 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004250 * @nbneg: return number of negative transitions
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004251 * @values: pointer to the array of acceptable values
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004252 * @terminal: return value if this was a terminal state
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004253 *
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004254 * Extract information from the regexp execution, internal routine to
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004255 * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004256 *
4257 * Returns: 0 in case of success or -1 in case of error.
4258 */
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004259static int
4260xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004261 int *nbval, int *nbneg,
4262 xmlChar **values, int *terminal) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004263 int maxval;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004264 int nb = 0;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004265
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004266 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004267 (values == NULL) || (*nbval <= 0))
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004268 return(-1);
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004269
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004270 maxval = *nbval;
4271 *nbval = 0;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004272 *nbneg = 0;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004273 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4274 xmlRegexpPtr comp;
4275 int target, i, state;
4276
4277 comp = exec->comp;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004278
4279 if (err) {
4280 if (exec->errStateNo == -1) return(-1);
4281 state = exec->errStateNo;
4282 } else {
4283 state = exec->index;
4284 }
4285 if (terminal != NULL) {
4286 if (comp->compact[state * (comp->nbstrings + 1)] ==
4287 XML_REGEXP_FINAL_STATE)
4288 *terminal = 1;
4289 else
4290 *terminal = 0;
4291 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004292 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004293 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004294 if ((target > 0) && (target <= comp->nbstates) &&
4295 (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4296 XML_REGEXP_SINK_STATE)) {
4297 values[nb++] = comp->stringMap[i];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004298 (*nbval)++;
4299 }
4300 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004301 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4302 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4303 if ((target > 0) && (target <= comp->nbstates) &&
4304 (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4305 XML_REGEXP_SINK_STATE)) {
4306 values[nb++] = comp->stringMap[i];
4307 (*nbneg)++;
4308 }
4309 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004310 } else {
4311 int transno;
4312 xmlRegTransPtr trans;
4313 xmlRegAtomPtr atom;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004314 xmlRegStatePtr state;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004315
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004316 if (terminal != NULL) {
4317 if (exec->state->type == XML_REGEXP_FINAL_STATE)
4318 *terminal = 1;
4319 else
4320 *terminal = 0;
4321 }
4322
4323 if (err) {
4324 if (exec->errState == NULL) return(-1);
4325 state = exec->errState;
4326 } else {
4327 if (exec->state == NULL) return(-1);
4328 state = exec->state;
4329 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004330 for (transno = 0;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004331 (transno < state->nbTrans) && (nb < maxval);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004332 transno++) {
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004333 trans = &state->trans[transno];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004334 if (trans->to < 0)
4335 continue;
4336 atom = trans->atom;
4337 if ((atom == NULL) || (atom->valuep == NULL))
4338 continue;
4339 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004340 /* this should not be reached but ... */
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004341 TODO;
4342 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004343 /* this should not be reached but ... */
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004344 TODO;
4345 } else if (trans->counter >= 0) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00004346 xmlRegCounterPtr counter = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004347 int count;
4348
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004349 if (err)
4350 count = exec->errCounts[trans->counter];
4351 else
4352 count = exec->counts[trans->counter];
Daniel Veillard11ce4002006-03-10 00:36:23 +00004353 if (exec->comp != NULL)
4354 counter = &exec->comp->counters[trans->counter];
4355 if ((counter == NULL) || (count < counter->max)) {
Daniel Veillard77005e62005-07-19 16:26:18 +00004356 if (atom->neg)
4357 values[nb++] = (xmlChar *) atom->valuep2;
4358 else
4359 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004360 (*nbval)++;
4361 }
4362 } else {
Gaurav2671b012013-09-11 14:59:06 +08004363 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004364 (exec->comp->states[trans->to]->type !=
4365 XML_REGEXP_SINK_STATE)) {
Daniel Veillard77005e62005-07-19 16:26:18 +00004366 if (atom->neg)
4367 values[nb++] = (xmlChar *) atom->valuep2;
4368 else
4369 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004370 (*nbval)++;
4371 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004372 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004373 }
4374 for (transno = 0;
4375 (transno < state->nbTrans) && (nb < maxval);
4376 transno++) {
4377 trans = &state->trans[transno];
4378 if (trans->to < 0)
4379 continue;
4380 atom = trans->atom;
4381 if ((atom == NULL) || (atom->valuep == NULL))
4382 continue;
4383 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4384 continue;
4385 } else if (trans->count == REGEXP_ALL_COUNTER) {
4386 continue;
4387 } else if (trans->counter >= 0) {
4388 continue;
4389 } else {
4390 if ((exec->comp->states[trans->to] != NULL) &&
4391 (exec->comp->states[trans->to]->type ==
4392 XML_REGEXP_SINK_STATE)) {
Daniel Veillard77005e62005-07-19 16:26:18 +00004393 if (atom->neg)
4394 values[nb++] = (xmlChar *) atom->valuep2;
4395 else
4396 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004397 (*nbneg)++;
4398 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004399 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004400 }
4401 }
4402 return(0);
4403}
4404
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004405/**
4406 * xmlRegExecNextValues:
4407 * @exec: a regexp execution context
4408 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004409 * @nbneg: return number of negative transitions
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004410 * @values: pointer to the array of acceptable values
4411 * @terminal: return value if this was a terminal state
4412 *
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004413 * Extract information from the regexp execution,
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004414 * the parameter @values must point to an array of @nbval string pointers
4415 * on return nbval will contain the number of possible strings in that
4416 * state and the @values array will be updated with them. The string values
4417 * returned will be freed with the @exec context and don't need to be
4418 * deallocated.
4419 *
4420 * Returns: 0 in case of success or -1 in case of error.
4421 */
4422int
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004423xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4424 xmlChar **values, int *terminal) {
4425 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004426}
4427
4428/**
4429 * xmlRegExecErrInfo:
4430 * @exec: a regexp execution context generating an error
4431 * @string: return value for the error string
4432 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004433 * @nbneg: return number of negative transitions
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004434 * @values: pointer to the array of acceptable values
4435 * @terminal: return value if this was a terminal state
4436 *
Haibo Huangcfd91dc2020-07-30 23:01:33 -07004437 * Extract error information from the regexp execution, the parameter
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004438 * @string will be updated with the value pushed and not accepted,
4439 * the parameter @values must point to an array of @nbval string pointers
4440 * on return nbval will contain the number of possible strings in that
4441 * state and the @values array will be updated with them. The string values
4442 * returned will be freed with the @exec context and don't need to be
4443 * deallocated.
4444 *
4445 * Returns: 0 in case of success or -1 in case of error.
4446 */
4447int
4448xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004449 int *nbval, int *nbneg, xmlChar **values, int *terminal) {
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004450 if (exec == NULL)
4451 return(-1);
4452 if (string != NULL) {
4453 if (exec->status != 0)
4454 *string = exec->errString;
4455 else
4456 *string = NULL;
4457 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004458 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004459}
4460
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004461#ifdef DEBUG_ERR
4462static void testerr(xmlRegExecCtxtPtr exec) {
4463 const xmlChar *string;
Daniel Veillardcee2b3a2005-01-25 00:22:52 +00004464 xmlChar *values[5];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004465 int nb = 5;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004466 int nbneg;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00004467 int terminal;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00004468 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00004469}
4470#endif
4471
Daniel Veillard4255d502002-04-16 15:50:10 +00004472#if 0
4473static int
4474xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4475 xmlRegTransPtr trans;
4476 xmlRegAtomPtr atom;
4477 int ret;
4478 int codepoint, len;
4479
4480 if (exec == NULL)
4481 return(-1);
4482 if (exec->status != 0)
4483 return(exec->status);
4484
4485 while ((exec->status == 0) &&
4486 ((exec->inputString[exec->index] != 0) ||
4487 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4488
4489 /*
4490 * End of input on non-terminal state, rollback, however we may
4491 * still have epsilon like transition for counted transitions
4492 * on counters, in that case don't break too early.
4493 */
4494 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4495 goto rollback;
4496
4497 exec->transcount = 0;
4498 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4499 trans = &exec->state->trans[exec->transno];
4500 if (trans->to < 0)
4501 continue;
4502 atom = trans->atom;
4503 ret = 0;
4504 if (trans->count >= 0) {
4505 int count;
4506 xmlRegCounterPtr counter;
4507
4508 /*
4509 * A counted transition.
4510 */
4511
4512 count = exec->counts[trans->count];
4513 counter = &exec->comp->counters[trans->count];
4514#ifdef DEBUG_REGEXP_EXEC
4515 printf("testing count %d: val %d, min %d, max %d\n",
4516 trans->count, count, counter->min, counter->max);
4517#endif
4518 ret = ((count >= counter->min) && (count <= counter->max));
4519 } else if (atom == NULL) {
4520 fprintf(stderr, "epsilon transition left at runtime\n");
4521 exec->status = -2;
4522 break;
4523 } else if (exec->inputString[exec->index] != 0) {
4524 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4525 ret = xmlRegCheckCharacter(atom, codepoint);
4526 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4527 xmlRegStatePtr to = exec->comp->states[trans->to];
4528
4529 /*
4530 * this is a multiple input sequence
4531 */
4532 if (exec->state->nbTrans > exec->transno + 1) {
4533 xmlFARegExecSave(exec);
4534 }
4535 exec->transcount = 1;
4536 do {
4537 /*
4538 * Try to progress as much as possible on the input
4539 */
4540 if (exec->transcount == atom->max) {
4541 break;
4542 }
4543 exec->index += len;
4544 /*
4545 * End of input: stop here
4546 */
4547 if (exec->inputString[exec->index] == 0) {
4548 exec->index -= len;
4549 break;
4550 }
4551 if (exec->transcount >= atom->min) {
4552 int transno = exec->transno;
4553 xmlRegStatePtr state = exec->state;
4554
4555 /*
4556 * The transition is acceptable save it
4557 */
4558 exec->transno = -1; /* trick */
4559 exec->state = to;
4560 xmlFARegExecSave(exec);
4561 exec->transno = transno;
4562 exec->state = state;
4563 }
4564 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4565 len);
4566 ret = xmlRegCheckCharacter(atom, codepoint);
4567 exec->transcount++;
4568 } while (ret == 1);
4569 if (exec->transcount < atom->min)
4570 ret = 0;
4571
4572 /*
4573 * If the last check failed but one transition was found
4574 * possible, rollback
4575 */
4576 if (ret < 0)
4577 ret = 0;
4578 if (ret == 0) {
4579 goto rollback;
4580 }
4581 }
4582 }
4583 if (ret == 1) {
4584 if (exec->state->nbTrans > exec->transno + 1) {
4585 xmlFARegExecSave(exec);
4586 }
Daniel Veillard54eb0242006-03-21 23:17:57 +00004587 /*
4588 * restart count for expressions like this ((abc){2})*
4589 */
4590 if (trans->count >= 0) {
4591#ifdef DEBUG_REGEXP_EXEC
4592 printf("Reset count %d\n", trans->count);
4593#endif
4594 exec->counts[trans->count] = 0;
4595 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004596 if (trans->counter >= 0) {
4597#ifdef DEBUG_REGEXP_EXEC
4598 printf("Increasing count %d\n", trans->counter);
4599#endif
4600 exec->counts[trans->counter]++;
4601 }
4602#ifdef DEBUG_REGEXP_EXEC
4603 printf("entering state %d\n", trans->to);
4604#endif
4605 exec->state = exec->comp->states[trans->to];
4606 exec->transno = 0;
4607 if (trans->atom != NULL) {
4608 exec->index += len;
4609 }
4610 goto progress;
4611 } else if (ret < 0) {
4612 exec->status = -4;
4613 break;
4614 }
4615 }
4616 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4617rollback:
4618 /*
4619 * Failed to find a way out
4620 */
4621 exec->determinist = 0;
4622 xmlFARegExecRollBack(exec);
4623 }
4624progress:
4625 continue;
4626 }
4627}
4628#endif
4629/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004630 * *
William M. Brackddf71d62004-05-06 04:17:26 +00004631 * Parser for the Schemas Datatype Regular Expressions *
Daniel Veillard4255d502002-04-16 15:50:10 +00004632 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004633 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00004634 ************************************************************************/
4635
4636/**
4637 * xmlFAIsChar:
Daniel Veillard441bc322002-04-20 17:38:48 +00004638 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004639 *
4640 * [10] Char ::= [^.\?*+()|#x5B#x5D]
4641 */
4642static int
4643xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4644 int cur;
4645 int len;
4646
4647 cur = CUR_SCHAR(ctxt->cur, len);
4648 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4649 (cur == '*') || (cur == '+') || (cur == '(') ||
4650 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
4651 (cur == 0x5D) || (cur == 0))
4652 return(-1);
4653 return(cur);
4654}
4655
4656/**
4657 * xmlFAParseCharProp:
Daniel Veillard441bc322002-04-20 17:38:48 +00004658 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004659 *
4660 * [27] charProp ::= IsCategory | IsBlock
4661 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004662 * Separators | Symbols | Others
Daniel Veillard4255d502002-04-16 15:50:10 +00004663 * [29] Letters ::= 'L' [ultmo]?
4664 * [30] Marks ::= 'M' [nce]?
4665 * [31] Numbers ::= 'N' [dlo]?
4666 * [32] Punctuation ::= 'P' [cdseifo]?
4667 * [33] Separators ::= 'Z' [slp]?
4668 * [34] Symbols ::= 'S' [mcko]?
4669 * [35] Others ::= 'C' [cfon]?
4670 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
4671 */
4672static void
4673xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4674 int cur;
William M. Brack779af002003-08-01 15:55:39 +00004675 xmlRegAtomType type = (xmlRegAtomType) 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00004676 xmlChar *blockName = NULL;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004677
Daniel Veillard4255d502002-04-16 15:50:10 +00004678 cur = CUR;
4679 if (cur == 'L') {
4680 NEXT;
4681 cur = CUR;
4682 if (cur == 'u') {
4683 NEXT;
4684 type = XML_REGEXP_LETTER_UPPERCASE;
4685 } else if (cur == 'l') {
4686 NEXT;
4687 type = XML_REGEXP_LETTER_LOWERCASE;
4688 } else if (cur == 't') {
4689 NEXT;
4690 type = XML_REGEXP_LETTER_TITLECASE;
4691 } else if (cur == 'm') {
4692 NEXT;
4693 type = XML_REGEXP_LETTER_MODIFIER;
4694 } else if (cur == 'o') {
4695 NEXT;
4696 type = XML_REGEXP_LETTER_OTHERS;
4697 } else {
4698 type = XML_REGEXP_LETTER;
4699 }
4700 } else if (cur == 'M') {
4701 NEXT;
4702 cur = CUR;
4703 if (cur == 'n') {
4704 NEXT;
4705 /* nonspacing */
4706 type = XML_REGEXP_MARK_NONSPACING;
4707 } else if (cur == 'c') {
4708 NEXT;
4709 /* spacing combining */
4710 type = XML_REGEXP_MARK_SPACECOMBINING;
4711 } else if (cur == 'e') {
4712 NEXT;
4713 /* enclosing */
4714 type = XML_REGEXP_MARK_ENCLOSING;
4715 } else {
4716 /* all marks */
4717 type = XML_REGEXP_MARK;
4718 }
4719 } else if (cur == 'N') {
4720 NEXT;
4721 cur = CUR;
4722 if (cur == 'd') {
4723 NEXT;
4724 /* digital */
4725 type = XML_REGEXP_NUMBER_DECIMAL;
4726 } else if (cur == 'l') {
4727 NEXT;
4728 /* letter */
4729 type = XML_REGEXP_NUMBER_LETTER;
4730 } else if (cur == 'o') {
4731 NEXT;
4732 /* other */
4733 type = XML_REGEXP_NUMBER_OTHERS;
4734 } else {
4735 /* all numbers */
4736 type = XML_REGEXP_NUMBER;
4737 }
4738 } else if (cur == 'P') {
4739 NEXT;
4740 cur = CUR;
4741 if (cur == 'c') {
4742 NEXT;
4743 /* connector */
4744 type = XML_REGEXP_PUNCT_CONNECTOR;
4745 } else if (cur == 'd') {
4746 NEXT;
4747 /* dash */
4748 type = XML_REGEXP_PUNCT_DASH;
4749 } else if (cur == 's') {
4750 NEXT;
4751 /* open */
4752 type = XML_REGEXP_PUNCT_OPEN;
4753 } else if (cur == 'e') {
4754 NEXT;
4755 /* close */
4756 type = XML_REGEXP_PUNCT_CLOSE;
4757 } else if (cur == 'i') {
4758 NEXT;
4759 /* initial quote */
4760 type = XML_REGEXP_PUNCT_INITQUOTE;
4761 } else if (cur == 'f') {
4762 NEXT;
4763 /* final quote */
4764 type = XML_REGEXP_PUNCT_FINQUOTE;
4765 } else if (cur == 'o') {
4766 NEXT;
4767 /* other */
4768 type = XML_REGEXP_PUNCT_OTHERS;
4769 } else {
4770 /* all punctuation */
4771 type = XML_REGEXP_PUNCT;
4772 }
4773 } else if (cur == 'Z') {
4774 NEXT;
4775 cur = CUR;
4776 if (cur == 's') {
4777 NEXT;
4778 /* space */
4779 type = XML_REGEXP_SEPAR_SPACE;
4780 } else if (cur == 'l') {
4781 NEXT;
4782 /* line */
4783 type = XML_REGEXP_SEPAR_LINE;
4784 } else if (cur == 'p') {
4785 NEXT;
4786 /* paragraph */
4787 type = XML_REGEXP_SEPAR_PARA;
4788 } else {
4789 /* all separators */
4790 type = XML_REGEXP_SEPAR;
4791 }
4792 } else if (cur == 'S') {
4793 NEXT;
4794 cur = CUR;
4795 if (cur == 'm') {
4796 NEXT;
4797 type = XML_REGEXP_SYMBOL_MATH;
4798 /* math */
4799 } else if (cur == 'c') {
4800 NEXT;
4801 type = XML_REGEXP_SYMBOL_CURRENCY;
4802 /* currency */
4803 } else if (cur == 'k') {
4804 NEXT;
4805 type = XML_REGEXP_SYMBOL_MODIFIER;
4806 /* modifiers */
4807 } else if (cur == 'o') {
4808 NEXT;
4809 type = XML_REGEXP_SYMBOL_OTHERS;
4810 /* other */
4811 } else {
4812 /* all symbols */
4813 type = XML_REGEXP_SYMBOL;
4814 }
4815 } else if (cur == 'C') {
4816 NEXT;
4817 cur = CUR;
4818 if (cur == 'c') {
4819 NEXT;
4820 /* control */
4821 type = XML_REGEXP_OTHER_CONTROL;
4822 } else if (cur == 'f') {
4823 NEXT;
4824 /* format */
4825 type = XML_REGEXP_OTHER_FORMAT;
4826 } else if (cur == 'o') {
4827 NEXT;
4828 /* private use */
4829 type = XML_REGEXP_OTHER_PRIVATE;
4830 } else if (cur == 'n') {
4831 NEXT;
4832 /* not assigned */
4833 type = XML_REGEXP_OTHER_NA;
4834 } else {
4835 /* all others */
4836 type = XML_REGEXP_OTHER;
4837 }
4838 } else if (cur == 'I') {
4839 const xmlChar *start;
4840 NEXT;
4841 cur = CUR;
4842 if (cur != 's') {
4843 ERROR("IsXXXX expected");
4844 return;
4845 }
4846 NEXT;
4847 start = ctxt->cur;
4848 cur = CUR;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004849 if (((cur >= 'a') && (cur <= 'z')) ||
4850 ((cur >= 'A') && (cur <= 'Z')) ||
4851 ((cur >= '0') && (cur <= '9')) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00004852 (cur == 0x2D)) {
4853 NEXT;
4854 cur = CUR;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004855 while (((cur >= 'a') && (cur <= 'z')) ||
4856 ((cur >= 'A') && (cur <= 'Z')) ||
4857 ((cur >= '0') && (cur <= '9')) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00004858 (cur == 0x2D)) {
4859 NEXT;
4860 cur = CUR;
4861 }
4862 }
4863 type = XML_REGEXP_BLOCK_NAME;
4864 blockName = xmlStrndup(start, ctxt->cur - start);
4865 } else {
4866 ERROR("Unknown char property");
4867 return;
4868 }
4869 if (ctxt->atom == NULL) {
4870 ctxt->atom = xmlRegNewAtom(ctxt, type);
4871 if (ctxt->atom != NULL)
4872 ctxt->atom->valuep = blockName;
4873 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4874 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4875 type, 0, 0, blockName);
4876 }
4877}
4878
4879/**
4880 * xmlFAParseCharClassEsc:
Daniel Veillard441bc322002-04-20 17:38:48 +00004881 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004882 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004883 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
Daniel Veillard4255d502002-04-16 15:50:10 +00004884 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4885 * [25] catEsc ::= '\p{' charProp '}'
4886 * [26] complEsc ::= '\P{' charProp '}'
4887 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4888 */
4889static void
4890xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4891 int cur;
4892
4893 if (CUR == '.') {
4894 if (ctxt->atom == NULL) {
4895 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4896 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4897 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4898 XML_REGEXP_ANYCHAR, 0, 0, NULL);
4899 }
4900 NEXT;
4901 return;
4902 }
4903 if (CUR != '\\') {
4904 ERROR("Escaped sequence: expecting \\");
4905 return;
4906 }
4907 NEXT;
4908 cur = CUR;
4909 if (cur == 'p') {
4910 NEXT;
4911 if (CUR != '{') {
4912 ERROR("Expecting '{'");
4913 return;
4914 }
4915 NEXT;
4916 xmlFAParseCharProp(ctxt);
4917 if (CUR != '}') {
4918 ERROR("Expecting '}'");
4919 return;
4920 }
4921 NEXT;
4922 } else if (cur == 'P') {
4923 NEXT;
4924 if (CUR != '{') {
4925 ERROR("Expecting '{'");
4926 return;
4927 }
4928 NEXT;
4929 xmlFAParseCharProp(ctxt);
Nick Wellnhofer8a0c6692017-07-04 17:13:06 +02004930 if (ctxt->atom != NULL)
4931 ctxt->atom->neg = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00004932 if (CUR != '}') {
4933 ERROR("Expecting '}'");
4934 return;
4935 }
4936 NEXT;
4937 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4938 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4939 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4940 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4941 (cur == 0x5E)) {
4942 if (ctxt->atom == NULL) {
4943 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
Daniel Veillard99c394d2005-07-14 12:58:49 +00004944 if (ctxt->atom != NULL) {
4945 switch (cur) {
4946 case 'n':
4947 ctxt->atom->codepoint = '\n';
4948 break;
4949 case 'r':
4950 ctxt->atom->codepoint = '\r';
4951 break;
4952 case 't':
4953 ctxt->atom->codepoint = '\t';
4954 break;
4955 default:
4956 ctxt->atom->codepoint = cur;
4957 }
4958 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004959 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
Daniel Veillard9543aee2010-03-15 11:13:39 +01004960 switch (cur) {
4961 case 'n':
4962 cur = '\n';
4963 break;
4964 case 'r':
4965 cur = '\r';
4966 break;
4967 case 't':
4968 cur = '\t';
4969 break;
4970 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004971 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4972 XML_REGEXP_CHARVAL, cur, cur, NULL);
4973 }
4974 NEXT;
4975 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
4976 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
4977 (cur == 'w') || (cur == 'W')) {
Daniel Veillardb509f152002-04-17 16:28:10 +00004978 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
Daniel Veillard4255d502002-04-16 15:50:10 +00004979
4980 switch (cur) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004981 case 's':
Daniel Veillard4255d502002-04-16 15:50:10 +00004982 type = XML_REGEXP_ANYSPACE;
4983 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004984 case 'S':
Daniel Veillard4255d502002-04-16 15:50:10 +00004985 type = XML_REGEXP_NOTSPACE;
4986 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004987 case 'i':
Daniel Veillard4255d502002-04-16 15:50:10 +00004988 type = XML_REGEXP_INITNAME;
4989 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004990 case 'I':
Daniel Veillard4255d502002-04-16 15:50:10 +00004991 type = XML_REGEXP_NOTINITNAME;
4992 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004993 case 'c':
Daniel Veillard4255d502002-04-16 15:50:10 +00004994 type = XML_REGEXP_NAMECHAR;
4995 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004996 case 'C':
Daniel Veillard4255d502002-04-16 15:50:10 +00004997 type = XML_REGEXP_NOTNAMECHAR;
4998 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08004999 case 'd':
Daniel Veillard4255d502002-04-16 15:50:10 +00005000 type = XML_REGEXP_DECIMAL;
5001 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005002 case 'D':
Daniel Veillard4255d502002-04-16 15:50:10 +00005003 type = XML_REGEXP_NOTDECIMAL;
5004 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005005 case 'w':
Daniel Veillard4255d502002-04-16 15:50:10 +00005006 type = XML_REGEXP_REALCHAR;
5007 break;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005008 case 'W':
Daniel Veillard4255d502002-04-16 15:50:10 +00005009 type = XML_REGEXP_NOTREALCHAR;
5010 break;
5011 }
5012 NEXT;
5013 if (ctxt->atom == NULL) {
5014 ctxt->atom = xmlRegNewAtom(ctxt, type);
5015 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
5016 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5017 type, 0, 0, NULL);
5018 }
Daniel Veillardcb4284e2007-04-25 13:55:20 +00005019 } else {
5020 ERROR("Wrong escape sequence, misuse of character '\\'");
Daniel Veillard4255d502002-04-16 15:50:10 +00005021 }
5022}
5023
5024/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005025 * xmlFAParseCharRange:
Daniel Veillard441bc322002-04-20 17:38:48 +00005026 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005027 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005028 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
Daniel Veillard4255d502002-04-16 15:50:10 +00005029 * [18] seRange ::= charOrEsc '-' charOrEsc
5030 * [20] charOrEsc ::= XmlChar | SingleCharEsc
5031 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
5032 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
5033 */
5034static void
5035xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
William M. Brackdc99df92003-12-27 01:54:25 +00005036 int cur, len;
Daniel Veillard4255d502002-04-16 15:50:10 +00005037 int start = -1;
5038 int end = -1;
5039
Daniel Veillard777737e2006-10-17 21:23:17 +00005040 if (CUR == '\0') {
5041 ERROR("Expecting ']'");
5042 return;
5043 }
5044
Daniel Veillard4255d502002-04-16 15:50:10 +00005045 cur = CUR;
5046 if (cur == '\\') {
5047 NEXT;
5048 cur = CUR;
5049 switch (cur) {
5050 case 'n': start = 0xA; break;
5051 case 'r': start = 0xD; break;
5052 case 't': start = 0x9; break;
5053 case '\\': case '|': case '.': case '-': case '^': case '?':
5054 case '*': case '+': case '{': case '}': case '(': case ')':
5055 case '[': case ']':
5056 start = cur; break;
5057 default:
5058 ERROR("Invalid escape value");
5059 return;
5060 }
5061 end = start;
William M. Brackdc99df92003-12-27 01:54:25 +00005062 len = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00005063 } else if ((cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00005064 end = start = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00005065 } else {
5066 ERROR("Expecting a char range");
5067 return;
5068 }
William M. Bracka9cbf282007-03-21 13:16:33 +00005069 /*
5070 * Since we are "inside" a range, we can assume ctxt->cur is past
5071 * the start of ctxt->string, and PREV should be safe
5072 */
5073 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
5074 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00005075 return;
5076 }
William M. Bracka9cbf282007-03-21 13:16:33 +00005077 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00005078 cur = CUR;
William M. Brack10f1ef42004-03-20 14:51:25 +00005079 if ((cur != '-') || (NXT(1) == ']')) {
Daniel Veillard4255d502002-04-16 15:50:10 +00005080 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5081 XML_REGEXP_CHARVAL, start, end, NULL);
5082 return;
5083 }
5084 NEXT;
5085 cur = CUR;
5086 if (cur == '\\') {
5087 NEXT;
5088 cur = CUR;
5089 switch (cur) {
5090 case 'n': end = 0xA; break;
5091 case 'r': end = 0xD; break;
5092 case 't': end = 0x9; break;
5093 case '\\': case '|': case '.': case '-': case '^': case '?':
5094 case '*': case '+': case '{': case '}': case '(': case ')':
5095 case '[': case ']':
5096 end = cur; break;
5097 default:
5098 ERROR("Invalid escape value");
5099 return;
5100 }
William M. Brackdc99df92003-12-27 01:54:25 +00005101 len = 1;
David Kilzerfb56f802017-07-04 18:38:03 +02005102 } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00005103 end = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00005104 } else {
5105 ERROR("Expecting the end of a char range");
5106 return;
5107 }
Pranjal Jumdecbb27162016-03-07 06:34:26 -08005108
Daniel Veillard4255d502002-04-16 15:50:10 +00005109 /* TODO check that the values are acceptable character ranges for XML */
5110 if (end < start) {
5111 ERROR("End of range is before start of range");
5112 } else {
Pranjal Jumdecbb27162016-03-07 06:34:26 -08005113 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00005114 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5115 XML_REGEXP_CHARVAL, start, end, NULL);
5116 }
5117 return;
5118}
5119
5120/**
5121 * xmlFAParsePosCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00005122 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005123 *
5124 * [14] posCharGroup ::= ( charRange | charClassEsc )+
5125 */
5126static void
5127xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5128 do {
Daniel Veillard041b6872008-02-08 10:37:18 +00005129 if (CUR == '\\') {
Daniel Veillard4255d502002-04-16 15:50:10 +00005130 xmlFAParseCharClassEsc(ctxt);
5131 } else {
5132 xmlFAParseCharRange(ctxt);
5133 }
5134 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
Daniel Veillard777737e2006-10-17 21:23:17 +00005135 (CUR != 0) && (ctxt->error == 0));
Daniel Veillard4255d502002-04-16 15:50:10 +00005136}
5137
5138/**
5139 * xmlFAParseCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00005140 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005141 *
5142 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
5143 * [15] negCharGroup ::= '^' posCharGroup
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005144 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
Daniel Veillard4255d502002-04-16 15:50:10 +00005145 * [12] charClassExpr ::= '[' charGroup ']'
5146 */
5147static void
5148xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
5149 int n = ctxt->neg;
5150 while ((CUR != ']') && (ctxt->error == 0)) {
5151 if (CUR == '^') {
5152 int neg = ctxt->neg;
5153
5154 NEXT;
5155 ctxt->neg = !ctxt->neg;
5156 xmlFAParsePosCharGroup(ctxt);
5157 ctxt->neg = neg;
William M. Brack10f1ef42004-03-20 14:51:25 +00005158 } else if ((CUR == '-') && (NXT(1) == '[')) {
Daniel Veillardf8b9de32003-11-24 14:27:26 +00005159 int neg = ctxt->neg;
Daniel Veillardf8b9de32003-11-24 14:27:26 +00005160 ctxt->neg = 2;
William M. Brack10f1ef42004-03-20 14:51:25 +00005161 NEXT; /* eat the '-' */
5162 NEXT; /* eat the '[' */
Daniel Veillard4255d502002-04-16 15:50:10 +00005163 xmlFAParseCharGroup(ctxt);
5164 if (CUR == ']') {
5165 NEXT;
5166 } else {
5167 ERROR("charClassExpr: ']' expected");
5168 break;
5169 }
Daniel Veillardf8b9de32003-11-24 14:27:26 +00005170 ctxt->neg = neg;
Daniel Veillard4255d502002-04-16 15:50:10 +00005171 break;
5172 } else if (CUR != ']') {
5173 xmlFAParsePosCharGroup(ctxt);
5174 }
5175 }
5176 ctxt->neg = n;
5177}
5178
5179/**
5180 * xmlFAParseCharClass:
Daniel Veillard441bc322002-04-20 17:38:48 +00005181 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005182 *
5183 * [11] charClass ::= charClassEsc | charClassExpr
5184 * [12] charClassExpr ::= '[' charGroup ']'
5185 */
5186static void
5187xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5188 if (CUR == '[') {
5189 NEXT;
5190 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5191 if (ctxt->atom == NULL)
5192 return;
5193 xmlFAParseCharGroup(ctxt);
5194 if (CUR == ']') {
5195 NEXT;
5196 } else {
5197 ERROR("xmlFAParseCharClass: ']' expected");
5198 }
5199 } else {
5200 xmlFAParseCharClassEsc(ctxt);
5201 }
5202}
5203
5204/**
5205 * xmlFAParseQuantExact:
Daniel Veillard441bc322002-04-20 17:38:48 +00005206 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005207 *
5208 * [8] QuantExact ::= [0-9]+
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005209 *
5210 * Returns 0 if success or -1 in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00005211 */
5212static int
5213xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5214 int ret = 0;
5215 int ok = 0;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005216 int overflow = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00005217
5218 while ((CUR >= '0') && (CUR <= '9')) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005219 if (ret > INT_MAX / 10) {
5220 overflow = 1;
5221 } else {
5222 int digit = CUR - '0';
5223
5224 ret *= 10;
5225 if (ret > INT_MAX - digit)
5226 overflow = 1;
5227 else
5228 ret += digit;
5229 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005230 ok = 1;
5231 NEXT;
5232 }
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005233 if ((ok != 1) || (overflow == 1)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00005234 return(-1);
5235 }
5236 return(ret);
5237}
5238
5239/**
5240 * xmlFAParseQuantifier:
Daniel Veillard441bc322002-04-20 17:38:48 +00005241 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005242 *
5243 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
5244 * [5] quantity ::= quantRange | quantMin | QuantExact
5245 * [6] quantRange ::= QuantExact ',' QuantExact
5246 * [7] quantMin ::= QuantExact ','
5247 * [8] QuantExact ::= [0-9]+
5248 */
5249static int
5250xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5251 int cur;
5252
5253 cur = CUR;
5254 if ((cur == '?') || (cur == '*') || (cur == '+')) {
5255 if (ctxt->atom != NULL) {
5256 if (cur == '?')
5257 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5258 else if (cur == '*')
5259 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5260 else if (cur == '+')
5261 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5262 }
5263 NEXT;
5264 return(1);
5265 }
5266 if (cur == '{') {
5267 int min = 0, max = 0;
5268
5269 NEXT;
5270 cur = xmlFAParseQuantExact(ctxt);
5271 if (cur >= 0)
5272 min = cur;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005273 else {
5274 ERROR("Improper quantifier");
5275 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005276 if (CUR == ',') {
5277 NEXT;
Daniel Veillardebe48c62003-12-03 12:12:27 +00005278 if (CUR == '}')
5279 max = INT_MAX;
5280 else {
5281 cur = xmlFAParseQuantExact(ctxt);
5282 if (cur >= 0)
5283 max = cur;
5284 else {
5285 ERROR("Improper quantifier");
5286 }
5287 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005288 }
5289 if (CUR == '}') {
5290 NEXT;
5291 } else {
5292 ERROR("Unterminated quantifier");
5293 }
5294 if (max == 0)
5295 max = min;
5296 if (ctxt->atom != NULL) {
5297 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5298 ctxt->atom->min = min;
5299 ctxt->atom->max = max;
5300 }
5301 return(1);
5302 }
5303 return(0);
5304}
5305
5306/**
5307 * xmlFAParseAtom:
Daniel Veillard441bc322002-04-20 17:38:48 +00005308 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005309 *
5310 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
5311 */
5312static int
5313xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5314 int codepoint, len;
5315
5316 codepoint = xmlFAIsChar(ctxt);
5317 if (codepoint > 0) {
5318 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5319 if (ctxt->atom == NULL)
5320 return(-1);
5321 codepoint = CUR_SCHAR(ctxt->cur, len);
5322 ctxt->atom->codepoint = codepoint;
5323 NEXTL(len);
5324 return(1);
5325 } else if (CUR == '|') {
5326 return(0);
5327 } else if (CUR == 0) {
5328 return(0);
5329 } else if (CUR == ')') {
5330 return(0);
5331 } else if (CUR == '(') {
Daniel Veillard76d59b62007-08-22 16:29:21 +00005332 xmlRegStatePtr start, oldend, start0;
Daniel Veillard4255d502002-04-16 15:50:10 +00005333
5334 NEXT;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005335 if (ctxt->depth >= 50) {
5336 ERROR("xmlFAParseAtom: maximum nesting depth exceeded");
5337 return(-1);
5338 }
Daniel Veillard76d59b62007-08-22 16:29:21 +00005339 /*
5340 * this extra Epsilon transition is needed if we count with 0 allowed
5341 * unfortunately this can't be known at that point
5342 */
5343 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5344 start0 = ctxt->state;
Daniel Veillard4255d502002-04-16 15:50:10 +00005345 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5346 start = ctxt->state;
5347 oldend = ctxt->end;
5348 ctxt->end = NULL;
5349 ctxt->atom = NULL;
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005350 ctxt->depth++;
Daniel Veillard4255d502002-04-16 15:50:10 +00005351 xmlFAParseRegExp(ctxt, 0);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005352 ctxt->depth--;
Daniel Veillard4255d502002-04-16 15:50:10 +00005353 if (CUR == ')') {
5354 NEXT;
5355 } else {
5356 ERROR("xmlFAParseAtom: expecting ')'");
5357 }
5358 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5359 if (ctxt->atom == NULL)
5360 return(-1);
5361 ctxt->atom->start = start;
Daniel Veillard76d59b62007-08-22 16:29:21 +00005362 ctxt->atom->start0 = start0;
Daniel Veillard4255d502002-04-16 15:50:10 +00005363 ctxt->atom->stop = ctxt->state;
5364 ctxt->end = oldend;
5365 return(1);
5366 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5367 xmlFAParseCharClass(ctxt);
5368 return(1);
5369 }
5370 return(0);
5371}
5372
5373/**
5374 * xmlFAParsePiece:
Daniel Veillard441bc322002-04-20 17:38:48 +00005375 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00005376 *
5377 * [3] piece ::= atom quantifier?
5378 */
5379static int
5380xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5381 int ret;
5382
5383 ctxt->atom = NULL;
5384 ret = xmlFAParseAtom(ctxt);
5385 if (ret == 0)
5386 return(0);
5387 if (ctxt->atom == NULL) {
5388 ERROR("internal: no atom generated");
5389 }
5390 xmlFAParseQuantifier(ctxt);
5391 return(1);
5392}
5393
5394/**
5395 * xmlFAParseBranch:
Daniel Veillard441bc322002-04-20 17:38:48 +00005396 * @ctxt: a regexp parser context
Daniel Veillard54eb0242006-03-21 23:17:57 +00005397 * @to: optional target to the end of the branch
5398 *
5399 * @to is used to optimize by removing duplicate path in automata
5400 * in expressions like (a|b)(c|d)
Daniel Veillard4255d502002-04-16 15:50:10 +00005401 *
5402 * [2] branch ::= piece*
5403 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005404static int
Daniel Veillard54eb0242006-03-21 23:17:57 +00005405xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
Daniel Veillard4255d502002-04-16 15:50:10 +00005406 xmlRegStatePtr previous;
Daniel Veillard4255d502002-04-16 15:50:10 +00005407 int ret;
5408
5409 previous = ctxt->state;
5410 ret = xmlFAParsePiece(ctxt);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005411 if (ret == 0) {
5412 /* Empty branch */
5413 xmlFAGenerateEpsilonTransition(ctxt, previous, to);
5414 } else {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005415 if (xmlFAGenerateTransitions(ctxt, previous,
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005416 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL, ctxt->atom) < 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005417 return(-1);
5418 previous = ctxt->state;
Daniel Veillard4255d502002-04-16 15:50:10 +00005419 ctxt->atom = NULL;
5420 }
5421 while ((ret != 0) && (ctxt->error == 0)) {
5422 ret = xmlFAParsePiece(ctxt);
5423 if (ret != 0) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005424 if (xmlFAGenerateTransitions(ctxt, previous,
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005425 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5426 ctxt->atom) < 0)
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005427 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00005428 previous = ctxt->state;
5429 ctxt->atom = NULL;
5430 }
5431 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005432 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00005433}
5434
5435/**
5436 * xmlFAParseRegExp:
Daniel Veillard441bc322002-04-20 17:38:48 +00005437 * @ctxt: a regexp parser context
William M. Brackddf71d62004-05-06 04:17:26 +00005438 * @top: is this the top-level expression ?
Daniel Veillard4255d502002-04-16 15:50:10 +00005439 *
5440 * [1] regExp ::= branch ( '|' branch )*
5441 */
5442static void
5443xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
Daniel Veillardc7e3cc42004-09-28 12:33:52 +00005444 xmlRegStatePtr start, end;
Daniel Veillard4255d502002-04-16 15:50:10 +00005445
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005446 /* if not top start should have been generated by an epsilon trans */
Daniel Veillard4255d502002-04-16 15:50:10 +00005447 start = ctxt->state;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005448 ctxt->end = NULL;
Daniel Veillard54eb0242006-03-21 23:17:57 +00005449 xmlFAParseBranch(ctxt, NULL);
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005450 if (top) {
5451#ifdef DEBUG_REGEXP_GRAPH
5452 printf("State %d is final\n", ctxt->state->no);
5453#endif
5454 ctxt->state->type = XML_REGEXP_FINAL_STATE;
5455 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005456 if (CUR != '|') {
5457 ctxt->end = ctxt->state;
5458 return;
5459 }
5460 end = ctxt->state;
5461 while ((CUR == '|') && (ctxt->error == 0)) {
5462 NEXT;
5463 ctxt->state = start;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005464 ctxt->end = NULL;
Daniel Veillard54eb0242006-03-21 23:17:57 +00005465 xmlFAParseBranch(ctxt, end);
Daniel Veillard4255d502002-04-16 15:50:10 +00005466 }
Daniel Veillard2cbf5962004-03-31 15:50:43 +00005467 if (!top) {
5468 ctxt->state = end;
5469 ctxt->end = end;
5470 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005471}
5472
5473/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005474 * *
5475 * The basic API *
5476 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00005477 ************************************************************************/
5478
5479/**
5480 * xmlRegexpPrint:
5481 * @output: the file for the output debug
5482 * @regexp: the compiled regexp
5483 *
5484 * Print the content of the compiled regular expression
5485 */
5486void
5487xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5488 int i;
5489
Daniel Veillarda82b1822004-11-08 16:24:57 +00005490 if (output == NULL)
5491 return;
Daniel Veillard4255d502002-04-16 15:50:10 +00005492 fprintf(output, " regexp: ");
5493 if (regexp == NULL) {
5494 fprintf(output, "NULL\n");
5495 return;
5496 }
5497 fprintf(output, "'%s' ", regexp->string);
5498 fprintf(output, "\n");
5499 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5500 for (i = 0;i < regexp->nbAtoms; i++) {
5501 fprintf(output, " %02d ", i);
5502 xmlRegPrintAtom(output, regexp->atoms[i]);
5503 }
5504 fprintf(output, "%d states:", regexp->nbStates);
5505 fprintf(output, "\n");
5506 for (i = 0;i < regexp->nbStates; i++) {
5507 xmlRegPrintState(output, regexp->states[i]);
5508 }
5509 fprintf(output, "%d counters:\n", regexp->nbCounters);
5510 for (i = 0;i < regexp->nbCounters; i++) {
5511 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5512 regexp->counters[i].max);
5513 }
5514}
5515
5516/**
5517 * xmlRegexpCompile:
5518 * @regexp: a regular expression string
5519 *
5520 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
William M. Brackddf71d62004-05-06 04:17:26 +00005521 * Appendix F and builds an automata suitable for testing strings against
Daniel Veillard4255d502002-04-16 15:50:10 +00005522 * that regular expression
5523 *
5524 * Returns the compiled expression or NULL in case of error
5525 */
5526xmlRegexpPtr
5527xmlRegexpCompile(const xmlChar *regexp) {
5528 xmlRegexpPtr ret;
5529 xmlRegParserCtxtPtr ctxt;
5530
5531 ctxt = xmlRegNewParserCtxt(regexp);
5532 if (ctxt == NULL)
5533 return(NULL);
5534
5535 /* initialize the parser */
5536 ctxt->end = NULL;
5537 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5538 xmlRegStatePush(ctxt, ctxt->start);
5539
5540 /* parse the expression building an automata */
5541 xmlFAParseRegExp(ctxt, 1);
5542 if (CUR != 0) {
5543 ERROR("xmlFAParseRegExp: extra characters");
5544 }
Daniel Veillardcb4284e2007-04-25 13:55:20 +00005545 if (ctxt->error != 0) {
5546 xmlRegFreeParserCtxt(ctxt);
5547 return(NULL);
5548 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005549 ctxt->end = ctxt->state;
5550 ctxt->start->type = XML_REGEXP_START_STATE;
5551 ctxt->end->type = XML_REGEXP_FINAL_STATE;
5552
5553 /* remove the Epsilon except for counted transitions */
5554 xmlFAEliminateEpsilonTransitions(ctxt);
5555
5556
5557 if (ctxt->error != 0) {
5558 xmlRegFreeParserCtxt(ctxt);
5559 return(NULL);
5560 }
5561 ret = xmlRegEpxFromParse(ctxt);
5562 xmlRegFreeParserCtxt(ctxt);
5563 return(ret);
5564}
5565
5566/**
5567 * xmlRegexpExec:
5568 * @comp: the compiled regular expression
5569 * @content: the value to check against the regular expression
5570 *
William M. Brackddf71d62004-05-06 04:17:26 +00005571 * Check if the regular expression generates the value
Daniel Veillard4255d502002-04-16 15:50:10 +00005572 *
William M. Brackddf71d62004-05-06 04:17:26 +00005573 * Returns 1 if it matches, 0 if not and a negative value in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00005574 */
5575int
5576xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5577 if ((comp == NULL) || (content == NULL))
5578 return(-1);
5579 return(xmlFARegExec(comp, content));
5580}
5581
5582/**
Daniel Veillard23e73572002-09-19 19:56:43 +00005583 * xmlRegexpIsDeterminist:
5584 * @comp: the compiled regular expression
5585 *
5586 * Check if the regular expression is determinist
5587 *
William M. Brackddf71d62004-05-06 04:17:26 +00005588 * Returns 1 if it yes, 0 if not and a negative value in case of error
Daniel Veillard23e73572002-09-19 19:56:43 +00005589 */
5590int
5591xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5592 xmlAutomataPtr am;
5593 int ret;
5594
5595 if (comp == NULL)
5596 return(-1);
5597 if (comp->determinist != -1)
5598 return(comp->determinist);
5599
5600 am = xmlNewAutomata();
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005601 if (am == NULL)
5602 return(-1);
Daniel Veillardbd9afb52002-09-25 22:25:35 +00005603 if (am->states != NULL) {
5604 int i;
5605
5606 for (i = 0;i < am->nbStates;i++)
5607 xmlRegFreeState(am->states[i]);
5608 xmlFree(am->states);
5609 }
Daniel Veillard23e73572002-09-19 19:56:43 +00005610 am->nbAtoms = comp->nbAtoms;
5611 am->atoms = comp->atoms;
5612 am->nbStates = comp->nbStates;
5613 am->states = comp->states;
5614 am->determinist = -1;
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02005615 am->flags = comp->flags;
Daniel Veillard23e73572002-09-19 19:56:43 +00005616 ret = xmlFAComputesDeterminism(am);
5617 am->atoms = NULL;
5618 am->states = NULL;
5619 xmlFreeAutomata(am);
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02005620 comp->determinist = ret;
Daniel Veillard23e73572002-09-19 19:56:43 +00005621 return(ret);
5622}
5623
5624/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005625 * xmlRegFreeRegexp:
5626 * @regexp: the regexp
5627 *
5628 * Free a regexp
5629 */
5630void
5631xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5632 int i;
5633 if (regexp == NULL)
5634 return;
5635
5636 if (regexp->string != NULL)
5637 xmlFree(regexp->string);
5638 if (regexp->states != NULL) {
5639 for (i = 0;i < regexp->nbStates;i++)
5640 xmlRegFreeState(regexp->states[i]);
5641 xmlFree(regexp->states);
5642 }
5643 if (regexp->atoms != NULL) {
5644 for (i = 0;i < regexp->nbAtoms;i++)
5645 xmlRegFreeAtom(regexp->atoms[i]);
5646 xmlFree(regexp->atoms);
5647 }
5648 if (regexp->counters != NULL)
5649 xmlFree(regexp->counters);
Daniel Veillard23e73572002-09-19 19:56:43 +00005650 if (regexp->compact != NULL)
5651 xmlFree(regexp->compact);
Daniel Veillard118aed72002-09-24 14:13:13 +00005652 if (regexp->transdata != NULL)
5653 xmlFree(regexp->transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +00005654 if (regexp->stringMap != NULL) {
5655 for (i = 0; i < regexp->nbstrings;i++)
5656 xmlFree(regexp->stringMap[i]);
5657 xmlFree(regexp->stringMap);
5658 }
5659
Daniel Veillard4255d502002-04-16 15:50:10 +00005660 xmlFree(regexp);
5661}
5662
5663#ifdef LIBXML_AUTOMATA_ENABLED
5664/************************************************************************
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005665 * *
5666 * The Automata interface *
5667 * *
Daniel Veillard4255d502002-04-16 15:50:10 +00005668 ************************************************************************/
5669
5670/**
5671 * xmlNewAutomata:
5672 *
5673 * Create a new automata
5674 *
5675 * Returns the new object or NULL in case of failure
5676 */
5677xmlAutomataPtr
5678xmlNewAutomata(void) {
5679 xmlAutomataPtr ctxt;
5680
5681 ctxt = xmlRegNewParserCtxt(NULL);
5682 if (ctxt == NULL)
5683 return(NULL);
5684
5685 /* initialize the parser */
5686 ctxt->end = NULL;
5687 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005688 if (ctxt->start == NULL) {
5689 xmlFreeAutomata(ctxt);
5690 return(NULL);
5691 }
Daniel Veillardd0271472006-01-02 10:22:02 +00005692 ctxt->start->type = XML_REGEXP_START_STATE;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005693 if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5694 xmlRegFreeState(ctxt->start);
5695 xmlFreeAutomata(ctxt);
5696 return(NULL);
5697 }
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02005698 ctxt->flags = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00005699
5700 return(ctxt);
5701}
5702
5703/**
5704 * xmlFreeAutomata:
5705 * @am: an automata
5706 *
5707 * Free an automata
5708 */
5709void
5710xmlFreeAutomata(xmlAutomataPtr am) {
5711 if (am == NULL)
5712 return;
5713 xmlRegFreeParserCtxt(am);
5714}
5715
5716/**
Daniel Veillard29341682009-09-10 18:23:39 +02005717 * xmlAutomataSetFlags:
Daniel Veillard1ba2aca2009-08-31 16:47:39 +02005718 * @am: an automata
5719 * @flags: a set of internal flags
5720 *
5721 * Set some flags on the automata
5722 */
5723void
5724xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5725 if (am == NULL)
5726 return;
5727 am->flags |= flags;
5728}
5729
5730/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005731 * xmlAutomataGetInitState:
5732 * @am: an automata
5733 *
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005734 * Initial state lookup
5735 *
Daniel Veillard4255d502002-04-16 15:50:10 +00005736 * Returns the initial state of the automata
5737 */
5738xmlAutomataStatePtr
5739xmlAutomataGetInitState(xmlAutomataPtr am) {
5740 if (am == NULL)
5741 return(NULL);
5742 return(am->start);
5743}
5744
5745/**
5746 * xmlAutomataSetFinalState:
5747 * @am: an automata
5748 * @state: a state in this automata
5749 *
5750 * Makes that state a final state
5751 *
5752 * Returns 0 or -1 in case of error
5753 */
5754int
5755xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5756 if ((am == NULL) || (state == NULL))
5757 return(-1);
5758 state->type = XML_REGEXP_FINAL_STATE;
5759 return(0);
5760}
5761
5762/**
5763 * xmlAutomataNewTransition:
5764 * @am: an automata
5765 * @from: the starting point of the transition
5766 * @to: the target point of the transition or NULL
5767 * @token: the input string associated to that transition
5768 * @data: data passed to the callback function if the transition is activated
5769 *
William M. Brackddf71d62004-05-06 04:17:26 +00005770 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00005771 * and then adds a transition from the @from state to the target state
5772 * activated by the value of @token
5773 *
5774 * Returns the target state or NULL in case of error
5775 */
5776xmlAutomataStatePtr
5777xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5778 xmlAutomataStatePtr to, const xmlChar *token,
5779 void *data) {
5780 xmlRegAtomPtr atom;
5781
5782 if ((am == NULL) || (from == NULL) || (token == NULL))
5783 return(NULL);
5784 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005785 if (atom == NULL)
5786 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00005787 atom->data = data;
Daniel Veillard4255d502002-04-16 15:50:10 +00005788 atom->valuep = xmlStrdup(token);
5789
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005790 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5791 xmlRegFreeAtom(atom);
5792 return(NULL);
5793 }
Daniel Veillard4255d502002-04-16 15:50:10 +00005794 if (to == NULL)
5795 return(am->state);
5796 return(to);
5797}
5798
5799/**
Daniel Veillard52b48c72003-04-13 19:53:42 +00005800 * xmlAutomataNewTransition2:
5801 * @am: an automata
5802 * @from: the starting point of the transition
5803 * @to: the target point of the transition or NULL
5804 * @token: the first input string associated to that transition
5805 * @token2: the second input string associated to that transition
5806 * @data: data passed to the callback function if the transition is activated
5807 *
William M. Brackddf71d62004-05-06 04:17:26 +00005808 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard52b48c72003-04-13 19:53:42 +00005809 * and then adds a transition from the @from state to the target state
5810 * activated by the value of @token
5811 *
5812 * Returns the target state or NULL in case of error
5813 */
5814xmlAutomataStatePtr
5815xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5816 xmlAutomataStatePtr to, const xmlChar *token,
5817 const xmlChar *token2, void *data) {
5818 xmlRegAtomPtr atom;
5819
5820 if ((am == NULL) || (from == NULL) || (token == NULL))
5821 return(NULL);
5822 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
Daniel Veillard52b48c72003-04-13 19:53:42 +00005823 if (atom == NULL)
5824 return(NULL);
Daniel Veillard11ce4002006-03-10 00:36:23 +00005825 atom->data = data;
Daniel Veillard52b48c72003-04-13 19:53:42 +00005826 if ((token2 == NULL) || (*token2 == 0)) {
5827 atom->valuep = xmlStrdup(token);
5828 } else {
5829 int lenn, lenp;
5830 xmlChar *str;
5831
5832 lenn = strlen((char *) token2);
5833 lenp = strlen((char *) token);
5834
Daniel Veillard3c908dc2003-04-19 00:07:51 +00005835 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00005836 if (str == NULL) {
5837 xmlRegFreeAtom(atom);
5838 return(NULL);
5839 }
5840 memcpy(&str[0], token, lenp);
5841 str[lenp] = '|';
5842 memcpy(&str[lenp + 1], token2, lenn);
5843 str[lenn + lenp + 1] = 0;
5844
5845 atom->valuep = str;
5846 }
5847
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005848 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5849 xmlRegFreeAtom(atom);
5850 return(NULL);
5851 }
Daniel Veillard52b48c72003-04-13 19:53:42 +00005852 if (to == NULL)
5853 return(am->state);
5854 return(to);
5855}
5856
5857/**
Daniel Veillard9efc4762005-07-19 14:33:55 +00005858 * xmlAutomataNewNegTrans:
5859 * @am: an automata
5860 * @from: the starting point of the transition
5861 * @to: the target point of the transition or NULL
5862 * @token: the first input string associated to that transition
5863 * @token2: the second input string associated to that transition
5864 * @data: data passed to the callback function if the transition is activated
5865 *
5866 * If @to is NULL, this creates first a new target state in the automata
5867 * and then adds a transition from the @from state to the target state
5868 * activated by any value except (@token,@token2)
Daniel Veillard6e65e152005-08-09 11:09:52 +00005869 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5870 # the semantic of XSD ##other
Daniel Veillard9efc4762005-07-19 14:33:55 +00005871 *
5872 * Returns the target state or NULL in case of error
5873 */
5874xmlAutomataStatePtr
5875xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5876 xmlAutomataStatePtr to, const xmlChar *token,
5877 const xmlChar *token2, void *data) {
5878 xmlRegAtomPtr atom;
Daniel Veillard77005e62005-07-19 16:26:18 +00005879 xmlChar err_msg[200];
Daniel Veillard9efc4762005-07-19 14:33:55 +00005880
5881 if ((am == NULL) || (from == NULL) || (token == NULL))
5882 return(NULL);
5883 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5884 if (atom == NULL)
5885 return(NULL);
5886 atom->data = data;
5887 atom->neg = 1;
5888 if ((token2 == NULL) || (*token2 == 0)) {
5889 atom->valuep = xmlStrdup(token);
5890 } else {
5891 int lenn, lenp;
5892 xmlChar *str;
5893
5894 lenn = strlen((char *) token2);
5895 lenp = strlen((char *) token);
5896
5897 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5898 if (str == NULL) {
5899 xmlRegFreeAtom(atom);
5900 return(NULL);
5901 }
5902 memcpy(&str[0], token, lenp);
5903 str[lenp] = '|';
5904 memcpy(&str[lenp + 1], token2, lenn);
5905 str[lenn + lenp + 1] = 0;
5906
5907 atom->valuep = str;
5908 }
Daniel Veillarddb68b742005-07-30 13:18:24 +00005909 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
Daniel Veillard77005e62005-07-19 16:26:18 +00005910 err_msg[199] = 0;
5911 atom->valuep2 = xmlStrdup(err_msg);
Daniel Veillard9efc4762005-07-19 14:33:55 +00005912
5913 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5914 xmlRegFreeAtom(atom);
5915 return(NULL);
5916 }
Daniel Veillard6e65e152005-08-09 11:09:52 +00005917 am->negs++;
Daniel Veillard9efc4762005-07-19 14:33:55 +00005918 if (to == NULL)
5919 return(am->state);
5920 return(to);
5921}
5922
5923/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005924 * xmlAutomataNewCountTrans2:
5925 * @am: an automata
5926 * @from: the starting point of the transition
5927 * @to: the target point of the transition or NULL
5928 * @token: the input string associated to that transition
5929 * @token2: the second input string associated to that transition
Haibo Huangcfd91dc2020-07-30 23:01:33 -07005930 * @min: the minimum successive occurrences of token
5931 * @max: the maximum successive occurrences of token
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005932 * @data: data associated to the transition
5933 *
5934 * If @to is NULL, this creates first a new target state in the automata
5935 * and then adds a transition from the @from state to the target state
Daniel Veillardf8e3db02012-09-11 13:26:36 +08005936 * activated by a succession of input of value @token and @token2 and
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005937 * whose number is between @min and @max
5938 *
5939 * Returns the target state or NULL in case of error
5940 */
5941xmlAutomataStatePtr
5942xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5943 xmlAutomataStatePtr to, const xmlChar *token,
5944 const xmlChar *token2,
5945 int min, int max, void *data) {
5946 xmlRegAtomPtr atom;
5947 int counter;
5948
5949 if ((am == NULL) || (from == NULL) || (token == NULL))
5950 return(NULL);
5951 if (min < 0)
5952 return(NULL);
5953 if ((max < min) || (max < 1))
5954 return(NULL);
5955 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5956 if (atom == NULL)
5957 return(NULL);
5958 if ((token2 == NULL) || (*token2 == 0)) {
5959 atom->valuep = xmlStrdup(token);
5960 } else {
5961 int lenn, lenp;
5962 xmlChar *str;
5963
5964 lenn = strlen((char *) token2);
5965 lenp = strlen((char *) token);
5966
5967 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5968 if (str == NULL) {
5969 xmlRegFreeAtom(atom);
5970 return(NULL);
5971 }
5972 memcpy(&str[0], token, lenp);
5973 str[lenp] = '|';
5974 memcpy(&str[lenp + 1], token2, lenn);
5975 str[lenn + lenp + 1] = 0;
5976
5977 atom->valuep = str;
5978 }
5979 atom->data = data;
5980 if (min == 0)
5981 atom->min = 1;
5982 else
5983 atom->min = min;
5984 atom->max = max;
5985
5986 /*
5987 * associate a counter to the transition.
5988 */
5989 counter = xmlRegGetCounter(am);
5990 am->counters[counter].min = min;
5991 am->counters[counter].max = max;
5992
5993 /* xmlFAGenerateTransitions(am, from, to, atom); */
5994 if (to == NULL) {
5995 to = xmlRegNewState(am);
5996 xmlRegStatePush(am, to);
5997 }
Daniel Veillard5de09382005-09-26 17:18:17 +00005998 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00005999 xmlRegAtomPush(am, atom);
6000 am->state = to;
6001
6002 if (to == NULL)
6003 to = am->state;
6004 if (to == NULL)
6005 return(NULL);
6006 if (min == 0)
6007 xmlFAGenerateEpsilonTransition(am, from, to);
6008 return(to);
6009}
6010
6011/**
Daniel Veillard4255d502002-04-16 15:50:10 +00006012 * xmlAutomataNewCountTrans:
6013 * @am: an automata
6014 * @from: the starting point of the transition
6015 * @to: the target point of the transition or NULL
6016 * @token: the input string associated to that transition
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006017 * @min: the minimum successive occurrences of token
6018 * @max: the maximum successive occurrences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00006019 * @data: data associated to the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00006020 *
William M. Brackddf71d62004-05-06 04:17:26 +00006021 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00006022 * and then adds a transition from the @from state to the target state
6023 * activated by a succession of input of value @token and whose number
6024 * is between @min and @max
6025 *
6026 * Returns the target state or NULL in case of error
6027 */
6028xmlAutomataStatePtr
6029xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6030 xmlAutomataStatePtr to, const xmlChar *token,
6031 int min, int max, void *data) {
6032 xmlRegAtomPtr atom;
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00006033 int counter;
Daniel Veillard4255d502002-04-16 15:50:10 +00006034
6035 if ((am == NULL) || (from == NULL) || (token == NULL))
6036 return(NULL);
6037 if (min < 0)
6038 return(NULL);
6039 if ((max < min) || (max < 1))
6040 return(NULL);
6041 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6042 if (atom == NULL)
6043 return(NULL);
6044 atom->valuep = xmlStrdup(token);
6045 atom->data = data;
6046 if (min == 0)
6047 atom->min = 1;
6048 else
6049 atom->min = min;
6050 atom->max = max;
6051
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00006052 /*
6053 * associate a counter to the transition.
6054 */
6055 counter = xmlRegGetCounter(am);
6056 am->counters[counter].min = min;
6057 am->counters[counter].max = max;
6058
6059 /* xmlFAGenerateTransitions(am, from, to, atom); */
6060 if (to == NULL) {
6061 to = xmlRegNewState(am);
6062 xmlRegStatePush(am, to);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00006063 }
Daniel Veillard5de09382005-09-26 17:18:17 +00006064 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00006065 xmlRegAtomPush(am, atom);
6066 am->state = to;
6067
Daniel Veillard4255d502002-04-16 15:50:10 +00006068 if (to == NULL)
6069 to = am->state;
6070 if (to == NULL)
6071 return(NULL);
6072 if (min == 0)
6073 xmlFAGenerateEpsilonTransition(am, from, to);
6074 return(to);
6075}
6076
6077/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006078 * xmlAutomataNewOnceTrans2:
6079 * @am: an automata
6080 * @from: the starting point of the transition
6081 * @to: the target point of the transition or NULL
6082 * @token: the input string associated to that transition
6083 * @token2: the second input string associated to that transition
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006084 * @min: the minimum successive occurrences of token
6085 * @max: the maximum successive occurrences of token
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006086 * @data: data associated to the transition
6087 *
6088 * If @to is NULL, this creates first a new target state in the automata
6089 * and then adds a transition from the @from state to the target state
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006090 * activated by a succession of input of value @token and @token2 and whose
6091 * number is between @min and @max, moreover that transition can only be
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006092 * crossed once.
6093 *
6094 * Returns the target state or NULL in case of error
6095 */
6096xmlAutomataStatePtr
6097xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6098 xmlAutomataStatePtr to, const xmlChar *token,
6099 const xmlChar *token2,
6100 int min, int max, void *data) {
6101 xmlRegAtomPtr atom;
6102 int counter;
6103
6104 if ((am == NULL) || (from == NULL) || (token == NULL))
6105 return(NULL);
6106 if (min < 1)
6107 return(NULL);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006108 if (max < min)
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006109 return(NULL);
6110 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6111 if (atom == NULL)
6112 return(NULL);
6113 if ((token2 == NULL) || (*token2 == 0)) {
6114 atom->valuep = xmlStrdup(token);
6115 } else {
6116 int lenn, lenp;
6117 xmlChar *str;
6118
6119 lenn = strlen((char *) token2);
6120 lenp = strlen((char *) token);
6121
6122 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6123 if (str == NULL) {
6124 xmlRegFreeAtom(atom);
6125 return(NULL);
6126 }
6127 memcpy(&str[0], token, lenp);
6128 str[lenp] = '|';
6129 memcpy(&str[lenp + 1], token2, lenn);
6130 str[lenn + lenp + 1] = 0;
6131
6132 atom->valuep = str;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006133 }
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006134 atom->data = data;
6135 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
Daniel Veillard11ce4002006-03-10 00:36:23 +00006136 atom->min = min;
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006137 atom->max = max;
6138 /*
6139 * associate a counter to the transition.
6140 */
6141 counter = xmlRegGetCounter(am);
6142 am->counters[counter].min = 1;
6143 am->counters[counter].max = 1;
6144
6145 /* xmlFAGenerateTransitions(am, from, to, atom); */
6146 if (to == NULL) {
6147 to = xmlRegNewState(am);
6148 xmlRegStatePush(am, to);
6149 }
Daniel Veillard5de09382005-09-26 17:18:17 +00006150 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006151 xmlRegAtomPush(am, atom);
6152 am->state = to;
6153 return(to);
6154}
6155
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006156
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00006157
6158/**
Daniel Veillard7646b182002-04-20 06:41:40 +00006159 * xmlAutomataNewOnceTrans:
6160 * @am: an automata
6161 * @from: the starting point of the transition
6162 * @to: the target point of the transition or NULL
6163 * @token: the input string associated to that transition
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006164 * @min: the minimum successive occurrences of token
6165 * @max: the maximum successive occurrences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00006166 * @data: data associated to the transition
Daniel Veillard7646b182002-04-20 06:41:40 +00006167 *
William M. Brackddf71d62004-05-06 04:17:26 +00006168 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00006169 * and then adds a transition from the @from state to the target state
6170 * activated by a succession of input of value @token and whose number
William M. Brackddf71d62004-05-06 04:17:26 +00006171 * is between @min and @max, moreover that transition can only be crossed
Daniel Veillard7646b182002-04-20 06:41:40 +00006172 * once.
6173 *
6174 * Returns the target state or NULL in case of error
6175 */
6176xmlAutomataStatePtr
6177xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6178 xmlAutomataStatePtr to, const xmlChar *token,
6179 int min, int max, void *data) {
6180 xmlRegAtomPtr atom;
6181 int counter;
6182
6183 if ((am == NULL) || (from == NULL) || (token == NULL))
6184 return(NULL);
6185 if (min < 1)
6186 return(NULL);
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006187 if (max < min)
Daniel Veillard7646b182002-04-20 06:41:40 +00006188 return(NULL);
6189 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6190 if (atom == NULL)
6191 return(NULL);
6192 atom->valuep = xmlStrdup(token);
6193 atom->data = data;
6194 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
Daniel Veillard11ce4002006-03-10 00:36:23 +00006195 atom->min = min;
Daniel Veillard7646b182002-04-20 06:41:40 +00006196 atom->max = max;
6197 /*
6198 * associate a counter to the transition.
6199 */
6200 counter = xmlRegGetCounter(am);
6201 am->counters[counter].min = 1;
6202 am->counters[counter].max = 1;
6203
6204 /* xmlFAGenerateTransitions(am, from, to, atom); */
6205 if (to == NULL) {
6206 to = xmlRegNewState(am);
6207 xmlRegStatePush(am, to);
6208 }
Daniel Veillard5de09382005-09-26 17:18:17 +00006209 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
Daniel Veillard7646b182002-04-20 06:41:40 +00006210 xmlRegAtomPush(am, atom);
6211 am->state = to;
Daniel Veillard7646b182002-04-20 06:41:40 +00006212 return(to);
6213}
6214
6215/**
Daniel Veillard4255d502002-04-16 15:50:10 +00006216 * xmlAutomataNewState:
6217 * @am: an automata
6218 *
6219 * Create a new disconnected state in the automata
6220 *
6221 * Returns the new state or NULL in case of error
6222 */
6223xmlAutomataStatePtr
6224xmlAutomataNewState(xmlAutomataPtr am) {
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006225 xmlAutomataStatePtr to;
Daniel Veillard4255d502002-04-16 15:50:10 +00006226
6227 if (am == NULL)
6228 return(NULL);
6229 to = xmlRegNewState(am);
6230 xmlRegStatePush(am, to);
6231 return(to);
6232}
6233
6234/**
Daniel Veillarda9b66d02002-12-11 14:23:49 +00006235 * xmlAutomataNewEpsilon:
Daniel Veillard4255d502002-04-16 15:50:10 +00006236 * @am: an automata
6237 * @from: the starting point of the transition
6238 * @to: the target point of the transition or NULL
6239 *
William M. Brackddf71d62004-05-06 04:17:26 +00006240 * If @to is NULL, this creates first a new target state in the automata
6241 * and then adds an epsilon transition from the @from state to the
Daniel Veillard4255d502002-04-16 15:50:10 +00006242 * target state
6243 *
6244 * Returns the target state or NULL in case of error
6245 */
6246xmlAutomataStatePtr
6247xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
6248 xmlAutomataStatePtr to) {
6249 if ((am == NULL) || (from == NULL))
6250 return(NULL);
6251 xmlFAGenerateEpsilonTransition(am, from, to);
6252 if (to == NULL)
6253 return(am->state);
6254 return(to);
6255}
6256
Daniel Veillardb509f152002-04-17 16:28:10 +00006257/**
Daniel Veillard7646b182002-04-20 06:41:40 +00006258 * xmlAutomataNewAllTrans:
6259 * @am: an automata
6260 * @from: the starting point of the transition
6261 * @to: the target point of the transition or NULL
Daniel Veillarda9b66d02002-12-11 14:23:49 +00006262 * @lax: allow to transition if not all all transitions have been activated
Daniel Veillard7646b182002-04-20 06:41:40 +00006263 *
William M. Brackddf71d62004-05-06 04:17:26 +00006264 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00006265 * and then adds a an ALL transition from the @from state to the
6266 * target state. That transition is an epsilon transition allowed only when
6267 * all transitions from the @from node have been activated.
6268 *
6269 * Returns the target state or NULL in case of error
6270 */
6271xmlAutomataStatePtr
6272xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
Daniel Veillard441bc322002-04-20 17:38:48 +00006273 xmlAutomataStatePtr to, int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00006274 if ((am == NULL) || (from == NULL))
6275 return(NULL);
Daniel Veillard441bc322002-04-20 17:38:48 +00006276 xmlFAGenerateAllTransition(am, from, to, lax);
Daniel Veillard7646b182002-04-20 06:41:40 +00006277 if (to == NULL)
6278 return(am->state);
6279 return(to);
6280}
6281
6282/**
Daniel Veillardb509f152002-04-17 16:28:10 +00006283 * xmlAutomataNewCounter:
6284 * @am: an automata
6285 * @min: the minimal value on the counter
6286 * @max: the maximal value on the counter
6287 *
6288 * Create a new counter
6289 *
6290 * Returns the counter number or -1 in case of error
6291 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006292int
Daniel Veillardb509f152002-04-17 16:28:10 +00006293xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6294 int ret;
6295
6296 if (am == NULL)
6297 return(-1);
6298
6299 ret = xmlRegGetCounter(am);
6300 if (ret < 0)
6301 return(-1);
6302 am->counters[ret].min = min;
6303 am->counters[ret].max = max;
6304 return(ret);
6305}
6306
6307/**
6308 * xmlAutomataNewCountedTrans:
6309 * @am: an automata
6310 * @from: the starting point of the transition
6311 * @to: the target point of the transition or NULL
6312 * @counter: the counter associated to that transition
6313 *
William M. Brackddf71d62004-05-06 04:17:26 +00006314 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00006315 * and then adds an epsilon transition from the @from state to the target state
6316 * which will increment the counter provided
6317 *
6318 * Returns the target state or NULL in case of error
6319 */
6320xmlAutomataStatePtr
6321xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6322 xmlAutomataStatePtr to, int counter) {
6323 if ((am == NULL) || (from == NULL) || (counter < 0))
6324 return(NULL);
6325 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6326 if (to == NULL)
6327 return(am->state);
6328 return(to);
6329}
6330
6331/**
6332 * xmlAutomataNewCounterTrans:
6333 * @am: an automata
6334 * @from: the starting point of the transition
6335 * @to: the target point of the transition or NULL
6336 * @counter: the counter associated to that transition
6337 *
William M. Brackddf71d62004-05-06 04:17:26 +00006338 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00006339 * and then adds an epsilon transition from the @from state to the target state
6340 * which will be allowed only if the counter is within the right range.
6341 *
6342 * Returns the target state or NULL in case of error
6343 */
6344xmlAutomataStatePtr
6345xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6346 xmlAutomataStatePtr to, int counter) {
6347 if ((am == NULL) || (from == NULL) || (counter < 0))
6348 return(NULL);
6349 xmlFAGenerateCountedTransition(am, from, to, counter);
6350 if (to == NULL)
6351 return(am->state);
6352 return(to);
6353}
Daniel Veillard4255d502002-04-16 15:50:10 +00006354
6355/**
6356 * xmlAutomataCompile:
6357 * @am: an automata
6358 *
6359 * Compile the automata into a Reg Exp ready for being executed.
6360 * The automata should be free after this point.
6361 *
6362 * Returns the compiled regexp or NULL in case of error
6363 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006364xmlRegexpPtr
Daniel Veillard4255d502002-04-16 15:50:10 +00006365xmlAutomataCompile(xmlAutomataPtr am) {
6366 xmlRegexpPtr ret;
6367
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00006368 if ((am == NULL) || (am->error != 0)) return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00006369 xmlFAEliminateEpsilonTransitions(am);
Daniel Veillard23e73572002-09-19 19:56:43 +00006370 /* xmlFAComputesDeterminism(am); */
Daniel Veillard4255d502002-04-16 15:50:10 +00006371 ret = xmlRegEpxFromParse(am);
6372
6373 return(ret);
6374}
Daniel Veillarde19fc232002-04-22 16:01:24 +00006375
6376/**
6377 * xmlAutomataIsDeterminist:
6378 * @am: an automata
6379 *
6380 * Checks if an automata is determinist.
6381 *
6382 * Returns 1 if true, 0 if not, and -1 in case of error
6383 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006384int
Daniel Veillarde19fc232002-04-22 16:01:24 +00006385xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6386 int ret;
6387
6388 if (am == NULL)
6389 return(-1);
6390
6391 ret = xmlFAComputesDeterminism(am);
6392 return(ret);
6393}
Daniel Veillard4255d502002-04-16 15:50:10 +00006394#endif /* LIBXML_AUTOMATA_ENABLED */
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006395
6396#ifdef LIBXML_EXPR_ENABLED
6397/************************************************************************
6398 * *
6399 * Formal Expression handling code *
6400 * *
6401 ************************************************************************/
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006402/************************************************************************
6403 * *
6404 * Expression handling context *
6405 * *
6406 ************************************************************************/
6407
6408struct _xmlExpCtxt {
6409 xmlDictPtr dict;
6410 xmlExpNodePtr *table;
6411 int size;
6412 int nbElems;
6413 int nb_nodes;
Daniel Veillard594e5df2009-09-07 14:58:47 +02006414 int maxNodes;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006415 const char *expr;
6416 const char *cur;
6417 int nb_cons;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006418 int tabSize;
6419};
6420
6421/**
6422 * xmlExpNewCtxt:
6423 * @maxNodes: the maximum number of nodes
Jan Pokornýbb654fe2016-04-13 16:56:07 +02006424 * @dict: optional dictionary to use internally
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006425 *
6426 * Creates a new context for manipulating expressions
6427 *
6428 * Returns the context or NULL in case of error
6429 */
6430xmlExpCtxtPtr
6431xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6432 xmlExpCtxtPtr ret;
6433 int size = 256;
6434
6435 if (maxNodes <= 4096)
6436 maxNodes = 4096;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006437
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006438 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6439 if (ret == NULL)
6440 return(NULL);
6441 memset(ret, 0, sizeof(xmlExpCtxt));
6442 ret->size = size;
6443 ret->nbElems = 0;
Daniel Veillard594e5df2009-09-07 14:58:47 +02006444 ret->maxNodes = maxNodes;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006445 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6446 if (ret->table == NULL) {
6447 xmlFree(ret);
6448 return(NULL);
6449 }
6450 memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6451 if (dict == NULL) {
6452 ret->dict = xmlDictCreate();
6453 if (ret->dict == NULL) {
6454 xmlFree(ret->table);
6455 xmlFree(ret);
6456 return(NULL);
6457 }
6458 } else {
6459 ret->dict = dict;
6460 xmlDictReference(ret->dict);
6461 }
6462 return(ret);
6463}
6464
6465/**
6466 * xmlExpFreeCtxt:
6467 * @ctxt: an expression context
6468 *
6469 * Free an expression context
6470 */
6471void
6472xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6473 if (ctxt == NULL)
6474 return;
6475 xmlDictFree(ctxt->dict);
6476 if (ctxt->table != NULL)
6477 xmlFree(ctxt->table);
6478 xmlFree(ctxt);
6479}
6480
6481/************************************************************************
6482 * *
6483 * Structure associated to an expression node *
6484 * *
6485 ************************************************************************/
Daniel Veillard465a0002005-08-22 12:07:04 +00006486#define MAX_NODES 10000
6487
6488/* #define DEBUG_DERIV */
6489
6490/*
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006491 * TODO:
Daniel Veillard465a0002005-08-22 12:07:04 +00006492 * - Wildcards
6493 * - public API for creation
6494 *
6495 * Started
6496 * - regression testing
6497 *
6498 * Done
6499 * - split into module and test tool
6500 * - memleaks
6501 */
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006502
6503typedef enum {
6504 XML_EXP_NILABLE = (1 << 0)
6505} xmlExpNodeInfo;
6506
6507#define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6508
6509struct _xmlExpNode {
6510 unsigned char type;/* xmlExpNodeType */
6511 unsigned char info;/* OR of xmlExpNodeInfo */
6512 unsigned short key; /* the hash key */
6513 unsigned int ref; /* The number of references */
6514 int c_max; /* the maximum length it can consume */
6515 xmlExpNodePtr exp_left;
6516 xmlExpNodePtr next;/* the next node in the hash table or free list */
6517 union {
6518 struct {
6519 int f_min;
6520 int f_max;
6521 } count;
6522 struct {
6523 xmlExpNodePtr f_right;
6524 } children;
6525 const xmlChar *f_str;
6526 } field;
6527};
6528
6529#define exp_min field.count.f_min
6530#define exp_max field.count.f_max
6531/* #define exp_left field.children.f_left */
6532#define exp_right field.children.f_right
6533#define exp_str field.f_str
6534
6535static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6536static xmlExpNode forbiddenExpNode = {
6537 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6538};
6539xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6540static xmlExpNode emptyExpNode = {
6541 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6542};
6543xmlExpNodePtr emptyExp = &emptyExpNode;
6544
6545/************************************************************************
6546 * *
6547 * The custom hash table for unicity and canonicalization *
6548 * of sub-expressions pointers *
6549 * *
6550 ************************************************************************/
6551/*
6552 * xmlExpHashNameComputeKey:
6553 * Calculate the hash key for a token
6554 */
6555static unsigned short
6556xmlExpHashNameComputeKey(const xmlChar *name) {
6557 unsigned short value = 0L;
6558 char ch;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006559
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006560 if (name != NULL) {
6561 value += 30 * (*name);
6562 while ((ch = *name++) != 0) {
6563 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6564 }
6565 }
6566 return (value);
6567}
6568
6569/*
6570 * xmlExpHashComputeKey:
6571 * Calculate the hash key for a compound expression
6572 */
6573static unsigned short
6574xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6575 xmlExpNodePtr right) {
6576 unsigned long value;
6577 unsigned short ret;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006578
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006579 switch (type) {
6580 case XML_EXP_SEQ:
6581 value = left->key;
6582 value += right->key;
6583 value *= 3;
6584 ret = (unsigned short) value;
6585 break;
6586 case XML_EXP_OR:
6587 value = left->key;
6588 value += right->key;
6589 value *= 7;
6590 ret = (unsigned short) value;
6591 break;
6592 case XML_EXP_COUNT:
6593 value = left->key;
6594 value += right->key;
6595 ret = (unsigned short) value;
6596 break;
6597 default:
6598 ret = 0;
6599 }
6600 return(ret);
6601}
6602
6603
6604static xmlExpNodePtr
6605xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6606 xmlExpNodePtr ret;
6607
6608 if (ctxt->nb_nodes >= MAX_NODES)
6609 return(NULL);
6610 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6611 if (ret == NULL)
6612 return(NULL);
6613 memset(ret, 0, sizeof(xmlExpNode));
6614 ret->type = type;
6615 ret->next = NULL;
6616 ctxt->nb_nodes++;
6617 ctxt->nb_cons++;
6618 return(ret);
6619}
6620
6621/**
6622 * xmlExpHashGetEntry:
6623 * @table: the hash table
6624 *
6625 * Get the unique entry from the hash table. The entry is created if
6626 * needed. @left and @right are consumed, i.e. their ref count will
6627 * be decremented by the operation.
6628 *
6629 * Returns the pointer or NULL in case of error
6630 */
6631static xmlExpNodePtr
6632xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6633 xmlExpNodePtr left, xmlExpNodePtr right,
6634 const xmlChar *name, int min, int max) {
6635 unsigned short kbase, key;
6636 xmlExpNodePtr entry;
6637 xmlExpNodePtr insert;
6638
6639 if (ctxt == NULL)
6640 return(NULL);
6641
6642 /*
6643 * Check for duplicate and insertion location.
6644 */
6645 if (type == XML_EXP_ATOM) {
6646 kbase = xmlExpHashNameComputeKey(name);
6647 } else if (type == XML_EXP_COUNT) {
6648 /* COUNT reduction rule 1 */
6649 /* a{1} -> a */
6650 if (min == max) {
6651 if (min == 1) {
6652 return(left);
6653 }
6654 if (min == 0) {
6655 xmlExpFree(ctxt, left);
6656 return(emptyExp);
6657 }
6658 }
6659 if (min < 0) {
6660 xmlExpFree(ctxt, left);
6661 return(forbiddenExp);
6662 }
6663 if (max == -1)
6664 kbase = min + 79;
6665 else
6666 kbase = max - min;
6667 kbase += left->key;
6668 } else if (type == XML_EXP_OR) {
6669 /* Forbid reduction rules */
6670 if (left->type == XML_EXP_FORBID) {
6671 xmlExpFree(ctxt, left);
6672 return(right);
6673 }
6674 if (right->type == XML_EXP_FORBID) {
6675 xmlExpFree(ctxt, right);
6676 return(left);
6677 }
6678
6679 /* OR reduction rule 1 */
6680 /* a | a reduced to a */
6681 if (left == right) {
Haibo Huangcfd91dc2020-07-30 23:01:33 -07006682 xmlExpFree(ctxt, right);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006683 return(left);
6684 }
6685 /* OR canonicalization rule 1 */
6686 /* linearize (a | b) | c into a | (b | c) */
6687 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6688 xmlExpNodePtr tmp = left;
6689 left = right;
6690 right = tmp;
6691 }
6692 /* OR reduction rule 2 */
6693 /* a | (a | b) and b | (a | b) are reduced to a | b */
6694 if (right->type == XML_EXP_OR) {
6695 if ((left == right->exp_left) ||
6696 (left == right->exp_right)) {
6697 xmlExpFree(ctxt, left);
6698 return(right);
6699 }
6700 }
6701 /* OR canonicalization rule 2 */
6702 /* linearize (a | b) | c into a | (b | c) */
6703 if (left->type == XML_EXP_OR) {
6704 xmlExpNodePtr tmp;
6705
6706 /* OR canonicalization rule 2 */
6707 if ((left->exp_right->type != XML_EXP_OR) &&
6708 (left->exp_right->key < left->exp_left->key)) {
6709 tmp = left->exp_right;
6710 left->exp_right = left->exp_left;
6711 left->exp_left = tmp;
6712 }
6713 left->exp_right->ref++;
6714 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6715 NULL, 0, 0);
6716 left->exp_left->ref++;
6717 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6718 NULL, 0, 0);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006719
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006720 xmlExpFree(ctxt, left);
6721 return(tmp);
6722 }
6723 if (right->type == XML_EXP_OR) {
6724 /* Ordering in the tree */
6725 /* C | (A | B) -> A | (B | C) */
6726 if (left->key > right->exp_right->key) {
6727 xmlExpNodePtr tmp;
6728 right->exp_right->ref++;
6729 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6730 left, NULL, 0, 0);
6731 right->exp_left->ref++;
6732 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6733 tmp, NULL, 0, 0);
6734 xmlExpFree(ctxt, right);
6735 return(tmp);
6736 }
6737 /* Ordering in the tree */
6738 /* B | (A | C) -> A | (B | C) */
6739 if (left->key > right->exp_left->key) {
6740 xmlExpNodePtr tmp;
6741 right->exp_right->ref++;
6742 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6743 right->exp_right, NULL, 0, 0);
6744 right->exp_left->ref++;
6745 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6746 tmp, NULL, 0, 0);
6747 xmlExpFree(ctxt, right);
6748 return(tmp);
6749 }
6750 }
6751 /* we know both types are != XML_EXP_OR here */
6752 else if (left->key > right->key) {
6753 xmlExpNodePtr tmp = left;
6754 left = right;
6755 right = tmp;
6756 }
6757 kbase = xmlExpHashComputeKey(type, left, right);
6758 } else if (type == XML_EXP_SEQ) {
6759 /* Forbid reduction rules */
6760 if (left->type == XML_EXP_FORBID) {
6761 xmlExpFree(ctxt, right);
6762 return(left);
6763 }
6764 if (right->type == XML_EXP_FORBID) {
6765 xmlExpFree(ctxt, left);
6766 return(right);
6767 }
6768 /* Empty reduction rules */
6769 if (right->type == XML_EXP_EMPTY) {
6770 return(left);
6771 }
6772 if (left->type == XML_EXP_EMPTY) {
6773 return(right);
6774 }
6775 kbase = xmlExpHashComputeKey(type, left, right);
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006776 } else
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006777 return(NULL);
6778
6779 key = kbase % ctxt->size;
6780 if (ctxt->table[key] != NULL) {
6781 for (insert = ctxt->table[key]; insert != NULL;
6782 insert = insert->next) {
6783 if ((insert->key == kbase) &&
6784 (insert->type == type)) {
6785 if (type == XML_EXP_ATOM) {
6786 if (name == insert->exp_str) {
6787 insert->ref++;
6788 return(insert);
6789 }
6790 } else if (type == XML_EXP_COUNT) {
6791 if ((insert->exp_min == min) && (insert->exp_max == max) &&
6792 (insert->exp_left == left)) {
6793 insert->ref++;
6794 left->ref--;
6795 return(insert);
6796 }
6797 } else if ((insert->exp_left == left) &&
6798 (insert->exp_right == right)) {
6799 insert->ref++;
6800 left->ref--;
6801 right->ref--;
6802 return(insert);
6803 }
6804 }
6805 }
6806 }
6807
6808 entry = xmlExpNewNode(ctxt, type);
6809 if (entry == NULL)
6810 return(NULL);
6811 entry->key = kbase;
6812 if (type == XML_EXP_ATOM) {
6813 entry->exp_str = name;
6814 entry->c_max = 1;
6815 } else if (type == XML_EXP_COUNT) {
6816 entry->exp_min = min;
6817 entry->exp_max = max;
6818 entry->exp_left = left;
6819 if ((min == 0) || (IS_NILLABLE(left)))
6820 entry->info |= XML_EXP_NILABLE;
6821 if (max < 0)
6822 entry->c_max = -1;
6823 else
6824 entry->c_max = max * entry->exp_left->c_max;
6825 } else {
6826 entry->exp_left = left;
6827 entry->exp_right = right;
6828 if (type == XML_EXP_OR) {
6829 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6830 entry->info |= XML_EXP_NILABLE;
6831 if ((entry->exp_left->c_max == -1) ||
6832 (entry->exp_right->c_max == -1))
6833 entry->c_max = -1;
6834 else if (entry->exp_left->c_max > entry->exp_right->c_max)
6835 entry->c_max = entry->exp_left->c_max;
6836 else
6837 entry->c_max = entry->exp_right->c_max;
6838 } else {
6839 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6840 entry->info |= XML_EXP_NILABLE;
6841 if ((entry->exp_left->c_max == -1) ||
6842 (entry->exp_right->c_max == -1))
6843 entry->c_max = -1;
6844 else
6845 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6846 }
6847 }
6848 entry->ref = 1;
6849 if (ctxt->table[key] != NULL)
6850 entry->next = ctxt->table[key];
6851
6852 ctxt->table[key] = entry;
6853 ctxt->nbElems++;
6854
6855 return(entry);
6856}
6857
6858/**
6859 * xmlExpFree:
6860 * @ctxt: the expression context
6861 * @exp: the expression
6862 *
6863 * Dereference the expression
6864 */
6865void
6866xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6867 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6868 return;
6869 exp->ref--;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006870 if (exp->ref == 0) {
6871 unsigned short key;
6872
6873 /* Unlink it first from the hash table */
6874 key = exp->key % ctxt->size;
6875 if (ctxt->table[key] == exp) {
6876 ctxt->table[key] = exp->next;
6877 } else {
6878 xmlExpNodePtr tmp;
6879
6880 tmp = ctxt->table[key];
6881 while (tmp != NULL) {
6882 if (tmp->next == exp) {
6883 tmp->next = exp->next;
6884 break;
6885 }
6886 tmp = tmp->next;
6887 }
6888 }
6889
6890 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6891 xmlExpFree(ctxt, exp->exp_left);
6892 xmlExpFree(ctxt, exp->exp_right);
6893 } else if (exp->type == XML_EXP_COUNT) {
6894 xmlExpFree(ctxt, exp->exp_left);
6895 }
6896 xmlFree(exp);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00006897 ctxt->nb_nodes--;
6898 }
6899}
6900
6901/**
6902 * xmlExpRef:
6903 * @exp: the expression
6904 *
6905 * Increase the reference count of the expression
6906 */
6907void
6908xmlExpRef(xmlExpNodePtr exp) {
6909 if (exp != NULL)
6910 exp->ref++;
6911}
6912
Daniel Veillardccb4d412005-08-23 13:41:17 +00006913/**
6914 * xmlExpNewAtom:
6915 * @ctxt: the expression context
6916 * @name: the atom name
Michael Woodfb27e2c2012-09-28 08:59:33 +02006917 * @len: the atom name length in byte (or -1);
Daniel Veillardccb4d412005-08-23 13:41:17 +00006918 *
6919 * Get the atom associated to this name from that context
6920 *
6921 * Returns the node or NULL in case of error
6922 */
6923xmlExpNodePtr
6924xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
6925 if ((ctxt == NULL) || (name == NULL))
6926 return(NULL);
6927 name = xmlDictLookup(ctxt->dict, name, len);
6928 if (name == NULL)
6929 return(NULL);
6930 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
6931}
6932
6933/**
6934 * xmlExpNewOr:
6935 * @ctxt: the expression context
6936 * @left: left expression
6937 * @right: right expression
6938 *
6939 * Get the atom associated to the choice @left | @right
6940 * Note that @left and @right are consumed in the operation, to keep
6941 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6942 * this is true even in case of failure (unless ctxt == NULL).
6943 *
6944 * Returns the node or NULL in case of error
6945 */
6946xmlExpNodePtr
6947xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00006948 if (ctxt == NULL)
6949 return(NULL);
6950 if ((left == NULL) || (right == NULL)) {
Daniel Veillardccb4d412005-08-23 13:41:17 +00006951 xmlExpFree(ctxt, left);
6952 xmlExpFree(ctxt, right);
6953 return(NULL);
6954 }
6955 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
6956}
6957
6958/**
6959 * xmlExpNewSeq:
6960 * @ctxt: the expression context
6961 * @left: left expression
6962 * @right: right expression
6963 *
6964 * Get the atom associated to the sequence @left , @right
6965 * Note that @left and @right are consumed in the operation, to keep
6966 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6967 * this is true even in case of failure (unless ctxt == NULL).
6968 *
6969 * Returns the node or NULL in case of error
6970 */
6971xmlExpNodePtr
6972xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00006973 if (ctxt == NULL)
6974 return(NULL);
6975 if ((left == NULL) || (right == NULL)) {
Daniel Veillardccb4d412005-08-23 13:41:17 +00006976 xmlExpFree(ctxt, left);
6977 xmlExpFree(ctxt, right);
6978 return(NULL);
6979 }
6980 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
6981}
6982
6983/**
6984 * xmlExpNewRange:
6985 * @ctxt: the expression context
6986 * @subset: the expression to be repeated
6987 * @min: the lower bound for the repetition
6988 * @max: the upper bound for the repetition, -1 means infinite
6989 *
6990 * Get the atom associated to the range (@subset){@min, @max}
6991 * Note that @subset is consumed in the operation, to keep
6992 * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
6993 * this is true even in case of failure (unless ctxt == NULL).
6994 *
6995 * Returns the node or NULL in case of error
6996 */
6997xmlExpNodePtr
6998xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
Daniel Veillard11ce4002006-03-10 00:36:23 +00006999 if (ctxt == NULL)
7000 return(NULL);
7001 if ((subset == NULL) || (min < 0) || (max < -1) ||
Daniel Veillardccb4d412005-08-23 13:41:17 +00007002 ((max >= 0) && (min > max))) {
7003 xmlExpFree(ctxt, subset);
7004 return(NULL);
7005 }
7006 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
7007 NULL, NULL, min, max));
7008}
7009
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007010/************************************************************************
7011 * *
7012 * Public API for operations on expressions *
7013 * *
7014 ************************************************************************/
7015
7016static int
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007017xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007018 const xmlChar**list, int len, int nb) {
7019 int tmp, tmp2;
7020tail:
7021 switch (exp->type) {
7022 case XML_EXP_EMPTY:
7023 return(0);
7024 case XML_EXP_ATOM:
7025 for (tmp = 0;tmp < nb;tmp++)
7026 if (list[tmp] == exp->exp_str)
7027 return(0);
7028 if (nb >= len)
7029 return(-2);
Daniel Veillard13cee4e2009-09-05 14:52:55 +02007030 list[nb] = exp->exp_str;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007031 return(1);
7032 case XML_EXP_COUNT:
7033 exp = exp->exp_left;
7034 goto tail;
7035 case XML_EXP_SEQ:
7036 case XML_EXP_OR:
7037 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
7038 if (tmp < 0)
7039 return(tmp);
7040 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
7041 nb + tmp);
7042 if (tmp2 < 0)
7043 return(tmp2);
7044 return(tmp + tmp2);
7045 }
7046 return(-1);
7047}
7048
7049/**
7050 * xmlExpGetLanguage:
7051 * @ctxt: the expression context
7052 * @exp: the expression
Daniel Veillard7802ba52005-10-27 11:56:20 +00007053 * @langList: where to store the tokens
Michael Woodfb27e2c2012-09-28 08:59:33 +02007054 * @len: the allocated length of @list
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007055 *
7056 * Find all the strings used in @exp and store them in @list
7057 *
7058 * Returns the number of unique strings found, -1 in case of errors and
7059 * -2 if there is more than @len strings
7060 */
7061int
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007062xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard7802ba52005-10-27 11:56:20 +00007063 const xmlChar**langList, int len) {
7064 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007065 return(-1);
Daniel Veillard7802ba52005-10-27 11:56:20 +00007066 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007067}
7068
7069static int
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007070xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007071 const xmlChar**list, int len, int nb) {
7072 int tmp, tmp2;
7073tail:
7074 switch (exp->type) {
7075 case XML_EXP_FORBID:
7076 return(0);
7077 case XML_EXP_EMPTY:
7078 return(0);
7079 case XML_EXP_ATOM:
7080 for (tmp = 0;tmp < nb;tmp++)
7081 if (list[tmp] == exp->exp_str)
7082 return(0);
7083 if (nb >= len)
7084 return(-2);
Daniel Veillard13cee4e2009-09-05 14:52:55 +02007085 list[nb] = exp->exp_str;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007086 return(1);
7087 case XML_EXP_COUNT:
7088 exp = exp->exp_left;
7089 goto tail;
7090 case XML_EXP_SEQ:
7091 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7092 if (tmp < 0)
7093 return(tmp);
7094 if (IS_NILLABLE(exp->exp_left)) {
7095 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7096 nb + tmp);
7097 if (tmp2 < 0)
7098 return(tmp2);
7099 tmp += tmp2;
7100 }
7101 return(tmp);
7102 case XML_EXP_OR:
7103 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7104 if (tmp < 0)
7105 return(tmp);
7106 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7107 nb + tmp);
7108 if (tmp2 < 0)
7109 return(tmp2);
7110 return(tmp + tmp2);
7111 }
7112 return(-1);
7113}
7114
7115/**
7116 * xmlExpGetStart:
7117 * @ctxt: the expression context
7118 * @exp: the expression
Daniel Veillard7802ba52005-10-27 11:56:20 +00007119 * @tokList: where to store the tokens
Michael Woodfb27e2c2012-09-28 08:59:33 +02007120 * @len: the allocated length of @list
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007121 *
7122 * Find all the strings that appears at the start of the languages
7123 * accepted by @exp and store them in @list. E.g. for (a, b) | c
7124 * it will return the list [a, c]
7125 *
7126 * Returns the number of unique strings found, -1 in case of errors and
7127 * -2 if there is more than @len strings
7128 */
7129int
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007130xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
Daniel Veillard7802ba52005-10-27 11:56:20 +00007131 const xmlChar**tokList, int len) {
7132 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007133 return(-1);
Daniel Veillard7802ba52005-10-27 11:56:20 +00007134 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007135}
7136
7137/**
7138 * xmlExpIsNillable:
7139 * @exp: the expression
7140 *
Haibo Huangcfd91dc2020-07-30 23:01:33 -07007141 * Finds if the expression is nillable, i.e. if it accepts the empty sequence
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007142 *
7143 * Returns 1 if nillable, 0 if not and -1 in case of error
7144 */
7145int
7146xmlExpIsNillable(xmlExpNodePtr exp) {
7147 if (exp == NULL)
7148 return(-1);
7149 return(IS_NILLABLE(exp) != 0);
7150}
7151
7152static xmlExpNodePtr
7153xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
7154{
7155 xmlExpNodePtr ret;
7156
7157 switch (exp->type) {
7158 case XML_EXP_EMPTY:
7159 return(forbiddenExp);
7160 case XML_EXP_FORBID:
7161 return(forbiddenExp);
7162 case XML_EXP_ATOM:
7163 if (exp->exp_str == str) {
7164#ifdef DEBUG_DERIV
7165 printf("deriv atom: equal => Empty\n");
7166#endif
7167 ret = emptyExp;
7168 } else {
7169#ifdef DEBUG_DERIV
7170 printf("deriv atom: mismatch => forbid\n");
7171#endif
7172 /* TODO wildcards here */
7173 ret = forbiddenExp;
7174 }
7175 return(ret);
7176 case XML_EXP_OR: {
7177 xmlExpNodePtr tmp;
7178
7179#ifdef DEBUG_DERIV
7180 printf("deriv or: => or(derivs)\n");
7181#endif
7182 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7183 if (tmp == NULL) {
7184 return(NULL);
7185 }
7186 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7187 if (ret == NULL) {
7188 xmlExpFree(ctxt, tmp);
7189 return(NULL);
7190 }
7191 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
7192 NULL, 0, 0);
7193 return(ret);
7194 }
7195 case XML_EXP_SEQ:
7196#ifdef DEBUG_DERIV
7197 printf("deriv seq: starting with left\n");
7198#endif
7199 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7200 if (ret == NULL) {
7201 return(NULL);
7202 } else if (ret == forbiddenExp) {
7203 if (IS_NILLABLE(exp->exp_left)) {
7204#ifdef DEBUG_DERIV
7205 printf("deriv seq: left failed but nillable\n");
7206#endif
7207 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7208 }
7209 } else {
7210#ifdef DEBUG_DERIV
7211 printf("deriv seq: left match => sequence\n");
7212#endif
7213 exp->exp_right->ref++;
7214 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
7215 NULL, 0, 0);
7216 }
7217 return(ret);
7218 case XML_EXP_COUNT: {
7219 int min, max;
7220 xmlExpNodePtr tmp;
7221
7222 if (exp->exp_max == 0)
7223 return(forbiddenExp);
7224 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7225 if (ret == NULL)
7226 return(NULL);
7227 if (ret == forbiddenExp) {
7228#ifdef DEBUG_DERIV
7229 printf("deriv count: pattern mismatch => forbid\n");
7230#endif
7231 return(ret);
7232 }
7233 if (exp->exp_max == 1)
7234 return(ret);
7235 if (exp->exp_max < 0) /* unbounded */
7236 max = -1;
7237 else
7238 max = exp->exp_max - 1;
7239 if (exp->exp_min > 0)
7240 min = exp->exp_min - 1;
7241 else
7242 min = 0;
7243 exp->exp_left->ref++;
7244 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
7245 NULL, min, max);
7246 if (ret == emptyExp) {
7247#ifdef DEBUG_DERIV
7248 printf("deriv count: match to empty => new count\n");
7249#endif
7250 return(tmp);
7251 }
7252#ifdef DEBUG_DERIV
7253 printf("deriv count: match => sequence with new count\n");
7254#endif
7255 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
7256 NULL, 0, 0));
7257 }
7258 }
7259 return(NULL);
7260}
7261
7262/**
7263 * xmlExpStringDerive:
7264 * @ctxt: the expression context
7265 * @exp: the expression
7266 * @str: the string
7267 * @len: the string len in bytes if available
7268 *
7269 * Do one step of Brzozowski derivation of the expression @exp with
7270 * respect to the input string
7271 *
7272 * Returns the resulting expression or NULL in case of internal error
7273 */
7274xmlExpNodePtr
7275xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7276 const xmlChar *str, int len) {
7277 const xmlChar *input;
7278
7279 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
7280 return(NULL);
7281 }
7282 /*
Jan Pokornýbb654fe2016-04-13 16:56:07 +02007283 * check the string is in the dictionary, if yes use an interned
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007284 * copy, otherwise we know it's not an acceptable input
7285 */
7286 input = xmlDictExists(ctxt->dict, str, len);
7287 if (input == NULL) {
7288 return(forbiddenExp);
7289 }
7290 return(xmlExpStringDeriveInt(ctxt, exp, input));
7291}
7292
7293static int
7294xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7295 int ret = 1;
7296
7297 if (sub->c_max == -1) {
7298 if (exp->c_max != -1)
7299 ret = 0;
7300 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7301 ret = 0;
7302 }
7303#if 0
7304 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7305 ret = 0;
7306#endif
7307 return(ret);
7308}
7309
7310static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7311 xmlExpNodePtr sub);
7312/**
7313 * xmlExpDivide:
7314 * @ctxt: the expressions context
7315 * @exp: the englobing expression
7316 * @sub: the subexpression
7317 * @mult: the multiple expression
7318 * @remain: the remain from the derivation of the multiple
7319 *
7320 * Check if exp is a multiple of sub, i.e. if there is a finite number n
7321 * so that sub{n} subsume exp
7322 *
7323 * Returns the multiple value if successful, 0 if it is not a multiple
Haibo Huangcfd91dc2020-07-30 23:01:33 -07007324 * and -1 in case of internal error.
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007325 */
7326
7327static int
7328xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7329 xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7330 int i;
7331 xmlExpNodePtr tmp, tmp2;
7332
7333 if (mult != NULL) *mult = NULL;
7334 if (remain != NULL) *remain = NULL;
7335 if (exp->c_max == -1) return(0);
7336 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7337
7338 for (i = 1;i <= exp->c_max;i++) {
7339 sub->ref++;
7340 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7341 sub, NULL, NULL, i, i);
7342 if (tmp == NULL) {
7343 return(-1);
7344 }
7345 if (!xmlExpCheckCard(tmp, exp)) {
7346 xmlExpFree(ctxt, tmp);
7347 continue;
7348 }
7349 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7350 if (tmp2 == NULL) {
7351 xmlExpFree(ctxt, tmp);
7352 return(-1);
7353 }
7354 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7355 if (remain != NULL)
7356 *remain = tmp2;
7357 else
7358 xmlExpFree(ctxt, tmp2);
7359 if (mult != NULL)
7360 *mult = tmp;
7361 else
7362 xmlExpFree(ctxt, tmp);
7363#ifdef DEBUG_DERIV
7364 printf("Divide succeeded %d\n", i);
7365#endif
7366 return(i);
7367 }
7368 xmlExpFree(ctxt, tmp);
7369 xmlExpFree(ctxt, tmp2);
7370 }
7371#ifdef DEBUG_DERIV
7372 printf("Divide failed\n");
7373#endif
7374 return(0);
7375}
7376
7377/**
7378 * xmlExpExpDeriveInt:
7379 * @ctxt: the expressions context
7380 * @exp: the englobing expression
7381 * @sub: the subexpression
7382 *
7383 * Try to do a step of Brzozowski derivation but at a higher level
7384 * the input being a subexpression.
7385 *
7386 * Returns the resulting expression or NULL in case of internal error
7387 */
7388static xmlExpNodePtr
7389xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7390 xmlExpNodePtr ret, tmp, tmp2, tmp3;
7391 const xmlChar **tab;
7392 int len, i;
7393
7394 /*
7395 * In case of equality and if the expression can only consume a finite
7396 * amount, then the derivation is empty
7397 */
7398 if ((exp == sub) && (exp->c_max >= 0)) {
7399#ifdef DEBUG_DERIV
7400 printf("Equal(exp, sub) and finite -> Empty\n");
7401#endif
7402 return(emptyExp);
7403 }
7404 /*
7405 * decompose sub sequence first
7406 */
7407 if (sub->type == XML_EXP_EMPTY) {
7408#ifdef DEBUG_DERIV
7409 printf("Empty(sub) -> Empty\n");
7410#endif
7411 exp->ref++;
7412 return(exp);
7413 }
7414 if (sub->type == XML_EXP_SEQ) {
7415#ifdef DEBUG_DERIV
7416 printf("Seq(sub) -> decompose\n");
7417#endif
7418 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7419 if (tmp == NULL)
7420 return(NULL);
7421 if (tmp == forbiddenExp)
7422 return(tmp);
7423 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7424 xmlExpFree(ctxt, tmp);
7425 return(ret);
7426 }
7427 if (sub->type == XML_EXP_OR) {
7428#ifdef DEBUG_DERIV
7429 printf("Or(sub) -> decompose\n");
7430#endif
7431 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7432 if (tmp == forbiddenExp)
7433 return(tmp);
7434 if (tmp == NULL)
7435 return(NULL);
7436 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7437 if ((ret == NULL) || (ret == forbiddenExp)) {
7438 xmlExpFree(ctxt, tmp);
7439 return(ret);
7440 }
7441 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7442 }
7443 if (!xmlExpCheckCard(exp, sub)) {
7444#ifdef DEBUG_DERIV
7445 printf("CheckCard(exp, sub) failed -> Forbid\n");
7446#endif
7447 return(forbiddenExp);
7448 }
7449 switch (exp->type) {
7450 case XML_EXP_EMPTY:
7451 if (sub == emptyExp)
7452 return(emptyExp);
7453#ifdef DEBUG_DERIV
7454 printf("Empty(exp) -> Forbid\n");
7455#endif
7456 return(forbiddenExp);
7457 case XML_EXP_FORBID:
7458#ifdef DEBUG_DERIV
7459 printf("Forbid(exp) -> Forbid\n");
7460#endif
7461 return(forbiddenExp);
7462 case XML_EXP_ATOM:
7463 if (sub->type == XML_EXP_ATOM) {
7464 /* TODO: handle wildcards */
7465 if (exp->exp_str == sub->exp_str) {
7466#ifdef DEBUG_DERIV
7467 printf("Atom match -> Empty\n");
7468#endif
7469 return(emptyExp);
7470 }
7471#ifdef DEBUG_DERIV
7472 printf("Atom mismatch -> Forbid\n");
7473#endif
7474 return(forbiddenExp);
7475 }
7476 if ((sub->type == XML_EXP_COUNT) &&
7477 (sub->exp_max == 1) &&
7478 (sub->exp_left->type == XML_EXP_ATOM)) {
7479 /* TODO: handle wildcards */
7480 if (exp->exp_str == sub->exp_left->exp_str) {
7481#ifdef DEBUG_DERIV
7482 printf("Atom match -> Empty\n");
7483#endif
7484 return(emptyExp);
7485 }
7486#ifdef DEBUG_DERIV
7487 printf("Atom mismatch -> Forbid\n");
7488#endif
7489 return(forbiddenExp);
7490 }
7491#ifdef DEBUG_DERIV
Haibo Huangcfd91dc2020-07-30 23:01:33 -07007492 printf("Complex exp vs Atom -> Forbid\n");
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007493#endif
7494 return(forbiddenExp);
7495 case XML_EXP_SEQ:
7496 /* try to get the sequence consumed only if possible */
7497 if (xmlExpCheckCard(exp->exp_left, sub)) {
7498 /* See if the sequence can be consumed directly */
7499#ifdef DEBUG_DERIV
7500 printf("Seq trying left only\n");
7501#endif
7502 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7503 if ((ret != forbiddenExp) && (ret != NULL)) {
7504#ifdef DEBUG_DERIV
7505 printf("Seq trying left only worked\n");
7506#endif
7507 /*
7508 * TODO: assumption here that we are determinist
7509 * i.e. we won't get to a nillable exp left
7510 * subset which could be matched by the right
7511 * part too.
7512 * e.g.: (a | b)+,(a | c) and 'a+,a'
7513 */
7514 exp->exp_right->ref++;
7515 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7516 exp->exp_right, NULL, 0, 0));
7517 }
7518#ifdef DEBUG_DERIV
7519 } else {
7520 printf("Seq: left too short\n");
7521#endif
7522 }
7523 /* Try instead to decompose */
7524 if (sub->type == XML_EXP_COUNT) {
7525 int min, max;
7526
7527#ifdef DEBUG_DERIV
7528 printf("Seq: sub is a count\n");
7529#endif
7530 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7531 if (ret == NULL)
7532 return(NULL);
7533 if (ret != forbiddenExp) {
7534#ifdef DEBUG_DERIV
7535 printf("Seq , Count match on left\n");
7536#endif
7537 if (sub->exp_max < 0)
7538 max = -1;
7539 else
7540 max = sub->exp_max -1;
7541 if (sub->exp_min > 0)
7542 min = sub->exp_min -1;
7543 else
7544 min = 0;
7545 exp->exp_right->ref++;
7546 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7547 exp->exp_right, NULL, 0, 0);
7548 if (tmp == NULL)
7549 return(NULL);
7550
7551 sub->exp_left->ref++;
7552 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7553 sub->exp_left, NULL, NULL, min, max);
7554 if (tmp2 == NULL) {
7555 xmlExpFree(ctxt, tmp);
7556 return(NULL);
7557 }
7558 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7559 xmlExpFree(ctxt, tmp);
7560 xmlExpFree(ctxt, tmp2);
7561 return(ret);
7562 }
7563 }
7564 /* we made no progress on structured operations */
7565 break;
7566 case XML_EXP_OR:
7567#ifdef DEBUG_DERIV
7568 printf("Or , trying both side\n");
7569#endif
7570 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7571 if (ret == NULL)
7572 return(NULL);
7573 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7574 if (tmp == NULL) {
7575 xmlExpFree(ctxt, ret);
7576 return(NULL);
7577 }
7578 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7579 case XML_EXP_COUNT: {
7580 int min, max;
7581
7582 if (sub->type == XML_EXP_COUNT) {
7583 /*
7584 * Try to see if the loop is completely subsumed
7585 */
7586 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7587 if (tmp == NULL)
7588 return(NULL);
7589 if (tmp == forbiddenExp) {
7590 int mult;
7591
7592#ifdef DEBUG_DERIV
7593 printf("Count, Count inner don't subsume\n");
7594#endif
7595 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7596 NULL, &tmp);
7597 if (mult <= 0) {
7598#ifdef DEBUG_DERIV
7599 printf("Count, Count not multiple => forbidden\n");
7600#endif
7601 return(forbiddenExp);
7602 }
7603 if (sub->exp_max == -1) {
7604 max = -1;
7605 if (exp->exp_max == -1) {
7606 if (exp->exp_min <= sub->exp_min * mult)
7607 min = 0;
7608 else
7609 min = exp->exp_min - sub->exp_min * mult;
7610 } else {
7611#ifdef DEBUG_DERIV
7612 printf("Count, Count finite can't subsume infinite\n");
7613#endif
7614 xmlExpFree(ctxt, tmp);
7615 return(forbiddenExp);
7616 }
7617 } else {
7618 if (exp->exp_max == -1) {
7619#ifdef DEBUG_DERIV
7620 printf("Infinite loop consume mult finite loop\n");
7621#endif
7622 if (exp->exp_min > sub->exp_min * mult) {
7623 max = -1;
7624 min = exp->exp_min - sub->exp_min * mult;
7625 } else {
7626 max = -1;
7627 min = 0;
7628 }
7629 } else {
7630 if (exp->exp_max < sub->exp_max * mult) {
7631#ifdef DEBUG_DERIV
7632 printf("loops max mult mismatch => forbidden\n");
7633#endif
7634 xmlExpFree(ctxt, tmp);
7635 return(forbiddenExp);
7636 }
7637 if (sub->exp_max * mult > exp->exp_min)
7638 min = 0;
7639 else
7640 min = exp->exp_min - sub->exp_max * mult;
7641 max = exp->exp_max - sub->exp_max * mult;
7642 }
7643 }
7644 } else if (!IS_NILLABLE(tmp)) {
7645 /*
7646 * TODO: loop here to try to grow if working on finite
7647 * blocks.
7648 */
7649#ifdef DEBUG_DERIV
7650 printf("Count, Count remain not nillable => forbidden\n");
7651#endif
7652 xmlExpFree(ctxt, tmp);
7653 return(forbiddenExp);
7654 } else if (sub->exp_max == -1) {
7655 if (exp->exp_max == -1) {
7656 if (exp->exp_min <= sub->exp_min) {
7657#ifdef DEBUG_DERIV
7658 printf("Infinite loops Okay => COUNT(0,Inf)\n");
7659#endif
7660 max = -1;
7661 min = 0;
7662 } else {
7663#ifdef DEBUG_DERIV
7664 printf("Infinite loops min => Count(X,Inf)\n");
7665#endif
7666 max = -1;
7667 min = exp->exp_min - sub->exp_min;
7668 }
7669 } else if (exp->exp_min > sub->exp_min) {
7670#ifdef DEBUG_DERIV
7671 printf("loops min mismatch 1 => forbidden ???\n");
7672#endif
7673 xmlExpFree(ctxt, tmp);
7674 return(forbiddenExp);
7675 } else {
7676 max = -1;
7677 min = 0;
7678 }
7679 } else {
7680 if (exp->exp_max == -1) {
7681#ifdef DEBUG_DERIV
7682 printf("Infinite loop consume finite loop\n");
7683#endif
7684 if (exp->exp_min > sub->exp_min) {
7685 max = -1;
7686 min = exp->exp_min - sub->exp_min;
7687 } else {
7688 max = -1;
7689 min = 0;
7690 }
7691 } else {
7692 if (exp->exp_max < sub->exp_max) {
7693#ifdef DEBUG_DERIV
7694 printf("loops max mismatch => forbidden\n");
7695#endif
7696 xmlExpFree(ctxt, tmp);
7697 return(forbiddenExp);
7698 }
7699 if (sub->exp_max > exp->exp_min)
7700 min = 0;
7701 else
7702 min = exp->exp_min - sub->exp_max;
7703 max = exp->exp_max - sub->exp_max;
7704 }
7705 }
7706#ifdef DEBUG_DERIV
7707 printf("loops match => SEQ(COUNT())\n");
7708#endif
7709 exp->exp_left->ref++;
7710 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7711 NULL, NULL, min, max);
7712 if (tmp2 == NULL) {
7713 return(NULL);
7714 }
7715 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7716 NULL, 0, 0);
7717 return(ret);
7718 }
7719 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7720 if (tmp == NULL)
7721 return(NULL);
7722 if (tmp == forbiddenExp) {
7723#ifdef DEBUG_DERIV
7724 printf("loop mismatch => forbidden\n");
7725#endif
7726 return(forbiddenExp);
7727 }
7728 if (exp->exp_min > 0)
7729 min = exp->exp_min - 1;
7730 else
7731 min = 0;
7732 if (exp->exp_max < 0)
7733 max = -1;
7734 else
7735 max = exp->exp_max - 1;
7736
7737#ifdef DEBUG_DERIV
7738 printf("loop match => SEQ(COUNT())\n");
7739#endif
7740 exp->exp_left->ref++;
7741 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7742 NULL, NULL, min, max);
7743 if (tmp2 == NULL)
7744 return(NULL);
7745 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7746 NULL, 0, 0);
7747 return(ret);
7748 }
7749 }
7750
Daniel Veillardccb4d412005-08-23 13:41:17 +00007751#ifdef DEBUG_DERIV
7752 printf("Fallback to derivative\n");
7753#endif
7754 if (IS_NILLABLE(sub)) {
7755 if (!(IS_NILLABLE(exp)))
7756 return(forbiddenExp);
7757 else
7758 ret = emptyExp;
7759 } else
7760 ret = NULL;
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007761 /*
7762 * here the structured derivation made no progress so
7763 * we use the default token based derivation to force one more step
7764 */
7765 if (ctxt->tabSize == 0)
7766 ctxt->tabSize = 40;
7767
7768 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7769 sizeof(const xmlChar *));
7770 if (tab == NULL) {
7771 return(NULL);
7772 }
7773
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007774 /*
7775 * collect all the strings accepted by the subexpression on input
7776 */
7777 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7778 while (len < 0) {
7779 const xmlChar **temp;
Rob Richards54a8f672005-10-07 02:33:00 +00007780 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007781 sizeof(const xmlChar *));
7782 if (temp == NULL) {
Rob Richards54a8f672005-10-07 02:33:00 +00007783 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007784 return(NULL);
7785 }
7786 tab = temp;
7787 ctxt->tabSize *= 2;
7788 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7789 }
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007790 for (i = 0;i < len;i++) {
7791 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7792 if ((tmp == NULL) || (tmp == forbiddenExp)) {
7793 xmlExpFree(ctxt, ret);
Rob Richards54a8f672005-10-07 02:33:00 +00007794 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007795 return(tmp);
7796 }
7797 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7798 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7799 xmlExpFree(ctxt, tmp);
7800 xmlExpFree(ctxt, ret);
Rob Richards54a8f672005-10-07 02:33:00 +00007801 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007802 return(tmp);
7803 }
7804 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7805 xmlExpFree(ctxt, tmp);
7806 xmlExpFree(ctxt, tmp2);
7807
7808 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7809 xmlExpFree(ctxt, ret);
Rob Richards54a8f672005-10-07 02:33:00 +00007810 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007811 return(tmp3);
7812 }
7813
7814 if (ret == NULL)
7815 ret = tmp3;
7816 else {
7817 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7818 if (ret == NULL) {
Rob Richards54a8f672005-10-07 02:33:00 +00007819 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007820 return(NULL);
7821 }
7822 }
7823 }
Rob Richards54a8f672005-10-07 02:33:00 +00007824 xmlFree((xmlChar **) tab);
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007825 return(ret);
7826}
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007827
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007828/**
Daniel Veillard0090bd52005-08-22 14:43:43 +00007829 * xmlExpExpDerive:
7830 * @ctxt: the expressions context
7831 * @exp: the englobing expression
7832 * @sub: the subexpression
7833 *
7834 * Evaluates the expression resulting from @exp consuming a sub expression @sub
7835 * Based on algebraic derivation and sometimes direct Brzozowski derivation
Haibo Huangcfd91dc2020-07-30 23:01:33 -07007836 * it usually takes less than linear time and can handle expressions generating
Daniel Veillard0090bd52005-08-22 14:43:43 +00007837 * infinite languages.
7838 *
7839 * Returns the resulting expression or NULL in case of internal error, the
7840 * result must be freed
7841 */
7842xmlExpNodePtr
7843xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7844 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7845 return(NULL);
7846
7847 /*
7848 * O(1) speedups
7849 */
7850 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7851#ifdef DEBUG_DERIV
7852 printf("Sub nillable and not exp : can't subsume\n");
7853#endif
7854 return(forbiddenExp);
7855 }
7856 if (xmlExpCheckCard(exp, sub) == 0) {
7857#ifdef DEBUG_DERIV
Haibo Huangcfd91dc2020-07-30 23:01:33 -07007858 printf("sub generate longer sequences than exp : can't subsume\n");
Daniel Veillard0090bd52005-08-22 14:43:43 +00007859#endif
7860 return(forbiddenExp);
7861 }
7862 return(xmlExpExpDeriveInt(ctxt, exp, sub));
7863}
7864
7865/**
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007866 * xmlExpSubsume:
7867 * @ctxt: the expressions context
7868 * @exp: the englobing expression
7869 * @sub: the subexpression
7870 *
Haibo Huangcfd91dc2020-07-30 23:01:33 -07007871 * Check whether @exp accepts all the languages accepted by @sub
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007872 * the input being a subexpression.
7873 *
7874 * Returns 1 if true 0 if false and -1 in case of failure.
7875 */
7876int
7877xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7878 xmlExpNodePtr tmp;
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007879
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007880 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7881 return(-1);
7882
7883 /*
7884 * TODO: speedup by checking the language of sub is a subset of the
7885 * language of exp
7886 */
7887 /*
7888 * O(1) speedups
7889 */
7890 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7891#ifdef DEBUG_DERIV
7892 printf("Sub nillable and not exp : can't subsume\n");
7893#endif
7894 return(0);
7895 }
7896 if (xmlExpCheckCard(exp, sub) == 0) {
7897#ifdef DEBUG_DERIV
Haibo Huangcfd91dc2020-07-30 23:01:33 -07007898 printf("sub generate longer sequences than exp : can't subsume\n");
Daniel Veillard81a8ec62005-08-22 00:20:58 +00007899#endif
7900 return(0);
7901 }
7902 tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7903#ifdef DEBUG_DERIV
7904 printf("Result derivation :\n");
7905 PRINT_EXP(tmp);
7906#endif
7907 if (tmp == NULL)
7908 return(-1);
7909 if (tmp == forbiddenExp)
7910 return(0);
7911 if (tmp == emptyExp)
7912 return(1);
7913 if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
7914 xmlExpFree(ctxt, tmp);
7915 return(1);
7916 }
7917 xmlExpFree(ctxt, tmp);
7918 return(0);
7919}
Daniel Veillard465a0002005-08-22 12:07:04 +00007920
7921/************************************************************************
7922 * *
Daniel Veillardf8e3db02012-09-11 13:26:36 +08007923 * Parsing expression *
Daniel Veillard465a0002005-08-22 12:07:04 +00007924 * *
7925 ************************************************************************/
7926
7927static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
7928
7929#undef CUR
7930#define CUR (*ctxt->cur)
7931#undef NEXT
7932#define NEXT ctxt->cur++;
7933#undef IS_BLANK
7934#define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
7935#define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
7936
7937static int
7938xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
7939 int ret = 0;
7940
7941 SKIP_BLANKS
7942 if (CUR == '*') {
7943 NEXT
7944 return(-1);
7945 }
7946 if ((CUR < '0') || (CUR > '9'))
7947 return(-1);
7948 while ((CUR >= '0') && (CUR <= '9')) {
7949 ret = ret * 10 + (CUR - '0');
7950 NEXT
7951 }
7952 return(ret);
7953}
7954
7955static xmlExpNodePtr
7956xmlExpParseOr(xmlExpCtxtPtr ctxt) {
7957 const char *base;
7958 xmlExpNodePtr ret;
7959 const xmlChar *val;
7960
7961 SKIP_BLANKS
7962 base = ctxt->cur;
7963 if (*ctxt->cur == '(') {
7964 NEXT
7965 ret = xmlExpParseExpr(ctxt);
7966 SKIP_BLANKS
7967 if (*ctxt->cur != ')') {
7968 fprintf(stderr, "unbalanced '(' : %s\n", base);
7969 xmlExpFree(ctxt, ret);
7970 return(NULL);
7971 }
7972 NEXT;
7973 SKIP_BLANKS
7974 goto parse_quantifier;
7975 }
7976 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
7977 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
7978 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
7979 NEXT;
7980 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
7981 if (val == NULL)
7982 return(NULL);
7983 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
7984 if (ret == NULL)
7985 return(NULL);
7986 SKIP_BLANKS
7987parse_quantifier:
7988 if (CUR == '{') {
7989 int min, max;
7990
7991 NEXT
7992 min = xmlExpParseNumber(ctxt);
7993 if (min < 0) {
7994 xmlExpFree(ctxt, ret);
7995 return(NULL);
7996 }
7997 SKIP_BLANKS
7998 if (CUR == ',') {
7999 NEXT
8000 max = xmlExpParseNumber(ctxt);
8001 SKIP_BLANKS
8002 } else
8003 max = min;
8004 if (CUR != '}') {
8005 xmlExpFree(ctxt, ret);
8006 return(NULL);
8007 }
8008 NEXT
8009 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8010 min, max);
8011 SKIP_BLANKS
8012 } else if (CUR == '?') {
8013 NEXT
8014 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8015 0, 1);
8016 SKIP_BLANKS
8017 } else if (CUR == '+') {
8018 NEXT
8019 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8020 1, -1);
8021 SKIP_BLANKS
8022 } else if (CUR == '*') {
8023 NEXT
8024 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8025 0, -1);
8026 SKIP_BLANKS
Daniel Veillardf8e3db02012-09-11 13:26:36 +08008027 }
Daniel Veillard465a0002005-08-22 12:07:04 +00008028 return(ret);
8029}
8030
8031
8032static xmlExpNodePtr
8033xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
8034 xmlExpNodePtr ret, right;
8035
8036 ret = xmlExpParseOr(ctxt);
8037 SKIP_BLANKS
8038 while (CUR == '|') {
8039 NEXT
8040 right = xmlExpParseOr(ctxt);
8041 if (right == NULL) {
8042 xmlExpFree(ctxt, ret);
8043 return(NULL);
8044 }
8045 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
8046 if (ret == NULL)
8047 return(NULL);
8048 }
8049 return(ret);
8050}
8051
8052static xmlExpNodePtr
8053xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
8054 xmlExpNodePtr ret, right;
8055
8056 ret = xmlExpParseSeq(ctxt);
8057 SKIP_BLANKS
8058 while (CUR == ',') {
8059 NEXT
8060 right = xmlExpParseSeq(ctxt);
8061 if (right == NULL) {
8062 xmlExpFree(ctxt, ret);
8063 return(NULL);
8064 }
8065 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
8066 if (ret == NULL)
8067 return(NULL);
8068 }
8069 return(ret);
8070}
8071
8072/**
8073 * xmlExpParse:
8074 * @ctxt: the expressions context
8075 * @expr: the 0 terminated string
8076 *
8077 * Minimal parser for regexps, it understand the following constructs
8078 * - string terminals
8079 * - choice operator |
8080 * - sequence operator ,
8081 * - subexpressions (...)
8082 * - usual cardinality operators + * and ?
8083 * - finite sequences { min, max }
8084 * - infinite sequences { min, * }
8085 * There is minimal checkings made especially no checking on strings values
8086 *
8087 * Returns a new expression or NULL in case of failure
8088 */
8089xmlExpNodePtr
8090xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
8091 xmlExpNodePtr ret;
8092
8093 ctxt->expr = expr;
8094 ctxt->cur = expr;
8095
8096 ret = xmlExpParseExpr(ctxt);
8097 SKIP_BLANKS
8098 if (*ctxt->cur != 0) {
8099 xmlExpFree(ctxt, ret);
8100 return(NULL);
8101 }
8102 return(ret);
8103}
8104
8105static void
8106xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
8107 xmlExpNodePtr c;
8108
8109 if (expr == NULL) return;
8110 if (glob) xmlBufferWriteChar(buf, "(");
8111 switch (expr->type) {
8112 case XML_EXP_EMPTY:
8113 xmlBufferWriteChar(buf, "empty");
8114 break;
8115 case XML_EXP_FORBID:
8116 xmlBufferWriteChar(buf, "forbidden");
8117 break;
8118 case XML_EXP_ATOM:
8119 xmlBufferWriteCHAR(buf, expr->exp_str);
8120 break;
8121 case XML_EXP_SEQ:
8122 c = expr->exp_left;
8123 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8124 xmlExpDumpInt(buf, c, 1);
8125 else
8126 xmlExpDumpInt(buf, c, 0);
8127 xmlBufferWriteChar(buf, " , ");
8128 c = expr->exp_right;
8129 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8130 xmlExpDumpInt(buf, c, 1);
8131 else
8132 xmlExpDumpInt(buf, c, 0);
8133 break;
8134 case XML_EXP_OR:
8135 c = expr->exp_left;
8136 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8137 xmlExpDumpInt(buf, c, 1);
8138 else
8139 xmlExpDumpInt(buf, c, 0);
8140 xmlBufferWriteChar(buf, " | ");
8141 c = expr->exp_right;
8142 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8143 xmlExpDumpInt(buf, c, 1);
8144 else
8145 xmlExpDumpInt(buf, c, 0);
8146 break;
8147 case XML_EXP_COUNT: {
8148 char rep[40];
Daniel Veillardf8e3db02012-09-11 13:26:36 +08008149
Daniel Veillard465a0002005-08-22 12:07:04 +00008150 c = expr->exp_left;
8151 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8152 xmlExpDumpInt(buf, c, 1);
8153 else
8154 xmlExpDumpInt(buf, c, 0);
8155 if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
8156 rep[0] = '?';
8157 rep[1] = 0;
8158 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
8159 rep[0] = '*';
8160 rep[1] = 0;
8161 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
8162 rep[0] = '+';
8163 rep[1] = 0;
8164 } else if (expr->exp_max == expr->exp_min) {
8165 snprintf(rep, 39, "{%d}", expr->exp_min);
8166 } else if (expr->exp_max < 0) {
8167 snprintf(rep, 39, "{%d,inf}", expr->exp_min);
8168 } else {
8169 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
8170 }
8171 rep[39] = 0;
8172 xmlBufferWriteChar(buf, rep);
8173 break;
8174 }
8175 default:
8176 fprintf(stderr, "Error in tree\n");
8177 }
8178 if (glob)
8179 xmlBufferWriteChar(buf, ")");
8180}
8181/**
8182 * xmlExpDump:
8183 * @buf: a buffer to receive the output
8184 * @expr: the compiled expression
8185 *
8186 * Serialize the expression as compiled to the buffer
8187 */
8188void
Daniel Veillard5eee7672005-08-22 21:22:27 +00008189xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
8190 if ((buf == NULL) || (expr == NULL))
Daniel Veillard465a0002005-08-22 12:07:04 +00008191 return;
Daniel Veillard5eee7672005-08-22 21:22:27 +00008192 xmlExpDumpInt(buf, expr, 0);
Daniel Veillard465a0002005-08-22 12:07:04 +00008193}
8194
8195/**
8196 * xmlExpMaxToken:
8197 * @expr: a compiled expression
8198 *
8199 * Indicate the maximum number of input a expression can accept
8200 *
8201 * Returns the maximum length or -1 in case of error
8202 */
8203int
8204xmlExpMaxToken(xmlExpNodePtr expr) {
8205 if (expr == NULL)
8206 return(-1);
8207 return(expr->c_max);
8208}
8209
8210/**
8211 * xmlExpCtxtNbNodes:
8212 * @ctxt: an expression context
8213 *
8214 * Debugging facility provides the number of allocated nodes at a that point
8215 *
8216 * Returns the number of nodes in use or -1 in case of error
8217 */
8218int
8219xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
8220 if (ctxt == NULL)
8221 return(-1);
8222 return(ctxt->nb_nodes);
8223}
8224
8225/**
8226 * xmlExpCtxtNbCons:
8227 * @ctxt: an expression context
8228 *
8229 * Debugging facility provides the number of allocated nodes over lifetime
8230 *
8231 * Returns the number of nodes ever allocated or -1 in case of error
8232 */
8233int
8234xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
8235 if (ctxt == NULL)
8236 return(-1);
8237 return(ctxt->nb_cons);
8238}
8239
Daniel Veillard81a8ec62005-08-22 00:20:58 +00008240#endif /* LIBXML_EXPR_ENABLED */
Daniel Veillard5d4644e2005-04-01 13:11:58 +00008241#define bottom_xmlregexp
8242#include "elfgcchack.h"
Daniel Veillard4255d502002-04-16 15:50:10 +00008243#endif /* LIBXML_REGEXP_ENABLED */