blob: 536e9f8b7091b2b4860c4144e66abe637e9d289c [file] [log] [blame]
Daniel Veillard4255d502002-04-16 15:50:10 +00001/*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/shemas mechanisms now available in
William M. Brackddf71d62004-05-06 04:17:26 +00006 * XML related specifications these include:
Daniel Veillard4255d502002-04-16 15:50:10 +00007 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17#define IN_LIBXML
18#include "libxml.h"
19
20#ifdef LIBXML_REGEXP_ENABLED
21
Daniel Veillardcee2b3a2005-01-25 00:22:52 +000022/* #define DEBUG_ERR */
Daniel Veillardfc0b6f62005-01-09 17:48:02 +000023
Daniel Veillard4255d502002-04-16 15:50:10 +000024#include <stdio.h>
25#include <string.h>
Daniel Veillardebe48c62003-12-03 12:12:27 +000026#ifdef HAVE_LIMITS_H
27#include <limits.h>
28#endif
29
Daniel Veillard4255d502002-04-16 15:50:10 +000030#include <libxml/tree.h>
31#include <libxml/parserInternals.h>
32#include <libxml/xmlregexp.h>
33#include <libxml/xmlautomata.h>
34#include <libxml/xmlunicode.h>
35
Daniel Veillardebe48c62003-12-03 12:12:27 +000036#ifndef INT_MAX
37#define INT_MAX 123456789 /* easy to flag and big enough for our needs */
38#endif
39
Daniel Veillardc0826a72004-08-10 14:17:33 +000040/* #define DEBUG_REGEXP_GRAPH */
41/* #define DEBUG_REGEXP_EXEC */
Daniel Veillard4255d502002-04-16 15:50:10 +000042/* #define DEBUG_PUSH */
Daniel Veillard23e73572002-09-19 19:56:43 +000043/* #define DEBUG_COMPACTION */
Daniel Veillard4255d502002-04-16 15:50:10 +000044
Daniel Veillardff46a042003-10-08 08:53:17 +000045#define ERROR(str) \
46 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
47 xmlRegexpErrCompile(ctxt, str);
Daniel Veillard4255d502002-04-16 15:50:10 +000048#define NEXT ctxt->cur++
49#define CUR (*(ctxt->cur))
50#define NXT(index) (ctxt->cur[index])
51
52#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
53#define NEXTL(l) ctxt->cur += l;
Daniel Veillardc0826a72004-08-10 14:17:33 +000054#define XML_REG_STRING_SEPARATOR '|'
Daniel Veillard4255d502002-04-16 15:50:10 +000055
Daniel Veillarde19fc232002-04-22 16:01:24 +000056/**
57 * TODO:
58 *
59 * macro to flag unimplemented blocks
60 */
61#define TODO \
62 xmlGenericError(xmlGenericErrorContext, \
63 "Unimplemented block at %s:%d\n", \
64 __FILE__, __LINE__);
65
Daniel Veillard4255d502002-04-16 15:50:10 +000066/************************************************************************
67 * *
68 * Datatypes and structures *
69 * *
70 ************************************************************************/
71
72typedef enum {
73 XML_REGEXP_EPSILON = 1,
74 XML_REGEXP_CHARVAL,
75 XML_REGEXP_RANGES,
76 XML_REGEXP_SUBREG,
77 XML_REGEXP_STRING,
78 XML_REGEXP_ANYCHAR, /* . */
79 XML_REGEXP_ANYSPACE, /* \s */
80 XML_REGEXP_NOTSPACE, /* \S */
81 XML_REGEXP_INITNAME, /* \l */
82 XML_REGEXP_NOTINITNAME, /* \l */
83 XML_REGEXP_NAMECHAR, /* \c */
84 XML_REGEXP_NOTNAMECHAR, /* \C */
85 XML_REGEXP_DECIMAL, /* \d */
86 XML_REGEXP_NOTDECIMAL, /* \d */
87 XML_REGEXP_REALCHAR, /* \w */
88 XML_REGEXP_NOTREALCHAR, /* \w */
89 XML_REGEXP_LETTER,
90 XML_REGEXP_LETTER_UPPERCASE,
91 XML_REGEXP_LETTER_LOWERCASE,
92 XML_REGEXP_LETTER_TITLECASE,
93 XML_REGEXP_LETTER_MODIFIER,
94 XML_REGEXP_LETTER_OTHERS,
95 XML_REGEXP_MARK,
96 XML_REGEXP_MARK_NONSPACING,
97 XML_REGEXP_MARK_SPACECOMBINING,
98 XML_REGEXP_MARK_ENCLOSING,
99 XML_REGEXP_NUMBER,
100 XML_REGEXP_NUMBER_DECIMAL,
101 XML_REGEXP_NUMBER_LETTER,
102 XML_REGEXP_NUMBER_OTHERS,
103 XML_REGEXP_PUNCT,
104 XML_REGEXP_PUNCT_CONNECTOR,
105 XML_REGEXP_PUNCT_DASH,
106 XML_REGEXP_PUNCT_OPEN,
107 XML_REGEXP_PUNCT_CLOSE,
108 XML_REGEXP_PUNCT_INITQUOTE,
109 XML_REGEXP_PUNCT_FINQUOTE,
110 XML_REGEXP_PUNCT_OTHERS,
111 XML_REGEXP_SEPAR,
112 XML_REGEXP_SEPAR_SPACE,
113 XML_REGEXP_SEPAR_LINE,
114 XML_REGEXP_SEPAR_PARA,
115 XML_REGEXP_SYMBOL,
116 XML_REGEXP_SYMBOL_MATH,
117 XML_REGEXP_SYMBOL_CURRENCY,
118 XML_REGEXP_SYMBOL_MODIFIER,
119 XML_REGEXP_SYMBOL_OTHERS,
120 XML_REGEXP_OTHER,
121 XML_REGEXP_OTHER_CONTROL,
122 XML_REGEXP_OTHER_FORMAT,
123 XML_REGEXP_OTHER_PRIVATE,
124 XML_REGEXP_OTHER_NA,
125 XML_REGEXP_BLOCK_NAME
126} xmlRegAtomType;
127
128typedef enum {
129 XML_REGEXP_QUANT_EPSILON = 1,
130 XML_REGEXP_QUANT_ONCE,
131 XML_REGEXP_QUANT_OPT,
132 XML_REGEXP_QUANT_MULT,
133 XML_REGEXP_QUANT_PLUS,
Daniel Veillard7646b182002-04-20 06:41:40 +0000134 XML_REGEXP_QUANT_ONCEONLY,
135 XML_REGEXP_QUANT_ALL,
Daniel Veillard4255d502002-04-16 15:50:10 +0000136 XML_REGEXP_QUANT_RANGE
137} xmlRegQuantType;
138
139typedef enum {
140 XML_REGEXP_START_STATE = 1,
141 XML_REGEXP_FINAL_STATE,
Daniel Veillardcc026dc2005-01-12 13:21:17 +0000142 XML_REGEXP_TRANS_STATE,
143 XML_REGEXP_SINK_STATE
Daniel Veillard4255d502002-04-16 15:50:10 +0000144} xmlRegStateType;
145
146typedef enum {
147 XML_REGEXP_MARK_NORMAL = 0,
148 XML_REGEXP_MARK_START,
149 XML_REGEXP_MARK_VISITED
150} xmlRegMarkedType;
151
152typedef struct _xmlRegRange xmlRegRange;
153typedef xmlRegRange *xmlRegRangePtr;
154
155struct _xmlRegRange {
Daniel Veillardf8b9de32003-11-24 14:27:26 +0000156 int neg; /* 0 normal, 1 not, 2 exclude */
Daniel Veillard4255d502002-04-16 15:50:10 +0000157 xmlRegAtomType type;
158 int start;
159 int end;
160 xmlChar *blockName;
161};
162
163typedef struct _xmlRegAtom xmlRegAtom;
164typedef xmlRegAtom *xmlRegAtomPtr;
165
166typedef struct _xmlAutomataState xmlRegState;
167typedef xmlRegState *xmlRegStatePtr;
168
169struct _xmlRegAtom {
170 int no;
171 xmlRegAtomType type;
172 xmlRegQuantType quant;
173 int min;
174 int max;
175
176 void *valuep;
Daniel Veillarda646cfd2002-09-17 21:50:03 +0000177 void *valuep2;
Daniel Veillard4255d502002-04-16 15:50:10 +0000178 int neg;
179 int codepoint;
180 xmlRegStatePtr start;
181 xmlRegStatePtr stop;
182 int maxRanges;
183 int nbRanges;
184 xmlRegRangePtr *ranges;
185 void *data;
186};
187
188typedef struct _xmlRegCounter xmlRegCounter;
189typedef xmlRegCounter *xmlRegCounterPtr;
190
191struct _xmlRegCounter {
192 int min;
193 int max;
194};
195
196typedef struct _xmlRegTrans xmlRegTrans;
197typedef xmlRegTrans *xmlRegTransPtr;
198
199struct _xmlRegTrans {
200 xmlRegAtomPtr atom;
201 int to;
202 int counter;
203 int count;
204};
205
206struct _xmlAutomataState {
207 xmlRegStateType type;
208 xmlRegMarkedType mark;
Daniel Veillard23e73572002-09-19 19:56:43 +0000209 xmlRegMarkedType reached;
Daniel Veillard4255d502002-04-16 15:50:10 +0000210 int no;
Daniel Veillard4255d502002-04-16 15:50:10 +0000211 int maxTrans;
212 int nbTrans;
213 xmlRegTrans *trans;
214};
215
216typedef struct _xmlAutomata xmlRegParserCtxt;
217typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
218
219struct _xmlAutomata {
220 xmlChar *string;
221 xmlChar *cur;
222
223 int error;
224 int neg;
225
226 xmlRegStatePtr start;
227 xmlRegStatePtr end;
228 xmlRegStatePtr state;
229
230 xmlRegAtomPtr atom;
231
232 int maxAtoms;
233 int nbAtoms;
234 xmlRegAtomPtr *atoms;
235
236 int maxStates;
237 int nbStates;
238 xmlRegStatePtr *states;
239
240 int maxCounters;
241 int nbCounters;
242 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000243
244 int determinist;
Daniel Veillard4255d502002-04-16 15:50:10 +0000245};
246
247struct _xmlRegexp {
248 xmlChar *string;
249 int nbStates;
250 xmlRegStatePtr *states;
251 int nbAtoms;
252 xmlRegAtomPtr *atoms;
253 int nbCounters;
254 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000255 int determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000256 /*
257 * That's the compact form for determinists automatas
258 */
259 int nbstates;
260 int *compact;
Daniel Veillard118aed72002-09-24 14:13:13 +0000261 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000262 int nbstrings;
263 xmlChar **stringMap;
Daniel Veillard4255d502002-04-16 15:50:10 +0000264};
265
266typedef struct _xmlRegExecRollback xmlRegExecRollback;
267typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
268
269struct _xmlRegExecRollback {
270 xmlRegStatePtr state;/* the current state */
271 int index; /* the index in the input stack */
272 int nextbranch; /* the next transition to explore in that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000273 int *counts; /* save the automata state if it has some */
Daniel Veillard4255d502002-04-16 15:50:10 +0000274};
275
276typedef struct _xmlRegInputToken xmlRegInputToken;
277typedef xmlRegInputToken *xmlRegInputTokenPtr;
278
279struct _xmlRegInputToken {
280 xmlChar *value;
281 void *data;
282};
283
284struct _xmlRegExecCtxt {
285 int status; /* execution status != 0 indicate an error */
William M. Brackddf71d62004-05-06 04:17:26 +0000286 int determinist; /* did we find an indeterministic behaviour */
Daniel Veillard4255d502002-04-16 15:50:10 +0000287 xmlRegexpPtr comp; /* the compiled regexp */
288 xmlRegExecCallbacks callback;
289 void *data;
290
291 xmlRegStatePtr state;/* the current state */
292 int transno; /* the current transition on that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000293 int transcount; /* the number of chars in char counted transitions */
Daniel Veillard4255d502002-04-16 15:50:10 +0000294
295 /*
296 * A stack of rollback states
297 */
298 int maxRollbacks;
299 int nbRollbacks;
300 xmlRegExecRollback *rollbacks;
301
302 /*
303 * The state of the automata if any
304 */
305 int *counts;
306
307 /*
308 * The input stack
309 */
310 int inputStackMax;
311 int inputStackNr;
312 int index;
313 int *charStack;
314 const xmlChar *inputString; /* when operating on characters */
315 xmlRegInputTokenPtr inputStack;/* when operating on strings */
316
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +0000317 /*
318 * error handling
319 */
320 int errStateNo; /* the error state number */
321 xmlRegStatePtr errState; /* the error state */
322 xmlChar *errString; /* the string raising the error */
323 int *errCounts; /* counters at the error state */
Daniel Veillard4255d502002-04-16 15:50:10 +0000324};
325
Daniel Veillard441bc322002-04-20 17:38:48 +0000326#define REGEXP_ALL_COUNTER 0x123456
327#define REGEXP_ALL_LAX_COUNTER 0x123457
Daniel Veillard7646b182002-04-20 06:41:40 +0000328
Daniel Veillard4255d502002-04-16 15:50:10 +0000329static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
Daniel Veillard23e73572002-09-19 19:56:43 +0000330static void xmlRegFreeState(xmlRegStatePtr state);
331static void xmlRegFreeAtom(xmlRegAtomPtr atom);
Daniel Veillard9efc4762005-07-19 14:33:55 +0000332static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
Daniel Veillard4255d502002-04-16 15:50:10 +0000333
334/************************************************************************
Daniel Veillardff46a042003-10-08 08:53:17 +0000335 * *
336 * Regexp memory error handler *
337 * *
338 ************************************************************************/
339/**
340 * xmlRegexpErrMemory:
William M. Brackddf71d62004-05-06 04:17:26 +0000341 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000342 *
343 * Handle an out of memory condition
344 */
345static void
346xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
347{
348 const char *regexp = NULL;
349 if (ctxt != NULL) {
350 regexp = (const char *) ctxt->string;
351 ctxt->error = XML_ERR_NO_MEMORY;
352 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000353 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000354 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
355 regexp, NULL, 0, 0,
356 "Memory allocation failed : %s\n", extra);
357}
358
359/**
360 * xmlRegexpErrCompile:
William M. Brackddf71d62004-05-06 04:17:26 +0000361 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000362 *
William M. Brackddf71d62004-05-06 04:17:26 +0000363 * Handle a compilation failure
Daniel Veillardff46a042003-10-08 08:53:17 +0000364 */
365static void
366xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
367{
368 const char *regexp = NULL;
369 int idx = 0;
370
371 if (ctxt != NULL) {
372 regexp = (const char *) ctxt->string;
373 idx = ctxt->cur - ctxt->string;
374 ctxt->error = XML_REGEXP_COMPILE_ERROR;
375 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000376 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000377 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
378 regexp, NULL, idx, 0,
379 "failed to compile: %s\n", extra);
380}
381
382/************************************************************************
Daniel Veillard4255d502002-04-16 15:50:10 +0000383 * *
384 * Allocation/Deallocation *
385 * *
386 ************************************************************************/
387
Daniel Veillard23e73572002-09-19 19:56:43 +0000388static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
Daniel Veillard4255d502002-04-16 15:50:10 +0000389/**
390 * xmlRegEpxFromParse:
391 * @ctxt: the parser context used to build it
392 *
William M. Brackddf71d62004-05-06 04:17:26 +0000393 * Allocate a new regexp and fill it with the result from the parser
Daniel Veillard4255d502002-04-16 15:50:10 +0000394 *
395 * Returns the new regexp or NULL in case of error
396 */
397static xmlRegexpPtr
398xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
399 xmlRegexpPtr ret;
400
401 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000402 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000403 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +0000404 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000405 }
Daniel Veillard4255d502002-04-16 15:50:10 +0000406 memset(ret, 0, sizeof(xmlRegexp));
407 ret->string = ctxt->string;
Daniel Veillard4255d502002-04-16 15:50:10 +0000408 ret->nbStates = ctxt->nbStates;
Daniel Veillard4255d502002-04-16 15:50:10 +0000409 ret->states = ctxt->states;
Daniel Veillard4255d502002-04-16 15:50:10 +0000410 ret->nbAtoms = ctxt->nbAtoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000411 ret->atoms = ctxt->atoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000412 ret->nbCounters = ctxt->nbCounters;
Daniel Veillard4255d502002-04-16 15:50:10 +0000413 ret->counters = ctxt->counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000414 ret->determinist = ctxt->determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000415
416 if ((ret->determinist != 0) &&
417 (ret->nbCounters == 0) &&
Daniel Veillard118aed72002-09-24 14:13:13 +0000418 (ret->atoms != NULL) &&
Daniel Veillard23e73572002-09-19 19:56:43 +0000419 (ret->atoms[0] != NULL) &&
420 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
421 int i, j, nbstates = 0, nbatoms = 0;
422 int *stateRemap;
423 int *stringRemap;
424 int *transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000425 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000426 xmlChar **stringMap;
427 xmlChar *value;
428
429 /*
430 * Switch to a compact representation
431 * 1/ counting the effective number of states left
William M. Brackddf71d62004-05-06 04:17:26 +0000432 * 2/ counting the unique number of atoms, and check that
Daniel Veillard23e73572002-09-19 19:56:43 +0000433 * they are all of the string type
434 * 3/ build a table state x atom for the transitions
435 */
436
437 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000438 if (stateRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000439 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000440 xmlFree(ret);
441 return(NULL);
442 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000443 for (i = 0;i < ret->nbStates;i++) {
444 if (ret->states[i] != NULL) {
445 stateRemap[i] = nbstates;
446 nbstates++;
447 } else {
448 stateRemap[i] = -1;
449 }
450 }
451#ifdef DEBUG_COMPACTION
452 printf("Final: %d states\n", nbstates);
453#endif
454 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000455 if (stringMap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000456 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000457 xmlFree(stateRemap);
458 xmlFree(ret);
459 return(NULL);
460 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000461 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000462 if (stringRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000463 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000464 xmlFree(stringMap);
465 xmlFree(stateRemap);
466 xmlFree(ret);
467 return(NULL);
468 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000469 for (i = 0;i < ret->nbAtoms;i++) {
470 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
471 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
472 value = ret->atoms[i]->valuep;
473 for (j = 0;j < nbatoms;j++) {
474 if (xmlStrEqual(stringMap[j], value)) {
475 stringRemap[i] = j;
476 break;
477 }
478 }
479 if (j >= nbatoms) {
480 stringRemap[i] = nbatoms;
481 stringMap[nbatoms] = xmlStrdup(value);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000482 if (stringMap[nbatoms] == NULL) {
483 for (i = 0;i < nbatoms;i++)
484 xmlFree(stringMap[i]);
485 xmlFree(stringRemap);
486 xmlFree(stringMap);
487 xmlFree(stateRemap);
488 xmlFree(ret);
489 return(NULL);
490 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000491 nbatoms++;
492 }
493 } else {
494 xmlFree(stateRemap);
495 xmlFree(stringRemap);
496 for (i = 0;i < nbatoms;i++)
497 xmlFree(stringMap[i]);
498 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000499 xmlFree(ret);
500 return(NULL);
Daniel Veillard23e73572002-09-19 19:56:43 +0000501 }
502 }
503#ifdef DEBUG_COMPACTION
504 printf("Final: %d atoms\n", nbatoms);
505#endif
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000506 transitions = (int *) xmlMalloc((nbstates + 1) *
507 (nbatoms + 1) * sizeof(int));
508 if (transitions == NULL) {
509 xmlFree(stateRemap);
510 xmlFree(stringRemap);
511 xmlFree(stringMap);
512 xmlFree(ret);
513 return(NULL);
514 }
515 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int));
Daniel Veillard23e73572002-09-19 19:56:43 +0000516
517 /*
518 * Allocate the transition table. The first entry for each
William M. Brackddf71d62004-05-06 04:17:26 +0000519 * state corresponds to the state type.
Daniel Veillard23e73572002-09-19 19:56:43 +0000520 */
Daniel Veillard118aed72002-09-24 14:13:13 +0000521 transdata = NULL;
Daniel Veillard23e73572002-09-19 19:56:43 +0000522
523 for (i = 0;i < ret->nbStates;i++) {
524 int stateno, atomno, targetno, prev;
525 xmlRegStatePtr state;
526 xmlRegTransPtr trans;
527
528 stateno = stateRemap[i];
529 if (stateno == -1)
530 continue;
531 state = ret->states[i];
532
533 transitions[stateno * (nbatoms + 1)] = state->type;
534
535 for (j = 0;j < state->nbTrans;j++) {
536 trans = &(state->trans[j]);
537 if ((trans->to == -1) || (trans->atom == NULL))
538 continue;
539 atomno = stringRemap[trans->atom->no];
Daniel Veillard118aed72002-09-24 14:13:13 +0000540 if ((trans->atom->data != NULL) && (transdata == NULL)) {
541 transdata = (void **) xmlMalloc(nbstates * nbatoms *
542 sizeof(void *));
543 if (transdata != NULL)
544 memset(transdata, 0,
545 nbstates * nbatoms * sizeof(void *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000546 else {
Daniel Veillardff46a042003-10-08 08:53:17 +0000547 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000548 break;
549 }
Daniel Veillard118aed72002-09-24 14:13:13 +0000550 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000551 targetno = stateRemap[trans->to];
552 /*
William M. Brackddf71d62004-05-06 04:17:26 +0000553 * if the same atom can generate transitions to 2 different
Daniel Veillard23e73572002-09-19 19:56:43 +0000554 * states then it means the automata is not determinist and
555 * the compact form can't be used !
556 */
557 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
558 if (prev != 0) {
559 if (prev != targetno + 1) {
Daniel Veillard23e73572002-09-19 19:56:43 +0000560 ret->determinist = 0;
561#ifdef DEBUG_COMPACTION
562 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
563 i, j, trans->atom->no, trans->to, atomno, targetno);
564 printf(" previous to is %d\n", prev);
565#endif
566 ret->determinist = 0;
Daniel Veillard118aed72002-09-24 14:13:13 +0000567 if (transdata != NULL)
568 xmlFree(transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +0000569 xmlFree(transitions);
570 xmlFree(stateRemap);
571 xmlFree(stringRemap);
572 for (i = 0;i < nbatoms;i++)
573 xmlFree(stringMap[i]);
574 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000575 goto not_determ;
Daniel Veillard23e73572002-09-19 19:56:43 +0000576 }
577 } else {
578#if 0
579 printf("State %d trans %d: atom %d to %d : %d to %d\n",
580 i, j, trans->atom->no, trans->to, atomno, targetno);
581#endif
582 transitions[stateno * (nbatoms + 1) + atomno + 1] =
Daniel Veillard118aed72002-09-24 14:13:13 +0000583 targetno + 1; /* to avoid 0 */
584 if (transdata != NULL)
585 transdata[stateno * nbatoms + atomno] =
586 trans->atom->data;
Daniel Veillard23e73572002-09-19 19:56:43 +0000587 }
588 }
589 }
590 ret->determinist = 1;
591#ifdef DEBUG_COMPACTION
592 /*
593 * Debug
594 */
595 for (i = 0;i < nbstates;i++) {
596 for (j = 0;j < nbatoms + 1;j++) {
597 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
598 }
599 printf("\n");
600 }
601 printf("\n");
602#endif
603 /*
604 * Cleanup of the old data
605 */
606 if (ret->states != NULL) {
607 for (i = 0;i < ret->nbStates;i++)
608 xmlRegFreeState(ret->states[i]);
609 xmlFree(ret->states);
610 }
611 ret->states = NULL;
612 ret->nbStates = 0;
613 if (ret->atoms != NULL) {
614 for (i = 0;i < ret->nbAtoms;i++)
615 xmlRegFreeAtom(ret->atoms[i]);
616 xmlFree(ret->atoms);
617 }
618 ret->atoms = NULL;
619 ret->nbAtoms = 0;
620
621 ret->compact = transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000622 ret->transdata = transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000623 ret->stringMap = stringMap;
624 ret->nbstrings = nbatoms;
625 ret->nbstates = nbstates;
626 xmlFree(stateRemap);
627 xmlFree(stringRemap);
628 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000629not_determ:
630 ctxt->string = NULL;
631 ctxt->nbStates = 0;
632 ctxt->states = NULL;
633 ctxt->nbAtoms = 0;
634 ctxt->atoms = NULL;
635 ctxt->nbCounters = 0;
636 ctxt->counters = NULL;
Daniel Veillard4255d502002-04-16 15:50:10 +0000637 return(ret);
638}
639
640/**
641 * xmlRegNewParserCtxt:
642 * @string: the string to parse
643 *
644 * Allocate a new regexp parser context
645 *
646 * Returns the new context or NULL in case of error
647 */
648static xmlRegParserCtxtPtr
649xmlRegNewParserCtxt(const xmlChar *string) {
650 xmlRegParserCtxtPtr ret;
651
652 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
653 if (ret == NULL)
654 return(NULL);
655 memset(ret, 0, sizeof(xmlRegParserCtxt));
656 if (string != NULL)
657 ret->string = xmlStrdup(string);
658 ret->cur = ret->string;
659 ret->neg = 0;
660 ret->error = 0;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000661 ret->determinist = -1;
Daniel Veillard4255d502002-04-16 15:50:10 +0000662 return(ret);
663}
664
665/**
666 * xmlRegNewRange:
667 * @ctxt: the regexp parser context
668 * @neg: is that negative
669 * @type: the type of range
670 * @start: the start codepoint
671 * @end: the end codepoint
672 *
673 * Allocate a new regexp range
674 *
675 * Returns the new range or NULL in case of error
676 */
677static xmlRegRangePtr
678xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
679 int neg, xmlRegAtomType type, int start, int end) {
680 xmlRegRangePtr ret;
681
682 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
683 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000684 xmlRegexpErrMemory(ctxt, "allocating range");
Daniel Veillard4255d502002-04-16 15:50:10 +0000685 return(NULL);
686 }
687 ret->neg = neg;
688 ret->type = type;
689 ret->start = start;
690 ret->end = end;
691 return(ret);
692}
693
694/**
695 * xmlRegFreeRange:
696 * @range: the regexp range
697 *
698 * Free a regexp range
699 */
700static void
701xmlRegFreeRange(xmlRegRangePtr range) {
702 if (range == NULL)
703 return;
704
705 if (range->blockName != NULL)
706 xmlFree(range->blockName);
707 xmlFree(range);
708}
709
710/**
711 * xmlRegNewAtom:
712 * @ctxt: the regexp parser context
713 * @type: the type of atom
714 *
715 * Allocate a new regexp range
716 *
717 * Returns the new atom or NULL in case of error
718 */
719static xmlRegAtomPtr
720xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
721 xmlRegAtomPtr ret;
722
723 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
724 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000725 xmlRegexpErrMemory(ctxt, "allocating atom");
Daniel Veillard4255d502002-04-16 15:50:10 +0000726 return(NULL);
727 }
728 memset(ret, 0, sizeof(xmlRegAtom));
729 ret->type = type;
730 ret->quant = XML_REGEXP_QUANT_ONCE;
731 ret->min = 0;
732 ret->max = 0;
733 return(ret);
734}
735
736/**
737 * xmlRegFreeAtom:
738 * @atom: the regexp atom
739 *
740 * Free a regexp atom
741 */
742static void
743xmlRegFreeAtom(xmlRegAtomPtr atom) {
744 int i;
745
746 if (atom == NULL)
747 return;
748
749 for (i = 0;i < atom->nbRanges;i++)
750 xmlRegFreeRange(atom->ranges[i]);
751 if (atom->ranges != NULL)
752 xmlFree(atom->ranges);
Daniel Veillardde0e4982005-07-03 14:35:44 +0000753 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
754 xmlFree(atom->valuep);
755 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +0000756 xmlFree(atom->valuep);
757 xmlFree(atom);
758}
759
760static xmlRegStatePtr
761xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
762 xmlRegStatePtr ret;
763
764 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
765 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000766 xmlRegexpErrMemory(ctxt, "allocating state");
Daniel Veillard4255d502002-04-16 15:50:10 +0000767 return(NULL);
768 }
769 memset(ret, 0, sizeof(xmlRegState));
770 ret->type = XML_REGEXP_TRANS_STATE;
771 ret->mark = XML_REGEXP_MARK_NORMAL;
772 return(ret);
773}
774
775/**
776 * xmlRegFreeState:
777 * @state: the regexp state
778 *
779 * Free a regexp state
780 */
781static void
782xmlRegFreeState(xmlRegStatePtr state) {
783 if (state == NULL)
784 return;
785
786 if (state->trans != NULL)
787 xmlFree(state->trans);
788 xmlFree(state);
789}
790
791/**
792 * xmlRegFreeParserCtxt:
793 * @ctxt: the regexp parser context
794 *
795 * Free a regexp parser context
796 */
797static void
798xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
799 int i;
800 if (ctxt == NULL)
801 return;
802
803 if (ctxt->string != NULL)
804 xmlFree(ctxt->string);
805 if (ctxt->states != NULL) {
806 for (i = 0;i < ctxt->nbStates;i++)
807 xmlRegFreeState(ctxt->states[i]);
808 xmlFree(ctxt->states);
809 }
810 if (ctxt->atoms != NULL) {
811 for (i = 0;i < ctxt->nbAtoms;i++)
812 xmlRegFreeAtom(ctxt->atoms[i]);
813 xmlFree(ctxt->atoms);
814 }
815 if (ctxt->counters != NULL)
816 xmlFree(ctxt->counters);
817 xmlFree(ctxt);
818}
819
820/************************************************************************
821 * *
822 * Display of Data structures *
823 * *
824 ************************************************************************/
825
826static void
827xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
828 switch (type) {
829 case XML_REGEXP_EPSILON:
830 fprintf(output, "epsilon "); break;
831 case XML_REGEXP_CHARVAL:
832 fprintf(output, "charval "); break;
833 case XML_REGEXP_RANGES:
834 fprintf(output, "ranges "); break;
835 case XML_REGEXP_SUBREG:
836 fprintf(output, "subexpr "); break;
837 case XML_REGEXP_STRING:
838 fprintf(output, "string "); break;
839 case XML_REGEXP_ANYCHAR:
840 fprintf(output, "anychar "); break;
841 case XML_REGEXP_ANYSPACE:
842 fprintf(output, "anyspace "); break;
843 case XML_REGEXP_NOTSPACE:
844 fprintf(output, "notspace "); break;
845 case XML_REGEXP_INITNAME:
846 fprintf(output, "initname "); break;
847 case XML_REGEXP_NOTINITNAME:
848 fprintf(output, "notinitname "); break;
849 case XML_REGEXP_NAMECHAR:
850 fprintf(output, "namechar "); break;
851 case XML_REGEXP_NOTNAMECHAR:
852 fprintf(output, "notnamechar "); break;
853 case XML_REGEXP_DECIMAL:
854 fprintf(output, "decimal "); break;
855 case XML_REGEXP_NOTDECIMAL:
856 fprintf(output, "notdecimal "); break;
857 case XML_REGEXP_REALCHAR:
858 fprintf(output, "realchar "); break;
859 case XML_REGEXP_NOTREALCHAR:
860 fprintf(output, "notrealchar "); break;
861 case XML_REGEXP_LETTER:
862 fprintf(output, "LETTER "); break;
863 case XML_REGEXP_LETTER_UPPERCASE:
864 fprintf(output, "LETTER_UPPERCASE "); break;
865 case XML_REGEXP_LETTER_LOWERCASE:
866 fprintf(output, "LETTER_LOWERCASE "); break;
867 case XML_REGEXP_LETTER_TITLECASE:
868 fprintf(output, "LETTER_TITLECASE "); break;
869 case XML_REGEXP_LETTER_MODIFIER:
870 fprintf(output, "LETTER_MODIFIER "); break;
871 case XML_REGEXP_LETTER_OTHERS:
872 fprintf(output, "LETTER_OTHERS "); break;
873 case XML_REGEXP_MARK:
874 fprintf(output, "MARK "); break;
875 case XML_REGEXP_MARK_NONSPACING:
876 fprintf(output, "MARK_NONSPACING "); break;
877 case XML_REGEXP_MARK_SPACECOMBINING:
878 fprintf(output, "MARK_SPACECOMBINING "); break;
879 case XML_REGEXP_MARK_ENCLOSING:
880 fprintf(output, "MARK_ENCLOSING "); break;
881 case XML_REGEXP_NUMBER:
882 fprintf(output, "NUMBER "); break;
883 case XML_REGEXP_NUMBER_DECIMAL:
884 fprintf(output, "NUMBER_DECIMAL "); break;
885 case XML_REGEXP_NUMBER_LETTER:
886 fprintf(output, "NUMBER_LETTER "); break;
887 case XML_REGEXP_NUMBER_OTHERS:
888 fprintf(output, "NUMBER_OTHERS "); break;
889 case XML_REGEXP_PUNCT:
890 fprintf(output, "PUNCT "); break;
891 case XML_REGEXP_PUNCT_CONNECTOR:
892 fprintf(output, "PUNCT_CONNECTOR "); break;
893 case XML_REGEXP_PUNCT_DASH:
894 fprintf(output, "PUNCT_DASH "); break;
895 case XML_REGEXP_PUNCT_OPEN:
896 fprintf(output, "PUNCT_OPEN "); break;
897 case XML_REGEXP_PUNCT_CLOSE:
898 fprintf(output, "PUNCT_CLOSE "); break;
899 case XML_REGEXP_PUNCT_INITQUOTE:
900 fprintf(output, "PUNCT_INITQUOTE "); break;
901 case XML_REGEXP_PUNCT_FINQUOTE:
902 fprintf(output, "PUNCT_FINQUOTE "); break;
903 case XML_REGEXP_PUNCT_OTHERS:
904 fprintf(output, "PUNCT_OTHERS "); break;
905 case XML_REGEXP_SEPAR:
906 fprintf(output, "SEPAR "); break;
907 case XML_REGEXP_SEPAR_SPACE:
908 fprintf(output, "SEPAR_SPACE "); break;
909 case XML_REGEXP_SEPAR_LINE:
910 fprintf(output, "SEPAR_LINE "); break;
911 case XML_REGEXP_SEPAR_PARA:
912 fprintf(output, "SEPAR_PARA "); break;
913 case XML_REGEXP_SYMBOL:
914 fprintf(output, "SYMBOL "); break;
915 case XML_REGEXP_SYMBOL_MATH:
916 fprintf(output, "SYMBOL_MATH "); break;
917 case XML_REGEXP_SYMBOL_CURRENCY:
918 fprintf(output, "SYMBOL_CURRENCY "); break;
919 case XML_REGEXP_SYMBOL_MODIFIER:
920 fprintf(output, "SYMBOL_MODIFIER "); break;
921 case XML_REGEXP_SYMBOL_OTHERS:
922 fprintf(output, "SYMBOL_OTHERS "); break;
923 case XML_REGEXP_OTHER:
924 fprintf(output, "OTHER "); break;
925 case XML_REGEXP_OTHER_CONTROL:
926 fprintf(output, "OTHER_CONTROL "); break;
927 case XML_REGEXP_OTHER_FORMAT:
928 fprintf(output, "OTHER_FORMAT "); break;
929 case XML_REGEXP_OTHER_PRIVATE:
930 fprintf(output, "OTHER_PRIVATE "); break;
931 case XML_REGEXP_OTHER_NA:
932 fprintf(output, "OTHER_NA "); break;
933 case XML_REGEXP_BLOCK_NAME:
934 fprintf(output, "BLOCK "); break;
935 }
936}
937
938static void
939xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
940 switch (type) {
941 case XML_REGEXP_QUANT_EPSILON:
942 fprintf(output, "epsilon "); break;
943 case XML_REGEXP_QUANT_ONCE:
944 fprintf(output, "once "); break;
945 case XML_REGEXP_QUANT_OPT:
946 fprintf(output, "? "); break;
947 case XML_REGEXP_QUANT_MULT:
948 fprintf(output, "* "); break;
949 case XML_REGEXP_QUANT_PLUS:
950 fprintf(output, "+ "); break;
951 case XML_REGEXP_QUANT_RANGE:
952 fprintf(output, "range "); break;
Daniel Veillard7646b182002-04-20 06:41:40 +0000953 case XML_REGEXP_QUANT_ONCEONLY:
954 fprintf(output, "onceonly "); break;
955 case XML_REGEXP_QUANT_ALL:
956 fprintf(output, "all "); break;
Daniel Veillard4255d502002-04-16 15:50:10 +0000957 }
958}
959static void
960xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
961 fprintf(output, " range: ");
962 if (range->neg)
963 fprintf(output, "negative ");
964 xmlRegPrintAtomType(output, range->type);
965 fprintf(output, "%c - %c\n", range->start, range->end);
966}
967
968static void
969xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
970 fprintf(output, " atom: ");
971 if (atom == NULL) {
972 fprintf(output, "NULL\n");
973 return;
974 }
Daniel Veillard9efc4762005-07-19 14:33:55 +0000975 if (atom->neg)
976 fprintf(output, "not ");
Daniel Veillard4255d502002-04-16 15:50:10 +0000977 xmlRegPrintAtomType(output, atom->type);
978 xmlRegPrintQuantType(output, atom->quant);
979 if (atom->quant == XML_REGEXP_QUANT_RANGE)
980 fprintf(output, "%d-%d ", atom->min, atom->max);
981 if (atom->type == XML_REGEXP_STRING)
982 fprintf(output, "'%s' ", (char *) atom->valuep);
983 if (atom->type == XML_REGEXP_CHARVAL)
984 fprintf(output, "char %c\n", atom->codepoint);
985 else if (atom->type == XML_REGEXP_RANGES) {
986 int i;
987 fprintf(output, "%d entries\n", atom->nbRanges);
988 for (i = 0; i < atom->nbRanges;i++)
989 xmlRegPrintRange(output, atom->ranges[i]);
990 } else if (atom->type == XML_REGEXP_SUBREG) {
991 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
992 } else {
993 fprintf(output, "\n");
994 }
995}
996
997static void
998xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
999 fprintf(output, " trans: ");
1000 if (trans == NULL) {
1001 fprintf(output, "NULL\n");
1002 return;
1003 }
1004 if (trans->to < 0) {
1005 fprintf(output, "removed\n");
1006 return;
1007 }
1008 if (trans->counter >= 0) {
1009 fprintf(output, "counted %d, ", trans->counter);
1010 }
Daniel Veillard8a001f62002-04-20 07:24:11 +00001011 if (trans->count == REGEXP_ALL_COUNTER) {
1012 fprintf(output, "all transition, ");
1013 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00001014 fprintf(output, "count based %d, ", trans->count);
1015 }
1016 if (trans->atom == NULL) {
1017 fprintf(output, "epsilon to %d\n", trans->to);
1018 return;
1019 }
1020 if (trans->atom->type == XML_REGEXP_CHARVAL)
1021 fprintf(output, "char %c ", trans->atom->codepoint);
1022 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1023}
1024
1025static void
1026xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1027 int i;
1028
1029 fprintf(output, " state: ");
1030 if (state == NULL) {
1031 fprintf(output, "NULL\n");
1032 return;
1033 }
1034 if (state->type == XML_REGEXP_START_STATE)
1035 fprintf(output, "START ");
1036 if (state->type == XML_REGEXP_FINAL_STATE)
1037 fprintf(output, "FINAL ");
1038
1039 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1040 for (i = 0;i < state->nbTrans; i++) {
1041 xmlRegPrintTrans(output, &(state->trans[i]));
1042 }
1043}
1044
Daniel Veillard23e73572002-09-19 19:56:43 +00001045#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard4255d502002-04-16 15:50:10 +00001046static void
1047xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1048 int i;
1049
1050 fprintf(output, " ctxt: ");
1051 if (ctxt == NULL) {
1052 fprintf(output, "NULL\n");
1053 return;
1054 }
1055 fprintf(output, "'%s' ", ctxt->string);
1056 if (ctxt->error)
1057 fprintf(output, "error ");
1058 if (ctxt->neg)
1059 fprintf(output, "neg ");
1060 fprintf(output, "\n");
1061 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1062 for (i = 0;i < ctxt->nbAtoms; i++) {
1063 fprintf(output, " %02d ", i);
1064 xmlRegPrintAtom(output, ctxt->atoms[i]);
1065 }
1066 if (ctxt->atom != NULL) {
1067 fprintf(output, "current atom:\n");
1068 xmlRegPrintAtom(output, ctxt->atom);
1069 }
1070 fprintf(output, "%d states:", ctxt->nbStates);
1071 if (ctxt->start != NULL)
1072 fprintf(output, " start: %d", ctxt->start->no);
1073 if (ctxt->end != NULL)
1074 fprintf(output, " end: %d", ctxt->end->no);
1075 fprintf(output, "\n");
1076 for (i = 0;i < ctxt->nbStates; i++) {
1077 xmlRegPrintState(output, ctxt->states[i]);
1078 }
1079 fprintf(output, "%d counters:\n", ctxt->nbCounters);
1080 for (i = 0;i < ctxt->nbCounters; i++) {
1081 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1082 ctxt->counters[i].max);
1083 }
1084}
Daniel Veillard23e73572002-09-19 19:56:43 +00001085#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00001086
1087/************************************************************************
1088 * *
1089 * Finite Automata structures manipulations *
1090 * *
1091 ************************************************************************/
1092
1093static void
1094xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1095 int neg, xmlRegAtomType type, int start, int end,
1096 xmlChar *blockName) {
1097 xmlRegRangePtr range;
1098
1099 if (atom == NULL) {
1100 ERROR("add range: atom is NULL");
1101 return;
1102 }
1103 if (atom->type != XML_REGEXP_RANGES) {
1104 ERROR("add range: atom is not ranges");
1105 return;
1106 }
1107 if (atom->maxRanges == 0) {
1108 atom->maxRanges = 4;
1109 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1110 sizeof(xmlRegRangePtr));
1111 if (atom->ranges == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001112 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001113 atom->maxRanges = 0;
1114 return;
1115 }
1116 } else if (atom->nbRanges >= atom->maxRanges) {
1117 xmlRegRangePtr *tmp;
1118 atom->maxRanges *= 2;
1119 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1120 sizeof(xmlRegRangePtr));
1121 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001122 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001123 atom->maxRanges /= 2;
1124 return;
1125 }
1126 atom->ranges = tmp;
1127 }
1128 range = xmlRegNewRange(ctxt, neg, type, start, end);
1129 if (range == NULL)
1130 return;
1131 range->blockName = blockName;
1132 atom->ranges[atom->nbRanges++] = range;
1133
1134}
1135
1136static int
1137xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1138 if (ctxt->maxCounters == 0) {
1139 ctxt->maxCounters = 4;
1140 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1141 sizeof(xmlRegCounter));
1142 if (ctxt->counters == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001143 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001144 ctxt->maxCounters = 0;
1145 return(-1);
1146 }
1147 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1148 xmlRegCounter *tmp;
1149 ctxt->maxCounters *= 2;
1150 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1151 sizeof(xmlRegCounter));
1152 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001153 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001154 ctxt->maxCounters /= 2;
1155 return(-1);
1156 }
1157 ctxt->counters = tmp;
1158 }
1159 ctxt->counters[ctxt->nbCounters].min = -1;
1160 ctxt->counters[ctxt->nbCounters].max = -1;
1161 return(ctxt->nbCounters++);
1162}
1163
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001164static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001165xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1166 if (atom == NULL) {
1167 ERROR("atom push: atom is NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001168 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001169 }
1170 if (ctxt->maxAtoms == 0) {
1171 ctxt->maxAtoms = 4;
1172 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1173 sizeof(xmlRegAtomPtr));
1174 if (ctxt->atoms == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001175 xmlRegexpErrMemory(ctxt, "pushing atom");
Daniel Veillard4255d502002-04-16 15:50:10 +00001176 ctxt->maxAtoms = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001177 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001178 }
1179 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1180 xmlRegAtomPtr *tmp;
1181 ctxt->maxAtoms *= 2;
1182 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1183 sizeof(xmlRegAtomPtr));
1184 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001185 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001186 ctxt->maxAtoms /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001187 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001188 }
1189 ctxt->atoms = tmp;
1190 }
1191 atom->no = ctxt->nbAtoms;
1192 ctxt->atoms[ctxt->nbAtoms++] = atom;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001193 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001194}
1195
1196static void
1197xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1198 xmlRegAtomPtr atom, xmlRegStatePtr target,
1199 int counter, int count) {
William M. Brackf9b5fa22004-05-10 07:52:15 +00001200
1201 int nrtrans;
1202
Daniel Veillard4255d502002-04-16 15:50:10 +00001203 if (state == NULL) {
1204 ERROR("add state: state is NULL");
1205 return;
1206 }
1207 if (target == NULL) {
1208 ERROR("add state: target is NULL");
1209 return;
1210 }
William M. Brackf9b5fa22004-05-10 07:52:15 +00001211 /*
1212 * Other routines follow the philosophy 'When in doubt, add a transition'
1213 * so we check here whether such a transition is already present and, if
1214 * so, silently ignore this request.
1215 */
1216
1217 for (nrtrans=0; nrtrans<state->nbTrans; nrtrans++) {
1218 if ((state->trans[nrtrans].atom == atom) &&
1219 (state->trans[nrtrans].to == target->no) &&
1220 (state->trans[nrtrans].counter == counter) &&
1221 (state->trans[nrtrans].count == count)) {
1222#ifdef DEBUG_REGEXP_GRAPH
1223 printf("Ignoring duplicate transition from %d to %d\n",
1224 state->no, target->no);
1225#endif
1226 return;
1227 }
1228 }
1229
Daniel Veillard4255d502002-04-16 15:50:10 +00001230 if (state->maxTrans == 0) {
1231 state->maxTrans = 4;
1232 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1233 sizeof(xmlRegTrans));
1234 if (state->trans == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001235 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001236 state->maxTrans = 0;
1237 return;
1238 }
1239 } else if (state->nbTrans >= state->maxTrans) {
1240 xmlRegTrans *tmp;
1241 state->maxTrans *= 2;
1242 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1243 sizeof(xmlRegTrans));
1244 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001245 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001246 state->maxTrans /= 2;
1247 return;
1248 }
1249 state->trans = tmp;
1250 }
1251#ifdef DEBUG_REGEXP_GRAPH
1252 printf("Add trans from %d to %d ", state->no, target->no);
Daniel Veillard8a001f62002-04-20 07:24:11 +00001253 if (count == REGEXP_ALL_COUNTER)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001254 printf("all transition\n");
Daniel Veillard4402ab42002-09-12 16:02:56 +00001255 else if (count >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001256 printf("count based %d\n", count);
Daniel Veillard4255d502002-04-16 15:50:10 +00001257 else if (counter >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001258 printf("counted %d\n", counter);
Daniel Veillard4255d502002-04-16 15:50:10 +00001259 else if (atom == NULL)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001260 printf("epsilon transition\n");
1261 else if (atom != NULL)
1262 xmlRegPrintAtom(stdout, atom);
Daniel Veillard4255d502002-04-16 15:50:10 +00001263#endif
1264
1265 state->trans[state->nbTrans].atom = atom;
1266 state->trans[state->nbTrans].to = target->no;
1267 state->trans[state->nbTrans].counter = counter;
1268 state->trans[state->nbTrans].count = count;
1269 state->nbTrans++;
1270}
1271
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001272static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001273xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001274 if (state == NULL) return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001275 if (ctxt->maxStates == 0) {
1276 ctxt->maxStates = 4;
1277 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1278 sizeof(xmlRegStatePtr));
1279 if (ctxt->states == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001280 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001281 ctxt->maxStates = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001282 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001283 }
1284 } else if (ctxt->nbStates >= ctxt->maxStates) {
1285 xmlRegStatePtr *tmp;
1286 ctxt->maxStates *= 2;
1287 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1288 sizeof(xmlRegStatePtr));
1289 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001290 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001291 ctxt->maxStates /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001292 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001293 }
1294 ctxt->states = tmp;
1295 }
1296 state->no = ctxt->nbStates;
1297 ctxt->states[ctxt->nbStates++] = state;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001298 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001299}
1300
1301/**
Daniel Veillard7646b182002-04-20 06:41:40 +00001302 * xmlFAGenerateAllTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001303 * @ctxt: a regexp parser context
1304 * @from: the from state
1305 * @to: the target state or NULL for building a new one
1306 * @lax:
Daniel Veillard7646b182002-04-20 06:41:40 +00001307 *
1308 */
1309static void
1310xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
Daniel Veillard441bc322002-04-20 17:38:48 +00001311 xmlRegStatePtr from, xmlRegStatePtr to,
1312 int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00001313 if (to == NULL) {
1314 to = xmlRegNewState(ctxt);
1315 xmlRegStatePush(ctxt, to);
1316 ctxt->state = to;
1317 }
Daniel Veillard441bc322002-04-20 17:38:48 +00001318 if (lax)
1319 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1320 else
1321 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
Daniel Veillard7646b182002-04-20 06:41:40 +00001322}
1323
1324/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001325 * xmlFAGenerateEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001326 * @ctxt: a regexp parser context
1327 * @from: the from state
1328 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001329 *
1330 */
1331static void
1332xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1333 xmlRegStatePtr from, xmlRegStatePtr to) {
1334 if (to == NULL) {
1335 to = xmlRegNewState(ctxt);
1336 xmlRegStatePush(ctxt, to);
1337 ctxt->state = to;
1338 }
1339 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1340}
1341
1342/**
1343 * xmlFAGenerateCountedEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001344 * @ctxt: a regexp parser context
1345 * @from: the from state
1346 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001347 * counter: the counter for that transition
1348 *
1349 */
1350static void
1351xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1352 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1353 if (to == NULL) {
1354 to = xmlRegNewState(ctxt);
1355 xmlRegStatePush(ctxt, to);
1356 ctxt->state = to;
1357 }
1358 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1359}
1360
1361/**
1362 * xmlFAGenerateCountedTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001363 * @ctxt: a regexp parser context
1364 * @from: the from state
1365 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001366 * counter: the counter for that transition
1367 *
1368 */
1369static void
1370xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1371 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1372 if (to == NULL) {
1373 to = xmlRegNewState(ctxt);
1374 xmlRegStatePush(ctxt, to);
1375 ctxt->state = to;
1376 }
1377 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1378}
1379
1380/**
1381 * xmlFAGenerateTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001382 * @ctxt: a regexp parser context
1383 * @from: the from state
1384 * @to: the target state or NULL for building a new one
1385 * @atom: the atom generating the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00001386 *
William M. Brackddf71d62004-05-06 04:17:26 +00001387 * Returns 0 if success and -1 in case of error.
Daniel Veillard4255d502002-04-16 15:50:10 +00001388 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001389static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001390xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1391 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1392 if (atom == NULL) {
1393 ERROR("genrate transition: atom == NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001394 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001395 }
1396 if (atom->type == XML_REGEXP_SUBREG) {
1397 /*
1398 * this is a subexpression handling one should not need to
William M. Brackddf71d62004-05-06 04:17:26 +00001399 * create a new node except for XML_REGEXP_QUANT_RANGE.
Daniel Veillard4255d502002-04-16 15:50:10 +00001400 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001401 if (xmlRegAtomPush(ctxt, atom) < 0) {
1402 return(-1);
1403 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001404 if ((to != NULL) && (atom->stop != to) &&
1405 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1406 /*
1407 * Generate an epsilon transition to link to the target
1408 */
1409 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1410 }
1411 switch (atom->quant) {
1412 case XML_REGEXP_QUANT_OPT:
1413 atom->quant = XML_REGEXP_QUANT_ONCE;
1414 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1415 break;
1416 case XML_REGEXP_QUANT_MULT:
1417 atom->quant = XML_REGEXP_QUANT_ONCE;
1418 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1419 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1420 break;
1421 case XML_REGEXP_QUANT_PLUS:
1422 atom->quant = XML_REGEXP_QUANT_ONCE;
1423 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1424 break;
1425 case XML_REGEXP_QUANT_RANGE: {
1426 int counter;
1427 xmlRegStatePtr newstate;
1428
1429 /*
1430 * This one is nasty:
William M. Brackddf71d62004-05-06 04:17:26 +00001431 * 1/ if range has minOccurs == 0, create a new state
1432 * and create epsilon transitions from atom->start
1433 * to atom->stop, as well as atom->start to the new
1434 * state
1435 * 2/ register a new counter
1436 * 3/ register an epsilon transition associated to
Daniel Veillard4255d502002-04-16 15:50:10 +00001437 * this counter going from atom->stop to atom->start
William M. Brackddf71d62004-05-06 04:17:26 +00001438 * 4/ create a new state
1439 * 5/ generate a counted transition from atom->stop to
Daniel Veillard4255d502002-04-16 15:50:10 +00001440 * that state
1441 */
William M. Brackddf71d62004-05-06 04:17:26 +00001442 if (atom->min == 0) {
1443 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1444 atom->stop);
1445 newstate = xmlRegNewState(ctxt);
1446 xmlRegStatePush(ctxt, newstate);
1447 ctxt->state = newstate;
1448 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1449 newstate);
1450 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001451 counter = xmlRegGetCounter(ctxt);
1452 ctxt->counters[counter].min = atom->min - 1;
1453 ctxt->counters[counter].max = atom->max - 1;
1454 atom->min = 0;
1455 atom->max = 0;
1456 atom->quant = XML_REGEXP_QUANT_ONCE;
1457 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1458 atom->start, counter);
1459 if (to != NULL) {
1460 newstate = to;
1461 } else {
1462 newstate = xmlRegNewState(ctxt);
1463 xmlRegStatePush(ctxt, newstate);
1464 ctxt->state = newstate;
1465 }
1466 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1467 newstate, counter);
1468 }
1469 default:
1470 break;
1471 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001472 return(0);
Daniel Veillard99c394d2005-07-14 12:58:49 +00001473 } else if ((atom->min == 0) && (atom->max == 0) &&
1474 (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1475 /*
1476 * we can discard the atom and generate an epsilon transition instead
1477 */
1478 if (to == NULL) {
1479 to = xmlRegNewState(ctxt);
1480 if (to != NULL)
1481 xmlRegStatePush(ctxt, to);
1482 else {
1483 return(-1);
1484 }
1485 }
1486 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1487 ctxt->state = to;
1488 xmlRegFreeAtom(atom);
1489 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001490 } else {
1491 if (to == NULL) {
1492 to = xmlRegNewState(ctxt);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001493 if (to != NULL)
1494 xmlRegStatePush(ctxt, to);
1495 else {
1496 return(-1);
1497 }
1498 }
1499 if (xmlRegAtomPush(ctxt, atom) < 0) {
1500 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001501 }
1502 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001503 ctxt->state = to;
1504 }
1505 switch (atom->quant) {
1506 case XML_REGEXP_QUANT_OPT:
1507 atom->quant = XML_REGEXP_QUANT_ONCE;
1508 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1509 break;
1510 case XML_REGEXP_QUANT_MULT:
1511 atom->quant = XML_REGEXP_QUANT_ONCE;
1512 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1513 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1514 break;
1515 case XML_REGEXP_QUANT_PLUS:
1516 atom->quant = XML_REGEXP_QUANT_ONCE;
1517 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1518 break;
1519 default:
1520 break;
1521 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001522 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001523}
1524
1525/**
1526 * xmlFAReduceEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001527 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001528 * @fromnr: the from state
1529 * @tonr: the to state
William M. Brackddf71d62004-05-06 04:17:26 +00001530 * @counter: should that transition be associated to a counted
Daniel Veillard4255d502002-04-16 15:50:10 +00001531 *
1532 */
1533static void
1534xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1535 int tonr, int counter) {
1536 int transnr;
1537 xmlRegStatePtr from;
1538 xmlRegStatePtr to;
1539
1540#ifdef DEBUG_REGEXP_GRAPH
1541 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1542#endif
1543 from = ctxt->states[fromnr];
1544 if (from == NULL)
1545 return;
1546 to = ctxt->states[tonr];
1547 if (to == NULL)
1548 return;
1549 if ((to->mark == XML_REGEXP_MARK_START) ||
1550 (to->mark == XML_REGEXP_MARK_VISITED))
1551 return;
1552
1553 to->mark = XML_REGEXP_MARK_VISITED;
1554 if (to->type == XML_REGEXP_FINAL_STATE) {
1555#ifdef DEBUG_REGEXP_GRAPH
1556 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1557#endif
1558 from->type = XML_REGEXP_FINAL_STATE;
1559 }
1560 for (transnr = 0;transnr < to->nbTrans;transnr++) {
1561 if (to->trans[transnr].atom == NULL) {
1562 /*
1563 * Don't remove counted transitions
1564 * Don't loop either
1565 */
Daniel Veillardb509f152002-04-17 16:28:10 +00001566 if (to->trans[transnr].to != fromnr) {
1567 if (to->trans[transnr].count >= 0) {
1568 int newto = to->trans[transnr].to;
1569
1570 xmlRegStateAddTrans(ctxt, from, NULL,
1571 ctxt->states[newto],
1572 -1, to->trans[transnr].count);
1573 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00001574#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillardb509f152002-04-17 16:28:10 +00001575 printf("Found epsilon trans %d from %d to %d\n",
1576 transnr, tonr, to->trans[transnr].to);
Daniel Veillard4255d502002-04-16 15:50:10 +00001577#endif
Daniel Veillardb509f152002-04-17 16:28:10 +00001578 if (to->trans[transnr].counter >= 0) {
1579 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1580 to->trans[transnr].to,
1581 to->trans[transnr].counter);
1582 } else {
1583 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1584 to->trans[transnr].to,
1585 counter);
1586 }
1587 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001588 }
1589 } else {
1590 int newto = to->trans[transnr].to;
1591
Daniel Veillardb509f152002-04-17 16:28:10 +00001592 if (to->trans[transnr].counter >= 0) {
1593 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1594 ctxt->states[newto],
1595 to->trans[transnr].counter, -1);
1596 } else {
1597 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1598 ctxt->states[newto], counter, -1);
1599 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001600 }
1601 }
1602 to->mark = XML_REGEXP_MARK_NORMAL;
1603}
1604
1605/**
1606 * xmlFAEliminateEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001607 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001608 *
1609 */
1610static void
1611xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1612 int statenr, transnr;
1613 xmlRegStatePtr state;
1614
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001615 if (ctxt->states == NULL) return;
1616
1617
Daniel Veillard4255d502002-04-16 15:50:10 +00001618 /*
1619 * build the completed transitions bypassing the epsilons
1620 * Use a marking algorithm to avoid loops
Daniel Veillardcc026dc2005-01-12 13:21:17 +00001621 * mark sink states too.
Daniel Veillard4255d502002-04-16 15:50:10 +00001622 */
1623 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1624 state = ctxt->states[statenr];
1625 if (state == NULL)
1626 continue;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00001627 if ((state->nbTrans == 0) &&
1628 (state->type != XML_REGEXP_FINAL_STATE)) {
1629 state->type = XML_REGEXP_SINK_STATE;
1630 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001631 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1632 if ((state->trans[transnr].atom == NULL) &&
1633 (state->trans[transnr].to >= 0)) {
1634 if (state->trans[transnr].to == statenr) {
1635 state->trans[transnr].to = -1;
1636#ifdef DEBUG_REGEXP_GRAPH
1637 printf("Removed loopback epsilon trans %d on %d\n",
1638 transnr, statenr);
1639#endif
1640 } else if (state->trans[transnr].count < 0) {
1641 int newto = state->trans[transnr].to;
1642
1643#ifdef DEBUG_REGEXP_GRAPH
1644 printf("Found epsilon trans %d from %d to %d\n",
1645 transnr, statenr, newto);
1646#endif
1647 state->mark = XML_REGEXP_MARK_START;
1648 xmlFAReduceEpsilonTransitions(ctxt, statenr,
1649 newto, state->trans[transnr].counter);
1650 state->mark = XML_REGEXP_MARK_NORMAL;
1651#ifdef DEBUG_REGEXP_GRAPH
1652 } else {
1653 printf("Found counted transition %d on %d\n",
1654 transnr, statenr);
1655#endif
1656 }
1657 }
1658 }
1659 }
1660 /*
1661 * Eliminate the epsilon transitions
1662 */
1663 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1664 state = ctxt->states[statenr];
1665 if (state == NULL)
1666 continue;
1667 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1668 if ((state->trans[transnr].atom == NULL) &&
1669 (state->trans[transnr].count < 0) &&
1670 (state->trans[transnr].to >= 0)) {
1671 state->trans[transnr].to = -1;
1672 }
1673 }
1674 }
Daniel Veillard23e73572002-09-19 19:56:43 +00001675
1676 /*
1677 * Use this pass to detect unreachable states too
1678 */
1679 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1680 state = ctxt->states[statenr];
1681 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00001682 state->reached = XML_REGEXP_MARK_NORMAL;
Daniel Veillard23e73572002-09-19 19:56:43 +00001683 }
1684 state = ctxt->states[0];
1685 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00001686 state->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00001687 while (state != NULL) {
1688 xmlRegStatePtr target = NULL;
William M. Brack779af002003-08-01 15:55:39 +00001689 state->reached = XML_REGEXP_MARK_VISITED;
Daniel Veillard23e73572002-09-19 19:56:43 +00001690 /*
William M. Brackddf71d62004-05-06 04:17:26 +00001691 * Mark all states reachable from the current reachable state
Daniel Veillard23e73572002-09-19 19:56:43 +00001692 */
1693 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1694 if ((state->trans[transnr].to >= 0) &&
1695 ((state->trans[transnr].atom != NULL) ||
1696 (state->trans[transnr].count >= 0))) {
1697 int newto = state->trans[transnr].to;
1698
1699 if (ctxt->states[newto] == NULL)
1700 continue;
William M. Brack779af002003-08-01 15:55:39 +00001701 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
1702 ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00001703 target = ctxt->states[newto];
1704 }
1705 }
1706 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00001707
Daniel Veillard23e73572002-09-19 19:56:43 +00001708 /*
1709 * find the next accessible state not explored
1710 */
1711 if (target == NULL) {
1712 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
1713 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00001714 if ((state != NULL) && (state->reached ==
1715 XML_REGEXP_MARK_START)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00001716 target = state;
1717 break;
1718 }
1719 }
1720 }
1721 state = target;
1722 }
1723 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1724 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00001725 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00001726#ifdef DEBUG_REGEXP_GRAPH
1727 printf("Removed unreachable state %d\n", statenr);
1728#endif
1729 xmlRegFreeState(state);
1730 ctxt->states[statenr] = NULL;
1731 }
1732 }
1733
Daniel Veillard4255d502002-04-16 15:50:10 +00001734}
1735
Daniel Veillarde19fc232002-04-22 16:01:24 +00001736/**
1737 * xmlFACompareAtoms:
1738 * @atom1: an atom
1739 * @atom2: an atom
1740 *
William M. Brackddf71d62004-05-06 04:17:26 +00001741 * Compares two atoms to check whether they are equivalents
Daniel Veillarde19fc232002-04-22 16:01:24 +00001742 *
1743 * Returns 1 if yes and 0 otherwise
1744 */
1745static int
1746xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
Daniel Veillard9efc4762005-07-19 14:33:55 +00001747 int ret;
1748
Daniel Veillarde19fc232002-04-22 16:01:24 +00001749 if (atom1 == atom2)
1750 return(1);
1751 if ((atom1 == NULL) || (atom2 == NULL))
1752 return(0);
1753
1754 if (atom1->type != atom2->type)
1755 return(0);
1756 switch (atom1->type) {
1757 case XML_REGEXP_STRING:
Daniel Veillard9efc4762005-07-19 14:33:55 +00001758 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep,
1759 (xmlChar *)atom2->valuep);
1760 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00001761 case XML_REGEXP_EPSILON:
1762 return(1);
1763 case XML_REGEXP_CHARVAL:
Daniel Veillard9efc4762005-07-19 14:33:55 +00001764 ret = atom1->codepoint == atom2->codepoint;
1765 break;
Daniel Veillarde19fc232002-04-22 16:01:24 +00001766 case XML_REGEXP_RANGES:
1767 TODO;
1768 return(0);
1769 default:
Daniel Veillard9efc4762005-07-19 14:33:55 +00001770 return(1);
Daniel Veillarde19fc232002-04-22 16:01:24 +00001771 }
Daniel Veillard9efc4762005-07-19 14:33:55 +00001772 if (atom1->neg != atom2->neg)
1773 ret = !ret;
1774 return(ret);
Daniel Veillarde19fc232002-04-22 16:01:24 +00001775}
1776
1777/**
1778 * xmlFARecurseDeterminism:
1779 * @ctxt: a regexp parser context
1780 *
1781 * Check whether the associated regexp is determinist,
1782 * should be called after xmlFAEliminateEpsilonTransitions()
1783 *
1784 */
1785static int
1786xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1787 int to, xmlRegAtomPtr atom) {
1788 int ret = 1;
1789 int transnr;
1790 xmlRegTransPtr t1;
1791
1792 if (state == NULL)
1793 return(ret);
1794 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1795 t1 = &(state->trans[transnr]);
1796 /*
1797 * check transitions conflicting with the one looked at
1798 */
1799 if (t1->atom == NULL) {
1800 if (t1->to == -1)
1801 continue;
1802 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1803 to, atom);
1804 if (ret == 0)
1805 return(0);
1806 continue;
1807 }
1808 if (t1->to != to)
1809 continue;
1810 if (xmlFACompareAtoms(t1->atom, atom))
1811 return(0);
1812 }
1813 return(ret);
1814}
1815
1816/**
1817 * xmlFAComputesDeterminism:
1818 * @ctxt: a regexp parser context
1819 *
1820 * Check whether the associated regexp is determinist,
1821 * should be called after xmlFAEliminateEpsilonTransitions()
1822 *
1823 */
1824static int
1825xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
1826 int statenr, transnr;
1827 xmlRegStatePtr state;
1828 xmlRegTransPtr t1, t2;
1829 int i;
1830 int ret = 1;
1831
Daniel Veillard4402ab42002-09-12 16:02:56 +00001832#ifdef DEBUG_REGEXP_GRAPH
1833 printf("xmlFAComputesDeterminism\n");
1834 xmlRegPrintCtxt(stdout, ctxt);
1835#endif
Daniel Veillarde19fc232002-04-22 16:01:24 +00001836 if (ctxt->determinist != -1)
1837 return(ctxt->determinist);
1838
1839 /*
William M. Brackddf71d62004-05-06 04:17:26 +00001840 * Check for all states that there aren't 2 transitions
Daniel Veillarde19fc232002-04-22 16:01:24 +00001841 * with the same atom and a different target.
1842 */
1843 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1844 state = ctxt->states[statenr];
1845 if (state == NULL)
1846 continue;
1847 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1848 t1 = &(state->trans[transnr]);
1849 /*
1850 * Determinism checks in case of counted or all transitions
1851 * will have to be handled separately
1852 */
1853 if (t1->atom == NULL)
1854 continue;
1855 if (t1->to == -1) /* eliminated */
1856 continue;
1857 for (i = 0;i < transnr;i++) {
1858 t2 = &(state->trans[i]);
1859 if (t2->to == -1) /* eliminated */
1860 continue;
1861 if (t2->atom != NULL) {
1862 if (t1->to == t2->to) {
1863 if (xmlFACompareAtoms(t1->atom, t2->atom))
William M. Brackddf71d62004-05-06 04:17:26 +00001864 t2->to = -1; /* eliminated */
Daniel Veillarde19fc232002-04-22 16:01:24 +00001865 } else {
1866 /* not determinist ! */
1867 if (xmlFACompareAtoms(t1->atom, t2->atom))
1868 ret = 0;
1869 }
1870 } else if (t1->to != -1) {
1871 /*
1872 * do the closure in case of remaining specific
1873 * epsilon transitions like choices or all
1874 */
1875 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1876 t2->to, t2->atom);
1877 if (ret == 0)
1878 return(0);
1879 }
1880 }
1881 if (ret == 0)
1882 break;
1883 }
1884 if (ret == 0)
1885 break;
1886 }
1887 ctxt->determinist = ret;
1888 return(ret);
1889}
1890
Daniel Veillard4255d502002-04-16 15:50:10 +00001891/************************************************************************
1892 * *
1893 * Routines to check input against transition atoms *
1894 * *
1895 ************************************************************************/
1896
1897static int
1898xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
1899 int start, int end, const xmlChar *blockName) {
1900 int ret = 0;
1901
1902 switch (type) {
1903 case XML_REGEXP_STRING:
1904 case XML_REGEXP_SUBREG:
1905 case XML_REGEXP_RANGES:
1906 case XML_REGEXP_EPSILON:
1907 return(-1);
1908 case XML_REGEXP_ANYCHAR:
1909 ret = ((codepoint != '\n') && (codepoint != '\r'));
1910 break;
1911 case XML_REGEXP_CHARVAL:
1912 ret = ((codepoint >= start) && (codepoint <= end));
1913 break;
1914 case XML_REGEXP_NOTSPACE:
1915 neg = !neg;
1916 case XML_REGEXP_ANYSPACE:
1917 ret = ((codepoint == '\n') || (codepoint == '\r') ||
1918 (codepoint == '\t') || (codepoint == ' '));
1919 break;
1920 case XML_REGEXP_NOTINITNAME:
1921 neg = !neg;
1922 case XML_REGEXP_INITNAME:
William M. Brack871611b2003-10-18 04:53:14 +00001923 ret = (IS_LETTER(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00001924 (codepoint == '_') || (codepoint == ':'));
1925 break;
1926 case XML_REGEXP_NOTNAMECHAR:
1927 neg = !neg;
1928 case XML_REGEXP_NAMECHAR:
William M. Brack871611b2003-10-18 04:53:14 +00001929 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00001930 (codepoint == '.') || (codepoint == '-') ||
1931 (codepoint == '_') || (codepoint == ':') ||
William M. Brack871611b2003-10-18 04:53:14 +00001932 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
Daniel Veillard4255d502002-04-16 15:50:10 +00001933 break;
1934 case XML_REGEXP_NOTDECIMAL:
1935 neg = !neg;
1936 case XML_REGEXP_DECIMAL:
1937 ret = xmlUCSIsCatNd(codepoint);
1938 break;
1939 case XML_REGEXP_REALCHAR:
1940 neg = !neg;
1941 case XML_REGEXP_NOTREALCHAR:
1942 ret = xmlUCSIsCatP(codepoint);
1943 if (ret == 0)
1944 ret = xmlUCSIsCatZ(codepoint);
1945 if (ret == 0)
1946 ret = xmlUCSIsCatC(codepoint);
1947 break;
1948 case XML_REGEXP_LETTER:
1949 ret = xmlUCSIsCatL(codepoint);
1950 break;
1951 case XML_REGEXP_LETTER_UPPERCASE:
1952 ret = xmlUCSIsCatLu(codepoint);
1953 break;
1954 case XML_REGEXP_LETTER_LOWERCASE:
1955 ret = xmlUCSIsCatLl(codepoint);
1956 break;
1957 case XML_REGEXP_LETTER_TITLECASE:
1958 ret = xmlUCSIsCatLt(codepoint);
1959 break;
1960 case XML_REGEXP_LETTER_MODIFIER:
1961 ret = xmlUCSIsCatLm(codepoint);
1962 break;
1963 case XML_REGEXP_LETTER_OTHERS:
1964 ret = xmlUCSIsCatLo(codepoint);
1965 break;
1966 case XML_REGEXP_MARK:
1967 ret = xmlUCSIsCatM(codepoint);
1968 break;
1969 case XML_REGEXP_MARK_NONSPACING:
1970 ret = xmlUCSIsCatMn(codepoint);
1971 break;
1972 case XML_REGEXP_MARK_SPACECOMBINING:
1973 ret = xmlUCSIsCatMc(codepoint);
1974 break;
1975 case XML_REGEXP_MARK_ENCLOSING:
1976 ret = xmlUCSIsCatMe(codepoint);
1977 break;
1978 case XML_REGEXP_NUMBER:
1979 ret = xmlUCSIsCatN(codepoint);
1980 break;
1981 case XML_REGEXP_NUMBER_DECIMAL:
1982 ret = xmlUCSIsCatNd(codepoint);
1983 break;
1984 case XML_REGEXP_NUMBER_LETTER:
1985 ret = xmlUCSIsCatNl(codepoint);
1986 break;
1987 case XML_REGEXP_NUMBER_OTHERS:
1988 ret = xmlUCSIsCatNo(codepoint);
1989 break;
1990 case XML_REGEXP_PUNCT:
1991 ret = xmlUCSIsCatP(codepoint);
1992 break;
1993 case XML_REGEXP_PUNCT_CONNECTOR:
1994 ret = xmlUCSIsCatPc(codepoint);
1995 break;
1996 case XML_REGEXP_PUNCT_DASH:
1997 ret = xmlUCSIsCatPd(codepoint);
1998 break;
1999 case XML_REGEXP_PUNCT_OPEN:
2000 ret = xmlUCSIsCatPs(codepoint);
2001 break;
2002 case XML_REGEXP_PUNCT_CLOSE:
2003 ret = xmlUCSIsCatPe(codepoint);
2004 break;
2005 case XML_REGEXP_PUNCT_INITQUOTE:
2006 ret = xmlUCSIsCatPi(codepoint);
2007 break;
2008 case XML_REGEXP_PUNCT_FINQUOTE:
2009 ret = xmlUCSIsCatPf(codepoint);
2010 break;
2011 case XML_REGEXP_PUNCT_OTHERS:
2012 ret = xmlUCSIsCatPo(codepoint);
2013 break;
2014 case XML_REGEXP_SEPAR:
2015 ret = xmlUCSIsCatZ(codepoint);
2016 break;
2017 case XML_REGEXP_SEPAR_SPACE:
2018 ret = xmlUCSIsCatZs(codepoint);
2019 break;
2020 case XML_REGEXP_SEPAR_LINE:
2021 ret = xmlUCSIsCatZl(codepoint);
2022 break;
2023 case XML_REGEXP_SEPAR_PARA:
2024 ret = xmlUCSIsCatZp(codepoint);
2025 break;
2026 case XML_REGEXP_SYMBOL:
2027 ret = xmlUCSIsCatS(codepoint);
2028 break;
2029 case XML_REGEXP_SYMBOL_MATH:
2030 ret = xmlUCSIsCatSm(codepoint);
2031 break;
2032 case XML_REGEXP_SYMBOL_CURRENCY:
2033 ret = xmlUCSIsCatSc(codepoint);
2034 break;
2035 case XML_REGEXP_SYMBOL_MODIFIER:
2036 ret = xmlUCSIsCatSk(codepoint);
2037 break;
2038 case XML_REGEXP_SYMBOL_OTHERS:
2039 ret = xmlUCSIsCatSo(codepoint);
2040 break;
2041 case XML_REGEXP_OTHER:
2042 ret = xmlUCSIsCatC(codepoint);
2043 break;
2044 case XML_REGEXP_OTHER_CONTROL:
2045 ret = xmlUCSIsCatCc(codepoint);
2046 break;
2047 case XML_REGEXP_OTHER_FORMAT:
2048 ret = xmlUCSIsCatCf(codepoint);
2049 break;
2050 case XML_REGEXP_OTHER_PRIVATE:
2051 ret = xmlUCSIsCatCo(codepoint);
2052 break;
2053 case XML_REGEXP_OTHER_NA:
2054 /* ret = xmlUCSIsCatCn(codepoint); */
2055 /* Seems it doesn't exist anymore in recent Unicode releases */
2056 ret = 0;
2057 break;
2058 case XML_REGEXP_BLOCK_NAME:
2059 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
2060 break;
2061 }
2062 if (neg)
2063 return(!ret);
2064 return(ret);
2065}
2066
2067static int
2068xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
2069 int i, ret = 0;
2070 xmlRegRangePtr range;
2071
William M. Brack871611b2003-10-18 04:53:14 +00002072 if ((atom == NULL) || (!IS_CHAR(codepoint)))
Daniel Veillard4255d502002-04-16 15:50:10 +00002073 return(-1);
2074
2075 switch (atom->type) {
2076 case XML_REGEXP_SUBREG:
2077 case XML_REGEXP_EPSILON:
2078 return(-1);
2079 case XML_REGEXP_CHARVAL:
2080 return(codepoint == atom->codepoint);
2081 case XML_REGEXP_RANGES: {
2082 int accept = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00002083
Daniel Veillard4255d502002-04-16 15:50:10 +00002084 for (i = 0;i < atom->nbRanges;i++) {
2085 range = atom->ranges[i];
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002086 if (range->neg == 2) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002087 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2088 0, range->start, range->end,
2089 range->blockName);
2090 if (ret != 0)
2091 return(0); /* excluded char */
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002092 } else if (range->neg) {
2093 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2094 0, range->start, range->end,
2095 range->blockName);
2096 if (ret == 0)
Daniel Veillardf2a12832003-11-24 13:04:35 +00002097 accept = 1;
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002098 else
2099 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00002100 } else {
2101 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2102 0, range->start, range->end,
2103 range->blockName);
2104 if (ret != 0)
2105 accept = 1; /* might still be excluded */
2106 }
2107 }
2108 return(accept);
2109 }
2110 case XML_REGEXP_STRING:
2111 printf("TODO: XML_REGEXP_STRING\n");
2112 return(-1);
2113 case XML_REGEXP_ANYCHAR:
2114 case XML_REGEXP_ANYSPACE:
2115 case XML_REGEXP_NOTSPACE:
2116 case XML_REGEXP_INITNAME:
2117 case XML_REGEXP_NOTINITNAME:
2118 case XML_REGEXP_NAMECHAR:
2119 case XML_REGEXP_NOTNAMECHAR:
2120 case XML_REGEXP_DECIMAL:
2121 case XML_REGEXP_NOTDECIMAL:
2122 case XML_REGEXP_REALCHAR:
2123 case XML_REGEXP_NOTREALCHAR:
2124 case XML_REGEXP_LETTER:
2125 case XML_REGEXP_LETTER_UPPERCASE:
2126 case XML_REGEXP_LETTER_LOWERCASE:
2127 case XML_REGEXP_LETTER_TITLECASE:
2128 case XML_REGEXP_LETTER_MODIFIER:
2129 case XML_REGEXP_LETTER_OTHERS:
2130 case XML_REGEXP_MARK:
2131 case XML_REGEXP_MARK_NONSPACING:
2132 case XML_REGEXP_MARK_SPACECOMBINING:
2133 case XML_REGEXP_MARK_ENCLOSING:
2134 case XML_REGEXP_NUMBER:
2135 case XML_REGEXP_NUMBER_DECIMAL:
2136 case XML_REGEXP_NUMBER_LETTER:
2137 case XML_REGEXP_NUMBER_OTHERS:
2138 case XML_REGEXP_PUNCT:
2139 case XML_REGEXP_PUNCT_CONNECTOR:
2140 case XML_REGEXP_PUNCT_DASH:
2141 case XML_REGEXP_PUNCT_OPEN:
2142 case XML_REGEXP_PUNCT_CLOSE:
2143 case XML_REGEXP_PUNCT_INITQUOTE:
2144 case XML_REGEXP_PUNCT_FINQUOTE:
2145 case XML_REGEXP_PUNCT_OTHERS:
2146 case XML_REGEXP_SEPAR:
2147 case XML_REGEXP_SEPAR_SPACE:
2148 case XML_REGEXP_SEPAR_LINE:
2149 case XML_REGEXP_SEPAR_PARA:
2150 case XML_REGEXP_SYMBOL:
2151 case XML_REGEXP_SYMBOL_MATH:
2152 case XML_REGEXP_SYMBOL_CURRENCY:
2153 case XML_REGEXP_SYMBOL_MODIFIER:
2154 case XML_REGEXP_SYMBOL_OTHERS:
2155 case XML_REGEXP_OTHER:
2156 case XML_REGEXP_OTHER_CONTROL:
2157 case XML_REGEXP_OTHER_FORMAT:
2158 case XML_REGEXP_OTHER_PRIVATE:
2159 case XML_REGEXP_OTHER_NA:
2160 case XML_REGEXP_BLOCK_NAME:
2161 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
2162 (const xmlChar *)atom->valuep);
2163 if (atom->neg)
2164 ret = !ret;
2165 break;
2166 }
2167 return(ret);
2168}
2169
2170/************************************************************************
2171 * *
William M. Brackddf71d62004-05-06 04:17:26 +00002172 * Saving and restoring state of an execution context *
Daniel Veillard4255d502002-04-16 15:50:10 +00002173 * *
2174 ************************************************************************/
2175
2176#ifdef DEBUG_REGEXP_EXEC
2177static void
2178xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
2179 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
2180 if (exec->inputStack != NULL) {
2181 int i;
2182 printf(": ");
2183 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
2184 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]);
2185 } else {
2186 printf(": %s", &(exec->inputString[exec->index]));
2187 }
2188 printf("\n");
2189}
2190#endif
2191
2192static void
2193xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
2194#ifdef DEBUG_REGEXP_EXEC
2195 printf("saving ");
2196 exec->transno++;
2197 xmlFARegDebugExec(exec);
2198 exec->transno--;
2199#endif
2200
2201 if (exec->maxRollbacks == 0) {
2202 exec->maxRollbacks = 4;
2203 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
2204 sizeof(xmlRegExecRollback));
2205 if (exec->rollbacks == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002206 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002207 exec->maxRollbacks = 0;
2208 return;
2209 }
2210 memset(exec->rollbacks, 0,
2211 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2212 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
2213 xmlRegExecRollback *tmp;
2214 int len = exec->maxRollbacks;
2215
2216 exec->maxRollbacks *= 2;
2217 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
2218 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2219 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002220 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002221 exec->maxRollbacks /= 2;
2222 return;
2223 }
2224 exec->rollbacks = tmp;
2225 tmp = &exec->rollbacks[len];
2226 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
2227 }
2228 exec->rollbacks[exec->nbRollbacks].state = exec->state;
2229 exec->rollbacks[exec->nbRollbacks].index = exec->index;
2230 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
2231 if (exec->comp->nbCounters > 0) {
2232 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2233 exec->rollbacks[exec->nbRollbacks].counts = (int *)
2234 xmlMalloc(exec->comp->nbCounters * sizeof(int));
2235 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002236 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002237 exec->status = -5;
2238 return;
2239 }
2240 }
2241 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
2242 exec->comp->nbCounters * sizeof(int));
2243 }
2244 exec->nbRollbacks++;
2245}
2246
2247static void
2248xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
2249 if (exec->nbRollbacks <= 0) {
2250 exec->status = -1;
2251#ifdef DEBUG_REGEXP_EXEC
2252 printf("rollback failed on empty stack\n");
2253#endif
2254 return;
2255 }
2256 exec->nbRollbacks--;
2257 exec->state = exec->rollbacks[exec->nbRollbacks].state;
2258 exec->index = exec->rollbacks[exec->nbRollbacks].index;
2259 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
2260 if (exec->comp->nbCounters > 0) {
2261 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2262 fprintf(stderr, "exec save: allocation failed");
2263 exec->status = -6;
2264 return;
2265 }
2266 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
2267 exec->comp->nbCounters * sizeof(int));
2268 }
2269
2270#ifdef DEBUG_REGEXP_EXEC
2271 printf("restored ");
2272 xmlFARegDebugExec(exec);
2273#endif
2274}
2275
2276/************************************************************************
2277 * *
William M. Brackddf71d62004-05-06 04:17:26 +00002278 * Verifier, running an input against a compiled regexp *
Daniel Veillard4255d502002-04-16 15:50:10 +00002279 * *
2280 ************************************************************************/
2281
2282static int
2283xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
2284 xmlRegExecCtxt execval;
2285 xmlRegExecCtxtPtr exec = &execval;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002286 int ret, codepoint = 0, len;
Daniel Veillard4255d502002-04-16 15:50:10 +00002287
2288 exec->inputString = content;
2289 exec->index = 0;
2290 exec->determinist = 1;
2291 exec->maxRollbacks = 0;
2292 exec->nbRollbacks = 0;
2293 exec->rollbacks = NULL;
2294 exec->status = 0;
2295 exec->comp = comp;
2296 exec->state = comp->states[0];
2297 exec->transno = 0;
2298 exec->transcount = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00002299 exec->inputStack = NULL;
2300 exec->inputStackMax = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00002301 if (comp->nbCounters > 0) {
2302 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
Daniel Veillardff46a042003-10-08 08:53:17 +00002303 if (exec->counts == NULL) {
2304 xmlRegexpErrMemory(NULL, "running regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002305 return(-1);
Daniel Veillardff46a042003-10-08 08:53:17 +00002306 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002307 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
2308 } else
2309 exec->counts = NULL;
2310 while ((exec->status == 0) &&
2311 ((exec->inputString[exec->index] != 0) ||
2312 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
2313 xmlRegTransPtr trans;
2314 xmlRegAtomPtr atom;
2315
2316 /*
William M. Brack0e00b282004-04-26 15:40:47 +00002317 * If end of input on non-terminal state, rollback, however we may
Daniel Veillard4255d502002-04-16 15:50:10 +00002318 * still have epsilon like transition for counted transitions
William M. Brack0e00b282004-04-26 15:40:47 +00002319 * on counters, in that case don't break too early. Additionally,
2320 * if we are working on a range like "AB{0,2}", where B is not present,
2321 * we don't want to break.
Daniel Veillard4255d502002-04-16 15:50:10 +00002322 */
William M. Brack0e00b282004-04-26 15:40:47 +00002323 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
William M. Brackddf71d62004-05-06 04:17:26 +00002324 /*
2325 * if there is a transition, we must check if
2326 * atom allows minOccurs of 0
2327 */
2328 if (exec->transno < exec->state->nbTrans) {
William M. Brack0e00b282004-04-26 15:40:47 +00002329 trans = &exec->state->trans[exec->transno];
2330 if (trans->to >=0) {
2331 atom = trans->atom;
2332 if (!((atom->min == 0) && (atom->max > 0)))
2333 goto rollback;
2334 }
2335 } else
2336 goto rollback;
2337 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002338
2339 exec->transcount = 0;
2340 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2341 trans = &exec->state->trans[exec->transno];
2342 if (trans->to < 0)
2343 continue;
2344 atom = trans->atom;
2345 ret = 0;
2346 if (trans->count >= 0) {
2347 int count;
2348 xmlRegCounterPtr counter;
2349
2350 /*
2351 * A counted transition.
2352 */
2353
2354 count = exec->counts[trans->count];
2355 counter = &exec->comp->counters[trans->count];
2356#ifdef DEBUG_REGEXP_EXEC
2357 printf("testing count %d: val %d, min %d, max %d\n",
2358 trans->count, count, counter->min, counter->max);
2359#endif
2360 ret = ((count >= counter->min) && (count <= counter->max));
2361 } else if (atom == NULL) {
2362 fprintf(stderr, "epsilon transition left at runtime\n");
2363 exec->status = -2;
2364 break;
2365 } else if (exec->inputString[exec->index] != 0) {
2366 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
2367 ret = xmlRegCheckCharacter(atom, codepoint);
William M. Brack0e00b282004-04-26 15:40:47 +00002368 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002369 xmlRegStatePtr to = comp->states[trans->to];
2370
2371 /*
2372 * this is a multiple input sequence
2373 */
2374 if (exec->state->nbTrans > exec->transno + 1) {
2375 xmlFARegExecSave(exec);
2376 }
2377 exec->transcount = 1;
2378 do {
2379 /*
2380 * Try to progress as much as possible on the input
2381 */
2382 if (exec->transcount == atom->max) {
2383 break;
2384 }
2385 exec->index += len;
2386 /*
2387 * End of input: stop here
2388 */
2389 if (exec->inputString[exec->index] == 0) {
2390 exec->index -= len;
2391 break;
2392 }
2393 if (exec->transcount >= atom->min) {
2394 int transno = exec->transno;
2395 xmlRegStatePtr state = exec->state;
2396
2397 /*
2398 * The transition is acceptable save it
2399 */
2400 exec->transno = -1; /* trick */
2401 exec->state = to;
2402 xmlFARegExecSave(exec);
2403 exec->transno = transno;
2404 exec->state = state;
2405 }
2406 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
2407 len);
2408 ret = xmlRegCheckCharacter(atom, codepoint);
2409 exec->transcount++;
2410 } while (ret == 1);
2411 if (exec->transcount < atom->min)
2412 ret = 0;
2413
2414 /*
2415 * If the last check failed but one transition was found
2416 * possible, rollback
2417 */
2418 if (ret < 0)
2419 ret = 0;
2420 if (ret == 0) {
2421 goto rollback;
2422 }
William M. Brack0e00b282004-04-26 15:40:47 +00002423 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
2424 /*
2425 * we don't match on the codepoint, but minOccurs of 0
2426 * says that's ok. Setting len to 0 inhibits stepping
2427 * over the codepoint.
2428 */
2429 exec->transcount = 1;
2430 len = 0;
2431 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00002432 }
William M. Brack0e00b282004-04-26 15:40:47 +00002433 } else if ((atom->min == 0) && (atom->max > 0)) {
2434 /* another spot to match when minOccurs is 0 */
2435 exec->transcount = 1;
2436 len = 0;
2437 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00002438 }
2439 if (ret == 1) {
2440 if (exec->state->nbTrans > exec->transno + 1) {
2441 xmlFARegExecSave(exec);
2442 }
2443 if (trans->counter >= 0) {
2444#ifdef DEBUG_REGEXP_EXEC
2445 printf("Increasing count %d\n", trans->counter);
2446#endif
2447 exec->counts[trans->counter]++;
2448 }
2449#ifdef DEBUG_REGEXP_EXEC
2450 printf("entering state %d\n", trans->to);
2451#endif
2452 exec->state = comp->states[trans->to];
2453 exec->transno = 0;
2454 if (trans->atom != NULL) {
2455 exec->index += len;
2456 }
2457 goto progress;
2458 } else if (ret < 0) {
2459 exec->status = -4;
2460 break;
2461 }
2462 }
2463 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2464rollback:
2465 /*
2466 * Failed to find a way out
2467 */
2468 exec->determinist = 0;
2469 xmlFARegExecRollBack(exec);
2470 }
2471progress:
2472 continue;
2473 }
2474 if (exec->rollbacks != NULL) {
2475 if (exec->counts != NULL) {
2476 int i;
2477
2478 for (i = 0;i < exec->maxRollbacks;i++)
2479 if (exec->rollbacks[i].counts != NULL)
2480 xmlFree(exec->rollbacks[i].counts);
2481 }
2482 xmlFree(exec->rollbacks);
2483 }
2484 if (exec->counts != NULL)
2485 xmlFree(exec->counts);
2486 if (exec->status == 0)
2487 return(1);
2488 if (exec->status == -1)
2489 return(0);
2490 return(exec->status);
2491}
2492
2493/************************************************************************
2494 * *
William M. Brackddf71d62004-05-06 04:17:26 +00002495 * Progressive interface to the verifier one atom at a time *
Daniel Veillard4255d502002-04-16 15:50:10 +00002496 * *
2497 ************************************************************************/
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002498#ifdef DEBUG_ERR
2499static void testerr(xmlRegExecCtxtPtr exec);
2500#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00002501
2502/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002503 * xmlRegNewExecCtxt:
Daniel Veillard4255d502002-04-16 15:50:10 +00002504 * @comp: a precompiled regular expression
2505 * @callback: a callback function used for handling progresses in the
2506 * automata matching phase
2507 * @data: the context data associated to the callback in this context
2508 *
2509 * Build a context used for progressive evaluation of a regexp.
Daniel Veillard01c13b52002-12-10 15:19:08 +00002510 *
2511 * Returns the new context
Daniel Veillard4255d502002-04-16 15:50:10 +00002512 */
2513xmlRegExecCtxtPtr
2514xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
2515 xmlRegExecCtxtPtr exec;
2516
2517 if (comp == NULL)
2518 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00002519 if ((comp->compact == NULL) && (comp->states == NULL))
2520 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00002521 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
2522 if (exec == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002523 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00002524 return(NULL);
2525 }
2526 memset(exec, 0, sizeof(xmlRegExecCtxt));
2527 exec->inputString = NULL;
2528 exec->index = 0;
2529 exec->determinist = 1;
2530 exec->maxRollbacks = 0;
2531 exec->nbRollbacks = 0;
2532 exec->rollbacks = NULL;
2533 exec->status = 0;
2534 exec->comp = comp;
Daniel Veillard23e73572002-09-19 19:56:43 +00002535 if (comp->compact == NULL)
2536 exec->state = comp->states[0];
Daniel Veillard4255d502002-04-16 15:50:10 +00002537 exec->transno = 0;
2538 exec->transcount = 0;
2539 exec->callback = callback;
2540 exec->data = data;
2541 if (comp->nbCounters > 0) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002542 /*
2543 * For error handling, exec->counts is allocated twice the size
2544 * the second half is used to store the data in case of rollback
2545 */
2546 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
2547 * 2);
Daniel Veillard4255d502002-04-16 15:50:10 +00002548 if (exec->counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002549 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00002550 xmlFree(exec);
2551 return(NULL);
2552 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002553 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
2554 exec->errCounts = &exec->counts[comp->nbCounters];
2555 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00002556 exec->counts = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002557 exec->errCounts = NULL;
2558 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002559 exec->inputStackMax = 0;
2560 exec->inputStackNr = 0;
2561 exec->inputStack = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002562 exec->errStateNo = -1;
2563 exec->errString = NULL;
Daniel Veillard4255d502002-04-16 15:50:10 +00002564 return(exec);
2565}
2566
2567/**
2568 * xmlRegFreeExecCtxt:
2569 * @exec: a regular expression evaulation context
2570 *
2571 * Free the structures associated to a regular expression evaulation context.
2572 */
2573void
2574xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
2575 if (exec == NULL)
2576 return;
2577
2578 if (exec->rollbacks != NULL) {
2579 if (exec->counts != NULL) {
2580 int i;
2581
2582 for (i = 0;i < exec->maxRollbacks;i++)
2583 if (exec->rollbacks[i].counts != NULL)
2584 xmlFree(exec->rollbacks[i].counts);
2585 }
2586 xmlFree(exec->rollbacks);
2587 }
2588 if (exec->counts != NULL)
2589 xmlFree(exec->counts);
2590 if (exec->inputStack != NULL) {
2591 int i;
2592
Daniel Veillard32370232002-10-16 14:08:14 +00002593 for (i = 0;i < exec->inputStackNr;i++) {
2594 if (exec->inputStack[i].value != NULL)
2595 xmlFree(exec->inputStack[i].value);
2596 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002597 xmlFree(exec->inputStack);
2598 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002599 if (exec->errString != NULL)
2600 xmlFree(exec->errString);
Daniel Veillard4255d502002-04-16 15:50:10 +00002601 xmlFree(exec);
2602}
2603
2604static void
2605xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2606 void *data) {
2607#ifdef DEBUG_PUSH
2608 printf("saving value: %d:%s\n", exec->inputStackNr, value);
2609#endif
2610 if (exec->inputStackMax == 0) {
2611 exec->inputStackMax = 4;
2612 exec->inputStack = (xmlRegInputTokenPtr)
2613 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
2614 if (exec->inputStack == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002615 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00002616 exec->inputStackMax = 0;
2617 return;
2618 }
2619 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
2620 xmlRegInputTokenPtr tmp;
2621
2622 exec->inputStackMax *= 2;
2623 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
2624 exec->inputStackMax * sizeof(xmlRegInputToken));
2625 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002626 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00002627 exec->inputStackMax /= 2;
2628 return;
2629 }
2630 exec->inputStack = tmp;
2631 }
2632 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
2633 exec->inputStack[exec->inputStackNr].data = data;
2634 exec->inputStackNr++;
2635 exec->inputStack[exec->inputStackNr].value = NULL;
2636 exec->inputStack[exec->inputStackNr].data = NULL;
2637}
2638
Daniel Veillardc0826a72004-08-10 14:17:33 +00002639/**
2640 * xmlRegStrEqualWildcard:
2641 * @expStr: the string to be evaluated
2642 * @valStr: the validation string
2643 *
2644 * Checks if both strings are equal or have the same content. "*"
2645 * can be used as a wildcard in @valStr; "|" is used as a seperator of
2646 * substrings in both @expStr and @valStr.
2647 *
2648 * Returns 1 if the comparison is satisfied and the number of substrings
2649 * is equal, 0 otherwise.
2650 */
2651
2652static int
2653xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
2654 if (expStr == valStr) return(1);
2655 if (expStr == NULL) return(0);
2656 if (valStr == NULL) return(0);
2657 do {
2658 /*
2659 * Eval if we have a wildcard for the current item.
2660 */
2661 if (*expStr != *valStr) {
2662 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
2663 do {
2664 if (*valStr == XML_REG_STRING_SEPARATOR)
2665 break;
Kasimier T. Buchcikc0e833f2005-04-19 15:02:20 +00002666 valStr++;
Daniel Veillardc0826a72004-08-10 14:17:33 +00002667 } while (*valStr != 0);
2668 continue;
2669 } else
2670 return(0);
2671 }
Kasimier T. Buchcikc0e833f2005-04-19 15:02:20 +00002672 expStr++;
2673 valStr++;
Daniel Veillardc0826a72004-08-10 14:17:33 +00002674 } while (*valStr != 0);
2675 if (*expStr != 0)
2676 return (0);
2677 else
2678 return (1);
2679}
Daniel Veillard4255d502002-04-16 15:50:10 +00002680
2681/**
Daniel Veillard23e73572002-09-19 19:56:43 +00002682 * xmlRegCompactPushString:
2683 * @exec: a regexp execution context
2684 * @comp: the precompiled exec with a compact table
2685 * @value: a string token input
2686 * @data: data associated to the token to reuse in callbacks
2687 *
2688 * Push one input token in the execution context
2689 *
2690 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2691 * a negative value in case of error.
2692 */
2693static int
2694xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
2695 xmlRegexpPtr comp,
2696 const xmlChar *value,
2697 void *data) {
2698 int state = exec->index;
2699 int i, target;
2700
2701 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
2702 return(-1);
2703
2704 if (value == NULL) {
2705 /*
2706 * are we at a final state ?
2707 */
2708 if (comp->compact[state * (comp->nbstrings + 1)] ==
2709 XML_REGEXP_FINAL_STATE)
2710 return(1);
2711 return(0);
2712 }
2713
2714#ifdef DEBUG_PUSH
2715 printf("value pushed: %s\n", value);
2716#endif
2717
2718 /*
William M. Brackddf71d62004-05-06 04:17:26 +00002719 * Examine all outside transitions from current state
Daniel Veillard23e73572002-09-19 19:56:43 +00002720 */
2721 for (i = 0;i < comp->nbstrings;i++) {
2722 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
2723 if ((target > 0) && (target <= comp->nbstates)) {
Daniel Veillardc0826a72004-08-10 14:17:33 +00002724 target--; /* to avoid 0 */
2725 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
2726 exec->index = target;
Daniel Veillard118aed72002-09-24 14:13:13 +00002727 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
2728 exec->callback(exec->data, value,
2729 comp->transdata[state * comp->nbstrings + i], data);
2730 }
Daniel Veillard23e73572002-09-19 19:56:43 +00002731#ifdef DEBUG_PUSH
2732 printf("entering state %d\n", target);
2733#endif
2734 if (comp->compact[target * (comp->nbstrings + 1)] ==
Daniel Veillardcc026dc2005-01-12 13:21:17 +00002735 XML_REGEXP_SINK_STATE)
2736 goto error;
2737
2738 if (comp->compact[target * (comp->nbstrings + 1)] ==
Daniel Veillard23e73572002-09-19 19:56:43 +00002739 XML_REGEXP_FINAL_STATE)
2740 return(1);
2741 return(0);
2742 }
2743 }
2744 }
2745 /*
2746 * Failed to find an exit transition out from current state for the
2747 * current token
2748 */
2749#ifdef DEBUG_PUSH
2750 printf("failed to find a transition for %s on state %d\n", value, state);
2751#endif
Daniel Veillardcc026dc2005-01-12 13:21:17 +00002752error:
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002753 if (exec->errString != NULL)
2754 xmlFree(exec->errString);
2755 exec->errString = xmlStrdup(value);
2756 exec->errStateNo = state;
Daniel Veillard23e73572002-09-19 19:56:43 +00002757 exec->status = -1;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002758#ifdef DEBUG_ERR
2759 testerr(exec);
2760#endif
Daniel Veillard23e73572002-09-19 19:56:43 +00002761 return(-1);
2762}
2763
2764/**
Daniel Veillard4255d502002-04-16 15:50:10 +00002765 * xmlRegExecPushString:
Daniel Veillardea7751d2002-12-20 00:16:24 +00002766 * @exec: a regexp execution context or NULL to indicate the end
Daniel Veillard4255d502002-04-16 15:50:10 +00002767 * @value: a string token input
2768 * @data: data associated to the token to reuse in callbacks
2769 *
2770 * Push one input token in the execution context
2771 *
2772 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2773 * a negative value in case of error.
2774 */
2775int
2776xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2777 void *data) {
2778 xmlRegTransPtr trans;
2779 xmlRegAtomPtr atom;
2780 int ret;
2781 int final = 0;
Daniel Veillard90700152005-01-08 22:05:09 +00002782 int progress = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00002783
2784 if (exec == NULL)
2785 return(-1);
Daniel Veillard23e73572002-09-19 19:56:43 +00002786 if (exec->comp == NULL)
2787 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00002788 if (exec->status != 0)
2789 return(exec->status);
2790
Daniel Veillard23e73572002-09-19 19:56:43 +00002791 if (exec->comp->compact != NULL)
2792 return(xmlRegCompactPushString(exec, exec->comp, value, data));
2793
Daniel Veillard4255d502002-04-16 15:50:10 +00002794 if (value == NULL) {
2795 if (exec->state->type == XML_REGEXP_FINAL_STATE)
2796 return(1);
2797 final = 1;
2798 }
2799
2800#ifdef DEBUG_PUSH
2801 printf("value pushed: %s\n", value);
2802#endif
2803 /*
2804 * If we have an active rollback stack push the new value there
2805 * and get back to where we were left
2806 */
2807 if ((value != NULL) && (exec->inputStackNr > 0)) {
2808 xmlFARegExecSaveInputString(exec, value, data);
2809 value = exec->inputStack[exec->index].value;
2810 data = exec->inputStack[exec->index].data;
2811#ifdef DEBUG_PUSH
2812 printf("value loaded: %s\n", value);
2813#endif
2814 }
2815
2816 while ((exec->status == 0) &&
2817 ((value != NULL) ||
2818 ((final == 1) &&
2819 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
2820
2821 /*
2822 * End of input on non-terminal state, rollback, however we may
2823 * still have epsilon like transition for counted transitions
2824 * on counters, in that case don't break too early.
2825 */
Daniel Veillardb509f152002-04-17 16:28:10 +00002826 if ((value == NULL) && (exec->counts == NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +00002827 goto rollback;
2828
2829 exec->transcount = 0;
2830 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2831 trans = &exec->state->trans[exec->transno];
2832 if (trans->to < 0)
2833 continue;
2834 atom = trans->atom;
2835 ret = 0;
Daniel Veillard441bc322002-04-20 17:38:48 +00002836 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
2837 int i;
2838 int count;
2839 xmlRegTransPtr t;
2840 xmlRegCounterPtr counter;
2841
2842 ret = 0;
2843
2844#ifdef DEBUG_PUSH
2845 printf("testing all lax %d\n", trans->count);
2846#endif
2847 /*
2848 * Check all counted transitions from the current state
2849 */
2850 if ((value == NULL) && (final)) {
2851 ret = 1;
2852 } else if (value != NULL) {
2853 for (i = 0;i < exec->state->nbTrans;i++) {
2854 t = &exec->state->trans[i];
2855 if ((t->counter < 0) || (t == trans))
2856 continue;
2857 counter = &exec->comp->counters[t->counter];
2858 count = exec->counts[t->counter];
2859 if ((count < counter->max) &&
2860 (t->atom != NULL) &&
2861 (xmlStrEqual(value, t->atom->valuep))) {
2862 ret = 0;
2863 break;
2864 }
2865 if ((count >= counter->min) &&
2866 (count < counter->max) &&
2867 (xmlStrEqual(value, t->atom->valuep))) {
2868 ret = 1;
2869 break;
2870 }
2871 }
2872 }
2873 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillard8a001f62002-04-20 07:24:11 +00002874 int i;
2875 int count;
2876 xmlRegTransPtr t;
2877 xmlRegCounterPtr counter;
2878
2879 ret = 1;
2880
2881#ifdef DEBUG_PUSH
2882 printf("testing all %d\n", trans->count);
2883#endif
2884 /*
2885 * Check all counted transitions from the current state
2886 */
2887 for (i = 0;i < exec->state->nbTrans;i++) {
2888 t = &exec->state->trans[i];
2889 if ((t->counter < 0) || (t == trans))
2890 continue;
2891 counter = &exec->comp->counters[t->counter];
2892 count = exec->counts[t->counter];
2893 if ((count < counter->min) || (count > counter->max)) {
2894 ret = 0;
2895 break;
2896 }
2897 }
2898 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002899 int count;
2900 xmlRegCounterPtr counter;
2901
2902 /*
2903 * A counted transition.
2904 */
2905
2906 count = exec->counts[trans->count];
2907 counter = &exec->comp->counters[trans->count];
2908#ifdef DEBUG_PUSH
2909 printf("testing count %d: val %d, min %d, max %d\n",
2910 trans->count, count, counter->min, counter->max);
2911#endif
2912 ret = ((count >= counter->min) && (count <= counter->max));
2913 } else if (atom == NULL) {
2914 fprintf(stderr, "epsilon transition left at runtime\n");
2915 exec->status = -2;
2916 break;
2917 } else if (value != NULL) {
Daniel Veillardc0826a72004-08-10 14:17:33 +00002918 ret = xmlRegStrEqualWildcard(atom->valuep, value);
Daniel Veillard9efc4762005-07-19 14:33:55 +00002919 if (atom->neg)
2920 ret = !ret;
Daniel Veillard441bc322002-04-20 17:38:48 +00002921 if ((ret == 1) && (trans->counter >= 0)) {
2922 xmlRegCounterPtr counter;
2923 int count;
2924
2925 count = exec->counts[trans->counter];
2926 counter = &exec->comp->counters[trans->counter];
2927 if (count >= counter->max)
2928 ret = 0;
2929 }
2930
Daniel Veillard4255d502002-04-16 15:50:10 +00002931 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2932 xmlRegStatePtr to = exec->comp->states[trans->to];
2933
2934 /*
2935 * this is a multiple input sequence
2936 */
2937 if (exec->state->nbTrans > exec->transno + 1) {
2938 if (exec->inputStackNr <= 0) {
2939 xmlFARegExecSaveInputString(exec, value, data);
2940 }
2941 xmlFARegExecSave(exec);
2942 }
2943 exec->transcount = 1;
2944 do {
2945 /*
2946 * Try to progress as much as possible on the input
2947 */
2948 if (exec->transcount == atom->max) {
2949 break;
2950 }
2951 exec->index++;
2952 value = exec->inputStack[exec->index].value;
2953 data = exec->inputStack[exec->index].data;
2954#ifdef DEBUG_PUSH
2955 printf("value loaded: %s\n", value);
2956#endif
2957
2958 /*
2959 * End of input: stop here
2960 */
2961 if (value == NULL) {
2962 exec->index --;
2963 break;
2964 }
2965 if (exec->transcount >= atom->min) {
2966 int transno = exec->transno;
2967 xmlRegStatePtr state = exec->state;
2968
2969 /*
2970 * The transition is acceptable save it
2971 */
2972 exec->transno = -1; /* trick */
2973 exec->state = to;
2974 if (exec->inputStackNr <= 0) {
2975 xmlFARegExecSaveInputString(exec, value, data);
2976 }
2977 xmlFARegExecSave(exec);
2978 exec->transno = transno;
2979 exec->state = state;
2980 }
2981 ret = xmlStrEqual(value, atom->valuep);
2982 exec->transcount++;
2983 } while (ret == 1);
2984 if (exec->transcount < atom->min)
2985 ret = 0;
2986
2987 /*
2988 * If the last check failed but one transition was found
2989 * possible, rollback
2990 */
2991 if (ret < 0)
2992 ret = 0;
2993 if (ret == 0) {
2994 goto rollback;
2995 }
2996 }
2997 }
2998 if (ret == 1) {
William M. Brack98873952003-12-26 06:03:14 +00002999 if ((exec->callback != NULL) && (atom != NULL) &&
3000 (data != NULL)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003001 exec->callback(exec->data, atom->valuep,
3002 atom->data, data);
3003 }
3004 if (exec->state->nbTrans > exec->transno + 1) {
3005 if (exec->inputStackNr <= 0) {
3006 xmlFARegExecSaveInputString(exec, value, data);
3007 }
3008 xmlFARegExecSave(exec);
3009 }
3010 if (trans->counter >= 0) {
3011#ifdef DEBUG_PUSH
3012 printf("Increasing count %d\n", trans->counter);
3013#endif
3014 exec->counts[trans->counter]++;
3015 }
3016#ifdef DEBUG_PUSH
3017 printf("entering state %d\n", trans->to);
3018#endif
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003019 if ((exec->comp->states[trans->to] != NULL) &&
3020 (exec->comp->states[trans->to]->type ==
3021 XML_REGEXP_SINK_STATE)) {
3022 /*
3023 * entering a sink state, save the current state as error
3024 * state.
3025 */
3026 if (exec->errString != NULL)
3027 xmlFree(exec->errString);
3028 exec->errString = xmlStrdup(value);
3029 exec->errState = exec->state;
3030 memcpy(exec->errCounts, exec->counts,
3031 exec->comp->nbCounters * sizeof(int));
3032 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003033 exec->state = exec->comp->states[trans->to];
3034 exec->transno = 0;
3035 if (trans->atom != NULL) {
3036 if (exec->inputStack != NULL) {
3037 exec->index++;
3038 if (exec->index < exec->inputStackNr) {
3039 value = exec->inputStack[exec->index].value;
3040 data = exec->inputStack[exec->index].data;
3041#ifdef DEBUG_PUSH
3042 printf("value loaded: %s\n", value);
3043#endif
3044 } else {
3045 value = NULL;
3046 data = NULL;
3047#ifdef DEBUG_PUSH
3048 printf("end of input\n");
3049#endif
3050 }
3051 } else {
3052 value = NULL;
3053 data = NULL;
3054#ifdef DEBUG_PUSH
3055 printf("end of input\n");
3056#endif
3057 }
3058 }
3059 goto progress;
3060 } else if (ret < 0) {
3061 exec->status = -4;
3062 break;
3063 }
3064 }
3065 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3066rollback:
Daniel Veillard90700152005-01-08 22:05:09 +00003067 /*
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003068 * if we didn't yet rollback on the current input
3069 * store the current state as the error state.
Daniel Veillard90700152005-01-08 22:05:09 +00003070 */
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003071 if ((progress) && (exec->state != NULL) &&
3072 (exec->state->type != XML_REGEXP_SINK_STATE)) {
Daniel Veillard90700152005-01-08 22:05:09 +00003073 progress = 0;
3074 if (exec->errString != NULL)
3075 xmlFree(exec->errString);
3076 exec->errString = xmlStrdup(value);
3077 exec->errState = exec->state;
3078 memcpy(exec->errCounts, exec->counts,
3079 exec->comp->nbCounters * sizeof(int));
3080 }
3081
Daniel Veillard4255d502002-04-16 15:50:10 +00003082 /*
3083 * Failed to find a way out
3084 */
3085 exec->determinist = 0;
3086 xmlFARegExecRollBack(exec);
3087 if (exec->status == 0) {
3088 value = exec->inputStack[exec->index].value;
3089 data = exec->inputStack[exec->index].data;
3090#ifdef DEBUG_PUSH
3091 printf("value loaded: %s\n", value);
3092#endif
3093 }
3094 }
Daniel Veillard90700152005-01-08 22:05:09 +00003095 continue;
Daniel Veillard4255d502002-04-16 15:50:10 +00003096progress:
Daniel Veillard90700152005-01-08 22:05:09 +00003097 progress = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003098 continue;
3099 }
3100 if (exec->status == 0) {
3101 return(exec->state->type == XML_REGEXP_FINAL_STATE);
3102 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003103#ifdef DEBUG_ERR
Daniel Veillard90700152005-01-08 22:05:09 +00003104 if (exec->status < 0) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003105 testerr(exec);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003106 }
Daniel Veillard90700152005-01-08 22:05:09 +00003107#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003108 return(exec->status);
3109}
3110
Daniel Veillard52b48c72003-04-13 19:53:42 +00003111/**
3112 * xmlRegExecPushString2:
3113 * @exec: a regexp execution context or NULL to indicate the end
3114 * @value: the first string token input
3115 * @value2: the second string token input
3116 * @data: data associated to the token to reuse in callbacks
3117 *
3118 * Push one input token in the execution context
3119 *
3120 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3121 * a negative value in case of error.
3122 */
3123int
3124xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
3125 const xmlChar *value2, void *data) {
3126 xmlChar buf[150];
3127 int lenn, lenp, ret;
3128 xmlChar *str;
3129
3130 if (exec == NULL)
3131 return(-1);
3132 if (exec->comp == NULL)
3133 return(-1);
3134 if (exec->status != 0)
3135 return(exec->status);
3136
3137 if (value2 == NULL)
3138 return(xmlRegExecPushString(exec, value, data));
3139
3140 lenn = strlen((char *) value2);
3141 lenp = strlen((char *) value);
3142
3143 if (150 < lenn + lenp + 2) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003144 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00003145 if (str == NULL) {
3146 exec->status = -1;
3147 return(-1);
3148 }
3149 } else {
3150 str = buf;
3151 }
3152 memcpy(&str[0], value, lenp);
Daniel Veillardc0826a72004-08-10 14:17:33 +00003153 str[lenp] = XML_REG_STRING_SEPARATOR;
Daniel Veillard52b48c72003-04-13 19:53:42 +00003154 memcpy(&str[lenp + 1], value2, lenn);
3155 str[lenn + lenp + 1] = 0;
3156
3157 if (exec->comp->compact != NULL)
3158 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
3159 else
3160 ret = xmlRegExecPushString(exec, str, data);
3161
3162 if (str != buf)
3163 xmlFree(buf);
3164 return(ret);
3165}
3166
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003167/**
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003168 * xmlRegExecGetalues:
3169 * @exec: a regexp execution context
3170 * @err: error extraction or normal one
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003171 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003172 * @nbneg: return number of negative transitions
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003173 * @values: pointer to the array of acceptable values
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003174 * @terminal: return value if this was a terminal state
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003175 *
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003176 * Extract informations from the regexp execution, internal routine to
3177 * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003178 *
3179 * Returns: 0 in case of success or -1 in case of error.
3180 */
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003181static int
3182xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003183 int *nbval, int *nbneg,
3184 xmlChar **values, int *terminal) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003185 int maxval;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003186 int nb = 0;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003187
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003188 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
3189 (values == NULL) || (*nbval <= 0))
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003190 return(-1);
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003191
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003192 maxval = *nbval;
3193 *nbval = 0;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003194 *nbneg = 0;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003195 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
3196 xmlRegexpPtr comp;
3197 int target, i, state;
3198
3199 comp = exec->comp;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003200
3201 if (err) {
3202 if (exec->errStateNo == -1) return(-1);
3203 state = exec->errStateNo;
3204 } else {
3205 state = exec->index;
3206 }
3207 if (terminal != NULL) {
3208 if (comp->compact[state * (comp->nbstrings + 1)] ==
3209 XML_REGEXP_FINAL_STATE)
3210 *terminal = 1;
3211 else
3212 *terminal = 0;
3213 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003214 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003215 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003216 if ((target > 0) && (target <= comp->nbstates) &&
3217 (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
3218 XML_REGEXP_SINK_STATE)) {
3219 values[nb++] = comp->stringMap[i];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003220 (*nbval)++;
3221 }
3222 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003223 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
3224 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3225 if ((target > 0) && (target <= comp->nbstates) &&
3226 (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
3227 XML_REGEXP_SINK_STATE)) {
3228 values[nb++] = comp->stringMap[i];
3229 (*nbneg)++;
3230 }
3231 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003232 } else {
3233 int transno;
3234 xmlRegTransPtr trans;
3235 xmlRegAtomPtr atom;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003236 xmlRegStatePtr state;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003237
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003238 if (terminal != NULL) {
3239 if (exec->state->type == XML_REGEXP_FINAL_STATE)
3240 *terminal = 1;
3241 else
3242 *terminal = 0;
3243 }
3244
3245 if (err) {
3246 if (exec->errState == NULL) return(-1);
3247 state = exec->errState;
3248 } else {
3249 if (exec->state == NULL) return(-1);
3250 state = exec->state;
3251 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003252 for (transno = 0;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003253 (transno < state->nbTrans) && (nb < maxval);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003254 transno++) {
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003255 trans = &state->trans[transno];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003256 if (trans->to < 0)
3257 continue;
3258 atom = trans->atom;
3259 if ((atom == NULL) || (atom->valuep == NULL))
3260 continue;
3261 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003262 /* this should not be reached but ... */
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003263 TODO;
3264 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003265 /* this should not be reached but ... */
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003266 TODO;
3267 } else if (trans->counter >= 0) {
3268 xmlRegCounterPtr counter;
3269 int count;
3270
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003271 if (err)
3272 count = exec->errCounts[trans->counter];
3273 else
3274 count = exec->counts[trans->counter];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003275 counter = &exec->comp->counters[trans->counter];
3276 if (count < counter->max) {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003277 values[nb++] = (xmlChar *) atom->valuep;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003278 (*nbval)++;
3279 }
3280 } else {
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003281 if ((exec->comp->states[trans->to] != NULL) &&
3282 (exec->comp->states[trans->to]->type !=
3283 XML_REGEXP_SINK_STATE)) {
3284 values[nb++] = (xmlChar *) atom->valuep;
3285 (*nbval)++;
3286 }
3287 }
3288 }
3289 for (transno = 0;
3290 (transno < state->nbTrans) && (nb < maxval);
3291 transno++) {
3292 trans = &state->trans[transno];
3293 if (trans->to < 0)
3294 continue;
3295 atom = trans->atom;
3296 if ((atom == NULL) || (atom->valuep == NULL))
3297 continue;
3298 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3299 continue;
3300 } else if (trans->count == REGEXP_ALL_COUNTER) {
3301 continue;
3302 } else if (trans->counter >= 0) {
3303 continue;
3304 } else {
3305 if ((exec->comp->states[trans->to] != NULL) &&
3306 (exec->comp->states[trans->to]->type ==
3307 XML_REGEXP_SINK_STATE)) {
3308 values[nb++] = (xmlChar *) atom->valuep;
3309 (*nbneg)++;
3310 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003311 }
3312 }
3313 }
3314 return(0);
3315}
3316
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003317/**
3318 * xmlRegExecNextValues:
3319 * @exec: a regexp execution context
3320 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003321 * @nbneg: return number of negative transitions
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003322 * @values: pointer to the array of acceptable values
3323 * @terminal: return value if this was a terminal state
3324 *
3325 * Extract informations from the regexp execution,
3326 * the parameter @values must point to an array of @nbval string pointers
3327 * on return nbval will contain the number of possible strings in that
3328 * state and the @values array will be updated with them. The string values
3329 * returned will be freed with the @exec context and don't need to be
3330 * deallocated.
3331 *
3332 * Returns: 0 in case of success or -1 in case of error.
3333 */
3334int
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003335xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
3336 xmlChar **values, int *terminal) {
3337 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003338}
3339
3340/**
3341 * xmlRegExecErrInfo:
3342 * @exec: a regexp execution context generating an error
3343 * @string: return value for the error string
3344 * @nbval: pointer to the number of accepted values IN/OUT
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003345 * @nbneg: return number of negative transitions
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003346 * @values: pointer to the array of acceptable values
3347 * @terminal: return value if this was a terminal state
3348 *
3349 * Extract error informations from the regexp execution, the parameter
3350 * @string will be updated with the value pushed and not accepted,
3351 * the parameter @values must point to an array of @nbval string pointers
3352 * on return nbval will contain the number of possible strings in that
3353 * state and the @values array will be updated with them. The string values
3354 * returned will be freed with the @exec context and don't need to be
3355 * deallocated.
3356 *
3357 * Returns: 0 in case of success or -1 in case of error.
3358 */
3359int
3360xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003361 int *nbval, int *nbneg, xmlChar **values, int *terminal) {
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003362 if (exec == NULL)
3363 return(-1);
3364 if (string != NULL) {
3365 if (exec->status != 0)
3366 *string = exec->errString;
3367 else
3368 *string = NULL;
3369 }
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003370 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003371}
3372
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003373#ifdef DEBUG_ERR
3374static void testerr(xmlRegExecCtxtPtr exec) {
3375 const xmlChar *string;
Daniel Veillardcee2b3a2005-01-25 00:22:52 +00003376 xmlChar *values[5];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003377 int nb = 5;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003378 int nbneg;
Daniel Veillardfc0b6f62005-01-09 17:48:02 +00003379 int terminal;
Daniel Veillardcc026dc2005-01-12 13:21:17 +00003380 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003381}
3382#endif
3383
Daniel Veillard4255d502002-04-16 15:50:10 +00003384#if 0
3385static int
3386xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
3387 xmlRegTransPtr trans;
3388 xmlRegAtomPtr atom;
3389 int ret;
3390 int codepoint, len;
3391
3392 if (exec == NULL)
3393 return(-1);
3394 if (exec->status != 0)
3395 return(exec->status);
3396
3397 while ((exec->status == 0) &&
3398 ((exec->inputString[exec->index] != 0) ||
3399 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
3400
3401 /*
3402 * End of input on non-terminal state, rollback, however we may
3403 * still have epsilon like transition for counted transitions
3404 * on counters, in that case don't break too early.
3405 */
3406 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
3407 goto rollback;
3408
3409 exec->transcount = 0;
3410 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3411 trans = &exec->state->trans[exec->transno];
3412 if (trans->to < 0)
3413 continue;
3414 atom = trans->atom;
3415 ret = 0;
3416 if (trans->count >= 0) {
3417 int count;
3418 xmlRegCounterPtr counter;
3419
3420 /*
3421 * A counted transition.
3422 */
3423
3424 count = exec->counts[trans->count];
3425 counter = &exec->comp->counters[trans->count];
3426#ifdef DEBUG_REGEXP_EXEC
3427 printf("testing count %d: val %d, min %d, max %d\n",
3428 trans->count, count, counter->min, counter->max);
3429#endif
3430 ret = ((count >= counter->min) && (count <= counter->max));
3431 } else if (atom == NULL) {
3432 fprintf(stderr, "epsilon transition left at runtime\n");
3433 exec->status = -2;
3434 break;
3435 } else if (exec->inputString[exec->index] != 0) {
3436 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3437 ret = xmlRegCheckCharacter(atom, codepoint);
3438 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3439 xmlRegStatePtr to = exec->comp->states[trans->to];
3440
3441 /*
3442 * this is a multiple input sequence
3443 */
3444 if (exec->state->nbTrans > exec->transno + 1) {
3445 xmlFARegExecSave(exec);
3446 }
3447 exec->transcount = 1;
3448 do {
3449 /*
3450 * Try to progress as much as possible on the input
3451 */
3452 if (exec->transcount == atom->max) {
3453 break;
3454 }
3455 exec->index += len;
3456 /*
3457 * End of input: stop here
3458 */
3459 if (exec->inputString[exec->index] == 0) {
3460 exec->index -= len;
3461 break;
3462 }
3463 if (exec->transcount >= atom->min) {
3464 int transno = exec->transno;
3465 xmlRegStatePtr state = exec->state;
3466
3467 /*
3468 * The transition is acceptable save it
3469 */
3470 exec->transno = -1; /* trick */
3471 exec->state = to;
3472 xmlFARegExecSave(exec);
3473 exec->transno = transno;
3474 exec->state = state;
3475 }
3476 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3477 len);
3478 ret = xmlRegCheckCharacter(atom, codepoint);
3479 exec->transcount++;
3480 } while (ret == 1);
3481 if (exec->transcount < atom->min)
3482 ret = 0;
3483
3484 /*
3485 * If the last check failed but one transition was found
3486 * possible, rollback
3487 */
3488 if (ret < 0)
3489 ret = 0;
3490 if (ret == 0) {
3491 goto rollback;
3492 }
3493 }
3494 }
3495 if (ret == 1) {
3496 if (exec->state->nbTrans > exec->transno + 1) {
3497 xmlFARegExecSave(exec);
3498 }
3499 if (trans->counter >= 0) {
3500#ifdef DEBUG_REGEXP_EXEC
3501 printf("Increasing count %d\n", trans->counter);
3502#endif
3503 exec->counts[trans->counter]++;
3504 }
3505#ifdef DEBUG_REGEXP_EXEC
3506 printf("entering state %d\n", trans->to);
3507#endif
3508 exec->state = exec->comp->states[trans->to];
3509 exec->transno = 0;
3510 if (trans->atom != NULL) {
3511 exec->index += len;
3512 }
3513 goto progress;
3514 } else if (ret < 0) {
3515 exec->status = -4;
3516 break;
3517 }
3518 }
3519 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3520rollback:
3521 /*
3522 * Failed to find a way out
3523 */
3524 exec->determinist = 0;
3525 xmlFARegExecRollBack(exec);
3526 }
3527progress:
3528 continue;
3529 }
3530}
3531#endif
3532/************************************************************************
3533 * *
William M. Brackddf71d62004-05-06 04:17:26 +00003534 * Parser for the Schemas Datatype Regular Expressions *
Daniel Veillard4255d502002-04-16 15:50:10 +00003535 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
3536 * *
3537 ************************************************************************/
3538
3539/**
3540 * xmlFAIsChar:
Daniel Veillard441bc322002-04-20 17:38:48 +00003541 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003542 *
3543 * [10] Char ::= [^.\?*+()|#x5B#x5D]
3544 */
3545static int
3546xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
3547 int cur;
3548 int len;
3549
3550 cur = CUR_SCHAR(ctxt->cur, len);
3551 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
3552 (cur == '*') || (cur == '+') || (cur == '(') ||
3553 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
3554 (cur == 0x5D) || (cur == 0))
3555 return(-1);
3556 return(cur);
3557}
3558
3559/**
3560 * xmlFAParseCharProp:
Daniel Veillard441bc322002-04-20 17:38:48 +00003561 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003562 *
3563 * [27] charProp ::= IsCategory | IsBlock
3564 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
3565 * Separators | Symbols | Others
3566 * [29] Letters ::= 'L' [ultmo]?
3567 * [30] Marks ::= 'M' [nce]?
3568 * [31] Numbers ::= 'N' [dlo]?
3569 * [32] Punctuation ::= 'P' [cdseifo]?
3570 * [33] Separators ::= 'Z' [slp]?
3571 * [34] Symbols ::= 'S' [mcko]?
3572 * [35] Others ::= 'C' [cfon]?
3573 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
3574 */
3575static void
3576xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
3577 int cur;
William M. Brack779af002003-08-01 15:55:39 +00003578 xmlRegAtomType type = (xmlRegAtomType) 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003579 xmlChar *blockName = NULL;
3580
3581 cur = CUR;
3582 if (cur == 'L') {
3583 NEXT;
3584 cur = CUR;
3585 if (cur == 'u') {
3586 NEXT;
3587 type = XML_REGEXP_LETTER_UPPERCASE;
3588 } else if (cur == 'l') {
3589 NEXT;
3590 type = XML_REGEXP_LETTER_LOWERCASE;
3591 } else if (cur == 't') {
3592 NEXT;
3593 type = XML_REGEXP_LETTER_TITLECASE;
3594 } else if (cur == 'm') {
3595 NEXT;
3596 type = XML_REGEXP_LETTER_MODIFIER;
3597 } else if (cur == 'o') {
3598 NEXT;
3599 type = XML_REGEXP_LETTER_OTHERS;
3600 } else {
3601 type = XML_REGEXP_LETTER;
3602 }
3603 } else if (cur == 'M') {
3604 NEXT;
3605 cur = CUR;
3606 if (cur == 'n') {
3607 NEXT;
3608 /* nonspacing */
3609 type = XML_REGEXP_MARK_NONSPACING;
3610 } else if (cur == 'c') {
3611 NEXT;
3612 /* spacing combining */
3613 type = XML_REGEXP_MARK_SPACECOMBINING;
3614 } else if (cur == 'e') {
3615 NEXT;
3616 /* enclosing */
3617 type = XML_REGEXP_MARK_ENCLOSING;
3618 } else {
3619 /* all marks */
3620 type = XML_REGEXP_MARK;
3621 }
3622 } else if (cur == 'N') {
3623 NEXT;
3624 cur = CUR;
3625 if (cur == 'd') {
3626 NEXT;
3627 /* digital */
3628 type = XML_REGEXP_NUMBER_DECIMAL;
3629 } else if (cur == 'l') {
3630 NEXT;
3631 /* letter */
3632 type = XML_REGEXP_NUMBER_LETTER;
3633 } else if (cur == 'o') {
3634 NEXT;
3635 /* other */
3636 type = XML_REGEXP_NUMBER_OTHERS;
3637 } else {
3638 /* all numbers */
3639 type = XML_REGEXP_NUMBER;
3640 }
3641 } else if (cur == 'P') {
3642 NEXT;
3643 cur = CUR;
3644 if (cur == 'c') {
3645 NEXT;
3646 /* connector */
3647 type = XML_REGEXP_PUNCT_CONNECTOR;
3648 } else if (cur == 'd') {
3649 NEXT;
3650 /* dash */
3651 type = XML_REGEXP_PUNCT_DASH;
3652 } else if (cur == 's') {
3653 NEXT;
3654 /* open */
3655 type = XML_REGEXP_PUNCT_OPEN;
3656 } else if (cur == 'e') {
3657 NEXT;
3658 /* close */
3659 type = XML_REGEXP_PUNCT_CLOSE;
3660 } else if (cur == 'i') {
3661 NEXT;
3662 /* initial quote */
3663 type = XML_REGEXP_PUNCT_INITQUOTE;
3664 } else if (cur == 'f') {
3665 NEXT;
3666 /* final quote */
3667 type = XML_REGEXP_PUNCT_FINQUOTE;
3668 } else if (cur == 'o') {
3669 NEXT;
3670 /* other */
3671 type = XML_REGEXP_PUNCT_OTHERS;
3672 } else {
3673 /* all punctuation */
3674 type = XML_REGEXP_PUNCT;
3675 }
3676 } else if (cur == 'Z') {
3677 NEXT;
3678 cur = CUR;
3679 if (cur == 's') {
3680 NEXT;
3681 /* space */
3682 type = XML_REGEXP_SEPAR_SPACE;
3683 } else if (cur == 'l') {
3684 NEXT;
3685 /* line */
3686 type = XML_REGEXP_SEPAR_LINE;
3687 } else if (cur == 'p') {
3688 NEXT;
3689 /* paragraph */
3690 type = XML_REGEXP_SEPAR_PARA;
3691 } else {
3692 /* all separators */
3693 type = XML_REGEXP_SEPAR;
3694 }
3695 } else if (cur == 'S') {
3696 NEXT;
3697 cur = CUR;
3698 if (cur == 'm') {
3699 NEXT;
3700 type = XML_REGEXP_SYMBOL_MATH;
3701 /* math */
3702 } else if (cur == 'c') {
3703 NEXT;
3704 type = XML_REGEXP_SYMBOL_CURRENCY;
3705 /* currency */
3706 } else if (cur == 'k') {
3707 NEXT;
3708 type = XML_REGEXP_SYMBOL_MODIFIER;
3709 /* modifiers */
3710 } else if (cur == 'o') {
3711 NEXT;
3712 type = XML_REGEXP_SYMBOL_OTHERS;
3713 /* other */
3714 } else {
3715 /* all symbols */
3716 type = XML_REGEXP_SYMBOL;
3717 }
3718 } else if (cur == 'C') {
3719 NEXT;
3720 cur = CUR;
3721 if (cur == 'c') {
3722 NEXT;
3723 /* control */
3724 type = XML_REGEXP_OTHER_CONTROL;
3725 } else if (cur == 'f') {
3726 NEXT;
3727 /* format */
3728 type = XML_REGEXP_OTHER_FORMAT;
3729 } else if (cur == 'o') {
3730 NEXT;
3731 /* private use */
3732 type = XML_REGEXP_OTHER_PRIVATE;
3733 } else if (cur == 'n') {
3734 NEXT;
3735 /* not assigned */
3736 type = XML_REGEXP_OTHER_NA;
3737 } else {
3738 /* all others */
3739 type = XML_REGEXP_OTHER;
3740 }
3741 } else if (cur == 'I') {
3742 const xmlChar *start;
3743 NEXT;
3744 cur = CUR;
3745 if (cur != 's') {
3746 ERROR("IsXXXX expected");
3747 return;
3748 }
3749 NEXT;
3750 start = ctxt->cur;
3751 cur = CUR;
3752 if (((cur >= 'a') && (cur <= 'z')) ||
3753 ((cur >= 'A') && (cur <= 'Z')) ||
3754 ((cur >= '0') && (cur <= '9')) ||
3755 (cur == 0x2D)) {
3756 NEXT;
3757 cur = CUR;
3758 while (((cur >= 'a') && (cur <= 'z')) ||
3759 ((cur >= 'A') && (cur <= 'Z')) ||
3760 ((cur >= '0') && (cur <= '9')) ||
3761 (cur == 0x2D)) {
3762 NEXT;
3763 cur = CUR;
3764 }
3765 }
3766 type = XML_REGEXP_BLOCK_NAME;
3767 blockName = xmlStrndup(start, ctxt->cur - start);
3768 } else {
3769 ERROR("Unknown char property");
3770 return;
3771 }
3772 if (ctxt->atom == NULL) {
3773 ctxt->atom = xmlRegNewAtom(ctxt, type);
3774 if (ctxt->atom != NULL)
3775 ctxt->atom->valuep = blockName;
3776 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3777 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3778 type, 0, 0, blockName);
3779 }
3780}
3781
3782/**
3783 * xmlFAParseCharClassEsc:
Daniel Veillard441bc322002-04-20 17:38:48 +00003784 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003785 *
3786 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
3787 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
3788 * [25] catEsc ::= '\p{' charProp '}'
3789 * [26] complEsc ::= '\P{' charProp '}'
3790 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
3791 */
3792static void
3793xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
3794 int cur;
3795
3796 if (CUR == '.') {
3797 if (ctxt->atom == NULL) {
3798 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
3799 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3800 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3801 XML_REGEXP_ANYCHAR, 0, 0, NULL);
3802 }
3803 NEXT;
3804 return;
3805 }
3806 if (CUR != '\\') {
3807 ERROR("Escaped sequence: expecting \\");
3808 return;
3809 }
3810 NEXT;
3811 cur = CUR;
3812 if (cur == 'p') {
3813 NEXT;
3814 if (CUR != '{') {
3815 ERROR("Expecting '{'");
3816 return;
3817 }
3818 NEXT;
3819 xmlFAParseCharProp(ctxt);
3820 if (CUR != '}') {
3821 ERROR("Expecting '}'");
3822 return;
3823 }
3824 NEXT;
3825 } else if (cur == 'P') {
3826 NEXT;
3827 if (CUR != '{') {
3828 ERROR("Expecting '{'");
3829 return;
3830 }
3831 NEXT;
3832 xmlFAParseCharProp(ctxt);
3833 ctxt->atom->neg = 1;
3834 if (CUR != '}') {
3835 ERROR("Expecting '}'");
3836 return;
3837 }
3838 NEXT;
3839 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
3840 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
3841 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
3842 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
3843 (cur == 0x5E)) {
3844 if (ctxt->atom == NULL) {
3845 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
Daniel Veillard99c394d2005-07-14 12:58:49 +00003846 if (ctxt->atom != NULL) {
3847 switch (cur) {
3848 case 'n':
3849 ctxt->atom->codepoint = '\n';
3850 break;
3851 case 'r':
3852 ctxt->atom->codepoint = '\r';
3853 break;
3854 case 't':
3855 ctxt->atom->codepoint = '\t';
3856 break;
3857 default:
3858 ctxt->atom->codepoint = cur;
3859 }
3860 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003861 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3862 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3863 XML_REGEXP_CHARVAL, cur, cur, NULL);
3864 }
3865 NEXT;
3866 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
3867 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
3868 (cur == 'w') || (cur == 'W')) {
Daniel Veillardb509f152002-04-17 16:28:10 +00003869 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
Daniel Veillard4255d502002-04-16 15:50:10 +00003870
3871 switch (cur) {
3872 case 's':
3873 type = XML_REGEXP_ANYSPACE;
3874 break;
3875 case 'S':
3876 type = XML_REGEXP_NOTSPACE;
3877 break;
3878 case 'i':
3879 type = XML_REGEXP_INITNAME;
3880 break;
3881 case 'I':
3882 type = XML_REGEXP_NOTINITNAME;
3883 break;
3884 case 'c':
3885 type = XML_REGEXP_NAMECHAR;
3886 break;
3887 case 'C':
3888 type = XML_REGEXP_NOTNAMECHAR;
3889 break;
3890 case 'd':
3891 type = XML_REGEXP_DECIMAL;
3892 break;
3893 case 'D':
3894 type = XML_REGEXP_NOTDECIMAL;
3895 break;
3896 case 'w':
3897 type = XML_REGEXP_REALCHAR;
3898 break;
3899 case 'W':
3900 type = XML_REGEXP_NOTREALCHAR;
3901 break;
3902 }
3903 NEXT;
3904 if (ctxt->atom == NULL) {
3905 ctxt->atom = xmlRegNewAtom(ctxt, type);
3906 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3907 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3908 type, 0, 0, NULL);
3909 }
3910 }
3911}
3912
3913/**
3914 * xmlFAParseCharRef:
Daniel Veillard441bc322002-04-20 17:38:48 +00003915 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003916 *
3917 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' )
3918 */
3919static int
3920xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) {
3921 int ret = 0, cur;
3922
3923 if ((CUR != '&') || (NXT(1) != '#'))
3924 return(-1);
3925 NEXT;
3926 NEXT;
3927 cur = CUR;
3928 if (cur == 'x') {
3929 NEXT;
3930 cur = CUR;
3931 if (((cur >= '0') && (cur <= '9')) ||
3932 ((cur >= 'a') && (cur <= 'f')) ||
3933 ((cur >= 'A') && (cur <= 'F'))) {
3934 while (((cur >= '0') && (cur <= '9')) ||
3935 ((cur >= 'A') && (cur <= 'F'))) {
3936 if ((cur >= '0') && (cur <= '9'))
3937 ret = ret * 16 + cur - '0';
3938 else if ((cur >= 'a') && (cur <= 'f'))
3939 ret = ret * 16 + 10 + (cur - 'a');
3940 else
3941 ret = ret * 16 + 10 + (cur - 'A');
3942 NEXT;
3943 cur = CUR;
3944 }
3945 } else {
3946 ERROR("Char ref: expecting [0-9A-F]");
3947 return(-1);
3948 }
3949 } else {
3950 if ((cur >= '0') && (cur <= '9')) {
3951 while ((cur >= '0') && (cur <= '9')) {
3952 ret = ret * 10 + cur - '0';
3953 NEXT;
3954 cur = CUR;
3955 }
3956 } else {
3957 ERROR("Char ref: expecting [0-9]");
3958 return(-1);
3959 }
3960 }
3961 if (cur != ';') {
3962 ERROR("Char ref: expecting ';'");
3963 return(-1);
3964 } else {
3965 NEXT;
3966 }
3967 return(ret);
3968}
3969
3970/**
3971 * xmlFAParseCharRange:
Daniel Veillard441bc322002-04-20 17:38:48 +00003972 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003973 *
3974 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
3975 * [18] seRange ::= charOrEsc '-' charOrEsc
3976 * [20] charOrEsc ::= XmlChar | SingleCharEsc
3977 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
3978 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
3979 */
3980static void
3981xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
William M. Brackdc99df92003-12-27 01:54:25 +00003982 int cur, len;
Daniel Veillard4255d502002-04-16 15:50:10 +00003983 int start = -1;
3984 int end = -1;
3985
3986 if ((CUR == '&') && (NXT(1) == '#')) {
3987 end = start = xmlFAParseCharRef(ctxt);
3988 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3989 XML_REGEXP_CHARVAL, start, end, NULL);
3990 return;
3991 }
3992 cur = CUR;
3993 if (cur == '\\') {
3994 NEXT;
3995 cur = CUR;
3996 switch (cur) {
3997 case 'n': start = 0xA; break;
3998 case 'r': start = 0xD; break;
3999 case 't': start = 0x9; break;
4000 case '\\': case '|': case '.': case '-': case '^': case '?':
4001 case '*': case '+': case '{': case '}': case '(': case ')':
4002 case '[': case ']':
4003 start = cur; break;
4004 default:
4005 ERROR("Invalid escape value");
4006 return;
4007 }
4008 end = start;
William M. Brackdc99df92003-12-27 01:54:25 +00004009 len = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00004010 } else if ((cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00004011 end = start = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00004012 } else {
4013 ERROR("Expecting a char range");
4014 return;
4015 }
William M. Brackdc99df92003-12-27 01:54:25 +00004016 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00004017 if (start == '-') {
4018 return;
4019 }
4020 cur = CUR;
William M. Brack10f1ef42004-03-20 14:51:25 +00004021 if ((cur != '-') || (NXT(1) == ']')) {
Daniel Veillard4255d502002-04-16 15:50:10 +00004022 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4023 XML_REGEXP_CHARVAL, start, end, NULL);
4024 return;
4025 }
4026 NEXT;
4027 cur = CUR;
4028 if (cur == '\\') {
4029 NEXT;
4030 cur = CUR;
4031 switch (cur) {
4032 case 'n': end = 0xA; break;
4033 case 'r': end = 0xD; break;
4034 case 't': end = 0x9; break;
4035 case '\\': case '|': case '.': case '-': case '^': case '?':
4036 case '*': case '+': case '{': case '}': case '(': case ')':
4037 case '[': case ']':
4038 end = cur; break;
4039 default:
4040 ERROR("Invalid escape value");
4041 return;
4042 }
William M. Brackdc99df92003-12-27 01:54:25 +00004043 len = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00004044 } else if ((cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00004045 end = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00004046 } else {
4047 ERROR("Expecting the end of a char range");
4048 return;
4049 }
William M. Brackdc99df92003-12-27 01:54:25 +00004050 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00004051 /* TODO check that the values are acceptable character ranges for XML */
4052 if (end < start) {
4053 ERROR("End of range is before start of range");
4054 } else {
4055 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4056 XML_REGEXP_CHARVAL, start, end, NULL);
4057 }
4058 return;
4059}
4060
4061/**
4062 * xmlFAParsePosCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00004063 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004064 *
4065 * [14] posCharGroup ::= ( charRange | charClassEsc )+
4066 */
4067static void
4068xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
4069 do {
4070 if ((CUR == '\\') || (CUR == '.')) {
4071 xmlFAParseCharClassEsc(ctxt);
4072 } else {
4073 xmlFAParseCharRange(ctxt);
4074 }
4075 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
4076 (ctxt->error == 0));
4077}
4078
4079/**
4080 * xmlFAParseCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00004081 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004082 *
4083 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
4084 * [15] negCharGroup ::= '^' posCharGroup
4085 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
4086 * [12] charClassExpr ::= '[' charGroup ']'
4087 */
4088static void
4089xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
4090 int n = ctxt->neg;
4091 while ((CUR != ']') && (ctxt->error == 0)) {
4092 if (CUR == '^') {
4093 int neg = ctxt->neg;
4094
4095 NEXT;
4096 ctxt->neg = !ctxt->neg;
4097 xmlFAParsePosCharGroup(ctxt);
4098 ctxt->neg = neg;
William M. Brack10f1ef42004-03-20 14:51:25 +00004099 } else if ((CUR == '-') && (NXT(1) == '[')) {
Daniel Veillardf8b9de32003-11-24 14:27:26 +00004100 int neg = ctxt->neg;
Daniel Veillardf8b9de32003-11-24 14:27:26 +00004101 ctxt->neg = 2;
William M. Brack10f1ef42004-03-20 14:51:25 +00004102 NEXT; /* eat the '-' */
4103 NEXT; /* eat the '[' */
Daniel Veillard4255d502002-04-16 15:50:10 +00004104 xmlFAParseCharGroup(ctxt);
4105 if (CUR == ']') {
4106 NEXT;
4107 } else {
4108 ERROR("charClassExpr: ']' expected");
4109 break;
4110 }
Daniel Veillardf8b9de32003-11-24 14:27:26 +00004111 ctxt->neg = neg;
Daniel Veillard4255d502002-04-16 15:50:10 +00004112 break;
4113 } else if (CUR != ']') {
4114 xmlFAParsePosCharGroup(ctxt);
4115 }
4116 }
4117 ctxt->neg = n;
4118}
4119
4120/**
4121 * xmlFAParseCharClass:
Daniel Veillard441bc322002-04-20 17:38:48 +00004122 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004123 *
4124 * [11] charClass ::= charClassEsc | charClassExpr
4125 * [12] charClassExpr ::= '[' charGroup ']'
4126 */
4127static void
4128xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
4129 if (CUR == '[') {
4130 NEXT;
4131 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
4132 if (ctxt->atom == NULL)
4133 return;
4134 xmlFAParseCharGroup(ctxt);
4135 if (CUR == ']') {
4136 NEXT;
4137 } else {
4138 ERROR("xmlFAParseCharClass: ']' expected");
4139 }
4140 } else {
4141 xmlFAParseCharClassEsc(ctxt);
4142 }
4143}
4144
4145/**
4146 * xmlFAParseQuantExact:
Daniel Veillard441bc322002-04-20 17:38:48 +00004147 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004148 *
4149 * [8] QuantExact ::= [0-9]+
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004150 *
4151 * Returns 0 if success or -1 in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00004152 */
4153static int
4154xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
4155 int ret = 0;
4156 int ok = 0;
4157
4158 while ((CUR >= '0') && (CUR <= '9')) {
4159 ret = ret * 10 + (CUR - '0');
4160 ok = 1;
4161 NEXT;
4162 }
4163 if (ok != 1) {
4164 return(-1);
4165 }
4166 return(ret);
4167}
4168
4169/**
4170 * xmlFAParseQuantifier:
Daniel Veillard441bc322002-04-20 17:38:48 +00004171 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004172 *
4173 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
4174 * [5] quantity ::= quantRange | quantMin | QuantExact
4175 * [6] quantRange ::= QuantExact ',' QuantExact
4176 * [7] quantMin ::= QuantExact ','
4177 * [8] QuantExact ::= [0-9]+
4178 */
4179static int
4180xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
4181 int cur;
4182
4183 cur = CUR;
4184 if ((cur == '?') || (cur == '*') || (cur == '+')) {
4185 if (ctxt->atom != NULL) {
4186 if (cur == '?')
4187 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
4188 else if (cur == '*')
4189 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
4190 else if (cur == '+')
4191 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
4192 }
4193 NEXT;
4194 return(1);
4195 }
4196 if (cur == '{') {
4197 int min = 0, max = 0;
4198
4199 NEXT;
4200 cur = xmlFAParseQuantExact(ctxt);
4201 if (cur >= 0)
4202 min = cur;
4203 if (CUR == ',') {
4204 NEXT;
Daniel Veillardebe48c62003-12-03 12:12:27 +00004205 if (CUR == '}')
4206 max = INT_MAX;
4207 else {
4208 cur = xmlFAParseQuantExact(ctxt);
4209 if (cur >= 0)
4210 max = cur;
4211 else {
4212 ERROR("Improper quantifier");
4213 }
4214 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004215 }
4216 if (CUR == '}') {
4217 NEXT;
4218 } else {
4219 ERROR("Unterminated quantifier");
4220 }
4221 if (max == 0)
4222 max = min;
4223 if (ctxt->atom != NULL) {
4224 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
4225 ctxt->atom->min = min;
4226 ctxt->atom->max = max;
4227 }
4228 return(1);
4229 }
4230 return(0);
4231}
4232
4233/**
4234 * xmlFAParseAtom:
Daniel Veillard441bc322002-04-20 17:38:48 +00004235 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004236 *
4237 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
4238 */
4239static int
4240xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
4241 int codepoint, len;
4242
4243 codepoint = xmlFAIsChar(ctxt);
4244 if (codepoint > 0) {
4245 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
4246 if (ctxt->atom == NULL)
4247 return(-1);
4248 codepoint = CUR_SCHAR(ctxt->cur, len);
4249 ctxt->atom->codepoint = codepoint;
4250 NEXTL(len);
4251 return(1);
4252 } else if (CUR == '|') {
4253 return(0);
4254 } else if (CUR == 0) {
4255 return(0);
4256 } else if (CUR == ')') {
4257 return(0);
4258 } else if (CUR == '(') {
4259 xmlRegStatePtr start, oldend;
4260
4261 NEXT;
4262 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
4263 start = ctxt->state;
4264 oldend = ctxt->end;
4265 ctxt->end = NULL;
4266 ctxt->atom = NULL;
4267 xmlFAParseRegExp(ctxt, 0);
4268 if (CUR == ')') {
4269 NEXT;
4270 } else {
4271 ERROR("xmlFAParseAtom: expecting ')'");
4272 }
4273 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
4274 if (ctxt->atom == NULL)
4275 return(-1);
4276 ctxt->atom->start = start;
4277 ctxt->atom->stop = ctxt->state;
4278 ctxt->end = oldend;
4279 return(1);
4280 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
4281 xmlFAParseCharClass(ctxt);
4282 return(1);
4283 }
4284 return(0);
4285}
4286
4287/**
4288 * xmlFAParsePiece:
Daniel Veillard441bc322002-04-20 17:38:48 +00004289 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004290 *
4291 * [3] piece ::= atom quantifier?
4292 */
4293static int
4294xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
4295 int ret;
4296
4297 ctxt->atom = NULL;
4298 ret = xmlFAParseAtom(ctxt);
4299 if (ret == 0)
4300 return(0);
4301 if (ctxt->atom == NULL) {
4302 ERROR("internal: no atom generated");
4303 }
4304 xmlFAParseQuantifier(ctxt);
4305 return(1);
4306}
4307
4308/**
4309 * xmlFAParseBranch:
Daniel Veillard441bc322002-04-20 17:38:48 +00004310 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004311 *
4312 * [2] branch ::= piece*
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004313 8
Daniel Veillard4255d502002-04-16 15:50:10 +00004314 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004315static int
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004316xmlFAParseBranch(xmlRegParserCtxtPtr ctxt) {
Daniel Veillard4255d502002-04-16 15:50:10 +00004317 xmlRegStatePtr previous;
Daniel Veillard4255d502002-04-16 15:50:10 +00004318 int ret;
4319
4320 previous = ctxt->state;
4321 ret = xmlFAParsePiece(ctxt);
4322 if (ret != 0) {
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004323 if (xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom) < 0)
4324 return(-1);
4325 previous = ctxt->state;
Daniel Veillard4255d502002-04-16 15:50:10 +00004326 ctxt->atom = NULL;
4327 }
4328 while ((ret != 0) && (ctxt->error == 0)) {
4329 ret = xmlFAParsePiece(ctxt);
4330 if (ret != 0) {
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004331 if (xmlFAGenerateTransitions(ctxt, previous, NULL,
4332 ctxt->atom) < 0)
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004333 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00004334 previous = ctxt->state;
4335 ctxt->atom = NULL;
4336 }
4337 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004338 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00004339}
4340
4341/**
4342 * xmlFAParseRegExp:
Daniel Veillard441bc322002-04-20 17:38:48 +00004343 * @ctxt: a regexp parser context
William M. Brackddf71d62004-05-06 04:17:26 +00004344 * @top: is this the top-level expression ?
Daniel Veillard4255d502002-04-16 15:50:10 +00004345 *
4346 * [1] regExp ::= branch ( '|' branch )*
4347 */
4348static void
4349xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
Daniel Veillardc7e3cc42004-09-28 12:33:52 +00004350 xmlRegStatePtr start, end;
Daniel Veillard4255d502002-04-16 15:50:10 +00004351
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004352 /* if not top start should have been generated by an epsilon trans */
Daniel Veillard4255d502002-04-16 15:50:10 +00004353 start = ctxt->state;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004354 ctxt->end = NULL;
4355 xmlFAParseBranch(ctxt);
4356 if (top) {
4357#ifdef DEBUG_REGEXP_GRAPH
4358 printf("State %d is final\n", ctxt->state->no);
4359#endif
4360 ctxt->state->type = XML_REGEXP_FINAL_STATE;
4361 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004362 if (CUR != '|') {
4363 ctxt->end = ctxt->state;
4364 return;
4365 }
4366 end = ctxt->state;
4367 while ((CUR == '|') && (ctxt->error == 0)) {
4368 NEXT;
4369 ctxt->state = start;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004370 ctxt->end = NULL;
4371 xmlFAParseBranch(ctxt);
4372 if (top) {
4373 ctxt->state->type = XML_REGEXP_FINAL_STATE;
4374#ifdef DEBUG_REGEXP_GRAPH
4375 printf("State %d is final\n", ctxt->state->no);
4376#endif
4377 } else {
4378 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, end);
4379 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004380 }
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004381 if (!top) {
4382 ctxt->state = end;
4383 ctxt->end = end;
4384 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004385}
4386
4387/************************************************************************
4388 * *
4389 * The basic API *
4390 * *
4391 ************************************************************************/
4392
4393/**
4394 * xmlRegexpPrint:
4395 * @output: the file for the output debug
4396 * @regexp: the compiled regexp
4397 *
4398 * Print the content of the compiled regular expression
4399 */
4400void
4401xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
4402 int i;
4403
Daniel Veillarda82b1822004-11-08 16:24:57 +00004404 if (output == NULL)
4405 return;
Daniel Veillard4255d502002-04-16 15:50:10 +00004406 fprintf(output, " regexp: ");
4407 if (regexp == NULL) {
4408 fprintf(output, "NULL\n");
4409 return;
4410 }
4411 fprintf(output, "'%s' ", regexp->string);
4412 fprintf(output, "\n");
4413 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
4414 for (i = 0;i < regexp->nbAtoms; i++) {
4415 fprintf(output, " %02d ", i);
4416 xmlRegPrintAtom(output, regexp->atoms[i]);
4417 }
4418 fprintf(output, "%d states:", regexp->nbStates);
4419 fprintf(output, "\n");
4420 for (i = 0;i < regexp->nbStates; i++) {
4421 xmlRegPrintState(output, regexp->states[i]);
4422 }
4423 fprintf(output, "%d counters:\n", regexp->nbCounters);
4424 for (i = 0;i < regexp->nbCounters; i++) {
4425 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
4426 regexp->counters[i].max);
4427 }
4428}
4429
4430/**
4431 * xmlRegexpCompile:
4432 * @regexp: a regular expression string
4433 *
4434 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
William M. Brackddf71d62004-05-06 04:17:26 +00004435 * Appendix F and builds an automata suitable for testing strings against
Daniel Veillard4255d502002-04-16 15:50:10 +00004436 * that regular expression
4437 *
4438 * Returns the compiled expression or NULL in case of error
4439 */
4440xmlRegexpPtr
4441xmlRegexpCompile(const xmlChar *regexp) {
4442 xmlRegexpPtr ret;
4443 xmlRegParserCtxtPtr ctxt;
4444
4445 ctxt = xmlRegNewParserCtxt(regexp);
4446 if (ctxt == NULL)
4447 return(NULL);
4448
4449 /* initialize the parser */
4450 ctxt->end = NULL;
4451 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
4452 xmlRegStatePush(ctxt, ctxt->start);
4453
4454 /* parse the expression building an automata */
4455 xmlFAParseRegExp(ctxt, 1);
4456 if (CUR != 0) {
4457 ERROR("xmlFAParseRegExp: extra characters");
4458 }
4459 ctxt->end = ctxt->state;
4460 ctxt->start->type = XML_REGEXP_START_STATE;
4461 ctxt->end->type = XML_REGEXP_FINAL_STATE;
4462
4463 /* remove the Epsilon except for counted transitions */
4464 xmlFAEliminateEpsilonTransitions(ctxt);
4465
4466
4467 if (ctxt->error != 0) {
4468 xmlRegFreeParserCtxt(ctxt);
4469 return(NULL);
4470 }
4471 ret = xmlRegEpxFromParse(ctxt);
4472 xmlRegFreeParserCtxt(ctxt);
4473 return(ret);
4474}
4475
4476/**
4477 * xmlRegexpExec:
4478 * @comp: the compiled regular expression
4479 * @content: the value to check against the regular expression
4480 *
William M. Brackddf71d62004-05-06 04:17:26 +00004481 * Check if the regular expression generates the value
Daniel Veillard4255d502002-04-16 15:50:10 +00004482 *
William M. Brackddf71d62004-05-06 04:17:26 +00004483 * Returns 1 if it matches, 0 if not and a negative value in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00004484 */
4485int
4486xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
4487 if ((comp == NULL) || (content == NULL))
4488 return(-1);
4489 return(xmlFARegExec(comp, content));
4490}
4491
4492/**
Daniel Veillard23e73572002-09-19 19:56:43 +00004493 * xmlRegexpIsDeterminist:
4494 * @comp: the compiled regular expression
4495 *
4496 * Check if the regular expression is determinist
4497 *
William M. Brackddf71d62004-05-06 04:17:26 +00004498 * Returns 1 if it yes, 0 if not and a negative value in case of error
Daniel Veillard23e73572002-09-19 19:56:43 +00004499 */
4500int
4501xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
4502 xmlAutomataPtr am;
4503 int ret;
4504
4505 if (comp == NULL)
4506 return(-1);
4507 if (comp->determinist != -1)
4508 return(comp->determinist);
4509
4510 am = xmlNewAutomata();
Daniel Veillardbd9afb52002-09-25 22:25:35 +00004511 if (am->states != NULL) {
4512 int i;
4513
4514 for (i = 0;i < am->nbStates;i++)
4515 xmlRegFreeState(am->states[i]);
4516 xmlFree(am->states);
4517 }
Daniel Veillard23e73572002-09-19 19:56:43 +00004518 am->nbAtoms = comp->nbAtoms;
4519 am->atoms = comp->atoms;
4520 am->nbStates = comp->nbStates;
4521 am->states = comp->states;
4522 am->determinist = -1;
4523 ret = xmlFAComputesDeterminism(am);
4524 am->atoms = NULL;
4525 am->states = NULL;
4526 xmlFreeAutomata(am);
4527 return(ret);
4528}
4529
4530/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004531 * xmlRegFreeRegexp:
4532 * @regexp: the regexp
4533 *
4534 * Free a regexp
4535 */
4536void
4537xmlRegFreeRegexp(xmlRegexpPtr regexp) {
4538 int i;
4539 if (regexp == NULL)
4540 return;
4541
4542 if (regexp->string != NULL)
4543 xmlFree(regexp->string);
4544 if (regexp->states != NULL) {
4545 for (i = 0;i < regexp->nbStates;i++)
4546 xmlRegFreeState(regexp->states[i]);
4547 xmlFree(regexp->states);
4548 }
4549 if (regexp->atoms != NULL) {
4550 for (i = 0;i < regexp->nbAtoms;i++)
4551 xmlRegFreeAtom(regexp->atoms[i]);
4552 xmlFree(regexp->atoms);
4553 }
4554 if (regexp->counters != NULL)
4555 xmlFree(regexp->counters);
Daniel Veillard23e73572002-09-19 19:56:43 +00004556 if (regexp->compact != NULL)
4557 xmlFree(regexp->compact);
Daniel Veillard118aed72002-09-24 14:13:13 +00004558 if (regexp->transdata != NULL)
4559 xmlFree(regexp->transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +00004560 if (regexp->stringMap != NULL) {
4561 for (i = 0; i < regexp->nbstrings;i++)
4562 xmlFree(regexp->stringMap[i]);
4563 xmlFree(regexp->stringMap);
4564 }
4565
Daniel Veillard4255d502002-04-16 15:50:10 +00004566 xmlFree(regexp);
4567}
4568
4569#ifdef LIBXML_AUTOMATA_ENABLED
4570/************************************************************************
4571 * *
4572 * The Automata interface *
4573 * *
4574 ************************************************************************/
4575
4576/**
4577 * xmlNewAutomata:
4578 *
4579 * Create a new automata
4580 *
4581 * Returns the new object or NULL in case of failure
4582 */
4583xmlAutomataPtr
4584xmlNewAutomata(void) {
4585 xmlAutomataPtr ctxt;
4586
4587 ctxt = xmlRegNewParserCtxt(NULL);
4588 if (ctxt == NULL)
4589 return(NULL);
4590
4591 /* initialize the parser */
4592 ctxt->end = NULL;
4593 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004594 if (ctxt->start == NULL) {
4595 xmlFreeAutomata(ctxt);
4596 return(NULL);
4597 }
4598 if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
4599 xmlRegFreeState(ctxt->start);
4600 xmlFreeAutomata(ctxt);
4601 return(NULL);
4602 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004603
4604 return(ctxt);
4605}
4606
4607/**
4608 * xmlFreeAutomata:
4609 * @am: an automata
4610 *
4611 * Free an automata
4612 */
4613void
4614xmlFreeAutomata(xmlAutomataPtr am) {
4615 if (am == NULL)
4616 return;
4617 xmlRegFreeParserCtxt(am);
4618}
4619
4620/**
4621 * xmlAutomataGetInitState:
4622 * @am: an automata
4623 *
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004624 * Initial state lookup
4625 *
Daniel Veillard4255d502002-04-16 15:50:10 +00004626 * Returns the initial state of the automata
4627 */
4628xmlAutomataStatePtr
4629xmlAutomataGetInitState(xmlAutomataPtr am) {
4630 if (am == NULL)
4631 return(NULL);
4632 return(am->start);
4633}
4634
4635/**
4636 * xmlAutomataSetFinalState:
4637 * @am: an automata
4638 * @state: a state in this automata
4639 *
4640 * Makes that state a final state
4641 *
4642 * Returns 0 or -1 in case of error
4643 */
4644int
4645xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
4646 if ((am == NULL) || (state == NULL))
4647 return(-1);
4648 state->type = XML_REGEXP_FINAL_STATE;
4649 return(0);
4650}
4651
4652/**
4653 * xmlAutomataNewTransition:
4654 * @am: an automata
4655 * @from: the starting point of the transition
4656 * @to: the target point of the transition or NULL
4657 * @token: the input string associated to that transition
4658 * @data: data passed to the callback function if the transition is activated
4659 *
William M. Brackddf71d62004-05-06 04:17:26 +00004660 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00004661 * and then adds a transition from the @from state to the target state
4662 * activated by the value of @token
4663 *
4664 * Returns the target state or NULL in case of error
4665 */
4666xmlAutomataStatePtr
4667xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
4668 xmlAutomataStatePtr to, const xmlChar *token,
4669 void *data) {
4670 xmlRegAtomPtr atom;
4671
4672 if ((am == NULL) || (from == NULL) || (token == NULL))
4673 return(NULL);
4674 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004675 if (atom == NULL)
4676 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00004677 atom->data = data;
4678 if (atom == NULL)
4679 return(NULL);
4680 atom->valuep = xmlStrdup(token);
4681
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004682 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
4683 xmlRegFreeAtom(atom);
4684 return(NULL);
4685 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004686 if (to == NULL)
4687 return(am->state);
4688 return(to);
4689}
4690
4691/**
Daniel Veillard52b48c72003-04-13 19:53:42 +00004692 * xmlAutomataNewTransition2:
4693 * @am: an automata
4694 * @from: the starting point of the transition
4695 * @to: the target point of the transition or NULL
4696 * @token: the first input string associated to that transition
4697 * @token2: the second input string associated to that transition
4698 * @data: data passed to the callback function if the transition is activated
4699 *
William M. Brackddf71d62004-05-06 04:17:26 +00004700 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard52b48c72003-04-13 19:53:42 +00004701 * and then adds a transition from the @from state to the target state
4702 * activated by the value of @token
4703 *
4704 * Returns the target state or NULL in case of error
4705 */
4706xmlAutomataStatePtr
4707xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
4708 xmlAutomataStatePtr to, const xmlChar *token,
4709 const xmlChar *token2, void *data) {
4710 xmlRegAtomPtr atom;
4711
4712 if ((am == NULL) || (from == NULL) || (token == NULL))
4713 return(NULL);
4714 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4715 atom->data = data;
4716 if (atom == NULL)
4717 return(NULL);
4718 if ((token2 == NULL) || (*token2 == 0)) {
4719 atom->valuep = xmlStrdup(token);
4720 } else {
4721 int lenn, lenp;
4722 xmlChar *str;
4723
4724 lenn = strlen((char *) token2);
4725 lenp = strlen((char *) token);
4726
Daniel Veillard3c908dc2003-04-19 00:07:51 +00004727 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00004728 if (str == NULL) {
4729 xmlRegFreeAtom(atom);
4730 return(NULL);
4731 }
4732 memcpy(&str[0], token, lenp);
4733 str[lenp] = '|';
4734 memcpy(&str[lenp + 1], token2, lenn);
4735 str[lenn + lenp + 1] = 0;
4736
4737 atom->valuep = str;
4738 }
4739
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004740 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
4741 xmlRegFreeAtom(atom);
4742 return(NULL);
4743 }
Daniel Veillard52b48c72003-04-13 19:53:42 +00004744 if (to == NULL)
4745 return(am->state);
4746 return(to);
4747}
4748
4749/**
Daniel Veillard9efc4762005-07-19 14:33:55 +00004750 * xmlAutomataNewNegTrans:
4751 * @am: an automata
4752 * @from: the starting point of the transition
4753 * @to: the target point of the transition or NULL
4754 * @token: the first input string associated to that transition
4755 * @token2: the second input string associated to that transition
4756 * @data: data passed to the callback function if the transition is activated
4757 *
4758 * If @to is NULL, this creates first a new target state in the automata
4759 * and then adds a transition from the @from state to the target state
4760 * activated by any value except (@token,@token2)
4761 *
4762 * Returns the target state or NULL in case of error
4763 */
4764xmlAutomataStatePtr
4765xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4766 xmlAutomataStatePtr to, const xmlChar *token,
4767 const xmlChar *token2, void *data) {
4768 xmlRegAtomPtr atom;
4769
4770 if ((am == NULL) || (from == NULL) || (token == NULL))
4771 return(NULL);
4772 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4773 if (atom == NULL)
4774 return(NULL);
4775 atom->data = data;
4776 atom->neg = 1;
4777 if ((token2 == NULL) || (*token2 == 0)) {
4778 atom->valuep = xmlStrdup(token);
4779 } else {
4780 int lenn, lenp;
4781 xmlChar *str;
4782
4783 lenn = strlen((char *) token2);
4784 lenp = strlen((char *) token);
4785
4786 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4787 if (str == NULL) {
4788 xmlRegFreeAtom(atom);
4789 return(NULL);
4790 }
4791 memcpy(&str[0], token, lenp);
4792 str[lenp] = '|';
4793 memcpy(&str[lenp + 1], token2, lenn);
4794 str[lenn + lenp + 1] = 0;
4795
4796 atom->valuep = str;
4797 }
4798
4799 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
4800 xmlRegFreeAtom(atom);
4801 return(NULL);
4802 }
4803 if (to == NULL)
4804 return(am->state);
4805 return(to);
4806}
4807
4808/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00004809 * xmlAutomataNewCountTrans2:
4810 * @am: an automata
4811 * @from: the starting point of the transition
4812 * @to: the target point of the transition or NULL
4813 * @token: the input string associated to that transition
4814 * @token2: the second input string associated to that transition
4815 * @min: the minimum successive occurences of token
4816 * @max: the maximum successive occurences of token
4817 * @data: data associated to the transition
4818 *
4819 * If @to is NULL, this creates first a new target state in the automata
4820 * and then adds a transition from the @from state to the target state
4821 * activated by a succession of input of value @token and @token2 and
4822 * whose number is between @min and @max
4823 *
4824 * Returns the target state or NULL in case of error
4825 */
4826xmlAutomataStatePtr
4827xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
4828 xmlAutomataStatePtr to, const xmlChar *token,
4829 const xmlChar *token2,
4830 int min, int max, void *data) {
4831 xmlRegAtomPtr atom;
4832 int counter;
4833
4834 if ((am == NULL) || (from == NULL) || (token == NULL))
4835 return(NULL);
4836 if (min < 0)
4837 return(NULL);
4838 if ((max < min) || (max < 1))
4839 return(NULL);
4840 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4841 if (atom == NULL)
4842 return(NULL);
4843 if ((token2 == NULL) || (*token2 == 0)) {
4844 atom->valuep = xmlStrdup(token);
4845 } else {
4846 int lenn, lenp;
4847 xmlChar *str;
4848
4849 lenn = strlen((char *) token2);
4850 lenp = strlen((char *) token);
4851
4852 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4853 if (str == NULL) {
4854 xmlRegFreeAtom(atom);
4855 return(NULL);
4856 }
4857 memcpy(&str[0], token, lenp);
4858 str[lenp] = '|';
4859 memcpy(&str[lenp + 1], token2, lenn);
4860 str[lenn + lenp + 1] = 0;
4861
4862 atom->valuep = str;
4863 }
4864 atom->data = data;
4865 if (min == 0)
4866 atom->min = 1;
4867 else
4868 atom->min = min;
4869 atom->max = max;
4870
4871 /*
4872 * associate a counter to the transition.
4873 */
4874 counter = xmlRegGetCounter(am);
4875 am->counters[counter].min = min;
4876 am->counters[counter].max = max;
4877
4878 /* xmlFAGenerateTransitions(am, from, to, atom); */
4879 if (to == NULL) {
4880 to = xmlRegNewState(am);
4881 xmlRegStatePush(am, to);
4882 }
4883 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4884 xmlRegAtomPush(am, atom);
4885 am->state = to;
4886
4887 if (to == NULL)
4888 to = am->state;
4889 if (to == NULL)
4890 return(NULL);
4891 if (min == 0)
4892 xmlFAGenerateEpsilonTransition(am, from, to);
4893 return(to);
4894}
4895
4896/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004897 * xmlAutomataNewCountTrans:
4898 * @am: an automata
4899 * @from: the starting point of the transition
4900 * @to: the target point of the transition or NULL
4901 * @token: the input string associated to that transition
4902 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004903 * @max: the maximum successive occurences of token
4904 * @data: data associated to the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00004905 *
William M. Brackddf71d62004-05-06 04:17:26 +00004906 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00004907 * and then adds a transition from the @from state to the target state
4908 * activated by a succession of input of value @token and whose number
4909 * is between @min and @max
4910 *
4911 * Returns the target state or NULL in case of error
4912 */
4913xmlAutomataStatePtr
4914xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4915 xmlAutomataStatePtr to, const xmlChar *token,
4916 int min, int max, void *data) {
4917 xmlRegAtomPtr atom;
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00004918 int counter;
Daniel Veillard4255d502002-04-16 15:50:10 +00004919
4920 if ((am == NULL) || (from == NULL) || (token == NULL))
4921 return(NULL);
4922 if (min < 0)
4923 return(NULL);
4924 if ((max < min) || (max < 1))
4925 return(NULL);
4926 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4927 if (atom == NULL)
4928 return(NULL);
4929 atom->valuep = xmlStrdup(token);
4930 atom->data = data;
4931 if (min == 0)
4932 atom->min = 1;
4933 else
4934 atom->min = min;
4935 atom->max = max;
4936
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00004937 /*
4938 * associate a counter to the transition.
4939 */
4940 counter = xmlRegGetCounter(am);
4941 am->counters[counter].min = min;
4942 am->counters[counter].max = max;
4943
4944 /* xmlFAGenerateTransitions(am, from, to, atom); */
4945 if (to == NULL) {
4946 to = xmlRegNewState(am);
4947 xmlRegStatePush(am, to);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004948 }
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00004949 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4950 xmlRegAtomPush(am, atom);
4951 am->state = to;
4952
Daniel Veillard4255d502002-04-16 15:50:10 +00004953 if (to == NULL)
4954 to = am->state;
4955 if (to == NULL)
4956 return(NULL);
4957 if (min == 0)
4958 xmlFAGenerateEpsilonTransition(am, from, to);
4959 return(to);
4960}
4961
4962/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00004963 * xmlAutomataNewOnceTrans2:
4964 * @am: an automata
4965 * @from: the starting point of the transition
4966 * @to: the target point of the transition or NULL
4967 * @token: the input string associated to that transition
4968 * @token2: the second input string associated to that transition
4969 * @min: the minimum successive occurences of token
4970 * @max: the maximum successive occurences of token
4971 * @data: data associated to the transition
4972 *
4973 * If @to is NULL, this creates first a new target state in the automata
4974 * and then adds a transition from the @from state to the target state
4975 * activated by a succession of input of value @token and @token2 and whose
4976 * number is between @min and @max, moreover that transition can only be
4977 * crossed once.
4978 *
4979 * Returns the target state or NULL in case of error
4980 */
4981xmlAutomataStatePtr
4982xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
4983 xmlAutomataStatePtr to, const xmlChar *token,
4984 const xmlChar *token2,
4985 int min, int max, void *data) {
4986 xmlRegAtomPtr atom;
4987 int counter;
4988
4989 if ((am == NULL) || (from == NULL) || (token == NULL))
4990 return(NULL);
4991 if (min < 1)
4992 return(NULL);
4993 if ((max < min) || (max < 1))
4994 return(NULL);
4995 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4996 if (atom == NULL)
4997 return(NULL);
4998 if ((token2 == NULL) || (*token2 == 0)) {
4999 atom->valuep = xmlStrdup(token);
5000 } else {
5001 int lenn, lenp;
5002 xmlChar *str;
5003
5004 lenn = strlen((char *) token2);
5005 lenp = strlen((char *) token);
5006
5007 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5008 if (str == NULL) {
5009 xmlRegFreeAtom(atom);
5010 return(NULL);
5011 }
5012 memcpy(&str[0], token, lenp);
5013 str[lenp] = '|';
5014 memcpy(&str[lenp + 1], token2, lenn);
5015 str[lenn + lenp + 1] = 0;
5016
5017 atom->valuep = str;
5018 }
5019 atom->data = data;
5020 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
5021 if (min == 0)
5022 atom->min = 1;
5023 else
5024 atom->min = min;
5025 atom->max = max;
5026 /*
5027 * associate a counter to the transition.
5028 */
5029 counter = xmlRegGetCounter(am);
5030 am->counters[counter].min = 1;
5031 am->counters[counter].max = 1;
5032
5033 /* xmlFAGenerateTransitions(am, from, to, atom); */
5034 if (to == NULL) {
5035 to = xmlRegNewState(am);
5036 xmlRegStatePush(am, to);
5037 }
5038 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5039 xmlRegAtomPush(am, atom);
5040 am->state = to;
5041 return(to);
5042}
5043
5044
5045
5046/**
Daniel Veillard7646b182002-04-20 06:41:40 +00005047 * xmlAutomataNewOnceTrans:
5048 * @am: an automata
5049 * @from: the starting point of the transition
5050 * @to: the target point of the transition or NULL
5051 * @token: the input string associated to that transition
5052 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005053 * @max: the maximum successive occurences of token
5054 * @data: data associated to the transition
Daniel Veillard7646b182002-04-20 06:41:40 +00005055 *
William M. Brackddf71d62004-05-06 04:17:26 +00005056 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00005057 * and then adds a transition from the @from state to the target state
5058 * activated by a succession of input of value @token and whose number
William M. Brackddf71d62004-05-06 04:17:26 +00005059 * is between @min and @max, moreover that transition can only be crossed
Daniel Veillard7646b182002-04-20 06:41:40 +00005060 * once.
5061 *
5062 * Returns the target state or NULL in case of error
5063 */
5064xmlAutomataStatePtr
5065xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5066 xmlAutomataStatePtr to, const xmlChar *token,
5067 int min, int max, void *data) {
5068 xmlRegAtomPtr atom;
5069 int counter;
5070
5071 if ((am == NULL) || (from == NULL) || (token == NULL))
5072 return(NULL);
5073 if (min < 1)
5074 return(NULL);
5075 if ((max < min) || (max < 1))
5076 return(NULL);
5077 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5078 if (atom == NULL)
5079 return(NULL);
5080 atom->valuep = xmlStrdup(token);
5081 atom->data = data;
5082 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
5083 if (min == 0)
5084 atom->min = 1;
5085 else
5086 atom->min = min;
5087 atom->max = max;
5088 /*
5089 * associate a counter to the transition.
5090 */
5091 counter = xmlRegGetCounter(am);
5092 am->counters[counter].min = 1;
5093 am->counters[counter].max = 1;
5094
5095 /* xmlFAGenerateTransitions(am, from, to, atom); */
5096 if (to == NULL) {
5097 to = xmlRegNewState(am);
5098 xmlRegStatePush(am, to);
5099 }
5100 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5101 xmlRegAtomPush(am, atom);
5102 am->state = to;
Daniel Veillard7646b182002-04-20 06:41:40 +00005103 return(to);
5104}
5105
5106/**
Daniel Veillard4255d502002-04-16 15:50:10 +00005107 * xmlAutomataNewState:
5108 * @am: an automata
5109 *
5110 * Create a new disconnected state in the automata
5111 *
5112 * Returns the new state or NULL in case of error
5113 */
5114xmlAutomataStatePtr
5115xmlAutomataNewState(xmlAutomataPtr am) {
5116 xmlAutomataStatePtr to;
5117
5118 if (am == NULL)
5119 return(NULL);
5120 to = xmlRegNewState(am);
5121 xmlRegStatePush(am, to);
5122 return(to);
5123}
5124
5125/**
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005126 * xmlAutomataNewEpsilon:
Daniel Veillard4255d502002-04-16 15:50:10 +00005127 * @am: an automata
5128 * @from: the starting point of the transition
5129 * @to: the target point of the transition or NULL
5130 *
William M. Brackddf71d62004-05-06 04:17:26 +00005131 * If @to is NULL, this creates first a new target state in the automata
5132 * and then adds an epsilon transition from the @from state to the
Daniel Veillard4255d502002-04-16 15:50:10 +00005133 * target state
5134 *
5135 * Returns the target state or NULL in case of error
5136 */
5137xmlAutomataStatePtr
5138xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
5139 xmlAutomataStatePtr to) {
5140 if ((am == NULL) || (from == NULL))
5141 return(NULL);
5142 xmlFAGenerateEpsilonTransition(am, from, to);
5143 if (to == NULL)
5144 return(am->state);
5145 return(to);
5146}
5147
Daniel Veillardb509f152002-04-17 16:28:10 +00005148/**
Daniel Veillard7646b182002-04-20 06:41:40 +00005149 * xmlAutomataNewAllTrans:
5150 * @am: an automata
5151 * @from: the starting point of the transition
5152 * @to: the target point of the transition or NULL
Daniel Veillarda9b66d02002-12-11 14:23:49 +00005153 * @lax: allow to transition if not all all transitions have been activated
Daniel Veillard7646b182002-04-20 06:41:40 +00005154 *
William M. Brackddf71d62004-05-06 04:17:26 +00005155 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00005156 * and then adds a an ALL transition from the @from state to the
5157 * target state. That transition is an epsilon transition allowed only when
5158 * all transitions from the @from node have been activated.
5159 *
5160 * Returns the target state or NULL in case of error
5161 */
5162xmlAutomataStatePtr
5163xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
Daniel Veillard441bc322002-04-20 17:38:48 +00005164 xmlAutomataStatePtr to, int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00005165 if ((am == NULL) || (from == NULL))
5166 return(NULL);
Daniel Veillard441bc322002-04-20 17:38:48 +00005167 xmlFAGenerateAllTransition(am, from, to, lax);
Daniel Veillard7646b182002-04-20 06:41:40 +00005168 if (to == NULL)
5169 return(am->state);
5170 return(to);
5171}
5172
5173/**
Daniel Veillardb509f152002-04-17 16:28:10 +00005174 * xmlAutomataNewCounter:
5175 * @am: an automata
5176 * @min: the minimal value on the counter
5177 * @max: the maximal value on the counter
5178 *
5179 * Create a new counter
5180 *
5181 * Returns the counter number or -1 in case of error
5182 */
5183int
5184xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
5185 int ret;
5186
5187 if (am == NULL)
5188 return(-1);
5189
5190 ret = xmlRegGetCounter(am);
5191 if (ret < 0)
5192 return(-1);
5193 am->counters[ret].min = min;
5194 am->counters[ret].max = max;
5195 return(ret);
5196}
5197
5198/**
5199 * xmlAutomataNewCountedTrans:
5200 * @am: an automata
5201 * @from: the starting point of the transition
5202 * @to: the target point of the transition or NULL
5203 * @counter: the counter associated to that transition
5204 *
William M. Brackddf71d62004-05-06 04:17:26 +00005205 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00005206 * and then adds an epsilon transition from the @from state to the target state
5207 * which will increment the counter provided
5208 *
5209 * Returns the target state or NULL in case of error
5210 */
5211xmlAutomataStatePtr
5212xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5213 xmlAutomataStatePtr to, int counter) {
5214 if ((am == NULL) || (from == NULL) || (counter < 0))
5215 return(NULL);
5216 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
5217 if (to == NULL)
5218 return(am->state);
5219 return(to);
5220}
5221
5222/**
5223 * xmlAutomataNewCounterTrans:
5224 * @am: an automata
5225 * @from: the starting point of the transition
5226 * @to: the target point of the transition or NULL
5227 * @counter: the counter associated to that transition
5228 *
William M. Brackddf71d62004-05-06 04:17:26 +00005229 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00005230 * and then adds an epsilon transition from the @from state to the target state
5231 * which will be allowed only if the counter is within the right range.
5232 *
5233 * Returns the target state or NULL in case of error
5234 */
5235xmlAutomataStatePtr
5236xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5237 xmlAutomataStatePtr to, int counter) {
5238 if ((am == NULL) || (from == NULL) || (counter < 0))
5239 return(NULL);
5240 xmlFAGenerateCountedTransition(am, from, to, counter);
5241 if (to == NULL)
5242 return(am->state);
5243 return(to);
5244}
Daniel Veillard4255d502002-04-16 15:50:10 +00005245
5246/**
5247 * xmlAutomataCompile:
5248 * @am: an automata
5249 *
5250 * Compile the automata into a Reg Exp ready for being executed.
5251 * The automata should be free after this point.
5252 *
5253 * Returns the compiled regexp or NULL in case of error
5254 */
5255xmlRegexpPtr
5256xmlAutomataCompile(xmlAutomataPtr am) {
5257 xmlRegexpPtr ret;
5258
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005259 if ((am == NULL) || (am->error != 0)) return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00005260 xmlFAEliminateEpsilonTransitions(am);
Daniel Veillard23e73572002-09-19 19:56:43 +00005261 /* xmlFAComputesDeterminism(am); */
Daniel Veillard4255d502002-04-16 15:50:10 +00005262 ret = xmlRegEpxFromParse(am);
5263
5264 return(ret);
5265}
Daniel Veillarde19fc232002-04-22 16:01:24 +00005266
5267/**
5268 * xmlAutomataIsDeterminist:
5269 * @am: an automata
5270 *
5271 * Checks if an automata is determinist.
5272 *
5273 * Returns 1 if true, 0 if not, and -1 in case of error
5274 */
5275int
5276xmlAutomataIsDeterminist(xmlAutomataPtr am) {
5277 int ret;
5278
5279 if (am == NULL)
5280 return(-1);
5281
5282 ret = xmlFAComputesDeterminism(am);
5283 return(ret);
5284}
Daniel Veillard4255d502002-04-16 15:50:10 +00005285#endif /* LIBXML_AUTOMATA_ENABLED */
Daniel Veillard5d4644e2005-04-01 13:11:58 +00005286#define bottom_xmlregexp
5287#include "elfgcchack.h"
Daniel Veillard4255d502002-04-16 15:50:10 +00005288#endif /* LIBXML_REGEXP_ENABLED */