blob: 0af5a7a335bf92db9622eadc6a47c0626ef9f159 [file] [log] [blame]
Daniel Veillard4255d502002-04-16 15:50:10 +00001/*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/shemas mechanisms now available in
6 * XML related specifications thise includes:
7 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17#define IN_LIBXML
18#include "libxml.h"
19
20#ifdef LIBXML_REGEXP_ENABLED
21
22#include <stdio.h>
23#include <string.h>
24#include <libxml/tree.h>
25#include <libxml/parserInternals.h>
26#include <libxml/xmlregexp.h>
27#include <libxml/xmlautomata.h>
28#include <libxml/xmlunicode.h>
29
30/* #define DEBUG_REGEXP_GRAPH */
31/* #define DEBUG_REGEXP_EXEC */
32/* #define DEBUG_PUSH */
Daniel Veillard23e73572002-09-19 19:56:43 +000033/* #define DEBUG_COMPACTION */
Daniel Veillard4255d502002-04-16 15:50:10 +000034
Daniel Veillardff46a042003-10-08 08:53:17 +000035#define ERROR(str) \
36 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
37 xmlRegexpErrCompile(ctxt, str);
Daniel Veillard4255d502002-04-16 15:50:10 +000038#define NEXT ctxt->cur++
39#define CUR (*(ctxt->cur))
40#define NXT(index) (ctxt->cur[index])
41
42#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
43#define NEXTL(l) ctxt->cur += l;
44
Daniel Veillarde19fc232002-04-22 16:01:24 +000045/**
46 * TODO:
47 *
48 * macro to flag unimplemented blocks
49 */
50#define TODO \
51 xmlGenericError(xmlGenericErrorContext, \
52 "Unimplemented block at %s:%d\n", \
53 __FILE__, __LINE__);
54
Daniel Veillard4255d502002-04-16 15:50:10 +000055/************************************************************************
56 * *
57 * Datatypes and structures *
58 * *
59 ************************************************************************/
60
61typedef enum {
62 XML_REGEXP_EPSILON = 1,
63 XML_REGEXP_CHARVAL,
64 XML_REGEXP_RANGES,
65 XML_REGEXP_SUBREG,
66 XML_REGEXP_STRING,
67 XML_REGEXP_ANYCHAR, /* . */
68 XML_REGEXP_ANYSPACE, /* \s */
69 XML_REGEXP_NOTSPACE, /* \S */
70 XML_REGEXP_INITNAME, /* \l */
71 XML_REGEXP_NOTINITNAME, /* \l */
72 XML_REGEXP_NAMECHAR, /* \c */
73 XML_REGEXP_NOTNAMECHAR, /* \C */
74 XML_REGEXP_DECIMAL, /* \d */
75 XML_REGEXP_NOTDECIMAL, /* \d */
76 XML_REGEXP_REALCHAR, /* \w */
77 XML_REGEXP_NOTREALCHAR, /* \w */
78 XML_REGEXP_LETTER,
79 XML_REGEXP_LETTER_UPPERCASE,
80 XML_REGEXP_LETTER_LOWERCASE,
81 XML_REGEXP_LETTER_TITLECASE,
82 XML_REGEXP_LETTER_MODIFIER,
83 XML_REGEXP_LETTER_OTHERS,
84 XML_REGEXP_MARK,
85 XML_REGEXP_MARK_NONSPACING,
86 XML_REGEXP_MARK_SPACECOMBINING,
87 XML_REGEXP_MARK_ENCLOSING,
88 XML_REGEXP_NUMBER,
89 XML_REGEXP_NUMBER_DECIMAL,
90 XML_REGEXP_NUMBER_LETTER,
91 XML_REGEXP_NUMBER_OTHERS,
92 XML_REGEXP_PUNCT,
93 XML_REGEXP_PUNCT_CONNECTOR,
94 XML_REGEXP_PUNCT_DASH,
95 XML_REGEXP_PUNCT_OPEN,
96 XML_REGEXP_PUNCT_CLOSE,
97 XML_REGEXP_PUNCT_INITQUOTE,
98 XML_REGEXP_PUNCT_FINQUOTE,
99 XML_REGEXP_PUNCT_OTHERS,
100 XML_REGEXP_SEPAR,
101 XML_REGEXP_SEPAR_SPACE,
102 XML_REGEXP_SEPAR_LINE,
103 XML_REGEXP_SEPAR_PARA,
104 XML_REGEXP_SYMBOL,
105 XML_REGEXP_SYMBOL_MATH,
106 XML_REGEXP_SYMBOL_CURRENCY,
107 XML_REGEXP_SYMBOL_MODIFIER,
108 XML_REGEXP_SYMBOL_OTHERS,
109 XML_REGEXP_OTHER,
110 XML_REGEXP_OTHER_CONTROL,
111 XML_REGEXP_OTHER_FORMAT,
112 XML_REGEXP_OTHER_PRIVATE,
113 XML_REGEXP_OTHER_NA,
114 XML_REGEXP_BLOCK_NAME
115} xmlRegAtomType;
116
117typedef enum {
118 XML_REGEXP_QUANT_EPSILON = 1,
119 XML_REGEXP_QUANT_ONCE,
120 XML_REGEXP_QUANT_OPT,
121 XML_REGEXP_QUANT_MULT,
122 XML_REGEXP_QUANT_PLUS,
Daniel Veillard7646b182002-04-20 06:41:40 +0000123 XML_REGEXP_QUANT_ONCEONLY,
124 XML_REGEXP_QUANT_ALL,
Daniel Veillard4255d502002-04-16 15:50:10 +0000125 XML_REGEXP_QUANT_RANGE
126} xmlRegQuantType;
127
128typedef enum {
129 XML_REGEXP_START_STATE = 1,
130 XML_REGEXP_FINAL_STATE,
131 XML_REGEXP_TRANS_STATE
132} xmlRegStateType;
133
134typedef enum {
135 XML_REGEXP_MARK_NORMAL = 0,
136 XML_REGEXP_MARK_START,
137 XML_REGEXP_MARK_VISITED
138} xmlRegMarkedType;
139
140typedef struct _xmlRegRange xmlRegRange;
141typedef xmlRegRange *xmlRegRangePtr;
142
143struct _xmlRegRange {
144 int neg;
145 xmlRegAtomType type;
146 int start;
147 int end;
148 xmlChar *blockName;
149};
150
151typedef struct _xmlRegAtom xmlRegAtom;
152typedef xmlRegAtom *xmlRegAtomPtr;
153
154typedef struct _xmlAutomataState xmlRegState;
155typedef xmlRegState *xmlRegStatePtr;
156
157struct _xmlRegAtom {
158 int no;
159 xmlRegAtomType type;
160 xmlRegQuantType quant;
161 int min;
162 int max;
163
164 void *valuep;
Daniel Veillarda646cfd2002-09-17 21:50:03 +0000165 void *valuep2;
Daniel Veillard4255d502002-04-16 15:50:10 +0000166 int neg;
167 int codepoint;
168 xmlRegStatePtr start;
169 xmlRegStatePtr stop;
170 int maxRanges;
171 int nbRanges;
172 xmlRegRangePtr *ranges;
173 void *data;
174};
175
176typedef struct _xmlRegCounter xmlRegCounter;
177typedef xmlRegCounter *xmlRegCounterPtr;
178
179struct _xmlRegCounter {
180 int min;
181 int max;
182};
183
184typedef struct _xmlRegTrans xmlRegTrans;
185typedef xmlRegTrans *xmlRegTransPtr;
186
187struct _xmlRegTrans {
188 xmlRegAtomPtr atom;
189 int to;
190 int counter;
191 int count;
192};
193
194struct _xmlAutomataState {
195 xmlRegStateType type;
196 xmlRegMarkedType mark;
Daniel Veillard23e73572002-09-19 19:56:43 +0000197 xmlRegMarkedType reached;
Daniel Veillard4255d502002-04-16 15:50:10 +0000198 int no;
199
200 int maxTrans;
201 int nbTrans;
202 xmlRegTrans *trans;
203};
204
205typedef struct _xmlAutomata xmlRegParserCtxt;
206typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
207
208struct _xmlAutomata {
209 xmlChar *string;
210 xmlChar *cur;
211
212 int error;
213 int neg;
214
215 xmlRegStatePtr start;
216 xmlRegStatePtr end;
217 xmlRegStatePtr state;
218
219 xmlRegAtomPtr atom;
220
221 int maxAtoms;
222 int nbAtoms;
223 xmlRegAtomPtr *atoms;
224
225 int maxStates;
226 int nbStates;
227 xmlRegStatePtr *states;
228
229 int maxCounters;
230 int nbCounters;
231 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000232
233 int determinist;
Daniel Veillard4255d502002-04-16 15:50:10 +0000234};
235
236struct _xmlRegexp {
237 xmlChar *string;
238 int nbStates;
239 xmlRegStatePtr *states;
240 int nbAtoms;
241 xmlRegAtomPtr *atoms;
242 int nbCounters;
243 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000244 int determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000245 /*
246 * That's the compact form for determinists automatas
247 */
248 int nbstates;
249 int *compact;
Daniel Veillard118aed72002-09-24 14:13:13 +0000250 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000251 int nbstrings;
252 xmlChar **stringMap;
Daniel Veillard4255d502002-04-16 15:50:10 +0000253};
254
255typedef struct _xmlRegExecRollback xmlRegExecRollback;
256typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
257
258struct _xmlRegExecRollback {
259 xmlRegStatePtr state;/* the current state */
260 int index; /* the index in the input stack */
261 int nextbranch; /* the next transition to explore in that state */
262 int *counts; /* save the automate state if it has some */
263};
264
265typedef struct _xmlRegInputToken xmlRegInputToken;
266typedef xmlRegInputToken *xmlRegInputTokenPtr;
267
268struct _xmlRegInputToken {
269 xmlChar *value;
270 void *data;
271};
272
273struct _xmlRegExecCtxt {
274 int status; /* execution status != 0 indicate an error */
275 int determinist; /* did we found an inderterministic behaviour */
276 xmlRegexpPtr comp; /* the compiled regexp */
277 xmlRegExecCallbacks callback;
278 void *data;
279
280 xmlRegStatePtr state;/* the current state */
281 int transno; /* the current transition on that state */
282 int transcount; /* the number of char in char counted transitions */
283
284 /*
285 * A stack of rollback states
286 */
287 int maxRollbacks;
288 int nbRollbacks;
289 xmlRegExecRollback *rollbacks;
290
291 /*
292 * The state of the automata if any
293 */
294 int *counts;
295
296 /*
297 * The input stack
298 */
299 int inputStackMax;
300 int inputStackNr;
301 int index;
302 int *charStack;
303 const xmlChar *inputString; /* when operating on characters */
304 xmlRegInputTokenPtr inputStack;/* when operating on strings */
305
306};
307
Daniel Veillard441bc322002-04-20 17:38:48 +0000308#define REGEXP_ALL_COUNTER 0x123456
309#define REGEXP_ALL_LAX_COUNTER 0x123457
Daniel Veillard7646b182002-04-20 06:41:40 +0000310
Daniel Veillard4255d502002-04-16 15:50:10 +0000311static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
Daniel Veillard23e73572002-09-19 19:56:43 +0000312static void xmlRegFreeState(xmlRegStatePtr state);
313static void xmlRegFreeAtom(xmlRegAtomPtr atom);
Daniel Veillard4255d502002-04-16 15:50:10 +0000314
315/************************************************************************
Daniel Veillardff46a042003-10-08 08:53:17 +0000316 * *
317 * Regexp memory error handler *
318 * *
319 ************************************************************************/
320/**
321 * xmlRegexpErrMemory:
322 * @extra: extra informations
323 *
324 * Handle an out of memory condition
325 */
326static void
327xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
328{
329 const char *regexp = NULL;
330 if (ctxt != NULL) {
331 regexp = (const char *) ctxt->string;
332 ctxt->error = XML_ERR_NO_MEMORY;
333 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000334 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000335 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
336 regexp, NULL, 0, 0,
337 "Memory allocation failed : %s\n", extra);
338}
339
340/**
341 * xmlRegexpErrCompile:
342 * @extra: extra informations
343 *
344 * Handle an compilation failure
345 */
346static void
347xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
348{
349 const char *regexp = NULL;
350 int idx = 0;
351
352 if (ctxt != NULL) {
353 regexp = (const char *) ctxt->string;
354 idx = ctxt->cur - ctxt->string;
355 ctxt->error = XML_REGEXP_COMPILE_ERROR;
356 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000357 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000358 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
359 regexp, NULL, idx, 0,
360 "failed to compile: %s\n", extra);
361}
362
363/************************************************************************
Daniel Veillard4255d502002-04-16 15:50:10 +0000364 * *
365 * Allocation/Deallocation *
366 * *
367 ************************************************************************/
368
Daniel Veillard23e73572002-09-19 19:56:43 +0000369static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
Daniel Veillard4255d502002-04-16 15:50:10 +0000370/**
371 * xmlRegEpxFromParse:
372 * @ctxt: the parser context used to build it
373 *
374 * Allocate a new regexp and fill it with the reult from the parser
375 *
376 * Returns the new regexp or NULL in case of error
377 */
378static xmlRegexpPtr
379xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
380 xmlRegexpPtr ret;
381
382 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000383 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000384 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +0000385 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000386 }
Daniel Veillard4255d502002-04-16 15:50:10 +0000387 memset(ret, 0, sizeof(xmlRegexp));
388 ret->string = ctxt->string;
Daniel Veillard4255d502002-04-16 15:50:10 +0000389 ret->nbStates = ctxt->nbStates;
Daniel Veillard4255d502002-04-16 15:50:10 +0000390 ret->states = ctxt->states;
Daniel Veillard4255d502002-04-16 15:50:10 +0000391 ret->nbAtoms = ctxt->nbAtoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000392 ret->atoms = ctxt->atoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000393 ret->nbCounters = ctxt->nbCounters;
Daniel Veillard4255d502002-04-16 15:50:10 +0000394 ret->counters = ctxt->counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000395 ret->determinist = ctxt->determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000396
397 if ((ret->determinist != 0) &&
398 (ret->nbCounters == 0) &&
Daniel Veillard118aed72002-09-24 14:13:13 +0000399 (ret->atoms != NULL) &&
Daniel Veillard23e73572002-09-19 19:56:43 +0000400 (ret->atoms[0] != NULL) &&
401 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
402 int i, j, nbstates = 0, nbatoms = 0;
403 int *stateRemap;
404 int *stringRemap;
405 int *transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000406 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000407 xmlChar **stringMap;
408 xmlChar *value;
409
410 /*
411 * Switch to a compact representation
412 * 1/ counting the effective number of states left
413 * 2/ conting the unique number of atoms, and check that
414 * they are all of the string type
415 * 3/ build a table state x atom for the transitions
416 */
417
418 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000419 if (stateRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000420 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000421 xmlFree(ret);
422 return(NULL);
423 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000424 for (i = 0;i < ret->nbStates;i++) {
425 if (ret->states[i] != NULL) {
426 stateRemap[i] = nbstates;
427 nbstates++;
428 } else {
429 stateRemap[i] = -1;
430 }
431 }
432#ifdef DEBUG_COMPACTION
433 printf("Final: %d states\n", nbstates);
434#endif
435 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000436 if (stringMap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000437 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000438 xmlFree(stateRemap);
439 xmlFree(ret);
440 return(NULL);
441 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000442 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000443 if (stringRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000444 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000445 xmlFree(stringMap);
446 xmlFree(stateRemap);
447 xmlFree(ret);
448 return(NULL);
449 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000450 for (i = 0;i < ret->nbAtoms;i++) {
451 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
452 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
453 value = ret->atoms[i]->valuep;
454 for (j = 0;j < nbatoms;j++) {
455 if (xmlStrEqual(stringMap[j], value)) {
456 stringRemap[i] = j;
457 break;
458 }
459 }
460 if (j >= nbatoms) {
461 stringRemap[i] = nbatoms;
462 stringMap[nbatoms] = xmlStrdup(value);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000463 if (stringMap[nbatoms] == NULL) {
464 for (i = 0;i < nbatoms;i++)
465 xmlFree(stringMap[i]);
466 xmlFree(stringRemap);
467 xmlFree(stringMap);
468 xmlFree(stateRemap);
469 xmlFree(ret);
470 return(NULL);
471 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000472 nbatoms++;
473 }
474 } else {
475 xmlFree(stateRemap);
476 xmlFree(stringRemap);
477 for (i = 0;i < nbatoms;i++)
478 xmlFree(stringMap[i]);
479 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000480 xmlFree(ret);
481 return(NULL);
Daniel Veillard23e73572002-09-19 19:56:43 +0000482 }
483 }
484#ifdef DEBUG_COMPACTION
485 printf("Final: %d atoms\n", nbatoms);
486#endif
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000487 transitions = (int *) xmlMalloc((nbstates + 1) *
488 (nbatoms + 1) * sizeof(int));
489 if (transitions == NULL) {
490 xmlFree(stateRemap);
491 xmlFree(stringRemap);
492 xmlFree(stringMap);
493 xmlFree(ret);
494 return(NULL);
495 }
496 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int));
Daniel Veillard23e73572002-09-19 19:56:43 +0000497
498 /*
499 * Allocate the transition table. The first entry for each
500 * state correspond to the state type.
501 */
Daniel Veillard118aed72002-09-24 14:13:13 +0000502 transdata = NULL;
Daniel Veillard23e73572002-09-19 19:56:43 +0000503
504 for (i = 0;i < ret->nbStates;i++) {
505 int stateno, atomno, targetno, prev;
506 xmlRegStatePtr state;
507 xmlRegTransPtr trans;
508
509 stateno = stateRemap[i];
510 if (stateno == -1)
511 continue;
512 state = ret->states[i];
513
514 transitions[stateno * (nbatoms + 1)] = state->type;
515
516 for (j = 0;j < state->nbTrans;j++) {
517 trans = &(state->trans[j]);
518 if ((trans->to == -1) || (trans->atom == NULL))
519 continue;
520 atomno = stringRemap[trans->atom->no];
Daniel Veillard118aed72002-09-24 14:13:13 +0000521 if ((trans->atom->data != NULL) && (transdata == NULL)) {
522 transdata = (void **) xmlMalloc(nbstates * nbatoms *
523 sizeof(void *));
524 if (transdata != NULL)
525 memset(transdata, 0,
526 nbstates * nbatoms * sizeof(void *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000527 else {
Daniel Veillardff46a042003-10-08 08:53:17 +0000528 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000529 break;
530 }
Daniel Veillard118aed72002-09-24 14:13:13 +0000531 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000532 targetno = stateRemap[trans->to];
533 /*
534 * if the same atome can generate transition to 2 different
535 * states then it means the automata is not determinist and
536 * the compact form can't be used !
537 */
538 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
539 if (prev != 0) {
540 if (prev != targetno + 1) {
Daniel Veillard23e73572002-09-19 19:56:43 +0000541 ret->determinist = 0;
542#ifdef DEBUG_COMPACTION
543 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
544 i, j, trans->atom->no, trans->to, atomno, targetno);
545 printf(" previous to is %d\n", prev);
546#endif
547 ret->determinist = 0;
Daniel Veillard118aed72002-09-24 14:13:13 +0000548 if (transdata != NULL)
549 xmlFree(transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +0000550 xmlFree(transitions);
551 xmlFree(stateRemap);
552 xmlFree(stringRemap);
553 for (i = 0;i < nbatoms;i++)
554 xmlFree(stringMap[i]);
555 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000556 goto not_determ;
Daniel Veillard23e73572002-09-19 19:56:43 +0000557 }
558 } else {
559#if 0
560 printf("State %d trans %d: atom %d to %d : %d to %d\n",
561 i, j, trans->atom->no, trans->to, atomno, targetno);
562#endif
563 transitions[stateno * (nbatoms + 1) + atomno + 1] =
Daniel Veillard118aed72002-09-24 14:13:13 +0000564 targetno + 1; /* to avoid 0 */
565 if (transdata != NULL)
566 transdata[stateno * nbatoms + atomno] =
567 trans->atom->data;
Daniel Veillard23e73572002-09-19 19:56:43 +0000568 }
569 }
570 }
571 ret->determinist = 1;
572#ifdef DEBUG_COMPACTION
573 /*
574 * Debug
575 */
576 for (i = 0;i < nbstates;i++) {
577 for (j = 0;j < nbatoms + 1;j++) {
578 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
579 }
580 printf("\n");
581 }
582 printf("\n");
583#endif
584 /*
585 * Cleanup of the old data
586 */
587 if (ret->states != NULL) {
588 for (i = 0;i < ret->nbStates;i++)
589 xmlRegFreeState(ret->states[i]);
590 xmlFree(ret->states);
591 }
592 ret->states = NULL;
593 ret->nbStates = 0;
594 if (ret->atoms != NULL) {
595 for (i = 0;i < ret->nbAtoms;i++)
596 xmlRegFreeAtom(ret->atoms[i]);
597 xmlFree(ret->atoms);
598 }
599 ret->atoms = NULL;
600 ret->nbAtoms = 0;
601
602 ret->compact = transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000603 ret->transdata = transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000604 ret->stringMap = stringMap;
605 ret->nbstrings = nbatoms;
606 ret->nbstates = nbstates;
607 xmlFree(stateRemap);
608 xmlFree(stringRemap);
609 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000610not_determ:
611 ctxt->string = NULL;
612 ctxt->nbStates = 0;
613 ctxt->states = NULL;
614 ctxt->nbAtoms = 0;
615 ctxt->atoms = NULL;
616 ctxt->nbCounters = 0;
617 ctxt->counters = NULL;
Daniel Veillard4255d502002-04-16 15:50:10 +0000618 return(ret);
619}
620
621/**
622 * xmlRegNewParserCtxt:
623 * @string: the string to parse
624 *
625 * Allocate a new regexp parser context
626 *
627 * Returns the new context or NULL in case of error
628 */
629static xmlRegParserCtxtPtr
630xmlRegNewParserCtxt(const xmlChar *string) {
631 xmlRegParserCtxtPtr ret;
632
633 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
634 if (ret == NULL)
635 return(NULL);
636 memset(ret, 0, sizeof(xmlRegParserCtxt));
637 if (string != NULL)
638 ret->string = xmlStrdup(string);
639 ret->cur = ret->string;
640 ret->neg = 0;
641 ret->error = 0;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000642 ret->determinist = -1;
Daniel Veillard4255d502002-04-16 15:50:10 +0000643 return(ret);
644}
645
646/**
647 * xmlRegNewRange:
648 * @ctxt: the regexp parser context
649 * @neg: is that negative
650 * @type: the type of range
651 * @start: the start codepoint
652 * @end: the end codepoint
653 *
654 * Allocate a new regexp range
655 *
656 * Returns the new range or NULL in case of error
657 */
658static xmlRegRangePtr
659xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
660 int neg, xmlRegAtomType type, int start, int end) {
661 xmlRegRangePtr ret;
662
663 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
664 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000665 xmlRegexpErrMemory(ctxt, "allocating range");
Daniel Veillard4255d502002-04-16 15:50:10 +0000666 return(NULL);
667 }
668 ret->neg = neg;
669 ret->type = type;
670 ret->start = start;
671 ret->end = end;
672 return(ret);
673}
674
675/**
676 * xmlRegFreeRange:
677 * @range: the regexp range
678 *
679 * Free a regexp range
680 */
681static void
682xmlRegFreeRange(xmlRegRangePtr range) {
683 if (range == NULL)
684 return;
685
686 if (range->blockName != NULL)
687 xmlFree(range->blockName);
688 xmlFree(range);
689}
690
691/**
692 * xmlRegNewAtom:
693 * @ctxt: the regexp parser context
694 * @type: the type of atom
695 *
696 * Allocate a new regexp range
697 *
698 * Returns the new atom or NULL in case of error
699 */
700static xmlRegAtomPtr
701xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
702 xmlRegAtomPtr ret;
703
704 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
705 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000706 xmlRegexpErrMemory(ctxt, "allocating atom");
Daniel Veillard4255d502002-04-16 15:50:10 +0000707 return(NULL);
708 }
709 memset(ret, 0, sizeof(xmlRegAtom));
710 ret->type = type;
711 ret->quant = XML_REGEXP_QUANT_ONCE;
712 ret->min = 0;
713 ret->max = 0;
714 return(ret);
715}
716
717/**
718 * xmlRegFreeAtom:
719 * @atom: the regexp atom
720 *
721 * Free a regexp atom
722 */
723static void
724xmlRegFreeAtom(xmlRegAtomPtr atom) {
725 int i;
726
727 if (atom == NULL)
728 return;
729
730 for (i = 0;i < atom->nbRanges;i++)
731 xmlRegFreeRange(atom->ranges[i]);
732 if (atom->ranges != NULL)
733 xmlFree(atom->ranges);
734 if (atom->type == XML_REGEXP_STRING)
735 xmlFree(atom->valuep);
736 xmlFree(atom);
737}
738
739static xmlRegStatePtr
740xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
741 xmlRegStatePtr ret;
742
743 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
744 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000745 xmlRegexpErrMemory(ctxt, "allocating state");
Daniel Veillard4255d502002-04-16 15:50:10 +0000746 return(NULL);
747 }
748 memset(ret, 0, sizeof(xmlRegState));
749 ret->type = XML_REGEXP_TRANS_STATE;
750 ret->mark = XML_REGEXP_MARK_NORMAL;
751 return(ret);
752}
753
754/**
755 * xmlRegFreeState:
756 * @state: the regexp state
757 *
758 * Free a regexp state
759 */
760static void
761xmlRegFreeState(xmlRegStatePtr state) {
762 if (state == NULL)
763 return;
764
765 if (state->trans != NULL)
766 xmlFree(state->trans);
767 xmlFree(state);
768}
769
770/**
771 * xmlRegFreeParserCtxt:
772 * @ctxt: the regexp parser context
773 *
774 * Free a regexp parser context
775 */
776static void
777xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
778 int i;
779 if (ctxt == NULL)
780 return;
781
782 if (ctxt->string != NULL)
783 xmlFree(ctxt->string);
784 if (ctxt->states != NULL) {
785 for (i = 0;i < ctxt->nbStates;i++)
786 xmlRegFreeState(ctxt->states[i]);
787 xmlFree(ctxt->states);
788 }
789 if (ctxt->atoms != NULL) {
790 for (i = 0;i < ctxt->nbAtoms;i++)
791 xmlRegFreeAtom(ctxt->atoms[i]);
792 xmlFree(ctxt->atoms);
793 }
794 if (ctxt->counters != NULL)
795 xmlFree(ctxt->counters);
796 xmlFree(ctxt);
797}
798
799/************************************************************************
800 * *
801 * Display of Data structures *
802 * *
803 ************************************************************************/
804
805static void
806xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
807 switch (type) {
808 case XML_REGEXP_EPSILON:
809 fprintf(output, "epsilon "); break;
810 case XML_REGEXP_CHARVAL:
811 fprintf(output, "charval "); break;
812 case XML_REGEXP_RANGES:
813 fprintf(output, "ranges "); break;
814 case XML_REGEXP_SUBREG:
815 fprintf(output, "subexpr "); break;
816 case XML_REGEXP_STRING:
817 fprintf(output, "string "); break;
818 case XML_REGEXP_ANYCHAR:
819 fprintf(output, "anychar "); break;
820 case XML_REGEXP_ANYSPACE:
821 fprintf(output, "anyspace "); break;
822 case XML_REGEXP_NOTSPACE:
823 fprintf(output, "notspace "); break;
824 case XML_REGEXP_INITNAME:
825 fprintf(output, "initname "); break;
826 case XML_REGEXP_NOTINITNAME:
827 fprintf(output, "notinitname "); break;
828 case XML_REGEXP_NAMECHAR:
829 fprintf(output, "namechar "); break;
830 case XML_REGEXP_NOTNAMECHAR:
831 fprintf(output, "notnamechar "); break;
832 case XML_REGEXP_DECIMAL:
833 fprintf(output, "decimal "); break;
834 case XML_REGEXP_NOTDECIMAL:
835 fprintf(output, "notdecimal "); break;
836 case XML_REGEXP_REALCHAR:
837 fprintf(output, "realchar "); break;
838 case XML_REGEXP_NOTREALCHAR:
839 fprintf(output, "notrealchar "); break;
840 case XML_REGEXP_LETTER:
841 fprintf(output, "LETTER "); break;
842 case XML_REGEXP_LETTER_UPPERCASE:
843 fprintf(output, "LETTER_UPPERCASE "); break;
844 case XML_REGEXP_LETTER_LOWERCASE:
845 fprintf(output, "LETTER_LOWERCASE "); break;
846 case XML_REGEXP_LETTER_TITLECASE:
847 fprintf(output, "LETTER_TITLECASE "); break;
848 case XML_REGEXP_LETTER_MODIFIER:
849 fprintf(output, "LETTER_MODIFIER "); break;
850 case XML_REGEXP_LETTER_OTHERS:
851 fprintf(output, "LETTER_OTHERS "); break;
852 case XML_REGEXP_MARK:
853 fprintf(output, "MARK "); break;
854 case XML_REGEXP_MARK_NONSPACING:
855 fprintf(output, "MARK_NONSPACING "); break;
856 case XML_REGEXP_MARK_SPACECOMBINING:
857 fprintf(output, "MARK_SPACECOMBINING "); break;
858 case XML_REGEXP_MARK_ENCLOSING:
859 fprintf(output, "MARK_ENCLOSING "); break;
860 case XML_REGEXP_NUMBER:
861 fprintf(output, "NUMBER "); break;
862 case XML_REGEXP_NUMBER_DECIMAL:
863 fprintf(output, "NUMBER_DECIMAL "); break;
864 case XML_REGEXP_NUMBER_LETTER:
865 fprintf(output, "NUMBER_LETTER "); break;
866 case XML_REGEXP_NUMBER_OTHERS:
867 fprintf(output, "NUMBER_OTHERS "); break;
868 case XML_REGEXP_PUNCT:
869 fprintf(output, "PUNCT "); break;
870 case XML_REGEXP_PUNCT_CONNECTOR:
871 fprintf(output, "PUNCT_CONNECTOR "); break;
872 case XML_REGEXP_PUNCT_DASH:
873 fprintf(output, "PUNCT_DASH "); break;
874 case XML_REGEXP_PUNCT_OPEN:
875 fprintf(output, "PUNCT_OPEN "); break;
876 case XML_REGEXP_PUNCT_CLOSE:
877 fprintf(output, "PUNCT_CLOSE "); break;
878 case XML_REGEXP_PUNCT_INITQUOTE:
879 fprintf(output, "PUNCT_INITQUOTE "); break;
880 case XML_REGEXP_PUNCT_FINQUOTE:
881 fprintf(output, "PUNCT_FINQUOTE "); break;
882 case XML_REGEXP_PUNCT_OTHERS:
883 fprintf(output, "PUNCT_OTHERS "); break;
884 case XML_REGEXP_SEPAR:
885 fprintf(output, "SEPAR "); break;
886 case XML_REGEXP_SEPAR_SPACE:
887 fprintf(output, "SEPAR_SPACE "); break;
888 case XML_REGEXP_SEPAR_LINE:
889 fprintf(output, "SEPAR_LINE "); break;
890 case XML_REGEXP_SEPAR_PARA:
891 fprintf(output, "SEPAR_PARA "); break;
892 case XML_REGEXP_SYMBOL:
893 fprintf(output, "SYMBOL "); break;
894 case XML_REGEXP_SYMBOL_MATH:
895 fprintf(output, "SYMBOL_MATH "); break;
896 case XML_REGEXP_SYMBOL_CURRENCY:
897 fprintf(output, "SYMBOL_CURRENCY "); break;
898 case XML_REGEXP_SYMBOL_MODIFIER:
899 fprintf(output, "SYMBOL_MODIFIER "); break;
900 case XML_REGEXP_SYMBOL_OTHERS:
901 fprintf(output, "SYMBOL_OTHERS "); break;
902 case XML_REGEXP_OTHER:
903 fprintf(output, "OTHER "); break;
904 case XML_REGEXP_OTHER_CONTROL:
905 fprintf(output, "OTHER_CONTROL "); break;
906 case XML_REGEXP_OTHER_FORMAT:
907 fprintf(output, "OTHER_FORMAT "); break;
908 case XML_REGEXP_OTHER_PRIVATE:
909 fprintf(output, "OTHER_PRIVATE "); break;
910 case XML_REGEXP_OTHER_NA:
911 fprintf(output, "OTHER_NA "); break;
912 case XML_REGEXP_BLOCK_NAME:
913 fprintf(output, "BLOCK "); break;
914 }
915}
916
917static void
918xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
919 switch (type) {
920 case XML_REGEXP_QUANT_EPSILON:
921 fprintf(output, "epsilon "); break;
922 case XML_REGEXP_QUANT_ONCE:
923 fprintf(output, "once "); break;
924 case XML_REGEXP_QUANT_OPT:
925 fprintf(output, "? "); break;
926 case XML_REGEXP_QUANT_MULT:
927 fprintf(output, "* "); break;
928 case XML_REGEXP_QUANT_PLUS:
929 fprintf(output, "+ "); break;
930 case XML_REGEXP_QUANT_RANGE:
931 fprintf(output, "range "); break;
Daniel Veillard7646b182002-04-20 06:41:40 +0000932 case XML_REGEXP_QUANT_ONCEONLY:
933 fprintf(output, "onceonly "); break;
934 case XML_REGEXP_QUANT_ALL:
935 fprintf(output, "all "); break;
Daniel Veillard4255d502002-04-16 15:50:10 +0000936 }
937}
938static void
939xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
940 fprintf(output, " range: ");
941 if (range->neg)
942 fprintf(output, "negative ");
943 xmlRegPrintAtomType(output, range->type);
944 fprintf(output, "%c - %c\n", range->start, range->end);
945}
946
947static void
948xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
949 fprintf(output, " atom: ");
950 if (atom == NULL) {
951 fprintf(output, "NULL\n");
952 return;
953 }
954 xmlRegPrintAtomType(output, atom->type);
955 xmlRegPrintQuantType(output, atom->quant);
956 if (atom->quant == XML_REGEXP_QUANT_RANGE)
957 fprintf(output, "%d-%d ", atom->min, atom->max);
958 if (atom->type == XML_REGEXP_STRING)
959 fprintf(output, "'%s' ", (char *) atom->valuep);
960 if (atom->type == XML_REGEXP_CHARVAL)
961 fprintf(output, "char %c\n", atom->codepoint);
962 else if (atom->type == XML_REGEXP_RANGES) {
963 int i;
964 fprintf(output, "%d entries\n", atom->nbRanges);
965 for (i = 0; i < atom->nbRanges;i++)
966 xmlRegPrintRange(output, atom->ranges[i]);
967 } else if (atom->type == XML_REGEXP_SUBREG) {
968 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
969 } else {
970 fprintf(output, "\n");
971 }
972}
973
974static void
975xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
976 fprintf(output, " trans: ");
977 if (trans == NULL) {
978 fprintf(output, "NULL\n");
979 return;
980 }
981 if (trans->to < 0) {
982 fprintf(output, "removed\n");
983 return;
984 }
985 if (trans->counter >= 0) {
986 fprintf(output, "counted %d, ", trans->counter);
987 }
Daniel Veillard8a001f62002-04-20 07:24:11 +0000988 if (trans->count == REGEXP_ALL_COUNTER) {
989 fprintf(output, "all transition, ");
990 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +0000991 fprintf(output, "count based %d, ", trans->count);
992 }
993 if (trans->atom == NULL) {
994 fprintf(output, "epsilon to %d\n", trans->to);
995 return;
996 }
997 if (trans->atom->type == XML_REGEXP_CHARVAL)
998 fprintf(output, "char %c ", trans->atom->codepoint);
999 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1000}
1001
1002static void
1003xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1004 int i;
1005
1006 fprintf(output, " state: ");
1007 if (state == NULL) {
1008 fprintf(output, "NULL\n");
1009 return;
1010 }
1011 if (state->type == XML_REGEXP_START_STATE)
1012 fprintf(output, "START ");
1013 if (state->type == XML_REGEXP_FINAL_STATE)
1014 fprintf(output, "FINAL ");
1015
1016 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1017 for (i = 0;i < state->nbTrans; i++) {
1018 xmlRegPrintTrans(output, &(state->trans[i]));
1019 }
1020}
1021
Daniel Veillard23e73572002-09-19 19:56:43 +00001022#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard4255d502002-04-16 15:50:10 +00001023static void
1024xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1025 int i;
1026
1027 fprintf(output, " ctxt: ");
1028 if (ctxt == NULL) {
1029 fprintf(output, "NULL\n");
1030 return;
1031 }
1032 fprintf(output, "'%s' ", ctxt->string);
1033 if (ctxt->error)
1034 fprintf(output, "error ");
1035 if (ctxt->neg)
1036 fprintf(output, "neg ");
1037 fprintf(output, "\n");
1038 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1039 for (i = 0;i < ctxt->nbAtoms; i++) {
1040 fprintf(output, " %02d ", i);
1041 xmlRegPrintAtom(output, ctxt->atoms[i]);
1042 }
1043 if (ctxt->atom != NULL) {
1044 fprintf(output, "current atom:\n");
1045 xmlRegPrintAtom(output, ctxt->atom);
1046 }
1047 fprintf(output, "%d states:", ctxt->nbStates);
1048 if (ctxt->start != NULL)
1049 fprintf(output, " start: %d", ctxt->start->no);
1050 if (ctxt->end != NULL)
1051 fprintf(output, " end: %d", ctxt->end->no);
1052 fprintf(output, "\n");
1053 for (i = 0;i < ctxt->nbStates; i++) {
1054 xmlRegPrintState(output, ctxt->states[i]);
1055 }
1056 fprintf(output, "%d counters:\n", ctxt->nbCounters);
1057 for (i = 0;i < ctxt->nbCounters; i++) {
1058 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1059 ctxt->counters[i].max);
1060 }
1061}
Daniel Veillard23e73572002-09-19 19:56:43 +00001062#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00001063
1064/************************************************************************
1065 * *
1066 * Finite Automata structures manipulations *
1067 * *
1068 ************************************************************************/
1069
1070static void
1071xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1072 int neg, xmlRegAtomType type, int start, int end,
1073 xmlChar *blockName) {
1074 xmlRegRangePtr range;
1075
1076 if (atom == NULL) {
1077 ERROR("add range: atom is NULL");
1078 return;
1079 }
1080 if (atom->type != XML_REGEXP_RANGES) {
1081 ERROR("add range: atom is not ranges");
1082 return;
1083 }
1084 if (atom->maxRanges == 0) {
1085 atom->maxRanges = 4;
1086 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1087 sizeof(xmlRegRangePtr));
1088 if (atom->ranges == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001089 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001090 atom->maxRanges = 0;
1091 return;
1092 }
1093 } else if (atom->nbRanges >= atom->maxRanges) {
1094 xmlRegRangePtr *tmp;
1095 atom->maxRanges *= 2;
1096 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1097 sizeof(xmlRegRangePtr));
1098 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001099 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001100 atom->maxRanges /= 2;
1101 return;
1102 }
1103 atom->ranges = tmp;
1104 }
1105 range = xmlRegNewRange(ctxt, neg, type, start, end);
1106 if (range == NULL)
1107 return;
1108 range->blockName = blockName;
1109 atom->ranges[atom->nbRanges++] = range;
1110
1111}
1112
1113static int
1114xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1115 if (ctxt->maxCounters == 0) {
1116 ctxt->maxCounters = 4;
1117 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1118 sizeof(xmlRegCounter));
1119 if (ctxt->counters == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001120 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001121 ctxt->maxCounters = 0;
1122 return(-1);
1123 }
1124 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1125 xmlRegCounter *tmp;
1126 ctxt->maxCounters *= 2;
1127 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1128 sizeof(xmlRegCounter));
1129 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001130 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001131 ctxt->maxCounters /= 2;
1132 return(-1);
1133 }
1134 ctxt->counters = tmp;
1135 }
1136 ctxt->counters[ctxt->nbCounters].min = -1;
1137 ctxt->counters[ctxt->nbCounters].max = -1;
1138 return(ctxt->nbCounters++);
1139}
1140
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001141static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001142xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1143 if (atom == NULL) {
1144 ERROR("atom push: atom is NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001145 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001146 }
1147 if (ctxt->maxAtoms == 0) {
1148 ctxt->maxAtoms = 4;
1149 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1150 sizeof(xmlRegAtomPtr));
1151 if (ctxt->atoms == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001152 xmlRegexpErrMemory(ctxt, "pushing atom");
Daniel Veillard4255d502002-04-16 15:50:10 +00001153 ctxt->maxAtoms = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001154 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001155 }
1156 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1157 xmlRegAtomPtr *tmp;
1158 ctxt->maxAtoms *= 2;
1159 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1160 sizeof(xmlRegAtomPtr));
1161 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001162 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001163 ctxt->maxAtoms /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001164 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001165 }
1166 ctxt->atoms = tmp;
1167 }
1168 atom->no = ctxt->nbAtoms;
1169 ctxt->atoms[ctxt->nbAtoms++] = atom;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001170 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001171}
1172
1173static void
1174xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1175 xmlRegAtomPtr atom, xmlRegStatePtr target,
1176 int counter, int count) {
1177 if (state == NULL) {
1178 ERROR("add state: state is NULL");
1179 return;
1180 }
1181 if (target == NULL) {
1182 ERROR("add state: target is NULL");
1183 return;
1184 }
1185 if (state->maxTrans == 0) {
1186 state->maxTrans = 4;
1187 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1188 sizeof(xmlRegTrans));
1189 if (state->trans == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001190 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001191 state->maxTrans = 0;
1192 return;
1193 }
1194 } else if (state->nbTrans >= state->maxTrans) {
1195 xmlRegTrans *tmp;
1196 state->maxTrans *= 2;
1197 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1198 sizeof(xmlRegTrans));
1199 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001200 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001201 state->maxTrans /= 2;
1202 return;
1203 }
1204 state->trans = tmp;
1205 }
1206#ifdef DEBUG_REGEXP_GRAPH
1207 printf("Add trans from %d to %d ", state->no, target->no);
Daniel Veillard8a001f62002-04-20 07:24:11 +00001208 if (count == REGEXP_ALL_COUNTER)
1209 printf("all transition");
Daniel Veillard4402ab42002-09-12 16:02:56 +00001210 else if (count >= 0)
Daniel Veillard4255d502002-04-16 15:50:10 +00001211 printf("count based %d", count);
1212 else if (counter >= 0)
1213 printf("counted %d", counter);
1214 else if (atom == NULL)
1215 printf("epsilon transition");
1216 printf("\n");
1217#endif
1218
1219 state->trans[state->nbTrans].atom = atom;
1220 state->trans[state->nbTrans].to = target->no;
1221 state->trans[state->nbTrans].counter = counter;
1222 state->trans[state->nbTrans].count = count;
1223 state->nbTrans++;
1224}
1225
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001226static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001227xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001228 if (state == NULL) return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001229 if (ctxt->maxStates == 0) {
1230 ctxt->maxStates = 4;
1231 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1232 sizeof(xmlRegStatePtr));
1233 if (ctxt->states == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001234 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001235 ctxt->maxStates = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001236 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001237 }
1238 } else if (ctxt->nbStates >= ctxt->maxStates) {
1239 xmlRegStatePtr *tmp;
1240 ctxt->maxStates *= 2;
1241 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1242 sizeof(xmlRegStatePtr));
1243 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001244 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001245 ctxt->maxStates /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001246 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001247 }
1248 ctxt->states = tmp;
1249 }
1250 state->no = ctxt->nbStates;
1251 ctxt->states[ctxt->nbStates++] = state;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001252 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001253}
1254
1255/**
Daniel Veillard7646b182002-04-20 06:41:40 +00001256 * xmlFAGenerateAllTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001257 * @ctxt: a regexp parser context
1258 * @from: the from state
1259 * @to: the target state or NULL for building a new one
1260 * @lax:
Daniel Veillard7646b182002-04-20 06:41:40 +00001261 *
1262 */
1263static void
1264xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
Daniel Veillard441bc322002-04-20 17:38:48 +00001265 xmlRegStatePtr from, xmlRegStatePtr to,
1266 int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00001267 if (to == NULL) {
1268 to = xmlRegNewState(ctxt);
1269 xmlRegStatePush(ctxt, to);
1270 ctxt->state = to;
1271 }
Daniel Veillard441bc322002-04-20 17:38:48 +00001272 if (lax)
1273 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1274 else
1275 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
Daniel Veillard7646b182002-04-20 06:41:40 +00001276}
1277
1278/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001279 * xmlFAGenerateEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001280 * @ctxt: a regexp parser context
1281 * @from: the from state
1282 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001283 *
1284 */
1285static void
1286xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1287 xmlRegStatePtr from, xmlRegStatePtr to) {
1288 if (to == NULL) {
1289 to = xmlRegNewState(ctxt);
1290 xmlRegStatePush(ctxt, to);
1291 ctxt->state = to;
1292 }
1293 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1294}
1295
1296/**
1297 * xmlFAGenerateCountedEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001298 * @ctxt: a regexp parser context
1299 * @from: the from state
1300 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001301 * counter: the counter for that transition
1302 *
1303 */
1304static void
1305xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1306 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1307 if (to == NULL) {
1308 to = xmlRegNewState(ctxt);
1309 xmlRegStatePush(ctxt, to);
1310 ctxt->state = to;
1311 }
1312 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1313}
1314
1315/**
1316 * xmlFAGenerateCountedTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001317 * @ctxt: a regexp parser context
1318 * @from: the from state
1319 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001320 * counter: the counter for that transition
1321 *
1322 */
1323static void
1324xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1325 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1326 if (to == NULL) {
1327 to = xmlRegNewState(ctxt);
1328 xmlRegStatePush(ctxt, to);
1329 ctxt->state = to;
1330 }
1331 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1332}
1333
1334/**
1335 * xmlFAGenerateTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001336 * @ctxt: a regexp parser context
1337 * @from: the from state
1338 * @to: the target state or NULL for building a new one
1339 * @atom: the atom generating the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00001340 *
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001341 * Returns 0 if succes and -1 in case of error.
Daniel Veillard4255d502002-04-16 15:50:10 +00001342 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001343static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001344xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1345 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1346 if (atom == NULL) {
1347 ERROR("genrate transition: atom == NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001348 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001349 }
1350 if (atom->type == XML_REGEXP_SUBREG) {
1351 /*
1352 * this is a subexpression handling one should not need to
1353 * create a new node excep for XML_REGEXP_QUANT_RANGE.
1354 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001355 if (xmlRegAtomPush(ctxt, atom) < 0) {
1356 return(-1);
1357 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001358 if ((to != NULL) && (atom->stop != to) &&
1359 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1360 /*
1361 * Generate an epsilon transition to link to the target
1362 */
1363 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1364 }
1365 switch (atom->quant) {
1366 case XML_REGEXP_QUANT_OPT:
1367 atom->quant = XML_REGEXP_QUANT_ONCE;
1368 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1369 break;
1370 case XML_REGEXP_QUANT_MULT:
1371 atom->quant = XML_REGEXP_QUANT_ONCE;
1372 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1373 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1374 break;
1375 case XML_REGEXP_QUANT_PLUS:
1376 atom->quant = XML_REGEXP_QUANT_ONCE;
1377 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1378 break;
1379 case XML_REGEXP_QUANT_RANGE: {
1380 int counter;
1381 xmlRegStatePtr newstate;
1382
1383 /*
1384 * This one is nasty:
1385 * 1/ register a new counter
1386 * 2/ register an epsilon transition associated to
1387 * this counter going from atom->stop to atom->start
1388 * 3/ create a new state
1389 * 4/ generate a counted transition from atom->stop to
1390 * that state
1391 */
1392 counter = xmlRegGetCounter(ctxt);
1393 ctxt->counters[counter].min = atom->min - 1;
1394 ctxt->counters[counter].max = atom->max - 1;
1395 atom->min = 0;
1396 atom->max = 0;
1397 atom->quant = XML_REGEXP_QUANT_ONCE;
1398 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1399 atom->start, counter);
1400 if (to != NULL) {
1401 newstate = to;
1402 } else {
1403 newstate = xmlRegNewState(ctxt);
1404 xmlRegStatePush(ctxt, newstate);
1405 ctxt->state = newstate;
1406 }
1407 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1408 newstate, counter);
1409 }
1410 default:
1411 break;
1412 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001413 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001414 } else {
1415 if (to == NULL) {
1416 to = xmlRegNewState(ctxt);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001417 if (to != NULL)
1418 xmlRegStatePush(ctxt, to);
1419 else {
1420 return(-1);
1421 }
1422 }
1423 if (xmlRegAtomPush(ctxt, atom) < 0) {
1424 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001425 }
1426 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001427 ctxt->state = to;
1428 }
1429 switch (atom->quant) {
1430 case XML_REGEXP_QUANT_OPT:
1431 atom->quant = XML_REGEXP_QUANT_ONCE;
1432 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1433 break;
1434 case XML_REGEXP_QUANT_MULT:
1435 atom->quant = XML_REGEXP_QUANT_ONCE;
1436 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1437 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1438 break;
1439 case XML_REGEXP_QUANT_PLUS:
1440 atom->quant = XML_REGEXP_QUANT_ONCE;
1441 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1442 break;
1443 default:
1444 break;
1445 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001446 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001447}
1448
1449/**
1450 * xmlFAReduceEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001451 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001452 * @fromnr: the from state
1453 * @tonr: the to state
1454 * @cpunter: should that transition be associted to a counted
1455 *
1456 */
1457static void
1458xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1459 int tonr, int counter) {
1460 int transnr;
1461 xmlRegStatePtr from;
1462 xmlRegStatePtr to;
1463
1464#ifdef DEBUG_REGEXP_GRAPH
1465 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1466#endif
1467 from = ctxt->states[fromnr];
1468 if (from == NULL)
1469 return;
1470 to = ctxt->states[tonr];
1471 if (to == NULL)
1472 return;
1473 if ((to->mark == XML_REGEXP_MARK_START) ||
1474 (to->mark == XML_REGEXP_MARK_VISITED))
1475 return;
1476
1477 to->mark = XML_REGEXP_MARK_VISITED;
1478 if (to->type == XML_REGEXP_FINAL_STATE) {
1479#ifdef DEBUG_REGEXP_GRAPH
1480 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1481#endif
1482 from->type = XML_REGEXP_FINAL_STATE;
1483 }
1484 for (transnr = 0;transnr < to->nbTrans;transnr++) {
1485 if (to->trans[transnr].atom == NULL) {
1486 /*
1487 * Don't remove counted transitions
1488 * Don't loop either
1489 */
Daniel Veillardb509f152002-04-17 16:28:10 +00001490 if (to->trans[transnr].to != fromnr) {
1491 if (to->trans[transnr].count >= 0) {
1492 int newto = to->trans[transnr].to;
1493
1494 xmlRegStateAddTrans(ctxt, from, NULL,
1495 ctxt->states[newto],
1496 -1, to->trans[transnr].count);
1497 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00001498#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillardb509f152002-04-17 16:28:10 +00001499 printf("Found epsilon trans %d from %d to %d\n",
1500 transnr, tonr, to->trans[transnr].to);
Daniel Veillard4255d502002-04-16 15:50:10 +00001501#endif
Daniel Veillardb509f152002-04-17 16:28:10 +00001502 if (to->trans[transnr].counter >= 0) {
1503 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1504 to->trans[transnr].to,
1505 to->trans[transnr].counter);
1506 } else {
1507 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1508 to->trans[transnr].to,
1509 counter);
1510 }
1511 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001512 }
1513 } else {
1514 int newto = to->trans[transnr].to;
1515
Daniel Veillardb509f152002-04-17 16:28:10 +00001516 if (to->trans[transnr].counter >= 0) {
1517 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1518 ctxt->states[newto],
1519 to->trans[transnr].counter, -1);
1520 } else {
1521 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1522 ctxt->states[newto], counter, -1);
1523 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001524 }
1525 }
1526 to->mark = XML_REGEXP_MARK_NORMAL;
1527}
1528
1529/**
1530 * xmlFAEliminateEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001531 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001532 *
1533 */
1534static void
1535xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1536 int statenr, transnr;
1537 xmlRegStatePtr state;
1538
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001539 if (ctxt->states == NULL) return;
1540
1541
Daniel Veillard4255d502002-04-16 15:50:10 +00001542 /*
1543 * build the completed transitions bypassing the epsilons
1544 * Use a marking algorithm to avoid loops
1545 */
1546 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1547 state = ctxt->states[statenr];
1548 if (state == NULL)
1549 continue;
1550 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1551 if ((state->trans[transnr].atom == NULL) &&
1552 (state->trans[transnr].to >= 0)) {
1553 if (state->trans[transnr].to == statenr) {
1554 state->trans[transnr].to = -1;
1555#ifdef DEBUG_REGEXP_GRAPH
1556 printf("Removed loopback epsilon trans %d on %d\n",
1557 transnr, statenr);
1558#endif
1559 } else if (state->trans[transnr].count < 0) {
1560 int newto = state->trans[transnr].to;
1561
1562#ifdef DEBUG_REGEXP_GRAPH
1563 printf("Found epsilon trans %d from %d to %d\n",
1564 transnr, statenr, newto);
1565#endif
1566 state->mark = XML_REGEXP_MARK_START;
1567 xmlFAReduceEpsilonTransitions(ctxt, statenr,
1568 newto, state->trans[transnr].counter);
1569 state->mark = XML_REGEXP_MARK_NORMAL;
1570#ifdef DEBUG_REGEXP_GRAPH
1571 } else {
1572 printf("Found counted transition %d on %d\n",
1573 transnr, statenr);
1574#endif
1575 }
1576 }
1577 }
1578 }
1579 /*
1580 * Eliminate the epsilon transitions
1581 */
1582 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1583 state = ctxt->states[statenr];
1584 if (state == NULL)
1585 continue;
1586 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1587 if ((state->trans[transnr].atom == NULL) &&
1588 (state->trans[transnr].count < 0) &&
1589 (state->trans[transnr].to >= 0)) {
1590 state->trans[transnr].to = -1;
1591 }
1592 }
1593 }
Daniel Veillard23e73572002-09-19 19:56:43 +00001594
1595 /*
1596 * Use this pass to detect unreachable states too
1597 */
1598 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1599 state = ctxt->states[statenr];
1600 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00001601 state->reached = XML_REGEXP_MARK_NORMAL;
Daniel Veillard23e73572002-09-19 19:56:43 +00001602 }
1603 state = ctxt->states[0];
1604 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00001605 state->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00001606 while (state != NULL) {
1607 xmlRegStatePtr target = NULL;
William M. Brack779af002003-08-01 15:55:39 +00001608 state->reached = XML_REGEXP_MARK_VISITED;
Daniel Veillard23e73572002-09-19 19:56:43 +00001609 /*
1610 * Mark all state reachable from the current reachable state
1611 */
1612 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1613 if ((state->trans[transnr].to >= 0) &&
1614 ((state->trans[transnr].atom != NULL) ||
1615 (state->trans[transnr].count >= 0))) {
1616 int newto = state->trans[transnr].to;
1617
1618 if (ctxt->states[newto] == NULL)
1619 continue;
William M. Brack779af002003-08-01 15:55:39 +00001620 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
1621 ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00001622 target = ctxt->states[newto];
1623 }
1624 }
1625 }
1626 /*
1627 * find the next accessible state not explored
1628 */
1629 if (target == NULL) {
1630 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
1631 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00001632 if ((state != NULL) && (state->reached ==
1633 XML_REGEXP_MARK_START)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00001634 target = state;
1635 break;
1636 }
1637 }
1638 }
1639 state = target;
1640 }
1641 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1642 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00001643 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00001644#ifdef DEBUG_REGEXP_GRAPH
1645 printf("Removed unreachable state %d\n", statenr);
1646#endif
1647 xmlRegFreeState(state);
1648 ctxt->states[statenr] = NULL;
1649 }
1650 }
1651
Daniel Veillard4255d502002-04-16 15:50:10 +00001652}
1653
Daniel Veillarde19fc232002-04-22 16:01:24 +00001654/**
1655 * xmlFACompareAtoms:
1656 * @atom1: an atom
1657 * @atom2: an atom
1658 *
1659 * Compares two atoms to check whether they are equivatents
1660 *
1661 * Returns 1 if yes and 0 otherwise
1662 */
1663static int
1664xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
1665 if (atom1 == atom2)
1666 return(1);
1667 if ((atom1 == NULL) || (atom2 == NULL))
1668 return(0);
1669
1670 if (atom1->type != atom2->type)
1671 return(0);
1672 switch (atom1->type) {
1673 case XML_REGEXP_STRING:
1674 return(xmlStrEqual((xmlChar *)atom1->valuep,
1675 (xmlChar *)atom2->valuep));
1676 case XML_REGEXP_EPSILON:
1677 return(1);
1678 case XML_REGEXP_CHARVAL:
1679 return(atom1->codepoint == atom2->codepoint);
1680 case XML_REGEXP_RANGES:
1681 TODO;
1682 return(0);
1683 default:
1684 break;
1685 }
1686 return(1);
1687}
1688
1689/**
1690 * xmlFARecurseDeterminism:
1691 * @ctxt: a regexp parser context
1692 *
1693 * Check whether the associated regexp is determinist,
1694 * should be called after xmlFAEliminateEpsilonTransitions()
1695 *
1696 */
1697static int
1698xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1699 int to, xmlRegAtomPtr atom) {
1700 int ret = 1;
1701 int transnr;
1702 xmlRegTransPtr t1;
1703
1704 if (state == NULL)
1705 return(ret);
1706 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1707 t1 = &(state->trans[transnr]);
1708 /*
1709 * check transitions conflicting with the one looked at
1710 */
1711 if (t1->atom == NULL) {
1712 if (t1->to == -1)
1713 continue;
1714 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1715 to, atom);
1716 if (ret == 0)
1717 return(0);
1718 continue;
1719 }
1720 if (t1->to != to)
1721 continue;
1722 if (xmlFACompareAtoms(t1->atom, atom))
1723 return(0);
1724 }
1725 return(ret);
1726}
1727
1728/**
1729 * xmlFAComputesDeterminism:
1730 * @ctxt: a regexp parser context
1731 *
1732 * Check whether the associated regexp is determinist,
1733 * should be called after xmlFAEliminateEpsilonTransitions()
1734 *
1735 */
1736static int
1737xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
1738 int statenr, transnr;
1739 xmlRegStatePtr state;
1740 xmlRegTransPtr t1, t2;
1741 int i;
1742 int ret = 1;
1743
Daniel Veillard4402ab42002-09-12 16:02:56 +00001744#ifdef DEBUG_REGEXP_GRAPH
1745 printf("xmlFAComputesDeterminism\n");
1746 xmlRegPrintCtxt(stdout, ctxt);
1747#endif
Daniel Veillarde19fc232002-04-22 16:01:24 +00001748 if (ctxt->determinist != -1)
1749 return(ctxt->determinist);
1750
1751 /*
1752 * Check for all states that there isn't 2 transitions
1753 * with the same atom and a different target.
1754 */
1755 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1756 state = ctxt->states[statenr];
1757 if (state == NULL)
1758 continue;
1759 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1760 t1 = &(state->trans[transnr]);
1761 /*
1762 * Determinism checks in case of counted or all transitions
1763 * will have to be handled separately
1764 */
1765 if (t1->atom == NULL)
1766 continue;
1767 if (t1->to == -1) /* eliminated */
1768 continue;
1769 for (i = 0;i < transnr;i++) {
1770 t2 = &(state->trans[i]);
1771 if (t2->to == -1) /* eliminated */
1772 continue;
1773 if (t2->atom != NULL) {
1774 if (t1->to == t2->to) {
1775 if (xmlFACompareAtoms(t1->atom, t2->atom))
1776 t2->to = -1; /* eliminate */
1777 } else {
1778 /* not determinist ! */
1779 if (xmlFACompareAtoms(t1->atom, t2->atom))
1780 ret = 0;
1781 }
1782 } else if (t1->to != -1) {
1783 /*
1784 * do the closure in case of remaining specific
1785 * epsilon transitions like choices or all
1786 */
1787 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1788 t2->to, t2->atom);
1789 if (ret == 0)
1790 return(0);
1791 }
1792 }
1793 if (ret == 0)
1794 break;
1795 }
1796 if (ret == 0)
1797 break;
1798 }
1799 ctxt->determinist = ret;
1800 return(ret);
1801}
1802
Daniel Veillard4255d502002-04-16 15:50:10 +00001803/************************************************************************
1804 * *
1805 * Routines to check input against transition atoms *
1806 * *
1807 ************************************************************************/
1808
1809static int
1810xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
1811 int start, int end, const xmlChar *blockName) {
1812 int ret = 0;
1813
1814 switch (type) {
1815 case XML_REGEXP_STRING:
1816 case XML_REGEXP_SUBREG:
1817 case XML_REGEXP_RANGES:
1818 case XML_REGEXP_EPSILON:
1819 return(-1);
1820 case XML_REGEXP_ANYCHAR:
1821 ret = ((codepoint != '\n') && (codepoint != '\r'));
1822 break;
1823 case XML_REGEXP_CHARVAL:
1824 ret = ((codepoint >= start) && (codepoint <= end));
1825 break;
1826 case XML_REGEXP_NOTSPACE:
1827 neg = !neg;
1828 case XML_REGEXP_ANYSPACE:
1829 ret = ((codepoint == '\n') || (codepoint == '\r') ||
1830 (codepoint == '\t') || (codepoint == ' '));
1831 break;
1832 case XML_REGEXP_NOTINITNAME:
1833 neg = !neg;
1834 case XML_REGEXP_INITNAME:
William M. Brack871611b2003-10-18 04:53:14 +00001835 ret = (IS_LETTER(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00001836 (codepoint == '_') || (codepoint == ':'));
1837 break;
1838 case XML_REGEXP_NOTNAMECHAR:
1839 neg = !neg;
1840 case XML_REGEXP_NAMECHAR:
William M. Brack871611b2003-10-18 04:53:14 +00001841 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00001842 (codepoint == '.') || (codepoint == '-') ||
1843 (codepoint == '_') || (codepoint == ':') ||
William M. Brack871611b2003-10-18 04:53:14 +00001844 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
Daniel Veillard4255d502002-04-16 15:50:10 +00001845 break;
1846 case XML_REGEXP_NOTDECIMAL:
1847 neg = !neg;
1848 case XML_REGEXP_DECIMAL:
1849 ret = xmlUCSIsCatNd(codepoint);
1850 break;
1851 case XML_REGEXP_REALCHAR:
1852 neg = !neg;
1853 case XML_REGEXP_NOTREALCHAR:
1854 ret = xmlUCSIsCatP(codepoint);
1855 if (ret == 0)
1856 ret = xmlUCSIsCatZ(codepoint);
1857 if (ret == 0)
1858 ret = xmlUCSIsCatC(codepoint);
1859 break;
1860 case XML_REGEXP_LETTER:
1861 ret = xmlUCSIsCatL(codepoint);
1862 break;
1863 case XML_REGEXP_LETTER_UPPERCASE:
1864 ret = xmlUCSIsCatLu(codepoint);
1865 break;
1866 case XML_REGEXP_LETTER_LOWERCASE:
1867 ret = xmlUCSIsCatLl(codepoint);
1868 break;
1869 case XML_REGEXP_LETTER_TITLECASE:
1870 ret = xmlUCSIsCatLt(codepoint);
1871 break;
1872 case XML_REGEXP_LETTER_MODIFIER:
1873 ret = xmlUCSIsCatLm(codepoint);
1874 break;
1875 case XML_REGEXP_LETTER_OTHERS:
1876 ret = xmlUCSIsCatLo(codepoint);
1877 break;
1878 case XML_REGEXP_MARK:
1879 ret = xmlUCSIsCatM(codepoint);
1880 break;
1881 case XML_REGEXP_MARK_NONSPACING:
1882 ret = xmlUCSIsCatMn(codepoint);
1883 break;
1884 case XML_REGEXP_MARK_SPACECOMBINING:
1885 ret = xmlUCSIsCatMc(codepoint);
1886 break;
1887 case XML_REGEXP_MARK_ENCLOSING:
1888 ret = xmlUCSIsCatMe(codepoint);
1889 break;
1890 case XML_REGEXP_NUMBER:
1891 ret = xmlUCSIsCatN(codepoint);
1892 break;
1893 case XML_REGEXP_NUMBER_DECIMAL:
1894 ret = xmlUCSIsCatNd(codepoint);
1895 break;
1896 case XML_REGEXP_NUMBER_LETTER:
1897 ret = xmlUCSIsCatNl(codepoint);
1898 break;
1899 case XML_REGEXP_NUMBER_OTHERS:
1900 ret = xmlUCSIsCatNo(codepoint);
1901 break;
1902 case XML_REGEXP_PUNCT:
1903 ret = xmlUCSIsCatP(codepoint);
1904 break;
1905 case XML_REGEXP_PUNCT_CONNECTOR:
1906 ret = xmlUCSIsCatPc(codepoint);
1907 break;
1908 case XML_REGEXP_PUNCT_DASH:
1909 ret = xmlUCSIsCatPd(codepoint);
1910 break;
1911 case XML_REGEXP_PUNCT_OPEN:
1912 ret = xmlUCSIsCatPs(codepoint);
1913 break;
1914 case XML_REGEXP_PUNCT_CLOSE:
1915 ret = xmlUCSIsCatPe(codepoint);
1916 break;
1917 case XML_REGEXP_PUNCT_INITQUOTE:
1918 ret = xmlUCSIsCatPi(codepoint);
1919 break;
1920 case XML_REGEXP_PUNCT_FINQUOTE:
1921 ret = xmlUCSIsCatPf(codepoint);
1922 break;
1923 case XML_REGEXP_PUNCT_OTHERS:
1924 ret = xmlUCSIsCatPo(codepoint);
1925 break;
1926 case XML_REGEXP_SEPAR:
1927 ret = xmlUCSIsCatZ(codepoint);
1928 break;
1929 case XML_REGEXP_SEPAR_SPACE:
1930 ret = xmlUCSIsCatZs(codepoint);
1931 break;
1932 case XML_REGEXP_SEPAR_LINE:
1933 ret = xmlUCSIsCatZl(codepoint);
1934 break;
1935 case XML_REGEXP_SEPAR_PARA:
1936 ret = xmlUCSIsCatZp(codepoint);
1937 break;
1938 case XML_REGEXP_SYMBOL:
1939 ret = xmlUCSIsCatS(codepoint);
1940 break;
1941 case XML_REGEXP_SYMBOL_MATH:
1942 ret = xmlUCSIsCatSm(codepoint);
1943 break;
1944 case XML_REGEXP_SYMBOL_CURRENCY:
1945 ret = xmlUCSIsCatSc(codepoint);
1946 break;
1947 case XML_REGEXP_SYMBOL_MODIFIER:
1948 ret = xmlUCSIsCatSk(codepoint);
1949 break;
1950 case XML_REGEXP_SYMBOL_OTHERS:
1951 ret = xmlUCSIsCatSo(codepoint);
1952 break;
1953 case XML_REGEXP_OTHER:
1954 ret = xmlUCSIsCatC(codepoint);
1955 break;
1956 case XML_REGEXP_OTHER_CONTROL:
1957 ret = xmlUCSIsCatCc(codepoint);
1958 break;
1959 case XML_REGEXP_OTHER_FORMAT:
1960 ret = xmlUCSIsCatCf(codepoint);
1961 break;
1962 case XML_REGEXP_OTHER_PRIVATE:
1963 ret = xmlUCSIsCatCo(codepoint);
1964 break;
1965 case XML_REGEXP_OTHER_NA:
1966 /* ret = xmlUCSIsCatCn(codepoint); */
1967 /* Seems it doesn't exist anymore in recent Unicode releases */
1968 ret = 0;
1969 break;
1970 case XML_REGEXP_BLOCK_NAME:
1971 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
1972 break;
1973 }
1974 if (neg)
1975 return(!ret);
1976 return(ret);
1977}
1978
1979static int
1980xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
1981 int i, ret = 0;
1982 xmlRegRangePtr range;
1983
William M. Brack871611b2003-10-18 04:53:14 +00001984 if ((atom == NULL) || (!IS_CHAR(codepoint)))
Daniel Veillard4255d502002-04-16 15:50:10 +00001985 return(-1);
1986
1987 switch (atom->type) {
1988 case XML_REGEXP_SUBREG:
1989 case XML_REGEXP_EPSILON:
1990 return(-1);
1991 case XML_REGEXP_CHARVAL:
1992 return(codepoint == atom->codepoint);
1993 case XML_REGEXP_RANGES: {
1994 int accept = 0;
1995 for (i = 0;i < atom->nbRanges;i++) {
1996 range = atom->ranges[i];
1997 if (range->neg) {
1998 ret = xmlRegCheckCharacterRange(range->type, codepoint,
1999 0, range->start, range->end,
2000 range->blockName);
2001 if (ret != 0)
2002 return(0); /* excluded char */
2003 } else {
2004 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2005 0, range->start, range->end,
2006 range->blockName);
2007 if (ret != 0)
2008 accept = 1; /* might still be excluded */
2009 }
2010 }
2011 return(accept);
2012 }
2013 case XML_REGEXP_STRING:
2014 printf("TODO: XML_REGEXP_STRING\n");
2015 return(-1);
2016 case XML_REGEXP_ANYCHAR:
2017 case XML_REGEXP_ANYSPACE:
2018 case XML_REGEXP_NOTSPACE:
2019 case XML_REGEXP_INITNAME:
2020 case XML_REGEXP_NOTINITNAME:
2021 case XML_REGEXP_NAMECHAR:
2022 case XML_REGEXP_NOTNAMECHAR:
2023 case XML_REGEXP_DECIMAL:
2024 case XML_REGEXP_NOTDECIMAL:
2025 case XML_REGEXP_REALCHAR:
2026 case XML_REGEXP_NOTREALCHAR:
2027 case XML_REGEXP_LETTER:
2028 case XML_REGEXP_LETTER_UPPERCASE:
2029 case XML_REGEXP_LETTER_LOWERCASE:
2030 case XML_REGEXP_LETTER_TITLECASE:
2031 case XML_REGEXP_LETTER_MODIFIER:
2032 case XML_REGEXP_LETTER_OTHERS:
2033 case XML_REGEXP_MARK:
2034 case XML_REGEXP_MARK_NONSPACING:
2035 case XML_REGEXP_MARK_SPACECOMBINING:
2036 case XML_REGEXP_MARK_ENCLOSING:
2037 case XML_REGEXP_NUMBER:
2038 case XML_REGEXP_NUMBER_DECIMAL:
2039 case XML_REGEXP_NUMBER_LETTER:
2040 case XML_REGEXP_NUMBER_OTHERS:
2041 case XML_REGEXP_PUNCT:
2042 case XML_REGEXP_PUNCT_CONNECTOR:
2043 case XML_REGEXP_PUNCT_DASH:
2044 case XML_REGEXP_PUNCT_OPEN:
2045 case XML_REGEXP_PUNCT_CLOSE:
2046 case XML_REGEXP_PUNCT_INITQUOTE:
2047 case XML_REGEXP_PUNCT_FINQUOTE:
2048 case XML_REGEXP_PUNCT_OTHERS:
2049 case XML_REGEXP_SEPAR:
2050 case XML_REGEXP_SEPAR_SPACE:
2051 case XML_REGEXP_SEPAR_LINE:
2052 case XML_REGEXP_SEPAR_PARA:
2053 case XML_REGEXP_SYMBOL:
2054 case XML_REGEXP_SYMBOL_MATH:
2055 case XML_REGEXP_SYMBOL_CURRENCY:
2056 case XML_REGEXP_SYMBOL_MODIFIER:
2057 case XML_REGEXP_SYMBOL_OTHERS:
2058 case XML_REGEXP_OTHER:
2059 case XML_REGEXP_OTHER_CONTROL:
2060 case XML_REGEXP_OTHER_FORMAT:
2061 case XML_REGEXP_OTHER_PRIVATE:
2062 case XML_REGEXP_OTHER_NA:
2063 case XML_REGEXP_BLOCK_NAME:
2064 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
2065 (const xmlChar *)atom->valuep);
2066 if (atom->neg)
2067 ret = !ret;
2068 break;
2069 }
2070 return(ret);
2071}
2072
2073/************************************************************************
2074 * *
2075 * Saving an restoring state of an execution context *
2076 * *
2077 ************************************************************************/
2078
2079#ifdef DEBUG_REGEXP_EXEC
2080static void
2081xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
2082 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
2083 if (exec->inputStack != NULL) {
2084 int i;
2085 printf(": ");
2086 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
2087 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]);
2088 } else {
2089 printf(": %s", &(exec->inputString[exec->index]));
2090 }
2091 printf("\n");
2092}
2093#endif
2094
2095static void
2096xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
2097#ifdef DEBUG_REGEXP_EXEC
2098 printf("saving ");
2099 exec->transno++;
2100 xmlFARegDebugExec(exec);
2101 exec->transno--;
2102#endif
2103
2104 if (exec->maxRollbacks == 0) {
2105 exec->maxRollbacks = 4;
2106 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
2107 sizeof(xmlRegExecRollback));
2108 if (exec->rollbacks == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002109 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002110 exec->maxRollbacks = 0;
2111 return;
2112 }
2113 memset(exec->rollbacks, 0,
2114 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2115 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
2116 xmlRegExecRollback *tmp;
2117 int len = exec->maxRollbacks;
2118
2119 exec->maxRollbacks *= 2;
2120 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
2121 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2122 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002123 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002124 exec->maxRollbacks /= 2;
2125 return;
2126 }
2127 exec->rollbacks = tmp;
2128 tmp = &exec->rollbacks[len];
2129 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
2130 }
2131 exec->rollbacks[exec->nbRollbacks].state = exec->state;
2132 exec->rollbacks[exec->nbRollbacks].index = exec->index;
2133 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
2134 if (exec->comp->nbCounters > 0) {
2135 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2136 exec->rollbacks[exec->nbRollbacks].counts = (int *)
2137 xmlMalloc(exec->comp->nbCounters * sizeof(int));
2138 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002139 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002140 exec->status = -5;
2141 return;
2142 }
2143 }
2144 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
2145 exec->comp->nbCounters * sizeof(int));
2146 }
2147 exec->nbRollbacks++;
2148}
2149
2150static void
2151xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
2152 if (exec->nbRollbacks <= 0) {
2153 exec->status = -1;
2154#ifdef DEBUG_REGEXP_EXEC
2155 printf("rollback failed on empty stack\n");
2156#endif
2157 return;
2158 }
2159 exec->nbRollbacks--;
2160 exec->state = exec->rollbacks[exec->nbRollbacks].state;
2161 exec->index = exec->rollbacks[exec->nbRollbacks].index;
2162 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
2163 if (exec->comp->nbCounters > 0) {
2164 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2165 fprintf(stderr, "exec save: allocation failed");
2166 exec->status = -6;
2167 return;
2168 }
2169 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
2170 exec->comp->nbCounters * sizeof(int));
2171 }
2172
2173#ifdef DEBUG_REGEXP_EXEC
2174 printf("restored ");
2175 xmlFARegDebugExec(exec);
2176#endif
2177}
2178
2179/************************************************************************
2180 * *
2181 * Verifyer, running an input against a compiled regexp *
2182 * *
2183 ************************************************************************/
2184
2185static int
2186xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
2187 xmlRegExecCtxt execval;
2188 xmlRegExecCtxtPtr exec = &execval;
2189 int ret, codepoint, len;
2190
2191 exec->inputString = content;
2192 exec->index = 0;
2193 exec->determinist = 1;
2194 exec->maxRollbacks = 0;
2195 exec->nbRollbacks = 0;
2196 exec->rollbacks = NULL;
2197 exec->status = 0;
2198 exec->comp = comp;
2199 exec->state = comp->states[0];
2200 exec->transno = 0;
2201 exec->transcount = 0;
2202 if (comp->nbCounters > 0) {
2203 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
Daniel Veillardff46a042003-10-08 08:53:17 +00002204 if (exec->counts == NULL) {
2205 xmlRegexpErrMemory(NULL, "running regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002206 return(-1);
Daniel Veillardff46a042003-10-08 08:53:17 +00002207 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002208 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
2209 } else
2210 exec->counts = NULL;
2211 while ((exec->status == 0) &&
2212 ((exec->inputString[exec->index] != 0) ||
2213 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
2214 xmlRegTransPtr trans;
2215 xmlRegAtomPtr atom;
2216
2217 /*
2218 * End of input on non-terminal state, rollback, however we may
2219 * still have epsilon like transition for counted transitions
2220 * on counters, in that case don't break too early.
2221 */
2222 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
2223 goto rollback;
2224
2225 exec->transcount = 0;
2226 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2227 trans = &exec->state->trans[exec->transno];
2228 if (trans->to < 0)
2229 continue;
2230 atom = trans->atom;
2231 ret = 0;
2232 if (trans->count >= 0) {
2233 int count;
2234 xmlRegCounterPtr counter;
2235
2236 /*
2237 * A counted transition.
2238 */
2239
2240 count = exec->counts[trans->count];
2241 counter = &exec->comp->counters[trans->count];
2242#ifdef DEBUG_REGEXP_EXEC
2243 printf("testing count %d: val %d, min %d, max %d\n",
2244 trans->count, count, counter->min, counter->max);
2245#endif
2246 ret = ((count >= counter->min) && (count <= counter->max));
2247 } else if (atom == NULL) {
2248 fprintf(stderr, "epsilon transition left at runtime\n");
2249 exec->status = -2;
2250 break;
2251 } else if (exec->inputString[exec->index] != 0) {
2252 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
2253 ret = xmlRegCheckCharacter(atom, codepoint);
2254 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2255 xmlRegStatePtr to = comp->states[trans->to];
2256
2257 /*
2258 * this is a multiple input sequence
2259 */
2260 if (exec->state->nbTrans > exec->transno + 1) {
2261 xmlFARegExecSave(exec);
2262 }
2263 exec->transcount = 1;
2264 do {
2265 /*
2266 * Try to progress as much as possible on the input
2267 */
2268 if (exec->transcount == atom->max) {
2269 break;
2270 }
2271 exec->index += len;
2272 /*
2273 * End of input: stop here
2274 */
2275 if (exec->inputString[exec->index] == 0) {
2276 exec->index -= len;
2277 break;
2278 }
2279 if (exec->transcount >= atom->min) {
2280 int transno = exec->transno;
2281 xmlRegStatePtr state = exec->state;
2282
2283 /*
2284 * The transition is acceptable save it
2285 */
2286 exec->transno = -1; /* trick */
2287 exec->state = to;
2288 xmlFARegExecSave(exec);
2289 exec->transno = transno;
2290 exec->state = state;
2291 }
2292 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
2293 len);
2294 ret = xmlRegCheckCharacter(atom, codepoint);
2295 exec->transcount++;
2296 } while (ret == 1);
2297 if (exec->transcount < atom->min)
2298 ret = 0;
2299
2300 /*
2301 * If the last check failed but one transition was found
2302 * possible, rollback
2303 */
2304 if (ret < 0)
2305 ret = 0;
2306 if (ret == 0) {
2307 goto rollback;
2308 }
2309 }
2310 }
2311 if (ret == 1) {
2312 if (exec->state->nbTrans > exec->transno + 1) {
2313 xmlFARegExecSave(exec);
2314 }
2315 if (trans->counter >= 0) {
2316#ifdef DEBUG_REGEXP_EXEC
2317 printf("Increasing count %d\n", trans->counter);
2318#endif
2319 exec->counts[trans->counter]++;
2320 }
2321#ifdef DEBUG_REGEXP_EXEC
2322 printf("entering state %d\n", trans->to);
2323#endif
2324 exec->state = comp->states[trans->to];
2325 exec->transno = 0;
2326 if (trans->atom != NULL) {
2327 exec->index += len;
2328 }
2329 goto progress;
2330 } else if (ret < 0) {
2331 exec->status = -4;
2332 break;
2333 }
2334 }
2335 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2336rollback:
2337 /*
2338 * Failed to find a way out
2339 */
2340 exec->determinist = 0;
2341 xmlFARegExecRollBack(exec);
2342 }
2343progress:
2344 continue;
2345 }
2346 if (exec->rollbacks != NULL) {
2347 if (exec->counts != NULL) {
2348 int i;
2349
2350 for (i = 0;i < exec->maxRollbacks;i++)
2351 if (exec->rollbacks[i].counts != NULL)
2352 xmlFree(exec->rollbacks[i].counts);
2353 }
2354 xmlFree(exec->rollbacks);
2355 }
2356 if (exec->counts != NULL)
2357 xmlFree(exec->counts);
2358 if (exec->status == 0)
2359 return(1);
2360 if (exec->status == -1)
2361 return(0);
2362 return(exec->status);
2363}
2364
2365/************************************************************************
2366 * *
2367 * Progressive interface to the verifyer one atom at a time *
2368 * *
2369 ************************************************************************/
2370
2371/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002372 * xmlRegNewExecCtxt:
Daniel Veillard4255d502002-04-16 15:50:10 +00002373 * @comp: a precompiled regular expression
2374 * @callback: a callback function used for handling progresses in the
2375 * automata matching phase
2376 * @data: the context data associated to the callback in this context
2377 *
2378 * Build a context used for progressive evaluation of a regexp.
Daniel Veillard01c13b52002-12-10 15:19:08 +00002379 *
2380 * Returns the new context
Daniel Veillard4255d502002-04-16 15:50:10 +00002381 */
2382xmlRegExecCtxtPtr
2383xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
2384 xmlRegExecCtxtPtr exec;
2385
2386 if (comp == NULL)
2387 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00002388 if ((comp->compact == NULL) && (comp->states == NULL))
2389 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00002390 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
2391 if (exec == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002392 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00002393 return(NULL);
2394 }
2395 memset(exec, 0, sizeof(xmlRegExecCtxt));
2396 exec->inputString = NULL;
2397 exec->index = 0;
2398 exec->determinist = 1;
2399 exec->maxRollbacks = 0;
2400 exec->nbRollbacks = 0;
2401 exec->rollbacks = NULL;
2402 exec->status = 0;
2403 exec->comp = comp;
Daniel Veillard23e73572002-09-19 19:56:43 +00002404 if (comp->compact == NULL)
2405 exec->state = comp->states[0];
Daniel Veillard4255d502002-04-16 15:50:10 +00002406 exec->transno = 0;
2407 exec->transcount = 0;
2408 exec->callback = callback;
2409 exec->data = data;
2410 if (comp->nbCounters > 0) {
2411 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
2412 if (exec->counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002413 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00002414 xmlFree(exec);
2415 return(NULL);
2416 }
2417 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
2418 } else
2419 exec->counts = NULL;
2420 exec->inputStackMax = 0;
2421 exec->inputStackNr = 0;
2422 exec->inputStack = NULL;
2423 return(exec);
2424}
2425
2426/**
2427 * xmlRegFreeExecCtxt:
2428 * @exec: a regular expression evaulation context
2429 *
2430 * Free the structures associated to a regular expression evaulation context.
2431 */
2432void
2433xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
2434 if (exec == NULL)
2435 return;
2436
2437 if (exec->rollbacks != NULL) {
2438 if (exec->counts != NULL) {
2439 int i;
2440
2441 for (i = 0;i < exec->maxRollbacks;i++)
2442 if (exec->rollbacks[i].counts != NULL)
2443 xmlFree(exec->rollbacks[i].counts);
2444 }
2445 xmlFree(exec->rollbacks);
2446 }
2447 if (exec->counts != NULL)
2448 xmlFree(exec->counts);
2449 if (exec->inputStack != NULL) {
2450 int i;
2451
Daniel Veillard32370232002-10-16 14:08:14 +00002452 for (i = 0;i < exec->inputStackNr;i++) {
2453 if (exec->inputStack[i].value != NULL)
2454 xmlFree(exec->inputStack[i].value);
2455 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002456 xmlFree(exec->inputStack);
2457 }
2458 xmlFree(exec);
2459}
2460
2461static void
2462xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2463 void *data) {
2464#ifdef DEBUG_PUSH
2465 printf("saving value: %d:%s\n", exec->inputStackNr, value);
2466#endif
2467 if (exec->inputStackMax == 0) {
2468 exec->inputStackMax = 4;
2469 exec->inputStack = (xmlRegInputTokenPtr)
2470 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
2471 if (exec->inputStack == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002472 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00002473 exec->inputStackMax = 0;
2474 return;
2475 }
2476 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
2477 xmlRegInputTokenPtr tmp;
2478
2479 exec->inputStackMax *= 2;
2480 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
2481 exec->inputStackMax * sizeof(xmlRegInputToken));
2482 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002483 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00002484 exec->inputStackMax /= 2;
2485 return;
2486 }
2487 exec->inputStack = tmp;
2488 }
2489 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
2490 exec->inputStack[exec->inputStackNr].data = data;
2491 exec->inputStackNr++;
2492 exec->inputStack[exec->inputStackNr].value = NULL;
2493 exec->inputStack[exec->inputStackNr].data = NULL;
2494}
2495
2496
2497/**
Daniel Veillard23e73572002-09-19 19:56:43 +00002498 * xmlRegCompactPushString:
2499 * @exec: a regexp execution context
2500 * @comp: the precompiled exec with a compact table
2501 * @value: a string token input
2502 * @data: data associated to the token to reuse in callbacks
2503 *
2504 * Push one input token in the execution context
2505 *
2506 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2507 * a negative value in case of error.
2508 */
2509static int
2510xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
2511 xmlRegexpPtr comp,
2512 const xmlChar *value,
2513 void *data) {
2514 int state = exec->index;
2515 int i, target;
2516
2517 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
2518 return(-1);
2519
2520 if (value == NULL) {
2521 /*
2522 * are we at a final state ?
2523 */
2524 if (comp->compact[state * (comp->nbstrings + 1)] ==
2525 XML_REGEXP_FINAL_STATE)
2526 return(1);
2527 return(0);
2528 }
2529
2530#ifdef DEBUG_PUSH
2531 printf("value pushed: %s\n", value);
2532#endif
2533
2534 /*
2535 * Examine all outside transition from current state
2536 */
2537 for (i = 0;i < comp->nbstrings;i++) {
2538 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
2539 if ((target > 0) && (target <= comp->nbstates)) {
2540 target--; /* to avoid 0 */
2541 if (xmlStrEqual(comp->stringMap[i], value)) {
2542 exec->index = target;
Daniel Veillard118aed72002-09-24 14:13:13 +00002543 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
2544 exec->callback(exec->data, value,
2545 comp->transdata[state * comp->nbstrings + i], data);
2546 }
Daniel Veillard23e73572002-09-19 19:56:43 +00002547#ifdef DEBUG_PUSH
2548 printf("entering state %d\n", target);
2549#endif
2550 if (comp->compact[target * (comp->nbstrings + 1)] ==
2551 XML_REGEXP_FINAL_STATE)
2552 return(1);
2553 return(0);
2554 }
2555 }
2556 }
2557 /*
2558 * Failed to find an exit transition out from current state for the
2559 * current token
2560 */
2561#ifdef DEBUG_PUSH
2562 printf("failed to find a transition for %s on state %d\n", value, state);
2563#endif
2564 exec->status = -1;
2565 return(-1);
2566}
2567
2568/**
Daniel Veillard4255d502002-04-16 15:50:10 +00002569 * xmlRegExecPushString:
Daniel Veillardea7751d2002-12-20 00:16:24 +00002570 * @exec: a regexp execution context or NULL to indicate the end
Daniel Veillard4255d502002-04-16 15:50:10 +00002571 * @value: a string token input
2572 * @data: data associated to the token to reuse in callbacks
2573 *
2574 * Push one input token in the execution context
2575 *
2576 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2577 * a negative value in case of error.
2578 */
2579int
2580xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2581 void *data) {
2582 xmlRegTransPtr trans;
2583 xmlRegAtomPtr atom;
2584 int ret;
2585 int final = 0;
2586
2587 if (exec == NULL)
2588 return(-1);
Daniel Veillard23e73572002-09-19 19:56:43 +00002589 if (exec->comp == NULL)
2590 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00002591 if (exec->status != 0)
2592 return(exec->status);
2593
Daniel Veillard23e73572002-09-19 19:56:43 +00002594 if (exec->comp->compact != NULL)
2595 return(xmlRegCompactPushString(exec, exec->comp, value, data));
2596
Daniel Veillard4255d502002-04-16 15:50:10 +00002597 if (value == NULL) {
2598 if (exec->state->type == XML_REGEXP_FINAL_STATE)
2599 return(1);
2600 final = 1;
2601 }
2602
2603#ifdef DEBUG_PUSH
2604 printf("value pushed: %s\n", value);
2605#endif
2606 /*
2607 * If we have an active rollback stack push the new value there
2608 * and get back to where we were left
2609 */
2610 if ((value != NULL) && (exec->inputStackNr > 0)) {
2611 xmlFARegExecSaveInputString(exec, value, data);
2612 value = exec->inputStack[exec->index].value;
2613 data = exec->inputStack[exec->index].data;
2614#ifdef DEBUG_PUSH
2615 printf("value loaded: %s\n", value);
2616#endif
2617 }
2618
2619 while ((exec->status == 0) &&
2620 ((value != NULL) ||
2621 ((final == 1) &&
2622 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
2623
2624 /*
2625 * End of input on non-terminal state, rollback, however we may
2626 * still have epsilon like transition for counted transitions
2627 * on counters, in that case don't break too early.
2628 */
Daniel Veillardb509f152002-04-17 16:28:10 +00002629 if ((value == NULL) && (exec->counts == NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +00002630 goto rollback;
2631
2632 exec->transcount = 0;
2633 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2634 trans = &exec->state->trans[exec->transno];
2635 if (trans->to < 0)
2636 continue;
2637 atom = trans->atom;
2638 ret = 0;
Daniel Veillard441bc322002-04-20 17:38:48 +00002639 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
2640 int i;
2641 int count;
2642 xmlRegTransPtr t;
2643 xmlRegCounterPtr counter;
2644
2645 ret = 0;
2646
2647#ifdef DEBUG_PUSH
2648 printf("testing all lax %d\n", trans->count);
2649#endif
2650 /*
2651 * Check all counted transitions from the current state
2652 */
2653 if ((value == NULL) && (final)) {
2654 ret = 1;
2655 } else if (value != NULL) {
2656 for (i = 0;i < exec->state->nbTrans;i++) {
2657 t = &exec->state->trans[i];
2658 if ((t->counter < 0) || (t == trans))
2659 continue;
2660 counter = &exec->comp->counters[t->counter];
2661 count = exec->counts[t->counter];
2662 if ((count < counter->max) &&
2663 (t->atom != NULL) &&
2664 (xmlStrEqual(value, t->atom->valuep))) {
2665 ret = 0;
2666 break;
2667 }
2668 if ((count >= counter->min) &&
2669 (count < counter->max) &&
2670 (xmlStrEqual(value, t->atom->valuep))) {
2671 ret = 1;
2672 break;
2673 }
2674 }
2675 }
2676 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillard8a001f62002-04-20 07:24:11 +00002677 int i;
2678 int count;
2679 xmlRegTransPtr t;
2680 xmlRegCounterPtr counter;
2681
2682 ret = 1;
2683
2684#ifdef DEBUG_PUSH
2685 printf("testing all %d\n", trans->count);
2686#endif
2687 /*
2688 * Check all counted transitions from the current state
2689 */
2690 for (i = 0;i < exec->state->nbTrans;i++) {
2691 t = &exec->state->trans[i];
2692 if ((t->counter < 0) || (t == trans))
2693 continue;
2694 counter = &exec->comp->counters[t->counter];
2695 count = exec->counts[t->counter];
2696 if ((count < counter->min) || (count > counter->max)) {
2697 ret = 0;
2698 break;
2699 }
2700 }
2701 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002702 int count;
2703 xmlRegCounterPtr counter;
2704
2705 /*
2706 * A counted transition.
2707 */
2708
2709 count = exec->counts[trans->count];
2710 counter = &exec->comp->counters[trans->count];
2711#ifdef DEBUG_PUSH
2712 printf("testing count %d: val %d, min %d, max %d\n",
2713 trans->count, count, counter->min, counter->max);
2714#endif
2715 ret = ((count >= counter->min) && (count <= counter->max));
2716 } else if (atom == NULL) {
2717 fprintf(stderr, "epsilon transition left at runtime\n");
2718 exec->status = -2;
2719 break;
2720 } else if (value != NULL) {
2721 ret = xmlStrEqual(value, atom->valuep);
Daniel Veillard441bc322002-04-20 17:38:48 +00002722 if ((ret == 1) && (trans->counter >= 0)) {
2723 xmlRegCounterPtr counter;
2724 int count;
2725
2726 count = exec->counts[trans->counter];
2727 counter = &exec->comp->counters[trans->counter];
2728 if (count >= counter->max)
2729 ret = 0;
2730 }
2731
Daniel Veillard4255d502002-04-16 15:50:10 +00002732 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2733 xmlRegStatePtr to = exec->comp->states[trans->to];
2734
2735 /*
2736 * this is a multiple input sequence
2737 */
2738 if (exec->state->nbTrans > exec->transno + 1) {
2739 if (exec->inputStackNr <= 0) {
2740 xmlFARegExecSaveInputString(exec, value, data);
2741 }
2742 xmlFARegExecSave(exec);
2743 }
2744 exec->transcount = 1;
2745 do {
2746 /*
2747 * Try to progress as much as possible on the input
2748 */
2749 if (exec->transcount == atom->max) {
2750 break;
2751 }
2752 exec->index++;
2753 value = exec->inputStack[exec->index].value;
2754 data = exec->inputStack[exec->index].data;
2755#ifdef DEBUG_PUSH
2756 printf("value loaded: %s\n", value);
2757#endif
2758
2759 /*
2760 * End of input: stop here
2761 */
2762 if (value == NULL) {
2763 exec->index --;
2764 break;
2765 }
2766 if (exec->transcount >= atom->min) {
2767 int transno = exec->transno;
2768 xmlRegStatePtr state = exec->state;
2769
2770 /*
2771 * The transition is acceptable save it
2772 */
2773 exec->transno = -1; /* trick */
2774 exec->state = to;
2775 if (exec->inputStackNr <= 0) {
2776 xmlFARegExecSaveInputString(exec, value, data);
2777 }
2778 xmlFARegExecSave(exec);
2779 exec->transno = transno;
2780 exec->state = state;
2781 }
2782 ret = xmlStrEqual(value, atom->valuep);
2783 exec->transcount++;
2784 } while (ret == 1);
2785 if (exec->transcount < atom->min)
2786 ret = 0;
2787
2788 /*
2789 * If the last check failed but one transition was found
2790 * possible, rollback
2791 */
2792 if (ret < 0)
2793 ret = 0;
2794 if (ret == 0) {
2795 goto rollback;
2796 }
2797 }
2798 }
2799 if (ret == 1) {
2800 if ((exec->callback != NULL) && (atom != NULL)) {
2801 exec->callback(exec->data, atom->valuep,
2802 atom->data, data);
2803 }
2804 if (exec->state->nbTrans > exec->transno + 1) {
2805 if (exec->inputStackNr <= 0) {
2806 xmlFARegExecSaveInputString(exec, value, data);
2807 }
2808 xmlFARegExecSave(exec);
2809 }
2810 if (trans->counter >= 0) {
2811#ifdef DEBUG_PUSH
2812 printf("Increasing count %d\n", trans->counter);
2813#endif
2814 exec->counts[trans->counter]++;
2815 }
2816#ifdef DEBUG_PUSH
2817 printf("entering state %d\n", trans->to);
2818#endif
2819 exec->state = exec->comp->states[trans->to];
2820 exec->transno = 0;
2821 if (trans->atom != NULL) {
2822 if (exec->inputStack != NULL) {
2823 exec->index++;
2824 if (exec->index < exec->inputStackNr) {
2825 value = exec->inputStack[exec->index].value;
2826 data = exec->inputStack[exec->index].data;
2827#ifdef DEBUG_PUSH
2828 printf("value loaded: %s\n", value);
2829#endif
2830 } else {
2831 value = NULL;
2832 data = NULL;
2833#ifdef DEBUG_PUSH
2834 printf("end of input\n");
2835#endif
2836 }
2837 } else {
2838 value = NULL;
2839 data = NULL;
2840#ifdef DEBUG_PUSH
2841 printf("end of input\n");
2842#endif
2843 }
2844 }
2845 goto progress;
2846 } else if (ret < 0) {
2847 exec->status = -4;
2848 break;
2849 }
2850 }
2851 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2852rollback:
2853 /*
2854 * Failed to find a way out
2855 */
2856 exec->determinist = 0;
2857 xmlFARegExecRollBack(exec);
2858 if (exec->status == 0) {
2859 value = exec->inputStack[exec->index].value;
2860 data = exec->inputStack[exec->index].data;
2861#ifdef DEBUG_PUSH
2862 printf("value loaded: %s\n", value);
2863#endif
2864 }
2865 }
2866progress:
2867 continue;
2868 }
2869 if (exec->status == 0) {
2870 return(exec->state->type == XML_REGEXP_FINAL_STATE);
2871 }
2872 return(exec->status);
2873}
2874
Daniel Veillard52b48c72003-04-13 19:53:42 +00002875/**
2876 * xmlRegExecPushString2:
2877 * @exec: a regexp execution context or NULL to indicate the end
2878 * @value: the first string token input
2879 * @value2: the second string token input
2880 * @data: data associated to the token to reuse in callbacks
2881 *
2882 * Push one input token in the execution context
2883 *
2884 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2885 * a negative value in case of error.
2886 */
2887int
2888xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
2889 const xmlChar *value2, void *data) {
2890 xmlChar buf[150];
2891 int lenn, lenp, ret;
2892 xmlChar *str;
2893
2894 if (exec == NULL)
2895 return(-1);
2896 if (exec->comp == NULL)
2897 return(-1);
2898 if (exec->status != 0)
2899 return(exec->status);
2900
2901 if (value2 == NULL)
2902 return(xmlRegExecPushString(exec, value, data));
2903
2904 lenn = strlen((char *) value2);
2905 lenp = strlen((char *) value);
2906
2907 if (150 < lenn + lenp + 2) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002908 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00002909 if (str == NULL) {
2910 exec->status = -1;
2911 return(-1);
2912 }
2913 } else {
2914 str = buf;
2915 }
2916 memcpy(&str[0], value, lenp);
2917 str[lenp] = '|';
2918 memcpy(&str[lenp + 1], value2, lenn);
2919 str[lenn + lenp + 1] = 0;
2920
2921 if (exec->comp->compact != NULL)
2922 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
2923 else
2924 ret = xmlRegExecPushString(exec, str, data);
2925
2926 if (str != buf)
2927 xmlFree(buf);
2928 return(ret);
2929}
2930
Daniel Veillard4255d502002-04-16 15:50:10 +00002931#if 0
2932static int
2933xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
2934 xmlRegTransPtr trans;
2935 xmlRegAtomPtr atom;
2936 int ret;
2937 int codepoint, len;
2938
2939 if (exec == NULL)
2940 return(-1);
2941 if (exec->status != 0)
2942 return(exec->status);
2943
2944 while ((exec->status == 0) &&
2945 ((exec->inputString[exec->index] != 0) ||
2946 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
2947
2948 /*
2949 * End of input on non-terminal state, rollback, however we may
2950 * still have epsilon like transition for counted transitions
2951 * on counters, in that case don't break too early.
2952 */
2953 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
2954 goto rollback;
2955
2956 exec->transcount = 0;
2957 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2958 trans = &exec->state->trans[exec->transno];
2959 if (trans->to < 0)
2960 continue;
2961 atom = trans->atom;
2962 ret = 0;
2963 if (trans->count >= 0) {
2964 int count;
2965 xmlRegCounterPtr counter;
2966
2967 /*
2968 * A counted transition.
2969 */
2970
2971 count = exec->counts[trans->count];
2972 counter = &exec->comp->counters[trans->count];
2973#ifdef DEBUG_REGEXP_EXEC
2974 printf("testing count %d: val %d, min %d, max %d\n",
2975 trans->count, count, counter->min, counter->max);
2976#endif
2977 ret = ((count >= counter->min) && (count <= counter->max));
2978 } else if (atom == NULL) {
2979 fprintf(stderr, "epsilon transition left at runtime\n");
2980 exec->status = -2;
2981 break;
2982 } else if (exec->inputString[exec->index] != 0) {
2983 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
2984 ret = xmlRegCheckCharacter(atom, codepoint);
2985 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2986 xmlRegStatePtr to = exec->comp->states[trans->to];
2987
2988 /*
2989 * this is a multiple input sequence
2990 */
2991 if (exec->state->nbTrans > exec->transno + 1) {
2992 xmlFARegExecSave(exec);
2993 }
2994 exec->transcount = 1;
2995 do {
2996 /*
2997 * Try to progress as much as possible on the input
2998 */
2999 if (exec->transcount == atom->max) {
3000 break;
3001 }
3002 exec->index += len;
3003 /*
3004 * End of input: stop here
3005 */
3006 if (exec->inputString[exec->index] == 0) {
3007 exec->index -= len;
3008 break;
3009 }
3010 if (exec->transcount >= atom->min) {
3011 int transno = exec->transno;
3012 xmlRegStatePtr state = exec->state;
3013
3014 /*
3015 * The transition is acceptable save it
3016 */
3017 exec->transno = -1; /* trick */
3018 exec->state = to;
3019 xmlFARegExecSave(exec);
3020 exec->transno = transno;
3021 exec->state = state;
3022 }
3023 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3024 len);
3025 ret = xmlRegCheckCharacter(atom, codepoint);
3026 exec->transcount++;
3027 } while (ret == 1);
3028 if (exec->transcount < atom->min)
3029 ret = 0;
3030
3031 /*
3032 * If the last check failed but one transition was found
3033 * possible, rollback
3034 */
3035 if (ret < 0)
3036 ret = 0;
3037 if (ret == 0) {
3038 goto rollback;
3039 }
3040 }
3041 }
3042 if (ret == 1) {
3043 if (exec->state->nbTrans > exec->transno + 1) {
3044 xmlFARegExecSave(exec);
3045 }
3046 if (trans->counter >= 0) {
3047#ifdef DEBUG_REGEXP_EXEC
3048 printf("Increasing count %d\n", trans->counter);
3049#endif
3050 exec->counts[trans->counter]++;
3051 }
3052#ifdef DEBUG_REGEXP_EXEC
3053 printf("entering state %d\n", trans->to);
3054#endif
3055 exec->state = exec->comp->states[trans->to];
3056 exec->transno = 0;
3057 if (trans->atom != NULL) {
3058 exec->index += len;
3059 }
3060 goto progress;
3061 } else if (ret < 0) {
3062 exec->status = -4;
3063 break;
3064 }
3065 }
3066 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3067rollback:
3068 /*
3069 * Failed to find a way out
3070 */
3071 exec->determinist = 0;
3072 xmlFARegExecRollBack(exec);
3073 }
3074progress:
3075 continue;
3076 }
3077}
3078#endif
3079/************************************************************************
3080 * *
3081 * Parser for the Shemas Datatype Regular Expressions *
3082 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
3083 * *
3084 ************************************************************************/
3085
3086/**
3087 * xmlFAIsChar:
Daniel Veillard441bc322002-04-20 17:38:48 +00003088 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003089 *
3090 * [10] Char ::= [^.\?*+()|#x5B#x5D]
3091 */
3092static int
3093xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
3094 int cur;
3095 int len;
3096
3097 cur = CUR_SCHAR(ctxt->cur, len);
3098 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
3099 (cur == '*') || (cur == '+') || (cur == '(') ||
3100 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
3101 (cur == 0x5D) || (cur == 0))
3102 return(-1);
3103 return(cur);
3104}
3105
3106/**
3107 * xmlFAParseCharProp:
Daniel Veillard441bc322002-04-20 17:38:48 +00003108 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003109 *
3110 * [27] charProp ::= IsCategory | IsBlock
3111 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
3112 * Separators | Symbols | Others
3113 * [29] Letters ::= 'L' [ultmo]?
3114 * [30] Marks ::= 'M' [nce]?
3115 * [31] Numbers ::= 'N' [dlo]?
3116 * [32] Punctuation ::= 'P' [cdseifo]?
3117 * [33] Separators ::= 'Z' [slp]?
3118 * [34] Symbols ::= 'S' [mcko]?
3119 * [35] Others ::= 'C' [cfon]?
3120 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
3121 */
3122static void
3123xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
3124 int cur;
William M. Brack779af002003-08-01 15:55:39 +00003125 xmlRegAtomType type = (xmlRegAtomType) 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003126 xmlChar *blockName = NULL;
3127
3128 cur = CUR;
3129 if (cur == 'L') {
3130 NEXT;
3131 cur = CUR;
3132 if (cur == 'u') {
3133 NEXT;
3134 type = XML_REGEXP_LETTER_UPPERCASE;
3135 } else if (cur == 'l') {
3136 NEXT;
3137 type = XML_REGEXP_LETTER_LOWERCASE;
3138 } else if (cur == 't') {
3139 NEXT;
3140 type = XML_REGEXP_LETTER_TITLECASE;
3141 } else if (cur == 'm') {
3142 NEXT;
3143 type = XML_REGEXP_LETTER_MODIFIER;
3144 } else if (cur == 'o') {
3145 NEXT;
3146 type = XML_REGEXP_LETTER_OTHERS;
3147 } else {
3148 type = XML_REGEXP_LETTER;
3149 }
3150 } else if (cur == 'M') {
3151 NEXT;
3152 cur = CUR;
3153 if (cur == 'n') {
3154 NEXT;
3155 /* nonspacing */
3156 type = XML_REGEXP_MARK_NONSPACING;
3157 } else if (cur == 'c') {
3158 NEXT;
3159 /* spacing combining */
3160 type = XML_REGEXP_MARK_SPACECOMBINING;
3161 } else if (cur == 'e') {
3162 NEXT;
3163 /* enclosing */
3164 type = XML_REGEXP_MARK_ENCLOSING;
3165 } else {
3166 /* all marks */
3167 type = XML_REGEXP_MARK;
3168 }
3169 } else if (cur == 'N') {
3170 NEXT;
3171 cur = CUR;
3172 if (cur == 'd') {
3173 NEXT;
3174 /* digital */
3175 type = XML_REGEXP_NUMBER_DECIMAL;
3176 } else if (cur == 'l') {
3177 NEXT;
3178 /* letter */
3179 type = XML_REGEXP_NUMBER_LETTER;
3180 } else if (cur == 'o') {
3181 NEXT;
3182 /* other */
3183 type = XML_REGEXP_NUMBER_OTHERS;
3184 } else {
3185 /* all numbers */
3186 type = XML_REGEXP_NUMBER;
3187 }
3188 } else if (cur == 'P') {
3189 NEXT;
3190 cur = CUR;
3191 if (cur == 'c') {
3192 NEXT;
3193 /* connector */
3194 type = XML_REGEXP_PUNCT_CONNECTOR;
3195 } else if (cur == 'd') {
3196 NEXT;
3197 /* dash */
3198 type = XML_REGEXP_PUNCT_DASH;
3199 } else if (cur == 's') {
3200 NEXT;
3201 /* open */
3202 type = XML_REGEXP_PUNCT_OPEN;
3203 } else if (cur == 'e') {
3204 NEXT;
3205 /* close */
3206 type = XML_REGEXP_PUNCT_CLOSE;
3207 } else if (cur == 'i') {
3208 NEXT;
3209 /* initial quote */
3210 type = XML_REGEXP_PUNCT_INITQUOTE;
3211 } else if (cur == 'f') {
3212 NEXT;
3213 /* final quote */
3214 type = XML_REGEXP_PUNCT_FINQUOTE;
3215 } else if (cur == 'o') {
3216 NEXT;
3217 /* other */
3218 type = XML_REGEXP_PUNCT_OTHERS;
3219 } else {
3220 /* all punctuation */
3221 type = XML_REGEXP_PUNCT;
3222 }
3223 } else if (cur == 'Z') {
3224 NEXT;
3225 cur = CUR;
3226 if (cur == 's') {
3227 NEXT;
3228 /* space */
3229 type = XML_REGEXP_SEPAR_SPACE;
3230 } else if (cur == 'l') {
3231 NEXT;
3232 /* line */
3233 type = XML_REGEXP_SEPAR_LINE;
3234 } else if (cur == 'p') {
3235 NEXT;
3236 /* paragraph */
3237 type = XML_REGEXP_SEPAR_PARA;
3238 } else {
3239 /* all separators */
3240 type = XML_REGEXP_SEPAR;
3241 }
3242 } else if (cur == 'S') {
3243 NEXT;
3244 cur = CUR;
3245 if (cur == 'm') {
3246 NEXT;
3247 type = XML_REGEXP_SYMBOL_MATH;
3248 /* math */
3249 } else if (cur == 'c') {
3250 NEXT;
3251 type = XML_REGEXP_SYMBOL_CURRENCY;
3252 /* currency */
3253 } else if (cur == 'k') {
3254 NEXT;
3255 type = XML_REGEXP_SYMBOL_MODIFIER;
3256 /* modifiers */
3257 } else if (cur == 'o') {
3258 NEXT;
3259 type = XML_REGEXP_SYMBOL_OTHERS;
3260 /* other */
3261 } else {
3262 /* all symbols */
3263 type = XML_REGEXP_SYMBOL;
3264 }
3265 } else if (cur == 'C') {
3266 NEXT;
3267 cur = CUR;
3268 if (cur == 'c') {
3269 NEXT;
3270 /* control */
3271 type = XML_REGEXP_OTHER_CONTROL;
3272 } else if (cur == 'f') {
3273 NEXT;
3274 /* format */
3275 type = XML_REGEXP_OTHER_FORMAT;
3276 } else if (cur == 'o') {
3277 NEXT;
3278 /* private use */
3279 type = XML_REGEXP_OTHER_PRIVATE;
3280 } else if (cur == 'n') {
3281 NEXT;
3282 /* not assigned */
3283 type = XML_REGEXP_OTHER_NA;
3284 } else {
3285 /* all others */
3286 type = XML_REGEXP_OTHER;
3287 }
3288 } else if (cur == 'I') {
3289 const xmlChar *start;
3290 NEXT;
3291 cur = CUR;
3292 if (cur != 's') {
3293 ERROR("IsXXXX expected");
3294 return;
3295 }
3296 NEXT;
3297 start = ctxt->cur;
3298 cur = CUR;
3299 if (((cur >= 'a') && (cur <= 'z')) ||
3300 ((cur >= 'A') && (cur <= 'Z')) ||
3301 ((cur >= '0') && (cur <= '9')) ||
3302 (cur == 0x2D)) {
3303 NEXT;
3304 cur = CUR;
3305 while (((cur >= 'a') && (cur <= 'z')) ||
3306 ((cur >= 'A') && (cur <= 'Z')) ||
3307 ((cur >= '0') && (cur <= '9')) ||
3308 (cur == 0x2D)) {
3309 NEXT;
3310 cur = CUR;
3311 }
3312 }
3313 type = XML_REGEXP_BLOCK_NAME;
3314 blockName = xmlStrndup(start, ctxt->cur - start);
3315 } else {
3316 ERROR("Unknown char property");
3317 return;
3318 }
3319 if (ctxt->atom == NULL) {
3320 ctxt->atom = xmlRegNewAtom(ctxt, type);
3321 if (ctxt->atom != NULL)
3322 ctxt->atom->valuep = blockName;
3323 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3324 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3325 type, 0, 0, blockName);
3326 }
3327}
3328
3329/**
3330 * xmlFAParseCharClassEsc:
Daniel Veillard441bc322002-04-20 17:38:48 +00003331 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003332 *
3333 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
3334 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
3335 * [25] catEsc ::= '\p{' charProp '}'
3336 * [26] complEsc ::= '\P{' charProp '}'
3337 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
3338 */
3339static void
3340xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
3341 int cur;
3342
3343 if (CUR == '.') {
3344 if (ctxt->atom == NULL) {
3345 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
3346 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3347 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3348 XML_REGEXP_ANYCHAR, 0, 0, NULL);
3349 }
3350 NEXT;
3351 return;
3352 }
3353 if (CUR != '\\') {
3354 ERROR("Escaped sequence: expecting \\");
3355 return;
3356 }
3357 NEXT;
3358 cur = CUR;
3359 if (cur == 'p') {
3360 NEXT;
3361 if (CUR != '{') {
3362 ERROR("Expecting '{'");
3363 return;
3364 }
3365 NEXT;
3366 xmlFAParseCharProp(ctxt);
3367 if (CUR != '}') {
3368 ERROR("Expecting '}'");
3369 return;
3370 }
3371 NEXT;
3372 } else if (cur == 'P') {
3373 NEXT;
3374 if (CUR != '{') {
3375 ERROR("Expecting '{'");
3376 return;
3377 }
3378 NEXT;
3379 xmlFAParseCharProp(ctxt);
3380 ctxt->atom->neg = 1;
3381 if (CUR != '}') {
3382 ERROR("Expecting '}'");
3383 return;
3384 }
3385 NEXT;
3386 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
3387 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
3388 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
3389 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
3390 (cur == 0x5E)) {
3391 if (ctxt->atom == NULL) {
3392 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
3393 if (ctxt->atom != NULL)
3394 ctxt->atom->codepoint = cur;
3395 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3396 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3397 XML_REGEXP_CHARVAL, cur, cur, NULL);
3398 }
3399 NEXT;
3400 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
3401 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
3402 (cur == 'w') || (cur == 'W')) {
Daniel Veillardb509f152002-04-17 16:28:10 +00003403 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
Daniel Veillard4255d502002-04-16 15:50:10 +00003404
3405 switch (cur) {
3406 case 's':
3407 type = XML_REGEXP_ANYSPACE;
3408 break;
3409 case 'S':
3410 type = XML_REGEXP_NOTSPACE;
3411 break;
3412 case 'i':
3413 type = XML_REGEXP_INITNAME;
3414 break;
3415 case 'I':
3416 type = XML_REGEXP_NOTINITNAME;
3417 break;
3418 case 'c':
3419 type = XML_REGEXP_NAMECHAR;
3420 break;
3421 case 'C':
3422 type = XML_REGEXP_NOTNAMECHAR;
3423 break;
3424 case 'd':
3425 type = XML_REGEXP_DECIMAL;
3426 break;
3427 case 'D':
3428 type = XML_REGEXP_NOTDECIMAL;
3429 break;
3430 case 'w':
3431 type = XML_REGEXP_REALCHAR;
3432 break;
3433 case 'W':
3434 type = XML_REGEXP_NOTREALCHAR;
3435 break;
3436 }
3437 NEXT;
3438 if (ctxt->atom == NULL) {
3439 ctxt->atom = xmlRegNewAtom(ctxt, type);
3440 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3441 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3442 type, 0, 0, NULL);
3443 }
3444 }
3445}
3446
3447/**
3448 * xmlFAParseCharRef:
Daniel Veillard441bc322002-04-20 17:38:48 +00003449 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003450 *
3451 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' )
3452 */
3453static int
3454xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) {
3455 int ret = 0, cur;
3456
3457 if ((CUR != '&') || (NXT(1) != '#'))
3458 return(-1);
3459 NEXT;
3460 NEXT;
3461 cur = CUR;
3462 if (cur == 'x') {
3463 NEXT;
3464 cur = CUR;
3465 if (((cur >= '0') && (cur <= '9')) ||
3466 ((cur >= 'a') && (cur <= 'f')) ||
3467 ((cur >= 'A') && (cur <= 'F'))) {
3468 while (((cur >= '0') && (cur <= '9')) ||
3469 ((cur >= 'A') && (cur <= 'F'))) {
3470 if ((cur >= '0') && (cur <= '9'))
3471 ret = ret * 16 + cur - '0';
3472 else if ((cur >= 'a') && (cur <= 'f'))
3473 ret = ret * 16 + 10 + (cur - 'a');
3474 else
3475 ret = ret * 16 + 10 + (cur - 'A');
3476 NEXT;
3477 cur = CUR;
3478 }
3479 } else {
3480 ERROR("Char ref: expecting [0-9A-F]");
3481 return(-1);
3482 }
3483 } else {
3484 if ((cur >= '0') && (cur <= '9')) {
3485 while ((cur >= '0') && (cur <= '9')) {
3486 ret = ret * 10 + cur - '0';
3487 NEXT;
3488 cur = CUR;
3489 }
3490 } else {
3491 ERROR("Char ref: expecting [0-9]");
3492 return(-1);
3493 }
3494 }
3495 if (cur != ';') {
3496 ERROR("Char ref: expecting ';'");
3497 return(-1);
3498 } else {
3499 NEXT;
3500 }
3501 return(ret);
3502}
3503
3504/**
3505 * xmlFAParseCharRange:
Daniel Veillard441bc322002-04-20 17:38:48 +00003506 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003507 *
3508 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
3509 * [18] seRange ::= charOrEsc '-' charOrEsc
3510 * [20] charOrEsc ::= XmlChar | SingleCharEsc
3511 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
3512 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
3513 */
3514static void
3515xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
3516 int cur;
3517 int start = -1;
3518 int end = -1;
3519
3520 if ((CUR == '&') && (NXT(1) == '#')) {
3521 end = start = xmlFAParseCharRef(ctxt);
3522 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3523 XML_REGEXP_CHARVAL, start, end, NULL);
3524 return;
3525 }
3526 cur = CUR;
3527 if (cur == '\\') {
3528 NEXT;
3529 cur = CUR;
3530 switch (cur) {
3531 case 'n': start = 0xA; break;
3532 case 'r': start = 0xD; break;
3533 case 't': start = 0x9; break;
3534 case '\\': case '|': case '.': case '-': case '^': case '?':
3535 case '*': case '+': case '{': case '}': case '(': case ')':
3536 case '[': case ']':
3537 start = cur; break;
3538 default:
3539 ERROR("Invalid escape value");
3540 return;
3541 }
3542 end = start;
3543 } else if ((cur != 0x5B) && (cur != 0x5D)) {
3544 end = start = cur;
3545 } else {
3546 ERROR("Expecting a char range");
3547 return;
3548 }
3549 NEXT;
3550 if (start == '-') {
3551 return;
3552 }
3553 cur = CUR;
3554 if (cur != '-') {
3555 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3556 XML_REGEXP_CHARVAL, start, end, NULL);
3557 return;
3558 }
3559 NEXT;
3560 cur = CUR;
3561 if (cur == '\\') {
3562 NEXT;
3563 cur = CUR;
3564 switch (cur) {
3565 case 'n': end = 0xA; break;
3566 case 'r': end = 0xD; break;
3567 case 't': end = 0x9; break;
3568 case '\\': case '|': case '.': case '-': case '^': case '?':
3569 case '*': case '+': case '{': case '}': case '(': case ')':
3570 case '[': case ']':
3571 end = cur; break;
3572 default:
3573 ERROR("Invalid escape value");
3574 return;
3575 }
3576 } else if ((cur != 0x5B) && (cur != 0x5D)) {
3577 end = cur;
3578 } else {
3579 ERROR("Expecting the end of a char range");
3580 return;
3581 }
3582 NEXT;
3583 /* TODO check that the values are acceptable character ranges for XML */
3584 if (end < start) {
3585 ERROR("End of range is before start of range");
3586 } else {
3587 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3588 XML_REGEXP_CHARVAL, start, end, NULL);
3589 }
3590 return;
3591}
3592
3593/**
3594 * xmlFAParsePosCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00003595 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003596 *
3597 * [14] posCharGroup ::= ( charRange | charClassEsc )+
3598 */
3599static void
3600xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
3601 do {
3602 if ((CUR == '\\') || (CUR == '.')) {
3603 xmlFAParseCharClassEsc(ctxt);
3604 } else {
3605 xmlFAParseCharRange(ctxt);
3606 }
3607 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
3608 (ctxt->error == 0));
3609}
3610
3611/**
3612 * xmlFAParseCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00003613 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003614 *
3615 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
3616 * [15] negCharGroup ::= '^' posCharGroup
3617 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
3618 * [12] charClassExpr ::= '[' charGroup ']'
3619 */
3620static void
3621xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
3622 int n = ctxt->neg;
3623 while ((CUR != ']') && (ctxt->error == 0)) {
3624 if (CUR == '^') {
3625 int neg = ctxt->neg;
3626
3627 NEXT;
3628 ctxt->neg = !ctxt->neg;
3629 xmlFAParsePosCharGroup(ctxt);
3630 ctxt->neg = neg;
3631 } else if (CUR == '-') {
3632 NEXT;
3633 ctxt->neg = !ctxt->neg;
3634 if (CUR != '[') {
3635 ERROR("charClassExpr: '[' expected");
3636 break;
3637 }
3638 NEXT;
3639 xmlFAParseCharGroup(ctxt);
3640 if (CUR == ']') {
3641 NEXT;
3642 } else {
3643 ERROR("charClassExpr: ']' expected");
3644 break;
3645 }
3646 break;
3647 } else if (CUR != ']') {
3648 xmlFAParsePosCharGroup(ctxt);
3649 }
3650 }
3651 ctxt->neg = n;
3652}
3653
3654/**
3655 * xmlFAParseCharClass:
Daniel Veillard441bc322002-04-20 17:38:48 +00003656 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003657 *
3658 * [11] charClass ::= charClassEsc | charClassExpr
3659 * [12] charClassExpr ::= '[' charGroup ']'
3660 */
3661static void
3662xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
3663 if (CUR == '[') {
3664 NEXT;
3665 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
3666 if (ctxt->atom == NULL)
3667 return;
3668 xmlFAParseCharGroup(ctxt);
3669 if (CUR == ']') {
3670 NEXT;
3671 } else {
3672 ERROR("xmlFAParseCharClass: ']' expected");
3673 }
3674 } else {
3675 xmlFAParseCharClassEsc(ctxt);
3676 }
3677}
3678
3679/**
3680 * xmlFAParseQuantExact:
Daniel Veillard441bc322002-04-20 17:38:48 +00003681 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003682 *
3683 * [8] QuantExact ::= [0-9]+
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00003684 *
3685 * Returns 0 if success or -1 in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00003686 */
3687static int
3688xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
3689 int ret = 0;
3690 int ok = 0;
3691
3692 while ((CUR >= '0') && (CUR <= '9')) {
3693 ret = ret * 10 + (CUR - '0');
3694 ok = 1;
3695 NEXT;
3696 }
3697 if (ok != 1) {
3698 return(-1);
3699 }
3700 return(ret);
3701}
3702
3703/**
3704 * xmlFAParseQuantifier:
Daniel Veillard441bc322002-04-20 17:38:48 +00003705 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003706 *
3707 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
3708 * [5] quantity ::= quantRange | quantMin | QuantExact
3709 * [6] quantRange ::= QuantExact ',' QuantExact
3710 * [7] quantMin ::= QuantExact ','
3711 * [8] QuantExact ::= [0-9]+
3712 */
3713static int
3714xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
3715 int cur;
3716
3717 cur = CUR;
3718 if ((cur == '?') || (cur == '*') || (cur == '+')) {
3719 if (ctxt->atom != NULL) {
3720 if (cur == '?')
3721 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
3722 else if (cur == '*')
3723 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
3724 else if (cur == '+')
3725 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
3726 }
3727 NEXT;
3728 return(1);
3729 }
3730 if (cur == '{') {
3731 int min = 0, max = 0;
3732
3733 NEXT;
3734 cur = xmlFAParseQuantExact(ctxt);
3735 if (cur >= 0)
3736 min = cur;
3737 if (CUR == ',') {
3738 NEXT;
3739 cur = xmlFAParseQuantExact(ctxt);
3740 if (cur >= 0)
3741 max = cur;
3742 }
3743 if (CUR == '}') {
3744 NEXT;
3745 } else {
3746 ERROR("Unterminated quantifier");
3747 }
3748 if (max == 0)
3749 max = min;
3750 if (ctxt->atom != NULL) {
3751 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
3752 ctxt->atom->min = min;
3753 ctxt->atom->max = max;
3754 }
3755 return(1);
3756 }
3757 return(0);
3758}
3759
3760/**
3761 * xmlFAParseAtom:
Daniel Veillard441bc322002-04-20 17:38:48 +00003762 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003763 *
3764 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
3765 */
3766static int
3767xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
3768 int codepoint, len;
3769
3770 codepoint = xmlFAIsChar(ctxt);
3771 if (codepoint > 0) {
3772 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
3773 if (ctxt->atom == NULL)
3774 return(-1);
3775 codepoint = CUR_SCHAR(ctxt->cur, len);
3776 ctxt->atom->codepoint = codepoint;
3777 NEXTL(len);
3778 return(1);
3779 } else if (CUR == '|') {
3780 return(0);
3781 } else if (CUR == 0) {
3782 return(0);
3783 } else if (CUR == ')') {
3784 return(0);
3785 } else if (CUR == '(') {
3786 xmlRegStatePtr start, oldend;
3787
3788 NEXT;
3789 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
3790 start = ctxt->state;
3791 oldend = ctxt->end;
3792 ctxt->end = NULL;
3793 ctxt->atom = NULL;
3794 xmlFAParseRegExp(ctxt, 0);
3795 if (CUR == ')') {
3796 NEXT;
3797 } else {
3798 ERROR("xmlFAParseAtom: expecting ')'");
3799 }
3800 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
3801 if (ctxt->atom == NULL)
3802 return(-1);
3803 ctxt->atom->start = start;
3804 ctxt->atom->stop = ctxt->state;
3805 ctxt->end = oldend;
3806 return(1);
3807 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
3808 xmlFAParseCharClass(ctxt);
3809 return(1);
3810 }
3811 return(0);
3812}
3813
3814/**
3815 * xmlFAParsePiece:
Daniel Veillard441bc322002-04-20 17:38:48 +00003816 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003817 *
3818 * [3] piece ::= atom quantifier?
3819 */
3820static int
3821xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
3822 int ret;
3823
3824 ctxt->atom = NULL;
3825 ret = xmlFAParseAtom(ctxt);
3826 if (ret == 0)
3827 return(0);
3828 if (ctxt->atom == NULL) {
3829 ERROR("internal: no atom generated");
3830 }
3831 xmlFAParseQuantifier(ctxt);
3832 return(1);
3833}
3834
3835/**
3836 * xmlFAParseBranch:
Daniel Veillard441bc322002-04-20 17:38:48 +00003837 * @ctxt: a regexp parser context
3838 * @first: is taht the first
Daniel Veillard4255d502002-04-16 15:50:10 +00003839 *
3840 * [2] branch ::= piece*
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00003841 8
Daniel Veillard4255d502002-04-16 15:50:10 +00003842 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00003843static int
Daniel Veillard4255d502002-04-16 15:50:10 +00003844xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, int first) {
3845 xmlRegStatePtr previous;
3846 xmlRegAtomPtr prevatom = NULL;
3847 int ret;
3848
3849 previous = ctxt->state;
3850 ret = xmlFAParsePiece(ctxt);
3851 if (ret != 0) {
3852 if (first) {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00003853 if (xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom) < 0)
3854 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003855 previous = ctxt->state;
3856 } else {
3857 prevatom = ctxt->atom;
3858 }
3859 ctxt->atom = NULL;
3860 }
3861 while ((ret != 0) && (ctxt->error == 0)) {
3862 ret = xmlFAParsePiece(ctxt);
3863 if (ret != 0) {
3864 if (first) {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00003865 if (xmlFAGenerateTransitions(ctxt, previous, NULL,
3866 ctxt->atom) < 0)
3867 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003868 } else {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00003869 if (xmlFAGenerateTransitions(ctxt, previous, NULL,
3870 prevatom) < 0)
3871 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003872 prevatom = ctxt->atom;
3873 }
3874 previous = ctxt->state;
3875 ctxt->atom = NULL;
3876 }
3877 }
3878 if (!first) {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00003879 if (xmlFAGenerateTransitions(ctxt, previous, ctxt->end, prevatom) < 0)
3880 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00003881 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00003882 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00003883}
3884
3885/**
3886 * xmlFAParseRegExp:
Daniel Veillard441bc322002-04-20 17:38:48 +00003887 * @ctxt: a regexp parser context
3888 * @top: is that the top-level expressions ?
Daniel Veillard4255d502002-04-16 15:50:10 +00003889 *
3890 * [1] regExp ::= branch ( '|' branch )*
3891 */
3892static void
3893xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
3894 xmlRegStatePtr start, end, oldend;
3895
3896 oldend = ctxt->end;
3897
3898 start = ctxt->state;
3899 xmlFAParseBranch(ctxt, (ctxt->end == NULL));
3900 if (CUR != '|') {
3901 ctxt->end = ctxt->state;
3902 return;
3903 }
3904 end = ctxt->state;
3905 while ((CUR == '|') && (ctxt->error == 0)) {
3906 NEXT;
3907 ctxt->state = start;
3908 ctxt->end = end;
3909 xmlFAParseBranch(ctxt, 0);
3910 }
3911 if (!top)
3912 ctxt->end = oldend;
3913}
3914
3915/************************************************************************
3916 * *
3917 * The basic API *
3918 * *
3919 ************************************************************************/
3920
3921/**
3922 * xmlRegexpPrint:
3923 * @output: the file for the output debug
3924 * @regexp: the compiled regexp
3925 *
3926 * Print the content of the compiled regular expression
3927 */
3928void
3929xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
3930 int i;
3931
3932 fprintf(output, " regexp: ");
3933 if (regexp == NULL) {
3934 fprintf(output, "NULL\n");
3935 return;
3936 }
3937 fprintf(output, "'%s' ", regexp->string);
3938 fprintf(output, "\n");
3939 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
3940 for (i = 0;i < regexp->nbAtoms; i++) {
3941 fprintf(output, " %02d ", i);
3942 xmlRegPrintAtom(output, regexp->atoms[i]);
3943 }
3944 fprintf(output, "%d states:", regexp->nbStates);
3945 fprintf(output, "\n");
3946 for (i = 0;i < regexp->nbStates; i++) {
3947 xmlRegPrintState(output, regexp->states[i]);
3948 }
3949 fprintf(output, "%d counters:\n", regexp->nbCounters);
3950 for (i = 0;i < regexp->nbCounters; i++) {
3951 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
3952 regexp->counters[i].max);
3953 }
3954}
3955
3956/**
3957 * xmlRegexpCompile:
3958 * @regexp: a regular expression string
3959 *
3960 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
3961 * Appendix F and build an automata suitable for testing strings against
3962 * that regular expression
3963 *
3964 * Returns the compiled expression or NULL in case of error
3965 */
3966xmlRegexpPtr
3967xmlRegexpCompile(const xmlChar *regexp) {
3968 xmlRegexpPtr ret;
3969 xmlRegParserCtxtPtr ctxt;
3970
3971 ctxt = xmlRegNewParserCtxt(regexp);
3972 if (ctxt == NULL)
3973 return(NULL);
3974
3975 /* initialize the parser */
3976 ctxt->end = NULL;
3977 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
3978 xmlRegStatePush(ctxt, ctxt->start);
3979
3980 /* parse the expression building an automata */
3981 xmlFAParseRegExp(ctxt, 1);
3982 if (CUR != 0) {
3983 ERROR("xmlFAParseRegExp: extra characters");
3984 }
3985 ctxt->end = ctxt->state;
3986 ctxt->start->type = XML_REGEXP_START_STATE;
3987 ctxt->end->type = XML_REGEXP_FINAL_STATE;
3988
3989 /* remove the Epsilon except for counted transitions */
3990 xmlFAEliminateEpsilonTransitions(ctxt);
3991
3992
3993 if (ctxt->error != 0) {
3994 xmlRegFreeParserCtxt(ctxt);
3995 return(NULL);
3996 }
3997 ret = xmlRegEpxFromParse(ctxt);
3998 xmlRegFreeParserCtxt(ctxt);
3999 return(ret);
4000}
4001
4002/**
4003 * xmlRegexpExec:
4004 * @comp: the compiled regular expression
4005 * @content: the value to check against the regular expression
4006 *
4007 * Check if the regular expression generate the value
4008 *
4009 * Returns 1 if it matches, 0 if not and a negativa value in case of error
4010 */
4011int
4012xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
4013 if ((comp == NULL) || (content == NULL))
4014 return(-1);
4015 return(xmlFARegExec(comp, content));
4016}
4017
4018/**
Daniel Veillard23e73572002-09-19 19:56:43 +00004019 * xmlRegexpIsDeterminist:
4020 * @comp: the compiled regular expression
4021 *
4022 * Check if the regular expression is determinist
4023 *
4024 * Returns 1 if it yes, 0 if not and a negativa value in case of error
4025 */
4026int
4027xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
4028 xmlAutomataPtr am;
4029 int ret;
4030
4031 if (comp == NULL)
4032 return(-1);
4033 if (comp->determinist != -1)
4034 return(comp->determinist);
4035
4036 am = xmlNewAutomata();
Daniel Veillardbd9afb52002-09-25 22:25:35 +00004037 if (am->states != NULL) {
4038 int i;
4039
4040 for (i = 0;i < am->nbStates;i++)
4041 xmlRegFreeState(am->states[i]);
4042 xmlFree(am->states);
4043 }
Daniel Veillard23e73572002-09-19 19:56:43 +00004044 am->nbAtoms = comp->nbAtoms;
4045 am->atoms = comp->atoms;
4046 am->nbStates = comp->nbStates;
4047 am->states = comp->states;
4048 am->determinist = -1;
4049 ret = xmlFAComputesDeterminism(am);
4050 am->atoms = NULL;
4051 am->states = NULL;
4052 xmlFreeAutomata(am);
4053 return(ret);
4054}
4055
4056/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004057 * xmlRegFreeRegexp:
4058 * @regexp: the regexp
4059 *
4060 * Free a regexp
4061 */
4062void
4063xmlRegFreeRegexp(xmlRegexpPtr regexp) {
4064 int i;
4065 if (regexp == NULL)
4066 return;
4067
4068 if (regexp->string != NULL)
4069 xmlFree(regexp->string);
4070 if (regexp->states != NULL) {
4071 for (i = 0;i < regexp->nbStates;i++)
4072 xmlRegFreeState(regexp->states[i]);
4073 xmlFree(regexp->states);
4074 }
4075 if (regexp->atoms != NULL) {
4076 for (i = 0;i < regexp->nbAtoms;i++)
4077 xmlRegFreeAtom(regexp->atoms[i]);
4078 xmlFree(regexp->atoms);
4079 }
4080 if (regexp->counters != NULL)
4081 xmlFree(regexp->counters);
Daniel Veillard23e73572002-09-19 19:56:43 +00004082 if (regexp->compact != NULL)
4083 xmlFree(regexp->compact);
Daniel Veillard118aed72002-09-24 14:13:13 +00004084 if (regexp->transdata != NULL)
4085 xmlFree(regexp->transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +00004086 if (regexp->stringMap != NULL) {
4087 for (i = 0; i < regexp->nbstrings;i++)
4088 xmlFree(regexp->stringMap[i]);
4089 xmlFree(regexp->stringMap);
4090 }
4091
Daniel Veillard4255d502002-04-16 15:50:10 +00004092 xmlFree(regexp);
4093}
4094
4095#ifdef LIBXML_AUTOMATA_ENABLED
4096/************************************************************************
4097 * *
4098 * The Automata interface *
4099 * *
4100 ************************************************************************/
4101
4102/**
4103 * xmlNewAutomata:
4104 *
4105 * Create a new automata
4106 *
4107 * Returns the new object or NULL in case of failure
4108 */
4109xmlAutomataPtr
4110xmlNewAutomata(void) {
4111 xmlAutomataPtr ctxt;
4112
4113 ctxt = xmlRegNewParserCtxt(NULL);
4114 if (ctxt == NULL)
4115 return(NULL);
4116
4117 /* initialize the parser */
4118 ctxt->end = NULL;
4119 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004120 if (ctxt->start == NULL) {
4121 xmlFreeAutomata(ctxt);
4122 return(NULL);
4123 }
4124 if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
4125 xmlRegFreeState(ctxt->start);
4126 xmlFreeAutomata(ctxt);
4127 return(NULL);
4128 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004129
4130 return(ctxt);
4131}
4132
4133/**
4134 * xmlFreeAutomata:
4135 * @am: an automata
4136 *
4137 * Free an automata
4138 */
4139void
4140xmlFreeAutomata(xmlAutomataPtr am) {
4141 if (am == NULL)
4142 return;
4143 xmlRegFreeParserCtxt(am);
4144}
4145
4146/**
4147 * xmlAutomataGetInitState:
4148 * @am: an automata
4149 *
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004150 * Initial state lookup
4151 *
Daniel Veillard4255d502002-04-16 15:50:10 +00004152 * Returns the initial state of the automata
4153 */
4154xmlAutomataStatePtr
4155xmlAutomataGetInitState(xmlAutomataPtr am) {
4156 if (am == NULL)
4157 return(NULL);
4158 return(am->start);
4159}
4160
4161/**
4162 * xmlAutomataSetFinalState:
4163 * @am: an automata
4164 * @state: a state in this automata
4165 *
4166 * Makes that state a final state
4167 *
4168 * Returns 0 or -1 in case of error
4169 */
4170int
4171xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
4172 if ((am == NULL) || (state == NULL))
4173 return(-1);
4174 state->type = XML_REGEXP_FINAL_STATE;
4175 return(0);
4176}
4177
4178/**
4179 * xmlAutomataNewTransition:
4180 * @am: an automata
4181 * @from: the starting point of the transition
4182 * @to: the target point of the transition or NULL
4183 * @token: the input string associated to that transition
4184 * @data: data passed to the callback function if the transition is activated
4185 *
4186 * If @to is NULL, this create first a new target state in the automata
4187 * and then adds a transition from the @from state to the target state
4188 * activated by the value of @token
4189 *
4190 * Returns the target state or NULL in case of error
4191 */
4192xmlAutomataStatePtr
4193xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
4194 xmlAutomataStatePtr to, const xmlChar *token,
4195 void *data) {
4196 xmlRegAtomPtr atom;
4197
4198 if ((am == NULL) || (from == NULL) || (token == NULL))
4199 return(NULL);
4200 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004201 if (atom == NULL)
4202 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00004203 atom->data = data;
4204 if (atom == NULL)
4205 return(NULL);
4206 atom->valuep = xmlStrdup(token);
4207
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004208 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
4209 xmlRegFreeAtom(atom);
4210 return(NULL);
4211 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004212 if (to == NULL)
4213 return(am->state);
4214 return(to);
4215}
4216
4217/**
Daniel Veillard52b48c72003-04-13 19:53:42 +00004218 * xmlAutomataNewTransition2:
4219 * @am: an automata
4220 * @from: the starting point of the transition
4221 * @to: the target point of the transition or NULL
4222 * @token: the first input string associated to that transition
4223 * @token2: the second input string associated to that transition
4224 * @data: data passed to the callback function if the transition is activated
4225 *
4226 * If @to is NULL, this create first a new target state in the automata
4227 * and then adds a transition from the @from state to the target state
4228 * activated by the value of @token
4229 *
4230 * Returns the target state or NULL in case of error
4231 */
4232xmlAutomataStatePtr
4233xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
4234 xmlAutomataStatePtr to, const xmlChar *token,
4235 const xmlChar *token2, void *data) {
4236 xmlRegAtomPtr atom;
4237
4238 if ((am == NULL) || (from == NULL) || (token == NULL))
4239 return(NULL);
4240 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4241 atom->data = data;
4242 if (atom == NULL)
4243 return(NULL);
4244 if ((token2 == NULL) || (*token2 == 0)) {
4245 atom->valuep = xmlStrdup(token);
4246 } else {
4247 int lenn, lenp;
4248 xmlChar *str;
4249
4250 lenn = strlen((char *) token2);
4251 lenp = strlen((char *) token);
4252
Daniel Veillard3c908dc2003-04-19 00:07:51 +00004253 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00004254 if (str == NULL) {
4255 xmlRegFreeAtom(atom);
4256 return(NULL);
4257 }
4258 memcpy(&str[0], token, lenp);
4259 str[lenp] = '|';
4260 memcpy(&str[lenp + 1], token2, lenn);
4261 str[lenn + lenp + 1] = 0;
4262
4263 atom->valuep = str;
4264 }
4265
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004266 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
4267 xmlRegFreeAtom(atom);
4268 return(NULL);
4269 }
Daniel Veillard52b48c72003-04-13 19:53:42 +00004270 if (to == NULL)
4271 return(am->state);
4272 return(to);
4273}
4274
4275/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004276 * xmlAutomataNewCountTrans:
4277 * @am: an automata
4278 * @from: the starting point of the transition
4279 * @to: the target point of the transition or NULL
4280 * @token: the input string associated to that transition
4281 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004282 * @max: the maximum successive occurences of token
4283 * @data: data associated to the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00004284 *
4285 * If @to is NULL, this create first a new target state in the automata
4286 * and then adds a transition from the @from state to the target state
4287 * activated by a succession of input of value @token and whose number
4288 * is between @min and @max
4289 *
4290 * Returns the target state or NULL in case of error
4291 */
4292xmlAutomataStatePtr
4293xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4294 xmlAutomataStatePtr to, const xmlChar *token,
4295 int min, int max, void *data) {
4296 xmlRegAtomPtr atom;
4297
4298 if ((am == NULL) || (from == NULL) || (token == NULL))
4299 return(NULL);
4300 if (min < 0)
4301 return(NULL);
4302 if ((max < min) || (max < 1))
4303 return(NULL);
4304 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4305 if (atom == NULL)
4306 return(NULL);
4307 atom->valuep = xmlStrdup(token);
4308 atom->data = data;
4309 if (min == 0)
4310 atom->min = 1;
4311 else
4312 atom->min = min;
4313 atom->max = max;
4314
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004315 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
4316 xmlRegFreeAtom(atom);
4317 return(NULL);
4318 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004319 if (to == NULL)
4320 to = am->state;
4321 if (to == NULL)
4322 return(NULL);
4323 if (min == 0)
4324 xmlFAGenerateEpsilonTransition(am, from, to);
4325 return(to);
4326}
4327
4328/**
Daniel Veillard7646b182002-04-20 06:41:40 +00004329 * xmlAutomataNewOnceTrans:
4330 * @am: an automata
4331 * @from: the starting point of the transition
4332 * @to: the target point of the transition or NULL
4333 * @token: the input string associated to that transition
4334 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004335 * @max: the maximum successive occurences of token
4336 * @data: data associated to the transition
Daniel Veillard7646b182002-04-20 06:41:40 +00004337 *
4338 * If @to is NULL, this create first a new target state in the automata
4339 * and then adds a transition from the @from state to the target state
4340 * activated by a succession of input of value @token and whose number
4341 * is between @min and @max, moreover that transistion can only be crossed
4342 * once.
4343 *
4344 * Returns the target state or NULL in case of error
4345 */
4346xmlAutomataStatePtr
4347xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4348 xmlAutomataStatePtr to, const xmlChar *token,
4349 int min, int max, void *data) {
4350 xmlRegAtomPtr atom;
4351 int counter;
4352
4353 if ((am == NULL) || (from == NULL) || (token == NULL))
4354 return(NULL);
4355 if (min < 1)
4356 return(NULL);
4357 if ((max < min) || (max < 1))
4358 return(NULL);
4359 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4360 if (atom == NULL)
4361 return(NULL);
4362 atom->valuep = xmlStrdup(token);
4363 atom->data = data;
4364 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
4365 if (min == 0)
4366 atom->min = 1;
4367 else
4368 atom->min = min;
4369 atom->max = max;
4370 /*
4371 * associate a counter to the transition.
4372 */
4373 counter = xmlRegGetCounter(am);
4374 am->counters[counter].min = 1;
4375 am->counters[counter].max = 1;
4376
4377 /* xmlFAGenerateTransitions(am, from, to, atom); */
4378 if (to == NULL) {
4379 to = xmlRegNewState(am);
4380 xmlRegStatePush(am, to);
4381 }
4382 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4383 xmlRegAtomPush(am, atom);
4384 am->state = to;
4385 if (to == NULL)
4386 to = am->state;
4387 if (to == NULL)
4388 return(NULL);
4389 return(to);
4390}
4391
4392/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004393 * xmlAutomataNewState:
4394 * @am: an automata
4395 *
4396 * Create a new disconnected state in the automata
4397 *
4398 * Returns the new state or NULL in case of error
4399 */
4400xmlAutomataStatePtr
4401xmlAutomataNewState(xmlAutomataPtr am) {
4402 xmlAutomataStatePtr to;
4403
4404 if (am == NULL)
4405 return(NULL);
4406 to = xmlRegNewState(am);
4407 xmlRegStatePush(am, to);
4408 return(to);
4409}
4410
4411/**
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004412 * xmlAutomataNewEpsilon:
Daniel Veillard4255d502002-04-16 15:50:10 +00004413 * @am: an automata
4414 * @from: the starting point of the transition
4415 * @to: the target point of the transition or NULL
4416 *
4417 * If @to is NULL, this create first a new target state in the automata
4418 * and then adds a an epsilon transition from the @from state to the
4419 * target state
4420 *
4421 * Returns the target state or NULL in case of error
4422 */
4423xmlAutomataStatePtr
4424xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
4425 xmlAutomataStatePtr to) {
4426 if ((am == NULL) || (from == NULL))
4427 return(NULL);
4428 xmlFAGenerateEpsilonTransition(am, from, to);
4429 if (to == NULL)
4430 return(am->state);
4431 return(to);
4432}
4433
Daniel Veillardb509f152002-04-17 16:28:10 +00004434/**
Daniel Veillard7646b182002-04-20 06:41:40 +00004435 * xmlAutomataNewAllTrans:
4436 * @am: an automata
4437 * @from: the starting point of the transition
4438 * @to: the target point of the transition or NULL
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004439 * @lax: allow to transition if not all all transitions have been activated
Daniel Veillard7646b182002-04-20 06:41:40 +00004440 *
4441 * If @to is NULL, this create first a new target state in the automata
4442 * and then adds a an ALL transition from the @from state to the
4443 * target state. That transition is an epsilon transition allowed only when
4444 * all transitions from the @from node have been activated.
4445 *
4446 * Returns the target state or NULL in case of error
4447 */
4448xmlAutomataStatePtr
4449xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
Daniel Veillard441bc322002-04-20 17:38:48 +00004450 xmlAutomataStatePtr to, int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00004451 if ((am == NULL) || (from == NULL))
4452 return(NULL);
Daniel Veillard441bc322002-04-20 17:38:48 +00004453 xmlFAGenerateAllTransition(am, from, to, lax);
Daniel Veillard7646b182002-04-20 06:41:40 +00004454 if (to == NULL)
4455 return(am->state);
4456 return(to);
4457}
4458
4459/**
Daniel Veillardb509f152002-04-17 16:28:10 +00004460 * xmlAutomataNewCounter:
4461 * @am: an automata
4462 * @min: the minimal value on the counter
4463 * @max: the maximal value on the counter
4464 *
4465 * Create a new counter
4466 *
4467 * Returns the counter number or -1 in case of error
4468 */
4469int
4470xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
4471 int ret;
4472
4473 if (am == NULL)
4474 return(-1);
4475
4476 ret = xmlRegGetCounter(am);
4477 if (ret < 0)
4478 return(-1);
4479 am->counters[ret].min = min;
4480 am->counters[ret].max = max;
4481 return(ret);
4482}
4483
4484/**
4485 * xmlAutomataNewCountedTrans:
4486 * @am: an automata
4487 * @from: the starting point of the transition
4488 * @to: the target point of the transition or NULL
4489 * @counter: the counter associated to that transition
4490 *
4491 * If @to is NULL, this create first a new target state in the automata
4492 * and then adds an epsilon transition from the @from state to the target state
4493 * which will increment the counter provided
4494 *
4495 * Returns the target state or NULL in case of error
4496 */
4497xmlAutomataStatePtr
4498xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4499 xmlAutomataStatePtr to, int counter) {
4500 if ((am == NULL) || (from == NULL) || (counter < 0))
4501 return(NULL);
4502 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
4503 if (to == NULL)
4504 return(am->state);
4505 return(to);
4506}
4507
4508/**
4509 * xmlAutomataNewCounterTrans:
4510 * @am: an automata
4511 * @from: the starting point of the transition
4512 * @to: the target point of the transition or NULL
4513 * @counter: the counter associated to that transition
4514 *
4515 * If @to is NULL, this create first a new target state in the automata
4516 * and then adds an epsilon transition from the @from state to the target state
4517 * which will be allowed only if the counter is within the right range.
4518 *
4519 * Returns the target state or NULL in case of error
4520 */
4521xmlAutomataStatePtr
4522xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4523 xmlAutomataStatePtr to, int counter) {
4524 if ((am == NULL) || (from == NULL) || (counter < 0))
4525 return(NULL);
4526 xmlFAGenerateCountedTransition(am, from, to, counter);
4527 if (to == NULL)
4528 return(am->state);
4529 return(to);
4530}
Daniel Veillard4255d502002-04-16 15:50:10 +00004531
4532/**
4533 * xmlAutomataCompile:
4534 * @am: an automata
4535 *
4536 * Compile the automata into a Reg Exp ready for being executed.
4537 * The automata should be free after this point.
4538 *
4539 * Returns the compiled regexp or NULL in case of error
4540 */
4541xmlRegexpPtr
4542xmlAutomataCompile(xmlAutomataPtr am) {
4543 xmlRegexpPtr ret;
4544
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004545 if ((am == NULL) || (am->error != 0)) return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00004546 xmlFAEliminateEpsilonTransitions(am);
Daniel Veillard23e73572002-09-19 19:56:43 +00004547 /* xmlFAComputesDeterminism(am); */
Daniel Veillard4255d502002-04-16 15:50:10 +00004548 ret = xmlRegEpxFromParse(am);
4549
4550 return(ret);
4551}
Daniel Veillarde19fc232002-04-22 16:01:24 +00004552
4553/**
4554 * xmlAutomataIsDeterminist:
4555 * @am: an automata
4556 *
4557 * Checks if an automata is determinist.
4558 *
4559 * Returns 1 if true, 0 if not, and -1 in case of error
4560 */
4561int
4562xmlAutomataIsDeterminist(xmlAutomataPtr am) {
4563 int ret;
4564
4565 if (am == NULL)
4566 return(-1);
4567
4568 ret = xmlFAComputesDeterminism(am);
4569 return(ret);
4570}
Daniel Veillard4255d502002-04-16 15:50:10 +00004571#endif /* LIBXML_AUTOMATA_ENABLED */
4572#endif /* LIBXML_REGEXP_ENABLED */