blob: 6ef0d9eb58d068269fd5c18bf73efb3336336293 [file] [log] [blame]
Daniel Veillard4255d502002-04-16 15:50:10 +00001/*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/shemas mechanisms now available in
William M. Brackddf71d62004-05-06 04:17:26 +00006 * XML related specifications these include:
Daniel Veillard4255d502002-04-16 15:50:10 +00007 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17#define IN_LIBXML
18#include "libxml.h"
19
20#ifdef LIBXML_REGEXP_ENABLED
21
22#include <stdio.h>
23#include <string.h>
Daniel Veillardebe48c62003-12-03 12:12:27 +000024#ifdef HAVE_LIMITS_H
25#include <limits.h>
26#endif
27
Daniel Veillard4255d502002-04-16 15:50:10 +000028#include <libxml/tree.h>
29#include <libxml/parserInternals.h>
30#include <libxml/xmlregexp.h>
31#include <libxml/xmlautomata.h>
32#include <libxml/xmlunicode.h>
33
Daniel Veillardebe48c62003-12-03 12:12:27 +000034#ifndef INT_MAX
35#define INT_MAX 123456789 /* easy to flag and big enough for our needs */
36#endif
37
Daniel Veillardc0826a72004-08-10 14:17:33 +000038/* #define DEBUG_REGEXP_GRAPH */
39/* #define DEBUG_REGEXP_EXEC */
Daniel Veillard4255d502002-04-16 15:50:10 +000040/* #define DEBUG_PUSH */
Daniel Veillard23e73572002-09-19 19:56:43 +000041/* #define DEBUG_COMPACTION */
Daniel Veillard4255d502002-04-16 15:50:10 +000042
Daniel Veillardff46a042003-10-08 08:53:17 +000043#define ERROR(str) \
44 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
45 xmlRegexpErrCompile(ctxt, str);
Daniel Veillard4255d502002-04-16 15:50:10 +000046#define NEXT ctxt->cur++
47#define CUR (*(ctxt->cur))
48#define NXT(index) (ctxt->cur[index])
49
50#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
51#define NEXTL(l) ctxt->cur += l;
Daniel Veillardc0826a72004-08-10 14:17:33 +000052#define XML_REG_STRING_SEPARATOR '|'
Daniel Veillard4255d502002-04-16 15:50:10 +000053
Daniel Veillarde19fc232002-04-22 16:01:24 +000054/**
55 * TODO:
56 *
57 * macro to flag unimplemented blocks
58 */
59#define TODO \
60 xmlGenericError(xmlGenericErrorContext, \
61 "Unimplemented block at %s:%d\n", \
62 __FILE__, __LINE__);
63
Daniel Veillard4255d502002-04-16 15:50:10 +000064/************************************************************************
65 * *
66 * Datatypes and structures *
67 * *
68 ************************************************************************/
69
70typedef enum {
71 XML_REGEXP_EPSILON = 1,
72 XML_REGEXP_CHARVAL,
73 XML_REGEXP_RANGES,
74 XML_REGEXP_SUBREG,
75 XML_REGEXP_STRING,
76 XML_REGEXP_ANYCHAR, /* . */
77 XML_REGEXP_ANYSPACE, /* \s */
78 XML_REGEXP_NOTSPACE, /* \S */
79 XML_REGEXP_INITNAME, /* \l */
80 XML_REGEXP_NOTINITNAME, /* \l */
81 XML_REGEXP_NAMECHAR, /* \c */
82 XML_REGEXP_NOTNAMECHAR, /* \C */
83 XML_REGEXP_DECIMAL, /* \d */
84 XML_REGEXP_NOTDECIMAL, /* \d */
85 XML_REGEXP_REALCHAR, /* \w */
86 XML_REGEXP_NOTREALCHAR, /* \w */
87 XML_REGEXP_LETTER,
88 XML_REGEXP_LETTER_UPPERCASE,
89 XML_REGEXP_LETTER_LOWERCASE,
90 XML_REGEXP_LETTER_TITLECASE,
91 XML_REGEXP_LETTER_MODIFIER,
92 XML_REGEXP_LETTER_OTHERS,
93 XML_REGEXP_MARK,
94 XML_REGEXP_MARK_NONSPACING,
95 XML_REGEXP_MARK_SPACECOMBINING,
96 XML_REGEXP_MARK_ENCLOSING,
97 XML_REGEXP_NUMBER,
98 XML_REGEXP_NUMBER_DECIMAL,
99 XML_REGEXP_NUMBER_LETTER,
100 XML_REGEXP_NUMBER_OTHERS,
101 XML_REGEXP_PUNCT,
102 XML_REGEXP_PUNCT_CONNECTOR,
103 XML_REGEXP_PUNCT_DASH,
104 XML_REGEXP_PUNCT_OPEN,
105 XML_REGEXP_PUNCT_CLOSE,
106 XML_REGEXP_PUNCT_INITQUOTE,
107 XML_REGEXP_PUNCT_FINQUOTE,
108 XML_REGEXP_PUNCT_OTHERS,
109 XML_REGEXP_SEPAR,
110 XML_REGEXP_SEPAR_SPACE,
111 XML_REGEXP_SEPAR_LINE,
112 XML_REGEXP_SEPAR_PARA,
113 XML_REGEXP_SYMBOL,
114 XML_REGEXP_SYMBOL_MATH,
115 XML_REGEXP_SYMBOL_CURRENCY,
116 XML_REGEXP_SYMBOL_MODIFIER,
117 XML_REGEXP_SYMBOL_OTHERS,
118 XML_REGEXP_OTHER,
119 XML_REGEXP_OTHER_CONTROL,
120 XML_REGEXP_OTHER_FORMAT,
121 XML_REGEXP_OTHER_PRIVATE,
122 XML_REGEXP_OTHER_NA,
123 XML_REGEXP_BLOCK_NAME
124} xmlRegAtomType;
125
126typedef enum {
127 XML_REGEXP_QUANT_EPSILON = 1,
128 XML_REGEXP_QUANT_ONCE,
129 XML_REGEXP_QUANT_OPT,
130 XML_REGEXP_QUANT_MULT,
131 XML_REGEXP_QUANT_PLUS,
Daniel Veillard7646b182002-04-20 06:41:40 +0000132 XML_REGEXP_QUANT_ONCEONLY,
133 XML_REGEXP_QUANT_ALL,
Daniel Veillard4255d502002-04-16 15:50:10 +0000134 XML_REGEXP_QUANT_RANGE
135} xmlRegQuantType;
136
137typedef enum {
138 XML_REGEXP_START_STATE = 1,
139 XML_REGEXP_FINAL_STATE,
140 XML_REGEXP_TRANS_STATE
141} xmlRegStateType;
142
143typedef enum {
144 XML_REGEXP_MARK_NORMAL = 0,
145 XML_REGEXP_MARK_START,
146 XML_REGEXP_MARK_VISITED
147} xmlRegMarkedType;
148
149typedef struct _xmlRegRange xmlRegRange;
150typedef xmlRegRange *xmlRegRangePtr;
151
152struct _xmlRegRange {
Daniel Veillardf8b9de32003-11-24 14:27:26 +0000153 int neg; /* 0 normal, 1 not, 2 exclude */
Daniel Veillard4255d502002-04-16 15:50:10 +0000154 xmlRegAtomType type;
155 int start;
156 int end;
157 xmlChar *blockName;
158};
159
160typedef struct _xmlRegAtom xmlRegAtom;
161typedef xmlRegAtom *xmlRegAtomPtr;
162
163typedef struct _xmlAutomataState xmlRegState;
164typedef xmlRegState *xmlRegStatePtr;
165
166struct _xmlRegAtom {
167 int no;
168 xmlRegAtomType type;
169 xmlRegQuantType quant;
170 int min;
171 int max;
172
173 void *valuep;
Daniel Veillarda646cfd2002-09-17 21:50:03 +0000174 void *valuep2;
Daniel Veillard4255d502002-04-16 15:50:10 +0000175 int neg;
176 int codepoint;
177 xmlRegStatePtr start;
178 xmlRegStatePtr stop;
179 int maxRanges;
180 int nbRanges;
181 xmlRegRangePtr *ranges;
182 void *data;
183};
184
185typedef struct _xmlRegCounter xmlRegCounter;
186typedef xmlRegCounter *xmlRegCounterPtr;
187
188struct _xmlRegCounter {
189 int min;
190 int max;
191};
192
193typedef struct _xmlRegTrans xmlRegTrans;
194typedef xmlRegTrans *xmlRegTransPtr;
195
196struct _xmlRegTrans {
197 xmlRegAtomPtr atom;
198 int to;
199 int counter;
200 int count;
201};
202
203struct _xmlAutomataState {
204 xmlRegStateType type;
205 xmlRegMarkedType mark;
Daniel Veillard23e73572002-09-19 19:56:43 +0000206 xmlRegMarkedType reached;
Daniel Veillard4255d502002-04-16 15:50:10 +0000207 int no;
208
209 int maxTrans;
210 int nbTrans;
211 xmlRegTrans *trans;
212};
213
214typedef struct _xmlAutomata xmlRegParserCtxt;
215typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
216
217struct _xmlAutomata {
218 xmlChar *string;
219 xmlChar *cur;
220
221 int error;
222 int neg;
223
224 xmlRegStatePtr start;
225 xmlRegStatePtr end;
226 xmlRegStatePtr state;
227
228 xmlRegAtomPtr atom;
229
230 int maxAtoms;
231 int nbAtoms;
232 xmlRegAtomPtr *atoms;
233
234 int maxStates;
235 int nbStates;
236 xmlRegStatePtr *states;
237
238 int maxCounters;
239 int nbCounters;
240 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000241
242 int determinist;
Daniel Veillard4255d502002-04-16 15:50:10 +0000243};
244
245struct _xmlRegexp {
246 xmlChar *string;
247 int nbStates;
248 xmlRegStatePtr *states;
249 int nbAtoms;
250 xmlRegAtomPtr *atoms;
251 int nbCounters;
252 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000253 int determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000254 /*
255 * That's the compact form for determinists automatas
256 */
257 int nbstates;
258 int *compact;
Daniel Veillard118aed72002-09-24 14:13:13 +0000259 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000260 int nbstrings;
261 xmlChar **stringMap;
Daniel Veillard4255d502002-04-16 15:50:10 +0000262};
263
264typedef struct _xmlRegExecRollback xmlRegExecRollback;
265typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
266
267struct _xmlRegExecRollback {
268 xmlRegStatePtr state;/* the current state */
269 int index; /* the index in the input stack */
270 int nextbranch; /* the next transition to explore in that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000271 int *counts; /* save the automata state if it has some */
Daniel Veillard4255d502002-04-16 15:50:10 +0000272};
273
274typedef struct _xmlRegInputToken xmlRegInputToken;
275typedef xmlRegInputToken *xmlRegInputTokenPtr;
276
277struct _xmlRegInputToken {
278 xmlChar *value;
279 void *data;
280};
281
282struct _xmlRegExecCtxt {
283 int status; /* execution status != 0 indicate an error */
William M. Brackddf71d62004-05-06 04:17:26 +0000284 int determinist; /* did we find an indeterministic behaviour */
Daniel Veillard4255d502002-04-16 15:50:10 +0000285 xmlRegexpPtr comp; /* the compiled regexp */
286 xmlRegExecCallbacks callback;
287 void *data;
288
289 xmlRegStatePtr state;/* the current state */
290 int transno; /* the current transition on that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000291 int transcount; /* the number of chars in char counted transitions */
Daniel Veillard4255d502002-04-16 15:50:10 +0000292
293 /*
294 * A stack of rollback states
295 */
296 int maxRollbacks;
297 int nbRollbacks;
298 xmlRegExecRollback *rollbacks;
299
300 /*
301 * The state of the automata if any
302 */
303 int *counts;
304
305 /*
306 * The input stack
307 */
308 int inputStackMax;
309 int inputStackNr;
310 int index;
311 int *charStack;
312 const xmlChar *inputString; /* when operating on characters */
313 xmlRegInputTokenPtr inputStack;/* when operating on strings */
314
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +0000315 /*
316 * error handling
317 */
318 int errStateNo; /* the error state number */
319 xmlRegStatePtr errState; /* the error state */
320 xmlChar *errString; /* the string raising the error */
321 int *errCounts; /* counters at the error state */
Daniel Veillard4255d502002-04-16 15:50:10 +0000322};
323
Daniel Veillard441bc322002-04-20 17:38:48 +0000324#define REGEXP_ALL_COUNTER 0x123456
325#define REGEXP_ALL_LAX_COUNTER 0x123457
Daniel Veillard7646b182002-04-20 06:41:40 +0000326
Daniel Veillard4255d502002-04-16 15:50:10 +0000327static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
Daniel Veillard23e73572002-09-19 19:56:43 +0000328static void xmlRegFreeState(xmlRegStatePtr state);
329static void xmlRegFreeAtom(xmlRegAtomPtr atom);
Daniel Veillard4255d502002-04-16 15:50:10 +0000330
331/************************************************************************
Daniel Veillardff46a042003-10-08 08:53:17 +0000332 * *
333 * Regexp memory error handler *
334 * *
335 ************************************************************************/
336/**
337 * xmlRegexpErrMemory:
William M. Brackddf71d62004-05-06 04:17:26 +0000338 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000339 *
340 * Handle an out of memory condition
341 */
342static void
343xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
344{
345 const char *regexp = NULL;
346 if (ctxt != NULL) {
347 regexp = (const char *) ctxt->string;
348 ctxt->error = XML_ERR_NO_MEMORY;
349 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000350 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000351 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
352 regexp, NULL, 0, 0,
353 "Memory allocation failed : %s\n", extra);
354}
355
356/**
357 * xmlRegexpErrCompile:
William M. Brackddf71d62004-05-06 04:17:26 +0000358 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000359 *
William M. Brackddf71d62004-05-06 04:17:26 +0000360 * Handle a compilation failure
Daniel Veillardff46a042003-10-08 08:53:17 +0000361 */
362static void
363xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
364{
365 const char *regexp = NULL;
366 int idx = 0;
367
368 if (ctxt != NULL) {
369 regexp = (const char *) ctxt->string;
370 idx = ctxt->cur - ctxt->string;
371 ctxt->error = XML_REGEXP_COMPILE_ERROR;
372 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000373 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000374 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
375 regexp, NULL, idx, 0,
376 "failed to compile: %s\n", extra);
377}
378
379/************************************************************************
Daniel Veillard4255d502002-04-16 15:50:10 +0000380 * *
381 * Allocation/Deallocation *
382 * *
383 ************************************************************************/
384
Daniel Veillard23e73572002-09-19 19:56:43 +0000385static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
Daniel Veillard4255d502002-04-16 15:50:10 +0000386/**
387 * xmlRegEpxFromParse:
388 * @ctxt: the parser context used to build it
389 *
William M. Brackddf71d62004-05-06 04:17:26 +0000390 * Allocate a new regexp and fill it with the result from the parser
Daniel Veillard4255d502002-04-16 15:50:10 +0000391 *
392 * Returns the new regexp or NULL in case of error
393 */
394static xmlRegexpPtr
395xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
396 xmlRegexpPtr ret;
397
398 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000399 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000400 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +0000401 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000402 }
Daniel Veillard4255d502002-04-16 15:50:10 +0000403 memset(ret, 0, sizeof(xmlRegexp));
404 ret->string = ctxt->string;
Daniel Veillard4255d502002-04-16 15:50:10 +0000405 ret->nbStates = ctxt->nbStates;
Daniel Veillard4255d502002-04-16 15:50:10 +0000406 ret->states = ctxt->states;
Daniel Veillard4255d502002-04-16 15:50:10 +0000407 ret->nbAtoms = ctxt->nbAtoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000408 ret->atoms = ctxt->atoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000409 ret->nbCounters = ctxt->nbCounters;
Daniel Veillard4255d502002-04-16 15:50:10 +0000410 ret->counters = ctxt->counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000411 ret->determinist = ctxt->determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000412
413 if ((ret->determinist != 0) &&
414 (ret->nbCounters == 0) &&
Daniel Veillard118aed72002-09-24 14:13:13 +0000415 (ret->atoms != NULL) &&
Daniel Veillard23e73572002-09-19 19:56:43 +0000416 (ret->atoms[0] != NULL) &&
417 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
418 int i, j, nbstates = 0, nbatoms = 0;
419 int *stateRemap;
420 int *stringRemap;
421 int *transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000422 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000423 xmlChar **stringMap;
424 xmlChar *value;
425
426 /*
427 * Switch to a compact representation
428 * 1/ counting the effective number of states left
William M. Brackddf71d62004-05-06 04:17:26 +0000429 * 2/ counting the unique number of atoms, and check that
Daniel Veillard23e73572002-09-19 19:56:43 +0000430 * they are all of the string type
431 * 3/ build a table state x atom for the transitions
432 */
433
434 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000435 if (stateRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000436 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000437 xmlFree(ret);
438 return(NULL);
439 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000440 for (i = 0;i < ret->nbStates;i++) {
441 if (ret->states[i] != NULL) {
442 stateRemap[i] = nbstates;
443 nbstates++;
444 } else {
445 stateRemap[i] = -1;
446 }
447 }
448#ifdef DEBUG_COMPACTION
449 printf("Final: %d states\n", nbstates);
450#endif
451 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000452 if (stringMap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000453 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000454 xmlFree(stateRemap);
455 xmlFree(ret);
456 return(NULL);
457 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000458 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000459 if (stringRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000460 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000461 xmlFree(stringMap);
462 xmlFree(stateRemap);
463 xmlFree(ret);
464 return(NULL);
465 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000466 for (i = 0;i < ret->nbAtoms;i++) {
467 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
468 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
469 value = ret->atoms[i]->valuep;
470 for (j = 0;j < nbatoms;j++) {
471 if (xmlStrEqual(stringMap[j], value)) {
472 stringRemap[i] = j;
473 break;
474 }
475 }
476 if (j >= nbatoms) {
477 stringRemap[i] = nbatoms;
478 stringMap[nbatoms] = xmlStrdup(value);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000479 if (stringMap[nbatoms] == NULL) {
480 for (i = 0;i < nbatoms;i++)
481 xmlFree(stringMap[i]);
482 xmlFree(stringRemap);
483 xmlFree(stringMap);
484 xmlFree(stateRemap);
485 xmlFree(ret);
486 return(NULL);
487 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000488 nbatoms++;
489 }
490 } else {
491 xmlFree(stateRemap);
492 xmlFree(stringRemap);
493 for (i = 0;i < nbatoms;i++)
494 xmlFree(stringMap[i]);
495 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000496 xmlFree(ret);
497 return(NULL);
Daniel Veillard23e73572002-09-19 19:56:43 +0000498 }
499 }
500#ifdef DEBUG_COMPACTION
501 printf("Final: %d atoms\n", nbatoms);
502#endif
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000503 transitions = (int *) xmlMalloc((nbstates + 1) *
504 (nbatoms + 1) * sizeof(int));
505 if (transitions == NULL) {
506 xmlFree(stateRemap);
507 xmlFree(stringRemap);
508 xmlFree(stringMap);
509 xmlFree(ret);
510 return(NULL);
511 }
512 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int));
Daniel Veillard23e73572002-09-19 19:56:43 +0000513
514 /*
515 * Allocate the transition table. The first entry for each
William M. Brackddf71d62004-05-06 04:17:26 +0000516 * state corresponds to the state type.
Daniel Veillard23e73572002-09-19 19:56:43 +0000517 */
Daniel Veillard118aed72002-09-24 14:13:13 +0000518 transdata = NULL;
Daniel Veillard23e73572002-09-19 19:56:43 +0000519
520 for (i = 0;i < ret->nbStates;i++) {
521 int stateno, atomno, targetno, prev;
522 xmlRegStatePtr state;
523 xmlRegTransPtr trans;
524
525 stateno = stateRemap[i];
526 if (stateno == -1)
527 continue;
528 state = ret->states[i];
529
530 transitions[stateno * (nbatoms + 1)] = state->type;
531
532 for (j = 0;j < state->nbTrans;j++) {
533 trans = &(state->trans[j]);
534 if ((trans->to == -1) || (trans->atom == NULL))
535 continue;
536 atomno = stringRemap[trans->atom->no];
Daniel Veillard118aed72002-09-24 14:13:13 +0000537 if ((trans->atom->data != NULL) && (transdata == NULL)) {
538 transdata = (void **) xmlMalloc(nbstates * nbatoms *
539 sizeof(void *));
540 if (transdata != NULL)
541 memset(transdata, 0,
542 nbstates * nbatoms * sizeof(void *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000543 else {
Daniel Veillardff46a042003-10-08 08:53:17 +0000544 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000545 break;
546 }
Daniel Veillard118aed72002-09-24 14:13:13 +0000547 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000548 targetno = stateRemap[trans->to];
549 /*
William M. Brackddf71d62004-05-06 04:17:26 +0000550 * if the same atom can generate transitions to 2 different
Daniel Veillard23e73572002-09-19 19:56:43 +0000551 * states then it means the automata is not determinist and
552 * the compact form can't be used !
553 */
554 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
555 if (prev != 0) {
556 if (prev != targetno + 1) {
Daniel Veillard23e73572002-09-19 19:56:43 +0000557 ret->determinist = 0;
558#ifdef DEBUG_COMPACTION
559 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
560 i, j, trans->atom->no, trans->to, atomno, targetno);
561 printf(" previous to is %d\n", prev);
562#endif
563 ret->determinist = 0;
Daniel Veillard118aed72002-09-24 14:13:13 +0000564 if (transdata != NULL)
565 xmlFree(transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +0000566 xmlFree(transitions);
567 xmlFree(stateRemap);
568 xmlFree(stringRemap);
569 for (i = 0;i < nbatoms;i++)
570 xmlFree(stringMap[i]);
571 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000572 goto not_determ;
Daniel Veillard23e73572002-09-19 19:56:43 +0000573 }
574 } else {
575#if 0
576 printf("State %d trans %d: atom %d to %d : %d to %d\n",
577 i, j, trans->atom->no, trans->to, atomno, targetno);
578#endif
579 transitions[stateno * (nbatoms + 1) + atomno + 1] =
Daniel Veillard118aed72002-09-24 14:13:13 +0000580 targetno + 1; /* to avoid 0 */
581 if (transdata != NULL)
582 transdata[stateno * nbatoms + atomno] =
583 trans->atom->data;
Daniel Veillard23e73572002-09-19 19:56:43 +0000584 }
585 }
586 }
587 ret->determinist = 1;
588#ifdef DEBUG_COMPACTION
589 /*
590 * Debug
591 */
592 for (i = 0;i < nbstates;i++) {
593 for (j = 0;j < nbatoms + 1;j++) {
594 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
595 }
596 printf("\n");
597 }
598 printf("\n");
599#endif
600 /*
601 * Cleanup of the old data
602 */
603 if (ret->states != NULL) {
604 for (i = 0;i < ret->nbStates;i++)
605 xmlRegFreeState(ret->states[i]);
606 xmlFree(ret->states);
607 }
608 ret->states = NULL;
609 ret->nbStates = 0;
610 if (ret->atoms != NULL) {
611 for (i = 0;i < ret->nbAtoms;i++)
612 xmlRegFreeAtom(ret->atoms[i]);
613 xmlFree(ret->atoms);
614 }
615 ret->atoms = NULL;
616 ret->nbAtoms = 0;
617
618 ret->compact = transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000619 ret->transdata = transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000620 ret->stringMap = stringMap;
621 ret->nbstrings = nbatoms;
622 ret->nbstates = nbstates;
623 xmlFree(stateRemap);
624 xmlFree(stringRemap);
625 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000626not_determ:
627 ctxt->string = NULL;
628 ctxt->nbStates = 0;
629 ctxt->states = NULL;
630 ctxt->nbAtoms = 0;
631 ctxt->atoms = NULL;
632 ctxt->nbCounters = 0;
633 ctxt->counters = NULL;
Daniel Veillard4255d502002-04-16 15:50:10 +0000634 return(ret);
635}
636
637/**
638 * xmlRegNewParserCtxt:
639 * @string: the string to parse
640 *
641 * Allocate a new regexp parser context
642 *
643 * Returns the new context or NULL in case of error
644 */
645static xmlRegParserCtxtPtr
646xmlRegNewParserCtxt(const xmlChar *string) {
647 xmlRegParserCtxtPtr ret;
648
649 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
650 if (ret == NULL)
651 return(NULL);
652 memset(ret, 0, sizeof(xmlRegParserCtxt));
653 if (string != NULL)
654 ret->string = xmlStrdup(string);
655 ret->cur = ret->string;
656 ret->neg = 0;
657 ret->error = 0;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000658 ret->determinist = -1;
Daniel Veillard4255d502002-04-16 15:50:10 +0000659 return(ret);
660}
661
662/**
663 * xmlRegNewRange:
664 * @ctxt: the regexp parser context
665 * @neg: is that negative
666 * @type: the type of range
667 * @start: the start codepoint
668 * @end: the end codepoint
669 *
670 * Allocate a new regexp range
671 *
672 * Returns the new range or NULL in case of error
673 */
674static xmlRegRangePtr
675xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
676 int neg, xmlRegAtomType type, int start, int end) {
677 xmlRegRangePtr ret;
678
679 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
680 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000681 xmlRegexpErrMemory(ctxt, "allocating range");
Daniel Veillard4255d502002-04-16 15:50:10 +0000682 return(NULL);
683 }
684 ret->neg = neg;
685 ret->type = type;
686 ret->start = start;
687 ret->end = end;
688 return(ret);
689}
690
691/**
692 * xmlRegFreeRange:
693 * @range: the regexp range
694 *
695 * Free a regexp range
696 */
697static void
698xmlRegFreeRange(xmlRegRangePtr range) {
699 if (range == NULL)
700 return;
701
702 if (range->blockName != NULL)
703 xmlFree(range->blockName);
704 xmlFree(range);
705}
706
707/**
708 * xmlRegNewAtom:
709 * @ctxt: the regexp parser context
710 * @type: the type of atom
711 *
712 * Allocate a new regexp range
713 *
714 * Returns the new atom or NULL in case of error
715 */
716static xmlRegAtomPtr
717xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
718 xmlRegAtomPtr ret;
719
720 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
721 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000722 xmlRegexpErrMemory(ctxt, "allocating atom");
Daniel Veillard4255d502002-04-16 15:50:10 +0000723 return(NULL);
724 }
725 memset(ret, 0, sizeof(xmlRegAtom));
726 ret->type = type;
727 ret->quant = XML_REGEXP_QUANT_ONCE;
728 ret->min = 0;
729 ret->max = 0;
730 return(ret);
731}
732
733/**
734 * xmlRegFreeAtom:
735 * @atom: the regexp atom
736 *
737 * Free a regexp atom
738 */
739static void
740xmlRegFreeAtom(xmlRegAtomPtr atom) {
741 int i;
742
743 if (atom == NULL)
744 return;
745
746 for (i = 0;i < atom->nbRanges;i++)
747 xmlRegFreeRange(atom->ranges[i]);
748 if (atom->ranges != NULL)
749 xmlFree(atom->ranges);
750 if (atom->type == XML_REGEXP_STRING)
751 xmlFree(atom->valuep);
752 xmlFree(atom);
753}
754
755static xmlRegStatePtr
756xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
757 xmlRegStatePtr ret;
758
759 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
760 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000761 xmlRegexpErrMemory(ctxt, "allocating state");
Daniel Veillard4255d502002-04-16 15:50:10 +0000762 return(NULL);
763 }
764 memset(ret, 0, sizeof(xmlRegState));
765 ret->type = XML_REGEXP_TRANS_STATE;
766 ret->mark = XML_REGEXP_MARK_NORMAL;
767 return(ret);
768}
769
770/**
771 * xmlRegFreeState:
772 * @state: the regexp state
773 *
774 * Free a regexp state
775 */
776static void
777xmlRegFreeState(xmlRegStatePtr state) {
778 if (state == NULL)
779 return;
780
781 if (state->trans != NULL)
782 xmlFree(state->trans);
783 xmlFree(state);
784}
785
786/**
787 * xmlRegFreeParserCtxt:
788 * @ctxt: the regexp parser context
789 *
790 * Free a regexp parser context
791 */
792static void
793xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
794 int i;
795 if (ctxt == NULL)
796 return;
797
798 if (ctxt->string != NULL)
799 xmlFree(ctxt->string);
800 if (ctxt->states != NULL) {
801 for (i = 0;i < ctxt->nbStates;i++)
802 xmlRegFreeState(ctxt->states[i]);
803 xmlFree(ctxt->states);
804 }
805 if (ctxt->atoms != NULL) {
806 for (i = 0;i < ctxt->nbAtoms;i++)
807 xmlRegFreeAtom(ctxt->atoms[i]);
808 xmlFree(ctxt->atoms);
809 }
810 if (ctxt->counters != NULL)
811 xmlFree(ctxt->counters);
812 xmlFree(ctxt);
813}
814
815/************************************************************************
816 * *
817 * Display of Data structures *
818 * *
819 ************************************************************************/
820
821static void
822xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
823 switch (type) {
824 case XML_REGEXP_EPSILON:
825 fprintf(output, "epsilon "); break;
826 case XML_REGEXP_CHARVAL:
827 fprintf(output, "charval "); break;
828 case XML_REGEXP_RANGES:
829 fprintf(output, "ranges "); break;
830 case XML_REGEXP_SUBREG:
831 fprintf(output, "subexpr "); break;
832 case XML_REGEXP_STRING:
833 fprintf(output, "string "); break;
834 case XML_REGEXP_ANYCHAR:
835 fprintf(output, "anychar "); break;
836 case XML_REGEXP_ANYSPACE:
837 fprintf(output, "anyspace "); break;
838 case XML_REGEXP_NOTSPACE:
839 fprintf(output, "notspace "); break;
840 case XML_REGEXP_INITNAME:
841 fprintf(output, "initname "); break;
842 case XML_REGEXP_NOTINITNAME:
843 fprintf(output, "notinitname "); break;
844 case XML_REGEXP_NAMECHAR:
845 fprintf(output, "namechar "); break;
846 case XML_REGEXP_NOTNAMECHAR:
847 fprintf(output, "notnamechar "); break;
848 case XML_REGEXP_DECIMAL:
849 fprintf(output, "decimal "); break;
850 case XML_REGEXP_NOTDECIMAL:
851 fprintf(output, "notdecimal "); break;
852 case XML_REGEXP_REALCHAR:
853 fprintf(output, "realchar "); break;
854 case XML_REGEXP_NOTREALCHAR:
855 fprintf(output, "notrealchar "); break;
856 case XML_REGEXP_LETTER:
857 fprintf(output, "LETTER "); break;
858 case XML_REGEXP_LETTER_UPPERCASE:
859 fprintf(output, "LETTER_UPPERCASE "); break;
860 case XML_REGEXP_LETTER_LOWERCASE:
861 fprintf(output, "LETTER_LOWERCASE "); break;
862 case XML_REGEXP_LETTER_TITLECASE:
863 fprintf(output, "LETTER_TITLECASE "); break;
864 case XML_REGEXP_LETTER_MODIFIER:
865 fprintf(output, "LETTER_MODIFIER "); break;
866 case XML_REGEXP_LETTER_OTHERS:
867 fprintf(output, "LETTER_OTHERS "); break;
868 case XML_REGEXP_MARK:
869 fprintf(output, "MARK "); break;
870 case XML_REGEXP_MARK_NONSPACING:
871 fprintf(output, "MARK_NONSPACING "); break;
872 case XML_REGEXP_MARK_SPACECOMBINING:
873 fprintf(output, "MARK_SPACECOMBINING "); break;
874 case XML_REGEXP_MARK_ENCLOSING:
875 fprintf(output, "MARK_ENCLOSING "); break;
876 case XML_REGEXP_NUMBER:
877 fprintf(output, "NUMBER "); break;
878 case XML_REGEXP_NUMBER_DECIMAL:
879 fprintf(output, "NUMBER_DECIMAL "); break;
880 case XML_REGEXP_NUMBER_LETTER:
881 fprintf(output, "NUMBER_LETTER "); break;
882 case XML_REGEXP_NUMBER_OTHERS:
883 fprintf(output, "NUMBER_OTHERS "); break;
884 case XML_REGEXP_PUNCT:
885 fprintf(output, "PUNCT "); break;
886 case XML_REGEXP_PUNCT_CONNECTOR:
887 fprintf(output, "PUNCT_CONNECTOR "); break;
888 case XML_REGEXP_PUNCT_DASH:
889 fprintf(output, "PUNCT_DASH "); break;
890 case XML_REGEXP_PUNCT_OPEN:
891 fprintf(output, "PUNCT_OPEN "); break;
892 case XML_REGEXP_PUNCT_CLOSE:
893 fprintf(output, "PUNCT_CLOSE "); break;
894 case XML_REGEXP_PUNCT_INITQUOTE:
895 fprintf(output, "PUNCT_INITQUOTE "); break;
896 case XML_REGEXP_PUNCT_FINQUOTE:
897 fprintf(output, "PUNCT_FINQUOTE "); break;
898 case XML_REGEXP_PUNCT_OTHERS:
899 fprintf(output, "PUNCT_OTHERS "); break;
900 case XML_REGEXP_SEPAR:
901 fprintf(output, "SEPAR "); break;
902 case XML_REGEXP_SEPAR_SPACE:
903 fprintf(output, "SEPAR_SPACE "); break;
904 case XML_REGEXP_SEPAR_LINE:
905 fprintf(output, "SEPAR_LINE "); break;
906 case XML_REGEXP_SEPAR_PARA:
907 fprintf(output, "SEPAR_PARA "); break;
908 case XML_REGEXP_SYMBOL:
909 fprintf(output, "SYMBOL "); break;
910 case XML_REGEXP_SYMBOL_MATH:
911 fprintf(output, "SYMBOL_MATH "); break;
912 case XML_REGEXP_SYMBOL_CURRENCY:
913 fprintf(output, "SYMBOL_CURRENCY "); break;
914 case XML_REGEXP_SYMBOL_MODIFIER:
915 fprintf(output, "SYMBOL_MODIFIER "); break;
916 case XML_REGEXP_SYMBOL_OTHERS:
917 fprintf(output, "SYMBOL_OTHERS "); break;
918 case XML_REGEXP_OTHER:
919 fprintf(output, "OTHER "); break;
920 case XML_REGEXP_OTHER_CONTROL:
921 fprintf(output, "OTHER_CONTROL "); break;
922 case XML_REGEXP_OTHER_FORMAT:
923 fprintf(output, "OTHER_FORMAT "); break;
924 case XML_REGEXP_OTHER_PRIVATE:
925 fprintf(output, "OTHER_PRIVATE "); break;
926 case XML_REGEXP_OTHER_NA:
927 fprintf(output, "OTHER_NA "); break;
928 case XML_REGEXP_BLOCK_NAME:
929 fprintf(output, "BLOCK "); break;
930 }
931}
932
933static void
934xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
935 switch (type) {
936 case XML_REGEXP_QUANT_EPSILON:
937 fprintf(output, "epsilon "); break;
938 case XML_REGEXP_QUANT_ONCE:
939 fprintf(output, "once "); break;
940 case XML_REGEXP_QUANT_OPT:
941 fprintf(output, "? "); break;
942 case XML_REGEXP_QUANT_MULT:
943 fprintf(output, "* "); break;
944 case XML_REGEXP_QUANT_PLUS:
945 fprintf(output, "+ "); break;
946 case XML_REGEXP_QUANT_RANGE:
947 fprintf(output, "range "); break;
Daniel Veillard7646b182002-04-20 06:41:40 +0000948 case XML_REGEXP_QUANT_ONCEONLY:
949 fprintf(output, "onceonly "); break;
950 case XML_REGEXP_QUANT_ALL:
951 fprintf(output, "all "); break;
Daniel Veillard4255d502002-04-16 15:50:10 +0000952 }
953}
954static void
955xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
956 fprintf(output, " range: ");
957 if (range->neg)
958 fprintf(output, "negative ");
959 xmlRegPrintAtomType(output, range->type);
960 fprintf(output, "%c - %c\n", range->start, range->end);
961}
962
963static void
964xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
965 fprintf(output, " atom: ");
966 if (atom == NULL) {
967 fprintf(output, "NULL\n");
968 return;
969 }
970 xmlRegPrintAtomType(output, atom->type);
971 xmlRegPrintQuantType(output, atom->quant);
972 if (atom->quant == XML_REGEXP_QUANT_RANGE)
973 fprintf(output, "%d-%d ", atom->min, atom->max);
974 if (atom->type == XML_REGEXP_STRING)
975 fprintf(output, "'%s' ", (char *) atom->valuep);
976 if (atom->type == XML_REGEXP_CHARVAL)
977 fprintf(output, "char %c\n", atom->codepoint);
978 else if (atom->type == XML_REGEXP_RANGES) {
979 int i;
980 fprintf(output, "%d entries\n", atom->nbRanges);
981 for (i = 0; i < atom->nbRanges;i++)
982 xmlRegPrintRange(output, atom->ranges[i]);
983 } else if (atom->type == XML_REGEXP_SUBREG) {
984 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
985 } else {
986 fprintf(output, "\n");
987 }
988}
989
990static void
991xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
992 fprintf(output, " trans: ");
993 if (trans == NULL) {
994 fprintf(output, "NULL\n");
995 return;
996 }
997 if (trans->to < 0) {
998 fprintf(output, "removed\n");
999 return;
1000 }
1001 if (trans->counter >= 0) {
1002 fprintf(output, "counted %d, ", trans->counter);
1003 }
Daniel Veillard8a001f62002-04-20 07:24:11 +00001004 if (trans->count == REGEXP_ALL_COUNTER) {
1005 fprintf(output, "all transition, ");
1006 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00001007 fprintf(output, "count based %d, ", trans->count);
1008 }
1009 if (trans->atom == NULL) {
1010 fprintf(output, "epsilon to %d\n", trans->to);
1011 return;
1012 }
1013 if (trans->atom->type == XML_REGEXP_CHARVAL)
1014 fprintf(output, "char %c ", trans->atom->codepoint);
1015 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1016}
1017
1018static void
1019xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1020 int i;
1021
1022 fprintf(output, " state: ");
1023 if (state == NULL) {
1024 fprintf(output, "NULL\n");
1025 return;
1026 }
1027 if (state->type == XML_REGEXP_START_STATE)
1028 fprintf(output, "START ");
1029 if (state->type == XML_REGEXP_FINAL_STATE)
1030 fprintf(output, "FINAL ");
1031
1032 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1033 for (i = 0;i < state->nbTrans; i++) {
1034 xmlRegPrintTrans(output, &(state->trans[i]));
1035 }
1036}
1037
Daniel Veillard23e73572002-09-19 19:56:43 +00001038#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard4255d502002-04-16 15:50:10 +00001039static void
1040xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1041 int i;
1042
1043 fprintf(output, " ctxt: ");
1044 if (ctxt == NULL) {
1045 fprintf(output, "NULL\n");
1046 return;
1047 }
1048 fprintf(output, "'%s' ", ctxt->string);
1049 if (ctxt->error)
1050 fprintf(output, "error ");
1051 if (ctxt->neg)
1052 fprintf(output, "neg ");
1053 fprintf(output, "\n");
1054 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1055 for (i = 0;i < ctxt->nbAtoms; i++) {
1056 fprintf(output, " %02d ", i);
1057 xmlRegPrintAtom(output, ctxt->atoms[i]);
1058 }
1059 if (ctxt->atom != NULL) {
1060 fprintf(output, "current atom:\n");
1061 xmlRegPrintAtom(output, ctxt->atom);
1062 }
1063 fprintf(output, "%d states:", ctxt->nbStates);
1064 if (ctxt->start != NULL)
1065 fprintf(output, " start: %d", ctxt->start->no);
1066 if (ctxt->end != NULL)
1067 fprintf(output, " end: %d", ctxt->end->no);
1068 fprintf(output, "\n");
1069 for (i = 0;i < ctxt->nbStates; i++) {
1070 xmlRegPrintState(output, ctxt->states[i]);
1071 }
1072 fprintf(output, "%d counters:\n", ctxt->nbCounters);
1073 for (i = 0;i < ctxt->nbCounters; i++) {
1074 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1075 ctxt->counters[i].max);
1076 }
1077}
Daniel Veillard23e73572002-09-19 19:56:43 +00001078#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00001079
1080/************************************************************************
1081 * *
1082 * Finite Automata structures manipulations *
1083 * *
1084 ************************************************************************/
1085
1086static void
1087xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1088 int neg, xmlRegAtomType type, int start, int end,
1089 xmlChar *blockName) {
1090 xmlRegRangePtr range;
1091
1092 if (atom == NULL) {
1093 ERROR("add range: atom is NULL");
1094 return;
1095 }
1096 if (atom->type != XML_REGEXP_RANGES) {
1097 ERROR("add range: atom is not ranges");
1098 return;
1099 }
1100 if (atom->maxRanges == 0) {
1101 atom->maxRanges = 4;
1102 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1103 sizeof(xmlRegRangePtr));
1104 if (atom->ranges == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001105 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001106 atom->maxRanges = 0;
1107 return;
1108 }
1109 } else if (atom->nbRanges >= atom->maxRanges) {
1110 xmlRegRangePtr *tmp;
1111 atom->maxRanges *= 2;
1112 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1113 sizeof(xmlRegRangePtr));
1114 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001115 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001116 atom->maxRanges /= 2;
1117 return;
1118 }
1119 atom->ranges = tmp;
1120 }
1121 range = xmlRegNewRange(ctxt, neg, type, start, end);
1122 if (range == NULL)
1123 return;
1124 range->blockName = blockName;
1125 atom->ranges[atom->nbRanges++] = range;
1126
1127}
1128
1129static int
1130xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1131 if (ctxt->maxCounters == 0) {
1132 ctxt->maxCounters = 4;
1133 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1134 sizeof(xmlRegCounter));
1135 if (ctxt->counters == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001136 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001137 ctxt->maxCounters = 0;
1138 return(-1);
1139 }
1140 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1141 xmlRegCounter *tmp;
1142 ctxt->maxCounters *= 2;
1143 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1144 sizeof(xmlRegCounter));
1145 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001146 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001147 ctxt->maxCounters /= 2;
1148 return(-1);
1149 }
1150 ctxt->counters = tmp;
1151 }
1152 ctxt->counters[ctxt->nbCounters].min = -1;
1153 ctxt->counters[ctxt->nbCounters].max = -1;
1154 return(ctxt->nbCounters++);
1155}
1156
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001157static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001158xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1159 if (atom == NULL) {
1160 ERROR("atom push: atom is NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001161 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001162 }
1163 if (ctxt->maxAtoms == 0) {
1164 ctxt->maxAtoms = 4;
1165 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1166 sizeof(xmlRegAtomPtr));
1167 if (ctxt->atoms == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001168 xmlRegexpErrMemory(ctxt, "pushing atom");
Daniel Veillard4255d502002-04-16 15:50:10 +00001169 ctxt->maxAtoms = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001170 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001171 }
1172 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1173 xmlRegAtomPtr *tmp;
1174 ctxt->maxAtoms *= 2;
1175 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1176 sizeof(xmlRegAtomPtr));
1177 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001178 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001179 ctxt->maxAtoms /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001180 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001181 }
1182 ctxt->atoms = tmp;
1183 }
1184 atom->no = ctxt->nbAtoms;
1185 ctxt->atoms[ctxt->nbAtoms++] = atom;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001186 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001187}
1188
1189static void
1190xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1191 xmlRegAtomPtr atom, xmlRegStatePtr target,
1192 int counter, int count) {
William M. Brackf9b5fa22004-05-10 07:52:15 +00001193
1194 int nrtrans;
1195
Daniel Veillard4255d502002-04-16 15:50:10 +00001196 if (state == NULL) {
1197 ERROR("add state: state is NULL");
1198 return;
1199 }
1200 if (target == NULL) {
1201 ERROR("add state: target is NULL");
1202 return;
1203 }
William M. Brackf9b5fa22004-05-10 07:52:15 +00001204 /*
1205 * Other routines follow the philosophy 'When in doubt, add a transition'
1206 * so we check here whether such a transition is already present and, if
1207 * so, silently ignore this request.
1208 */
1209
1210 for (nrtrans=0; nrtrans<state->nbTrans; nrtrans++) {
1211 if ((state->trans[nrtrans].atom == atom) &&
1212 (state->trans[nrtrans].to == target->no) &&
1213 (state->trans[nrtrans].counter == counter) &&
1214 (state->trans[nrtrans].count == count)) {
1215#ifdef DEBUG_REGEXP_GRAPH
1216 printf("Ignoring duplicate transition from %d to %d\n",
1217 state->no, target->no);
1218#endif
1219 return;
1220 }
1221 }
1222
Daniel Veillard4255d502002-04-16 15:50:10 +00001223 if (state->maxTrans == 0) {
1224 state->maxTrans = 4;
1225 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1226 sizeof(xmlRegTrans));
1227 if (state->trans == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001228 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001229 state->maxTrans = 0;
1230 return;
1231 }
1232 } else if (state->nbTrans >= state->maxTrans) {
1233 xmlRegTrans *tmp;
1234 state->maxTrans *= 2;
1235 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1236 sizeof(xmlRegTrans));
1237 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001238 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001239 state->maxTrans /= 2;
1240 return;
1241 }
1242 state->trans = tmp;
1243 }
1244#ifdef DEBUG_REGEXP_GRAPH
1245 printf("Add trans from %d to %d ", state->no, target->no);
Daniel Veillard8a001f62002-04-20 07:24:11 +00001246 if (count == REGEXP_ALL_COUNTER)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001247 printf("all transition\n");
Daniel Veillard4402ab42002-09-12 16:02:56 +00001248 else if (count >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001249 printf("count based %d\n", count);
Daniel Veillard4255d502002-04-16 15:50:10 +00001250 else if (counter >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001251 printf("counted %d\n", counter);
Daniel Veillard4255d502002-04-16 15:50:10 +00001252 else if (atom == NULL)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001253 printf("epsilon transition\n");
1254 else if (atom != NULL)
1255 xmlRegPrintAtom(stdout, atom);
Daniel Veillard4255d502002-04-16 15:50:10 +00001256#endif
1257
1258 state->trans[state->nbTrans].atom = atom;
1259 state->trans[state->nbTrans].to = target->no;
1260 state->trans[state->nbTrans].counter = counter;
1261 state->trans[state->nbTrans].count = count;
1262 state->nbTrans++;
1263}
1264
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001265static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001266xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001267 if (state == NULL) return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001268 if (ctxt->maxStates == 0) {
1269 ctxt->maxStates = 4;
1270 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1271 sizeof(xmlRegStatePtr));
1272 if (ctxt->states == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001273 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001274 ctxt->maxStates = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001275 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001276 }
1277 } else if (ctxt->nbStates >= ctxt->maxStates) {
1278 xmlRegStatePtr *tmp;
1279 ctxt->maxStates *= 2;
1280 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1281 sizeof(xmlRegStatePtr));
1282 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001283 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001284 ctxt->maxStates /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001285 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001286 }
1287 ctxt->states = tmp;
1288 }
1289 state->no = ctxt->nbStates;
1290 ctxt->states[ctxt->nbStates++] = state;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001291 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001292}
1293
1294/**
Daniel Veillard7646b182002-04-20 06:41:40 +00001295 * xmlFAGenerateAllTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001296 * @ctxt: a regexp parser context
1297 * @from: the from state
1298 * @to: the target state or NULL for building a new one
1299 * @lax:
Daniel Veillard7646b182002-04-20 06:41:40 +00001300 *
1301 */
1302static void
1303xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
Daniel Veillard441bc322002-04-20 17:38:48 +00001304 xmlRegStatePtr from, xmlRegStatePtr to,
1305 int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00001306 if (to == NULL) {
1307 to = xmlRegNewState(ctxt);
1308 xmlRegStatePush(ctxt, to);
1309 ctxt->state = to;
1310 }
Daniel Veillard441bc322002-04-20 17:38:48 +00001311 if (lax)
1312 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1313 else
1314 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
Daniel Veillard7646b182002-04-20 06:41:40 +00001315}
1316
1317/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001318 * xmlFAGenerateEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001319 * @ctxt: a regexp parser context
1320 * @from: the from state
1321 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001322 *
1323 */
1324static void
1325xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1326 xmlRegStatePtr from, xmlRegStatePtr to) {
1327 if (to == NULL) {
1328 to = xmlRegNewState(ctxt);
1329 xmlRegStatePush(ctxt, to);
1330 ctxt->state = to;
1331 }
1332 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1333}
1334
1335/**
1336 * xmlFAGenerateCountedEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001337 * @ctxt: a regexp parser context
1338 * @from: the from state
1339 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001340 * counter: the counter for that transition
1341 *
1342 */
1343static void
1344xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1345 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1346 if (to == NULL) {
1347 to = xmlRegNewState(ctxt);
1348 xmlRegStatePush(ctxt, to);
1349 ctxt->state = to;
1350 }
1351 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1352}
1353
1354/**
1355 * xmlFAGenerateCountedTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001356 * @ctxt: a regexp parser context
1357 * @from: the from state
1358 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001359 * counter: the counter for that transition
1360 *
1361 */
1362static void
1363xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1364 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1365 if (to == NULL) {
1366 to = xmlRegNewState(ctxt);
1367 xmlRegStatePush(ctxt, to);
1368 ctxt->state = to;
1369 }
1370 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1371}
1372
1373/**
1374 * xmlFAGenerateTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001375 * @ctxt: a regexp parser context
1376 * @from: the from state
1377 * @to: the target state or NULL for building a new one
1378 * @atom: the atom generating the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00001379 *
William M. Brackddf71d62004-05-06 04:17:26 +00001380 * Returns 0 if success and -1 in case of error.
Daniel Veillard4255d502002-04-16 15:50:10 +00001381 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001382static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001383xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1384 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1385 if (atom == NULL) {
1386 ERROR("genrate transition: atom == NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001387 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001388 }
1389 if (atom->type == XML_REGEXP_SUBREG) {
1390 /*
1391 * this is a subexpression handling one should not need to
William M. Brackddf71d62004-05-06 04:17:26 +00001392 * create a new node except for XML_REGEXP_QUANT_RANGE.
Daniel Veillard4255d502002-04-16 15:50:10 +00001393 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001394 if (xmlRegAtomPush(ctxt, atom) < 0) {
1395 return(-1);
1396 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001397 if ((to != NULL) && (atom->stop != to) &&
1398 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1399 /*
1400 * Generate an epsilon transition to link to the target
1401 */
1402 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1403 }
1404 switch (atom->quant) {
1405 case XML_REGEXP_QUANT_OPT:
1406 atom->quant = XML_REGEXP_QUANT_ONCE;
1407 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1408 break;
1409 case XML_REGEXP_QUANT_MULT:
1410 atom->quant = XML_REGEXP_QUANT_ONCE;
1411 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1412 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1413 break;
1414 case XML_REGEXP_QUANT_PLUS:
1415 atom->quant = XML_REGEXP_QUANT_ONCE;
1416 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1417 break;
1418 case XML_REGEXP_QUANT_RANGE: {
1419 int counter;
1420 xmlRegStatePtr newstate;
1421
1422 /*
1423 * This one is nasty:
William M. Brackddf71d62004-05-06 04:17:26 +00001424 * 1/ if range has minOccurs == 0, create a new state
1425 * and create epsilon transitions from atom->start
1426 * to atom->stop, as well as atom->start to the new
1427 * state
1428 * 2/ register a new counter
1429 * 3/ register an epsilon transition associated to
Daniel Veillard4255d502002-04-16 15:50:10 +00001430 * this counter going from atom->stop to atom->start
William M. Brackddf71d62004-05-06 04:17:26 +00001431 * 4/ create a new state
1432 * 5/ generate a counted transition from atom->stop to
Daniel Veillard4255d502002-04-16 15:50:10 +00001433 * that state
1434 */
William M. Brackddf71d62004-05-06 04:17:26 +00001435 if (atom->min == 0) {
1436 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1437 atom->stop);
1438 newstate = xmlRegNewState(ctxt);
1439 xmlRegStatePush(ctxt, newstate);
1440 ctxt->state = newstate;
1441 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1442 newstate);
1443 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001444 counter = xmlRegGetCounter(ctxt);
1445 ctxt->counters[counter].min = atom->min - 1;
1446 ctxt->counters[counter].max = atom->max - 1;
1447 atom->min = 0;
1448 atom->max = 0;
1449 atom->quant = XML_REGEXP_QUANT_ONCE;
1450 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1451 atom->start, counter);
1452 if (to != NULL) {
1453 newstate = to;
1454 } else {
1455 newstate = xmlRegNewState(ctxt);
1456 xmlRegStatePush(ctxt, newstate);
1457 ctxt->state = newstate;
1458 }
1459 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1460 newstate, counter);
1461 }
1462 default:
1463 break;
1464 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001465 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001466 } else {
1467 if (to == NULL) {
1468 to = xmlRegNewState(ctxt);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001469 if (to != NULL)
1470 xmlRegStatePush(ctxt, to);
1471 else {
1472 return(-1);
1473 }
1474 }
1475 if (xmlRegAtomPush(ctxt, atom) < 0) {
1476 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001477 }
1478 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001479 ctxt->state = to;
1480 }
1481 switch (atom->quant) {
1482 case XML_REGEXP_QUANT_OPT:
1483 atom->quant = XML_REGEXP_QUANT_ONCE;
1484 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1485 break;
1486 case XML_REGEXP_QUANT_MULT:
1487 atom->quant = XML_REGEXP_QUANT_ONCE;
1488 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1489 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1490 break;
1491 case XML_REGEXP_QUANT_PLUS:
1492 atom->quant = XML_REGEXP_QUANT_ONCE;
1493 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1494 break;
1495 default:
1496 break;
1497 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001498 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001499}
1500
1501/**
1502 * xmlFAReduceEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001503 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001504 * @fromnr: the from state
1505 * @tonr: the to state
William M. Brackddf71d62004-05-06 04:17:26 +00001506 * @counter: should that transition be associated to a counted
Daniel Veillard4255d502002-04-16 15:50:10 +00001507 *
1508 */
1509static void
1510xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1511 int tonr, int counter) {
1512 int transnr;
1513 xmlRegStatePtr from;
1514 xmlRegStatePtr to;
1515
1516#ifdef DEBUG_REGEXP_GRAPH
1517 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1518#endif
1519 from = ctxt->states[fromnr];
1520 if (from == NULL)
1521 return;
1522 to = ctxt->states[tonr];
1523 if (to == NULL)
1524 return;
1525 if ((to->mark == XML_REGEXP_MARK_START) ||
1526 (to->mark == XML_REGEXP_MARK_VISITED))
1527 return;
1528
1529 to->mark = XML_REGEXP_MARK_VISITED;
1530 if (to->type == XML_REGEXP_FINAL_STATE) {
1531#ifdef DEBUG_REGEXP_GRAPH
1532 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1533#endif
1534 from->type = XML_REGEXP_FINAL_STATE;
1535 }
1536 for (transnr = 0;transnr < to->nbTrans;transnr++) {
1537 if (to->trans[transnr].atom == NULL) {
1538 /*
1539 * Don't remove counted transitions
1540 * Don't loop either
1541 */
Daniel Veillardb509f152002-04-17 16:28:10 +00001542 if (to->trans[transnr].to != fromnr) {
1543 if (to->trans[transnr].count >= 0) {
1544 int newto = to->trans[transnr].to;
1545
1546 xmlRegStateAddTrans(ctxt, from, NULL,
1547 ctxt->states[newto],
1548 -1, to->trans[transnr].count);
1549 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00001550#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillardb509f152002-04-17 16:28:10 +00001551 printf("Found epsilon trans %d from %d to %d\n",
1552 transnr, tonr, to->trans[transnr].to);
Daniel Veillard4255d502002-04-16 15:50:10 +00001553#endif
Daniel Veillardb509f152002-04-17 16:28:10 +00001554 if (to->trans[transnr].counter >= 0) {
1555 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1556 to->trans[transnr].to,
1557 to->trans[transnr].counter);
1558 } else {
1559 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1560 to->trans[transnr].to,
1561 counter);
1562 }
1563 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001564 }
1565 } else {
1566 int newto = to->trans[transnr].to;
1567
Daniel Veillardb509f152002-04-17 16:28:10 +00001568 if (to->trans[transnr].counter >= 0) {
1569 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1570 ctxt->states[newto],
1571 to->trans[transnr].counter, -1);
1572 } else {
1573 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1574 ctxt->states[newto], counter, -1);
1575 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001576 }
1577 }
1578 to->mark = XML_REGEXP_MARK_NORMAL;
1579}
1580
1581/**
1582 * xmlFAEliminateEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001583 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001584 *
1585 */
1586static void
1587xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1588 int statenr, transnr;
1589 xmlRegStatePtr state;
1590
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001591 if (ctxt->states == NULL) return;
1592
1593
Daniel Veillard4255d502002-04-16 15:50:10 +00001594 /*
1595 * build the completed transitions bypassing the epsilons
1596 * Use a marking algorithm to avoid loops
1597 */
1598 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1599 state = ctxt->states[statenr];
1600 if (state == NULL)
1601 continue;
1602 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1603 if ((state->trans[transnr].atom == NULL) &&
1604 (state->trans[transnr].to >= 0)) {
1605 if (state->trans[transnr].to == statenr) {
1606 state->trans[transnr].to = -1;
1607#ifdef DEBUG_REGEXP_GRAPH
1608 printf("Removed loopback epsilon trans %d on %d\n",
1609 transnr, statenr);
1610#endif
1611 } else if (state->trans[transnr].count < 0) {
1612 int newto = state->trans[transnr].to;
1613
1614#ifdef DEBUG_REGEXP_GRAPH
1615 printf("Found epsilon trans %d from %d to %d\n",
1616 transnr, statenr, newto);
1617#endif
1618 state->mark = XML_REGEXP_MARK_START;
1619 xmlFAReduceEpsilonTransitions(ctxt, statenr,
1620 newto, state->trans[transnr].counter);
1621 state->mark = XML_REGEXP_MARK_NORMAL;
1622#ifdef DEBUG_REGEXP_GRAPH
1623 } else {
1624 printf("Found counted transition %d on %d\n",
1625 transnr, statenr);
1626#endif
1627 }
1628 }
1629 }
1630 }
1631 /*
1632 * Eliminate the epsilon transitions
1633 */
1634 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1635 state = ctxt->states[statenr];
1636 if (state == NULL)
1637 continue;
1638 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1639 if ((state->trans[transnr].atom == NULL) &&
1640 (state->trans[transnr].count < 0) &&
1641 (state->trans[transnr].to >= 0)) {
1642 state->trans[transnr].to = -1;
1643 }
1644 }
1645 }
Daniel Veillard23e73572002-09-19 19:56:43 +00001646
1647 /*
1648 * Use this pass to detect unreachable states too
1649 */
1650 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1651 state = ctxt->states[statenr];
1652 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00001653 state->reached = XML_REGEXP_MARK_NORMAL;
Daniel Veillard23e73572002-09-19 19:56:43 +00001654 }
1655 state = ctxt->states[0];
1656 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00001657 state->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00001658 while (state != NULL) {
1659 xmlRegStatePtr target = NULL;
William M. Brack779af002003-08-01 15:55:39 +00001660 state->reached = XML_REGEXP_MARK_VISITED;
Daniel Veillard23e73572002-09-19 19:56:43 +00001661 /*
William M. Brackddf71d62004-05-06 04:17:26 +00001662 * Mark all states reachable from the current reachable state
Daniel Veillard23e73572002-09-19 19:56:43 +00001663 */
1664 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1665 if ((state->trans[transnr].to >= 0) &&
1666 ((state->trans[transnr].atom != NULL) ||
1667 (state->trans[transnr].count >= 0))) {
1668 int newto = state->trans[transnr].to;
1669
1670 if (ctxt->states[newto] == NULL)
1671 continue;
William M. Brack779af002003-08-01 15:55:39 +00001672 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
1673 ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00001674 target = ctxt->states[newto];
1675 }
1676 }
1677 }
1678 /*
1679 * find the next accessible state not explored
1680 */
1681 if (target == NULL) {
1682 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
1683 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00001684 if ((state != NULL) && (state->reached ==
1685 XML_REGEXP_MARK_START)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00001686 target = state;
1687 break;
1688 }
1689 }
1690 }
1691 state = target;
1692 }
1693 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1694 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00001695 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00001696#ifdef DEBUG_REGEXP_GRAPH
1697 printf("Removed unreachable state %d\n", statenr);
1698#endif
1699 xmlRegFreeState(state);
1700 ctxt->states[statenr] = NULL;
1701 }
1702 }
1703
Daniel Veillard4255d502002-04-16 15:50:10 +00001704}
1705
Daniel Veillarde19fc232002-04-22 16:01:24 +00001706/**
1707 * xmlFACompareAtoms:
1708 * @atom1: an atom
1709 * @atom2: an atom
1710 *
William M. Brackddf71d62004-05-06 04:17:26 +00001711 * Compares two atoms to check whether they are equivalents
Daniel Veillarde19fc232002-04-22 16:01:24 +00001712 *
1713 * Returns 1 if yes and 0 otherwise
1714 */
1715static int
1716xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
1717 if (atom1 == atom2)
1718 return(1);
1719 if ((atom1 == NULL) || (atom2 == NULL))
1720 return(0);
1721
1722 if (atom1->type != atom2->type)
1723 return(0);
1724 switch (atom1->type) {
1725 case XML_REGEXP_STRING:
1726 return(xmlStrEqual((xmlChar *)atom1->valuep,
1727 (xmlChar *)atom2->valuep));
1728 case XML_REGEXP_EPSILON:
1729 return(1);
1730 case XML_REGEXP_CHARVAL:
1731 return(atom1->codepoint == atom2->codepoint);
1732 case XML_REGEXP_RANGES:
1733 TODO;
1734 return(0);
1735 default:
1736 break;
1737 }
1738 return(1);
1739}
1740
1741/**
1742 * xmlFARecurseDeterminism:
1743 * @ctxt: a regexp parser context
1744 *
1745 * Check whether the associated regexp is determinist,
1746 * should be called after xmlFAEliminateEpsilonTransitions()
1747 *
1748 */
1749static int
1750xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1751 int to, xmlRegAtomPtr atom) {
1752 int ret = 1;
1753 int transnr;
1754 xmlRegTransPtr t1;
1755
1756 if (state == NULL)
1757 return(ret);
1758 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1759 t1 = &(state->trans[transnr]);
1760 /*
1761 * check transitions conflicting with the one looked at
1762 */
1763 if (t1->atom == NULL) {
1764 if (t1->to == -1)
1765 continue;
1766 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1767 to, atom);
1768 if (ret == 0)
1769 return(0);
1770 continue;
1771 }
1772 if (t1->to != to)
1773 continue;
1774 if (xmlFACompareAtoms(t1->atom, atom))
1775 return(0);
1776 }
1777 return(ret);
1778}
1779
1780/**
1781 * xmlFAComputesDeterminism:
1782 * @ctxt: a regexp parser context
1783 *
1784 * Check whether the associated regexp is determinist,
1785 * should be called after xmlFAEliminateEpsilonTransitions()
1786 *
1787 */
1788static int
1789xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
1790 int statenr, transnr;
1791 xmlRegStatePtr state;
1792 xmlRegTransPtr t1, t2;
1793 int i;
1794 int ret = 1;
1795
Daniel Veillard4402ab42002-09-12 16:02:56 +00001796#ifdef DEBUG_REGEXP_GRAPH
1797 printf("xmlFAComputesDeterminism\n");
1798 xmlRegPrintCtxt(stdout, ctxt);
1799#endif
Daniel Veillarde19fc232002-04-22 16:01:24 +00001800 if (ctxt->determinist != -1)
1801 return(ctxt->determinist);
1802
1803 /*
William M. Brackddf71d62004-05-06 04:17:26 +00001804 * Check for all states that there aren't 2 transitions
Daniel Veillarde19fc232002-04-22 16:01:24 +00001805 * with the same atom and a different target.
1806 */
1807 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1808 state = ctxt->states[statenr];
1809 if (state == NULL)
1810 continue;
1811 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1812 t1 = &(state->trans[transnr]);
1813 /*
1814 * Determinism checks in case of counted or all transitions
1815 * will have to be handled separately
1816 */
1817 if (t1->atom == NULL)
1818 continue;
1819 if (t1->to == -1) /* eliminated */
1820 continue;
1821 for (i = 0;i < transnr;i++) {
1822 t2 = &(state->trans[i]);
1823 if (t2->to == -1) /* eliminated */
1824 continue;
1825 if (t2->atom != NULL) {
1826 if (t1->to == t2->to) {
1827 if (xmlFACompareAtoms(t1->atom, t2->atom))
William M. Brackddf71d62004-05-06 04:17:26 +00001828 t2->to = -1; /* eliminated */
Daniel Veillarde19fc232002-04-22 16:01:24 +00001829 } else {
1830 /* not determinist ! */
1831 if (xmlFACompareAtoms(t1->atom, t2->atom))
1832 ret = 0;
1833 }
1834 } else if (t1->to != -1) {
1835 /*
1836 * do the closure in case of remaining specific
1837 * epsilon transitions like choices or all
1838 */
1839 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1840 t2->to, t2->atom);
1841 if (ret == 0)
1842 return(0);
1843 }
1844 }
1845 if (ret == 0)
1846 break;
1847 }
1848 if (ret == 0)
1849 break;
1850 }
1851 ctxt->determinist = ret;
1852 return(ret);
1853}
1854
Daniel Veillard4255d502002-04-16 15:50:10 +00001855/************************************************************************
1856 * *
1857 * Routines to check input against transition atoms *
1858 * *
1859 ************************************************************************/
1860
1861static int
1862xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
1863 int start, int end, const xmlChar *blockName) {
1864 int ret = 0;
1865
1866 switch (type) {
1867 case XML_REGEXP_STRING:
1868 case XML_REGEXP_SUBREG:
1869 case XML_REGEXP_RANGES:
1870 case XML_REGEXP_EPSILON:
1871 return(-1);
1872 case XML_REGEXP_ANYCHAR:
1873 ret = ((codepoint != '\n') && (codepoint != '\r'));
1874 break;
1875 case XML_REGEXP_CHARVAL:
1876 ret = ((codepoint >= start) && (codepoint <= end));
1877 break;
1878 case XML_REGEXP_NOTSPACE:
1879 neg = !neg;
1880 case XML_REGEXP_ANYSPACE:
1881 ret = ((codepoint == '\n') || (codepoint == '\r') ||
1882 (codepoint == '\t') || (codepoint == ' '));
1883 break;
1884 case XML_REGEXP_NOTINITNAME:
1885 neg = !neg;
1886 case XML_REGEXP_INITNAME:
William M. Brack871611b2003-10-18 04:53:14 +00001887 ret = (IS_LETTER(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00001888 (codepoint == '_') || (codepoint == ':'));
1889 break;
1890 case XML_REGEXP_NOTNAMECHAR:
1891 neg = !neg;
1892 case XML_REGEXP_NAMECHAR:
William M. Brack871611b2003-10-18 04:53:14 +00001893 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00001894 (codepoint == '.') || (codepoint == '-') ||
1895 (codepoint == '_') || (codepoint == ':') ||
William M. Brack871611b2003-10-18 04:53:14 +00001896 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
Daniel Veillard4255d502002-04-16 15:50:10 +00001897 break;
1898 case XML_REGEXP_NOTDECIMAL:
1899 neg = !neg;
1900 case XML_REGEXP_DECIMAL:
1901 ret = xmlUCSIsCatNd(codepoint);
1902 break;
1903 case XML_REGEXP_REALCHAR:
1904 neg = !neg;
1905 case XML_REGEXP_NOTREALCHAR:
1906 ret = xmlUCSIsCatP(codepoint);
1907 if (ret == 0)
1908 ret = xmlUCSIsCatZ(codepoint);
1909 if (ret == 0)
1910 ret = xmlUCSIsCatC(codepoint);
1911 break;
1912 case XML_REGEXP_LETTER:
1913 ret = xmlUCSIsCatL(codepoint);
1914 break;
1915 case XML_REGEXP_LETTER_UPPERCASE:
1916 ret = xmlUCSIsCatLu(codepoint);
1917 break;
1918 case XML_REGEXP_LETTER_LOWERCASE:
1919 ret = xmlUCSIsCatLl(codepoint);
1920 break;
1921 case XML_REGEXP_LETTER_TITLECASE:
1922 ret = xmlUCSIsCatLt(codepoint);
1923 break;
1924 case XML_REGEXP_LETTER_MODIFIER:
1925 ret = xmlUCSIsCatLm(codepoint);
1926 break;
1927 case XML_REGEXP_LETTER_OTHERS:
1928 ret = xmlUCSIsCatLo(codepoint);
1929 break;
1930 case XML_REGEXP_MARK:
1931 ret = xmlUCSIsCatM(codepoint);
1932 break;
1933 case XML_REGEXP_MARK_NONSPACING:
1934 ret = xmlUCSIsCatMn(codepoint);
1935 break;
1936 case XML_REGEXP_MARK_SPACECOMBINING:
1937 ret = xmlUCSIsCatMc(codepoint);
1938 break;
1939 case XML_REGEXP_MARK_ENCLOSING:
1940 ret = xmlUCSIsCatMe(codepoint);
1941 break;
1942 case XML_REGEXP_NUMBER:
1943 ret = xmlUCSIsCatN(codepoint);
1944 break;
1945 case XML_REGEXP_NUMBER_DECIMAL:
1946 ret = xmlUCSIsCatNd(codepoint);
1947 break;
1948 case XML_REGEXP_NUMBER_LETTER:
1949 ret = xmlUCSIsCatNl(codepoint);
1950 break;
1951 case XML_REGEXP_NUMBER_OTHERS:
1952 ret = xmlUCSIsCatNo(codepoint);
1953 break;
1954 case XML_REGEXP_PUNCT:
1955 ret = xmlUCSIsCatP(codepoint);
1956 break;
1957 case XML_REGEXP_PUNCT_CONNECTOR:
1958 ret = xmlUCSIsCatPc(codepoint);
1959 break;
1960 case XML_REGEXP_PUNCT_DASH:
1961 ret = xmlUCSIsCatPd(codepoint);
1962 break;
1963 case XML_REGEXP_PUNCT_OPEN:
1964 ret = xmlUCSIsCatPs(codepoint);
1965 break;
1966 case XML_REGEXP_PUNCT_CLOSE:
1967 ret = xmlUCSIsCatPe(codepoint);
1968 break;
1969 case XML_REGEXP_PUNCT_INITQUOTE:
1970 ret = xmlUCSIsCatPi(codepoint);
1971 break;
1972 case XML_REGEXP_PUNCT_FINQUOTE:
1973 ret = xmlUCSIsCatPf(codepoint);
1974 break;
1975 case XML_REGEXP_PUNCT_OTHERS:
1976 ret = xmlUCSIsCatPo(codepoint);
1977 break;
1978 case XML_REGEXP_SEPAR:
1979 ret = xmlUCSIsCatZ(codepoint);
1980 break;
1981 case XML_REGEXP_SEPAR_SPACE:
1982 ret = xmlUCSIsCatZs(codepoint);
1983 break;
1984 case XML_REGEXP_SEPAR_LINE:
1985 ret = xmlUCSIsCatZl(codepoint);
1986 break;
1987 case XML_REGEXP_SEPAR_PARA:
1988 ret = xmlUCSIsCatZp(codepoint);
1989 break;
1990 case XML_REGEXP_SYMBOL:
1991 ret = xmlUCSIsCatS(codepoint);
1992 break;
1993 case XML_REGEXP_SYMBOL_MATH:
1994 ret = xmlUCSIsCatSm(codepoint);
1995 break;
1996 case XML_REGEXP_SYMBOL_CURRENCY:
1997 ret = xmlUCSIsCatSc(codepoint);
1998 break;
1999 case XML_REGEXP_SYMBOL_MODIFIER:
2000 ret = xmlUCSIsCatSk(codepoint);
2001 break;
2002 case XML_REGEXP_SYMBOL_OTHERS:
2003 ret = xmlUCSIsCatSo(codepoint);
2004 break;
2005 case XML_REGEXP_OTHER:
2006 ret = xmlUCSIsCatC(codepoint);
2007 break;
2008 case XML_REGEXP_OTHER_CONTROL:
2009 ret = xmlUCSIsCatCc(codepoint);
2010 break;
2011 case XML_REGEXP_OTHER_FORMAT:
2012 ret = xmlUCSIsCatCf(codepoint);
2013 break;
2014 case XML_REGEXP_OTHER_PRIVATE:
2015 ret = xmlUCSIsCatCo(codepoint);
2016 break;
2017 case XML_REGEXP_OTHER_NA:
2018 /* ret = xmlUCSIsCatCn(codepoint); */
2019 /* Seems it doesn't exist anymore in recent Unicode releases */
2020 ret = 0;
2021 break;
2022 case XML_REGEXP_BLOCK_NAME:
2023 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
2024 break;
2025 }
2026 if (neg)
2027 return(!ret);
2028 return(ret);
2029}
2030
2031static int
2032xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
2033 int i, ret = 0;
2034 xmlRegRangePtr range;
2035
William M. Brack871611b2003-10-18 04:53:14 +00002036 if ((atom == NULL) || (!IS_CHAR(codepoint)))
Daniel Veillard4255d502002-04-16 15:50:10 +00002037 return(-1);
2038
2039 switch (atom->type) {
2040 case XML_REGEXP_SUBREG:
2041 case XML_REGEXP_EPSILON:
2042 return(-1);
2043 case XML_REGEXP_CHARVAL:
2044 return(codepoint == atom->codepoint);
2045 case XML_REGEXP_RANGES: {
2046 int accept = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00002047
Daniel Veillard4255d502002-04-16 15:50:10 +00002048 for (i = 0;i < atom->nbRanges;i++) {
2049 range = atom->ranges[i];
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002050 if (range->neg == 2) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002051 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2052 0, range->start, range->end,
2053 range->blockName);
2054 if (ret != 0)
2055 return(0); /* excluded char */
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002056 } else if (range->neg) {
2057 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2058 0, range->start, range->end,
2059 range->blockName);
2060 if (ret == 0)
Daniel Veillardf2a12832003-11-24 13:04:35 +00002061 accept = 1;
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002062 else
2063 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00002064 } else {
2065 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2066 0, range->start, range->end,
2067 range->blockName);
2068 if (ret != 0)
2069 accept = 1; /* might still be excluded */
2070 }
2071 }
2072 return(accept);
2073 }
2074 case XML_REGEXP_STRING:
2075 printf("TODO: XML_REGEXP_STRING\n");
2076 return(-1);
2077 case XML_REGEXP_ANYCHAR:
2078 case XML_REGEXP_ANYSPACE:
2079 case XML_REGEXP_NOTSPACE:
2080 case XML_REGEXP_INITNAME:
2081 case XML_REGEXP_NOTINITNAME:
2082 case XML_REGEXP_NAMECHAR:
2083 case XML_REGEXP_NOTNAMECHAR:
2084 case XML_REGEXP_DECIMAL:
2085 case XML_REGEXP_NOTDECIMAL:
2086 case XML_REGEXP_REALCHAR:
2087 case XML_REGEXP_NOTREALCHAR:
2088 case XML_REGEXP_LETTER:
2089 case XML_REGEXP_LETTER_UPPERCASE:
2090 case XML_REGEXP_LETTER_LOWERCASE:
2091 case XML_REGEXP_LETTER_TITLECASE:
2092 case XML_REGEXP_LETTER_MODIFIER:
2093 case XML_REGEXP_LETTER_OTHERS:
2094 case XML_REGEXP_MARK:
2095 case XML_REGEXP_MARK_NONSPACING:
2096 case XML_REGEXP_MARK_SPACECOMBINING:
2097 case XML_REGEXP_MARK_ENCLOSING:
2098 case XML_REGEXP_NUMBER:
2099 case XML_REGEXP_NUMBER_DECIMAL:
2100 case XML_REGEXP_NUMBER_LETTER:
2101 case XML_REGEXP_NUMBER_OTHERS:
2102 case XML_REGEXP_PUNCT:
2103 case XML_REGEXP_PUNCT_CONNECTOR:
2104 case XML_REGEXP_PUNCT_DASH:
2105 case XML_REGEXP_PUNCT_OPEN:
2106 case XML_REGEXP_PUNCT_CLOSE:
2107 case XML_REGEXP_PUNCT_INITQUOTE:
2108 case XML_REGEXP_PUNCT_FINQUOTE:
2109 case XML_REGEXP_PUNCT_OTHERS:
2110 case XML_REGEXP_SEPAR:
2111 case XML_REGEXP_SEPAR_SPACE:
2112 case XML_REGEXP_SEPAR_LINE:
2113 case XML_REGEXP_SEPAR_PARA:
2114 case XML_REGEXP_SYMBOL:
2115 case XML_REGEXP_SYMBOL_MATH:
2116 case XML_REGEXP_SYMBOL_CURRENCY:
2117 case XML_REGEXP_SYMBOL_MODIFIER:
2118 case XML_REGEXP_SYMBOL_OTHERS:
2119 case XML_REGEXP_OTHER:
2120 case XML_REGEXP_OTHER_CONTROL:
2121 case XML_REGEXP_OTHER_FORMAT:
2122 case XML_REGEXP_OTHER_PRIVATE:
2123 case XML_REGEXP_OTHER_NA:
2124 case XML_REGEXP_BLOCK_NAME:
2125 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
2126 (const xmlChar *)atom->valuep);
2127 if (atom->neg)
2128 ret = !ret;
2129 break;
2130 }
2131 return(ret);
2132}
2133
2134/************************************************************************
2135 * *
William M. Brackddf71d62004-05-06 04:17:26 +00002136 * Saving and restoring state of an execution context *
Daniel Veillard4255d502002-04-16 15:50:10 +00002137 * *
2138 ************************************************************************/
2139
2140#ifdef DEBUG_REGEXP_EXEC
2141static void
2142xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
2143 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
2144 if (exec->inputStack != NULL) {
2145 int i;
2146 printf(": ");
2147 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
2148 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]);
2149 } else {
2150 printf(": %s", &(exec->inputString[exec->index]));
2151 }
2152 printf("\n");
2153}
2154#endif
2155
2156static void
2157xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
2158#ifdef DEBUG_REGEXP_EXEC
2159 printf("saving ");
2160 exec->transno++;
2161 xmlFARegDebugExec(exec);
2162 exec->transno--;
2163#endif
2164
2165 if (exec->maxRollbacks == 0) {
2166 exec->maxRollbacks = 4;
2167 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
2168 sizeof(xmlRegExecRollback));
2169 if (exec->rollbacks == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002170 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002171 exec->maxRollbacks = 0;
2172 return;
2173 }
2174 memset(exec->rollbacks, 0,
2175 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2176 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
2177 xmlRegExecRollback *tmp;
2178 int len = exec->maxRollbacks;
2179
2180 exec->maxRollbacks *= 2;
2181 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
2182 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2183 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002184 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002185 exec->maxRollbacks /= 2;
2186 return;
2187 }
2188 exec->rollbacks = tmp;
2189 tmp = &exec->rollbacks[len];
2190 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
2191 }
2192 exec->rollbacks[exec->nbRollbacks].state = exec->state;
2193 exec->rollbacks[exec->nbRollbacks].index = exec->index;
2194 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
2195 if (exec->comp->nbCounters > 0) {
2196 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2197 exec->rollbacks[exec->nbRollbacks].counts = (int *)
2198 xmlMalloc(exec->comp->nbCounters * sizeof(int));
2199 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002200 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002201 exec->status = -5;
2202 return;
2203 }
2204 }
2205 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
2206 exec->comp->nbCounters * sizeof(int));
2207 }
2208 exec->nbRollbacks++;
2209}
2210
2211static void
2212xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
2213 if (exec->nbRollbacks <= 0) {
2214 exec->status = -1;
2215#ifdef DEBUG_REGEXP_EXEC
2216 printf("rollback failed on empty stack\n");
2217#endif
2218 return;
2219 }
2220 exec->nbRollbacks--;
2221 exec->state = exec->rollbacks[exec->nbRollbacks].state;
2222 exec->index = exec->rollbacks[exec->nbRollbacks].index;
2223 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
2224 if (exec->comp->nbCounters > 0) {
2225 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2226 fprintf(stderr, "exec save: allocation failed");
2227 exec->status = -6;
2228 return;
2229 }
2230 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
2231 exec->comp->nbCounters * sizeof(int));
2232 }
2233
2234#ifdef DEBUG_REGEXP_EXEC
2235 printf("restored ");
2236 xmlFARegDebugExec(exec);
2237#endif
2238}
2239
2240/************************************************************************
2241 * *
William M. Brackddf71d62004-05-06 04:17:26 +00002242 * Verifier, running an input against a compiled regexp *
Daniel Veillard4255d502002-04-16 15:50:10 +00002243 * *
2244 ************************************************************************/
2245
2246static int
2247xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
2248 xmlRegExecCtxt execval;
2249 xmlRegExecCtxtPtr exec = &execval;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002250 int ret, codepoint = 0, len;
Daniel Veillard4255d502002-04-16 15:50:10 +00002251
2252 exec->inputString = content;
2253 exec->index = 0;
2254 exec->determinist = 1;
2255 exec->maxRollbacks = 0;
2256 exec->nbRollbacks = 0;
2257 exec->rollbacks = NULL;
2258 exec->status = 0;
2259 exec->comp = comp;
2260 exec->state = comp->states[0];
2261 exec->transno = 0;
2262 exec->transcount = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00002263 exec->inputStack = NULL;
2264 exec->inputStackMax = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00002265 if (comp->nbCounters > 0) {
2266 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
Daniel Veillardff46a042003-10-08 08:53:17 +00002267 if (exec->counts == NULL) {
2268 xmlRegexpErrMemory(NULL, "running regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002269 return(-1);
Daniel Veillardff46a042003-10-08 08:53:17 +00002270 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002271 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
2272 } else
2273 exec->counts = NULL;
2274 while ((exec->status == 0) &&
2275 ((exec->inputString[exec->index] != 0) ||
2276 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
2277 xmlRegTransPtr trans;
2278 xmlRegAtomPtr atom;
2279
2280 /*
William M. Brack0e00b282004-04-26 15:40:47 +00002281 * If end of input on non-terminal state, rollback, however we may
Daniel Veillard4255d502002-04-16 15:50:10 +00002282 * still have epsilon like transition for counted transitions
William M. Brack0e00b282004-04-26 15:40:47 +00002283 * on counters, in that case don't break too early. Additionally,
2284 * if we are working on a range like "AB{0,2}", where B is not present,
2285 * we don't want to break.
Daniel Veillard4255d502002-04-16 15:50:10 +00002286 */
William M. Brack0e00b282004-04-26 15:40:47 +00002287 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
William M. Brackddf71d62004-05-06 04:17:26 +00002288 /*
2289 * if there is a transition, we must check if
2290 * atom allows minOccurs of 0
2291 */
2292 if (exec->transno < exec->state->nbTrans) {
William M. Brack0e00b282004-04-26 15:40:47 +00002293 trans = &exec->state->trans[exec->transno];
2294 if (trans->to >=0) {
2295 atom = trans->atom;
2296 if (!((atom->min == 0) && (atom->max > 0)))
2297 goto rollback;
2298 }
2299 } else
2300 goto rollback;
2301 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002302
2303 exec->transcount = 0;
2304 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2305 trans = &exec->state->trans[exec->transno];
2306 if (trans->to < 0)
2307 continue;
2308 atom = trans->atom;
2309 ret = 0;
2310 if (trans->count >= 0) {
2311 int count;
2312 xmlRegCounterPtr counter;
2313
2314 /*
2315 * A counted transition.
2316 */
2317
2318 count = exec->counts[trans->count];
2319 counter = &exec->comp->counters[trans->count];
2320#ifdef DEBUG_REGEXP_EXEC
2321 printf("testing count %d: val %d, min %d, max %d\n",
2322 trans->count, count, counter->min, counter->max);
2323#endif
2324 ret = ((count >= counter->min) && (count <= counter->max));
2325 } else if (atom == NULL) {
2326 fprintf(stderr, "epsilon transition left at runtime\n");
2327 exec->status = -2;
2328 break;
2329 } else if (exec->inputString[exec->index] != 0) {
2330 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
2331 ret = xmlRegCheckCharacter(atom, codepoint);
William M. Brack0e00b282004-04-26 15:40:47 +00002332 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002333 xmlRegStatePtr to = comp->states[trans->to];
2334
2335 /*
2336 * this is a multiple input sequence
2337 */
2338 if (exec->state->nbTrans > exec->transno + 1) {
2339 xmlFARegExecSave(exec);
2340 }
2341 exec->transcount = 1;
2342 do {
2343 /*
2344 * Try to progress as much as possible on the input
2345 */
2346 if (exec->transcount == atom->max) {
2347 break;
2348 }
2349 exec->index += len;
2350 /*
2351 * End of input: stop here
2352 */
2353 if (exec->inputString[exec->index] == 0) {
2354 exec->index -= len;
2355 break;
2356 }
2357 if (exec->transcount >= atom->min) {
2358 int transno = exec->transno;
2359 xmlRegStatePtr state = exec->state;
2360
2361 /*
2362 * The transition is acceptable save it
2363 */
2364 exec->transno = -1; /* trick */
2365 exec->state = to;
2366 xmlFARegExecSave(exec);
2367 exec->transno = transno;
2368 exec->state = state;
2369 }
2370 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
2371 len);
2372 ret = xmlRegCheckCharacter(atom, codepoint);
2373 exec->transcount++;
2374 } while (ret == 1);
2375 if (exec->transcount < atom->min)
2376 ret = 0;
2377
2378 /*
2379 * If the last check failed but one transition was found
2380 * possible, rollback
2381 */
2382 if (ret < 0)
2383 ret = 0;
2384 if (ret == 0) {
2385 goto rollback;
2386 }
William M. Brack0e00b282004-04-26 15:40:47 +00002387 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
2388 /*
2389 * we don't match on the codepoint, but minOccurs of 0
2390 * says that's ok. Setting len to 0 inhibits stepping
2391 * over the codepoint.
2392 */
2393 exec->transcount = 1;
2394 len = 0;
2395 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00002396 }
William M. Brack0e00b282004-04-26 15:40:47 +00002397 } else if ((atom->min == 0) && (atom->max > 0)) {
2398 /* another spot to match when minOccurs is 0 */
2399 exec->transcount = 1;
2400 len = 0;
2401 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00002402 }
2403 if (ret == 1) {
2404 if (exec->state->nbTrans > exec->transno + 1) {
2405 xmlFARegExecSave(exec);
2406 }
2407 if (trans->counter >= 0) {
2408#ifdef DEBUG_REGEXP_EXEC
2409 printf("Increasing count %d\n", trans->counter);
2410#endif
2411 exec->counts[trans->counter]++;
2412 }
2413#ifdef DEBUG_REGEXP_EXEC
2414 printf("entering state %d\n", trans->to);
2415#endif
2416 exec->state = comp->states[trans->to];
2417 exec->transno = 0;
2418 if (trans->atom != NULL) {
2419 exec->index += len;
2420 }
2421 goto progress;
2422 } else if (ret < 0) {
2423 exec->status = -4;
2424 break;
2425 }
2426 }
2427 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2428rollback:
2429 /*
2430 * Failed to find a way out
2431 */
2432 exec->determinist = 0;
2433 xmlFARegExecRollBack(exec);
2434 }
2435progress:
2436 continue;
2437 }
2438 if (exec->rollbacks != NULL) {
2439 if (exec->counts != NULL) {
2440 int i;
2441
2442 for (i = 0;i < exec->maxRollbacks;i++)
2443 if (exec->rollbacks[i].counts != NULL)
2444 xmlFree(exec->rollbacks[i].counts);
2445 }
2446 xmlFree(exec->rollbacks);
2447 }
2448 if (exec->counts != NULL)
2449 xmlFree(exec->counts);
2450 if (exec->status == 0)
2451 return(1);
2452 if (exec->status == -1)
2453 return(0);
2454 return(exec->status);
2455}
2456
2457/************************************************************************
2458 * *
William M. Brackddf71d62004-05-06 04:17:26 +00002459 * Progressive interface to the verifier one atom at a time *
Daniel Veillard4255d502002-04-16 15:50:10 +00002460 * *
2461 ************************************************************************/
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002462#ifdef DEBUG_ERR
2463static void testerr(xmlRegExecCtxtPtr exec);
2464#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00002465
2466/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002467 * xmlRegNewExecCtxt:
Daniel Veillard4255d502002-04-16 15:50:10 +00002468 * @comp: a precompiled regular expression
2469 * @callback: a callback function used for handling progresses in the
2470 * automata matching phase
2471 * @data: the context data associated to the callback in this context
2472 *
2473 * Build a context used for progressive evaluation of a regexp.
Daniel Veillard01c13b52002-12-10 15:19:08 +00002474 *
2475 * Returns the new context
Daniel Veillard4255d502002-04-16 15:50:10 +00002476 */
2477xmlRegExecCtxtPtr
2478xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
2479 xmlRegExecCtxtPtr exec;
2480
2481 if (comp == NULL)
2482 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00002483 if ((comp->compact == NULL) && (comp->states == NULL))
2484 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00002485 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
2486 if (exec == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002487 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00002488 return(NULL);
2489 }
2490 memset(exec, 0, sizeof(xmlRegExecCtxt));
2491 exec->inputString = NULL;
2492 exec->index = 0;
2493 exec->determinist = 1;
2494 exec->maxRollbacks = 0;
2495 exec->nbRollbacks = 0;
2496 exec->rollbacks = NULL;
2497 exec->status = 0;
2498 exec->comp = comp;
Daniel Veillard23e73572002-09-19 19:56:43 +00002499 if (comp->compact == NULL)
2500 exec->state = comp->states[0];
Daniel Veillard4255d502002-04-16 15:50:10 +00002501 exec->transno = 0;
2502 exec->transcount = 0;
2503 exec->callback = callback;
2504 exec->data = data;
2505 if (comp->nbCounters > 0) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002506 /*
2507 * For error handling, exec->counts is allocated twice the size
2508 * the second half is used to store the data in case of rollback
2509 */
2510 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
2511 * 2);
Daniel Veillard4255d502002-04-16 15:50:10 +00002512 if (exec->counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002513 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00002514 xmlFree(exec);
2515 return(NULL);
2516 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002517 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
2518 exec->errCounts = &exec->counts[comp->nbCounters];
2519 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00002520 exec->counts = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002521 exec->errCounts = NULL;
2522 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002523 exec->inputStackMax = 0;
2524 exec->inputStackNr = 0;
2525 exec->inputStack = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002526 exec->errStateNo = -1;
2527 exec->errString = NULL;
Daniel Veillard4255d502002-04-16 15:50:10 +00002528 return(exec);
2529}
2530
2531/**
2532 * xmlRegFreeExecCtxt:
2533 * @exec: a regular expression evaulation context
2534 *
2535 * Free the structures associated to a regular expression evaulation context.
2536 */
2537void
2538xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
2539 if (exec == NULL)
2540 return;
2541
2542 if (exec->rollbacks != NULL) {
2543 if (exec->counts != NULL) {
2544 int i;
2545
2546 for (i = 0;i < exec->maxRollbacks;i++)
2547 if (exec->rollbacks[i].counts != NULL)
2548 xmlFree(exec->rollbacks[i].counts);
2549 }
2550 xmlFree(exec->rollbacks);
2551 }
2552 if (exec->counts != NULL)
2553 xmlFree(exec->counts);
2554 if (exec->inputStack != NULL) {
2555 int i;
2556
Daniel Veillard32370232002-10-16 14:08:14 +00002557 for (i = 0;i < exec->inputStackNr;i++) {
2558 if (exec->inputStack[i].value != NULL)
2559 xmlFree(exec->inputStack[i].value);
2560 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002561 xmlFree(exec->inputStack);
2562 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002563 if (exec->errString != NULL)
2564 xmlFree(exec->errString);
Daniel Veillard4255d502002-04-16 15:50:10 +00002565 xmlFree(exec);
2566}
2567
2568static void
2569xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2570 void *data) {
2571#ifdef DEBUG_PUSH
2572 printf("saving value: %d:%s\n", exec->inputStackNr, value);
2573#endif
2574 if (exec->inputStackMax == 0) {
2575 exec->inputStackMax = 4;
2576 exec->inputStack = (xmlRegInputTokenPtr)
2577 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
2578 if (exec->inputStack == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002579 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00002580 exec->inputStackMax = 0;
2581 return;
2582 }
2583 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
2584 xmlRegInputTokenPtr tmp;
2585
2586 exec->inputStackMax *= 2;
2587 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
2588 exec->inputStackMax * sizeof(xmlRegInputToken));
2589 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002590 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00002591 exec->inputStackMax /= 2;
2592 return;
2593 }
2594 exec->inputStack = tmp;
2595 }
2596 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
2597 exec->inputStack[exec->inputStackNr].data = data;
2598 exec->inputStackNr++;
2599 exec->inputStack[exec->inputStackNr].value = NULL;
2600 exec->inputStack[exec->inputStackNr].data = NULL;
2601}
2602
Daniel Veillardc0826a72004-08-10 14:17:33 +00002603/**
2604 * xmlRegStrEqualWildcard:
2605 * @expStr: the string to be evaluated
2606 * @valStr: the validation string
2607 *
2608 * Checks if both strings are equal or have the same content. "*"
2609 * can be used as a wildcard in @valStr; "|" is used as a seperator of
2610 * substrings in both @expStr and @valStr.
2611 *
2612 * Returns 1 if the comparison is satisfied and the number of substrings
2613 * is equal, 0 otherwise.
2614 */
2615
2616static int
2617xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
2618 if (expStr == valStr) return(1);
2619 if (expStr == NULL) return(0);
2620 if (valStr == NULL) return(0);
2621 do {
2622 /*
2623 * Eval if we have a wildcard for the current item.
2624 */
2625 if (*expStr != *valStr) {
2626 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
2627 do {
2628 if (*valStr == XML_REG_STRING_SEPARATOR)
2629 break;
2630 *valStr++;
2631 } while (*valStr != 0);
2632 continue;
2633 } else
2634 return(0);
2635 }
2636 *expStr++;
2637 *valStr++;
2638 } while (*valStr != 0);
2639 if (*expStr != 0)
2640 return (0);
2641 else
2642 return (1);
2643}
Daniel Veillard4255d502002-04-16 15:50:10 +00002644
2645/**
Daniel Veillard23e73572002-09-19 19:56:43 +00002646 * xmlRegCompactPushString:
2647 * @exec: a regexp execution context
2648 * @comp: the precompiled exec with a compact table
2649 * @value: a string token input
2650 * @data: data associated to the token to reuse in callbacks
2651 *
2652 * Push one input token in the execution context
2653 *
2654 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2655 * a negative value in case of error.
2656 */
2657static int
2658xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
2659 xmlRegexpPtr comp,
2660 const xmlChar *value,
2661 void *data) {
2662 int state = exec->index;
2663 int i, target;
2664
2665 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
2666 return(-1);
2667
2668 if (value == NULL) {
2669 /*
2670 * are we at a final state ?
2671 */
2672 if (comp->compact[state * (comp->nbstrings + 1)] ==
2673 XML_REGEXP_FINAL_STATE)
2674 return(1);
2675 return(0);
2676 }
2677
2678#ifdef DEBUG_PUSH
2679 printf("value pushed: %s\n", value);
2680#endif
2681
2682 /*
William M. Brackddf71d62004-05-06 04:17:26 +00002683 * Examine all outside transitions from current state
Daniel Veillard23e73572002-09-19 19:56:43 +00002684 */
2685 for (i = 0;i < comp->nbstrings;i++) {
2686 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
2687 if ((target > 0) && (target <= comp->nbstates)) {
Daniel Veillardc0826a72004-08-10 14:17:33 +00002688 target--; /* to avoid 0 */
2689 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
2690 exec->index = target;
Daniel Veillard118aed72002-09-24 14:13:13 +00002691 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
2692 exec->callback(exec->data, value,
2693 comp->transdata[state * comp->nbstrings + i], data);
2694 }
Daniel Veillard23e73572002-09-19 19:56:43 +00002695#ifdef DEBUG_PUSH
2696 printf("entering state %d\n", target);
2697#endif
2698 if (comp->compact[target * (comp->nbstrings + 1)] ==
2699 XML_REGEXP_FINAL_STATE)
2700 return(1);
2701 return(0);
2702 }
2703 }
2704 }
2705 /*
2706 * Failed to find an exit transition out from current state for the
2707 * current token
2708 */
2709#ifdef DEBUG_PUSH
2710 printf("failed to find a transition for %s on state %d\n", value, state);
2711#endif
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002712 if (exec->errString != NULL)
2713 xmlFree(exec->errString);
2714 exec->errString = xmlStrdup(value);
2715 exec->errStateNo = state;
Daniel Veillard23e73572002-09-19 19:56:43 +00002716 exec->status = -1;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002717#ifdef DEBUG_ERR
2718 testerr(exec);
2719#endif
Daniel Veillard23e73572002-09-19 19:56:43 +00002720 return(-1);
2721}
2722
2723/**
Daniel Veillard4255d502002-04-16 15:50:10 +00002724 * xmlRegExecPushString:
Daniel Veillardea7751d2002-12-20 00:16:24 +00002725 * @exec: a regexp execution context or NULL to indicate the end
Daniel Veillard4255d502002-04-16 15:50:10 +00002726 * @value: a string token input
2727 * @data: data associated to the token to reuse in callbacks
2728 *
2729 * Push one input token in the execution context
2730 *
2731 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2732 * a negative value in case of error.
2733 */
2734int
2735xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2736 void *data) {
2737 xmlRegTransPtr trans;
2738 xmlRegAtomPtr atom;
2739 int ret;
2740 int final = 0;
Daniel Veillard90700152005-01-08 22:05:09 +00002741 int progress = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00002742
2743 if (exec == NULL)
2744 return(-1);
Daniel Veillard23e73572002-09-19 19:56:43 +00002745 if (exec->comp == NULL)
2746 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00002747 if (exec->status != 0)
2748 return(exec->status);
2749
Daniel Veillard23e73572002-09-19 19:56:43 +00002750 if (exec->comp->compact != NULL)
2751 return(xmlRegCompactPushString(exec, exec->comp, value, data));
2752
Daniel Veillard4255d502002-04-16 15:50:10 +00002753 if (value == NULL) {
2754 if (exec->state->type == XML_REGEXP_FINAL_STATE)
2755 return(1);
2756 final = 1;
2757 }
2758
2759#ifdef DEBUG_PUSH
2760 printf("value pushed: %s\n", value);
2761#endif
2762 /*
2763 * If we have an active rollback stack push the new value there
2764 * and get back to where we were left
2765 */
2766 if ((value != NULL) && (exec->inputStackNr > 0)) {
2767 xmlFARegExecSaveInputString(exec, value, data);
2768 value = exec->inputStack[exec->index].value;
2769 data = exec->inputStack[exec->index].data;
2770#ifdef DEBUG_PUSH
2771 printf("value loaded: %s\n", value);
2772#endif
2773 }
2774
2775 while ((exec->status == 0) &&
2776 ((value != NULL) ||
2777 ((final == 1) &&
2778 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
2779
2780 /*
2781 * End of input on non-terminal state, rollback, however we may
2782 * still have epsilon like transition for counted transitions
2783 * on counters, in that case don't break too early.
2784 */
Daniel Veillardb509f152002-04-17 16:28:10 +00002785 if ((value == NULL) && (exec->counts == NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +00002786 goto rollback;
2787
2788 exec->transcount = 0;
2789 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2790 trans = &exec->state->trans[exec->transno];
2791 if (trans->to < 0)
2792 continue;
2793 atom = trans->atom;
2794 ret = 0;
Daniel Veillard441bc322002-04-20 17:38:48 +00002795 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
2796 int i;
2797 int count;
2798 xmlRegTransPtr t;
2799 xmlRegCounterPtr counter;
2800
2801 ret = 0;
2802
2803#ifdef DEBUG_PUSH
2804 printf("testing all lax %d\n", trans->count);
2805#endif
2806 /*
2807 * Check all counted transitions from the current state
2808 */
2809 if ((value == NULL) && (final)) {
2810 ret = 1;
2811 } else if (value != NULL) {
2812 for (i = 0;i < exec->state->nbTrans;i++) {
2813 t = &exec->state->trans[i];
2814 if ((t->counter < 0) || (t == trans))
2815 continue;
2816 counter = &exec->comp->counters[t->counter];
2817 count = exec->counts[t->counter];
2818 if ((count < counter->max) &&
2819 (t->atom != NULL) &&
2820 (xmlStrEqual(value, t->atom->valuep))) {
2821 ret = 0;
2822 break;
2823 }
2824 if ((count >= counter->min) &&
2825 (count < counter->max) &&
2826 (xmlStrEqual(value, t->atom->valuep))) {
2827 ret = 1;
2828 break;
2829 }
2830 }
2831 }
2832 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillard8a001f62002-04-20 07:24:11 +00002833 int i;
2834 int count;
2835 xmlRegTransPtr t;
2836 xmlRegCounterPtr counter;
2837
2838 ret = 1;
2839
2840#ifdef DEBUG_PUSH
2841 printf("testing all %d\n", trans->count);
2842#endif
2843 /*
2844 * Check all counted transitions from the current state
2845 */
2846 for (i = 0;i < exec->state->nbTrans;i++) {
2847 t = &exec->state->trans[i];
2848 if ((t->counter < 0) || (t == trans))
2849 continue;
2850 counter = &exec->comp->counters[t->counter];
2851 count = exec->counts[t->counter];
2852 if ((count < counter->min) || (count > counter->max)) {
2853 ret = 0;
2854 break;
2855 }
2856 }
2857 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002858 int count;
2859 xmlRegCounterPtr counter;
2860
2861 /*
2862 * A counted transition.
2863 */
2864
2865 count = exec->counts[trans->count];
2866 counter = &exec->comp->counters[trans->count];
2867#ifdef DEBUG_PUSH
2868 printf("testing count %d: val %d, min %d, max %d\n",
2869 trans->count, count, counter->min, counter->max);
2870#endif
2871 ret = ((count >= counter->min) && (count <= counter->max));
2872 } else if (atom == NULL) {
2873 fprintf(stderr, "epsilon transition left at runtime\n");
2874 exec->status = -2;
2875 break;
2876 } else if (value != NULL) {
Daniel Veillardc0826a72004-08-10 14:17:33 +00002877 ret = xmlRegStrEqualWildcard(atom->valuep, value);
Daniel Veillard441bc322002-04-20 17:38:48 +00002878 if ((ret == 1) && (trans->counter >= 0)) {
2879 xmlRegCounterPtr counter;
2880 int count;
2881
2882 count = exec->counts[trans->counter];
2883 counter = &exec->comp->counters[trans->counter];
2884 if (count >= counter->max)
2885 ret = 0;
2886 }
2887
Daniel Veillard4255d502002-04-16 15:50:10 +00002888 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2889 xmlRegStatePtr to = exec->comp->states[trans->to];
2890
2891 /*
2892 * this is a multiple input sequence
2893 */
2894 if (exec->state->nbTrans > exec->transno + 1) {
2895 if (exec->inputStackNr <= 0) {
2896 xmlFARegExecSaveInputString(exec, value, data);
2897 }
2898 xmlFARegExecSave(exec);
2899 }
2900 exec->transcount = 1;
2901 do {
2902 /*
2903 * Try to progress as much as possible on the input
2904 */
2905 if (exec->transcount == atom->max) {
2906 break;
2907 }
2908 exec->index++;
2909 value = exec->inputStack[exec->index].value;
2910 data = exec->inputStack[exec->index].data;
2911#ifdef DEBUG_PUSH
2912 printf("value loaded: %s\n", value);
2913#endif
2914
2915 /*
2916 * End of input: stop here
2917 */
2918 if (value == NULL) {
2919 exec->index --;
2920 break;
2921 }
2922 if (exec->transcount >= atom->min) {
2923 int transno = exec->transno;
2924 xmlRegStatePtr state = exec->state;
2925
2926 /*
2927 * The transition is acceptable save it
2928 */
2929 exec->transno = -1; /* trick */
2930 exec->state = to;
2931 if (exec->inputStackNr <= 0) {
2932 xmlFARegExecSaveInputString(exec, value, data);
2933 }
2934 xmlFARegExecSave(exec);
2935 exec->transno = transno;
2936 exec->state = state;
2937 }
2938 ret = xmlStrEqual(value, atom->valuep);
2939 exec->transcount++;
2940 } while (ret == 1);
2941 if (exec->transcount < atom->min)
2942 ret = 0;
2943
2944 /*
2945 * If the last check failed but one transition was found
2946 * possible, rollback
2947 */
2948 if (ret < 0)
2949 ret = 0;
2950 if (ret == 0) {
2951 goto rollback;
2952 }
2953 }
2954 }
2955 if (ret == 1) {
William M. Brack98873952003-12-26 06:03:14 +00002956 if ((exec->callback != NULL) && (atom != NULL) &&
2957 (data != NULL)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002958 exec->callback(exec->data, atom->valuep,
2959 atom->data, data);
2960 }
2961 if (exec->state->nbTrans > exec->transno + 1) {
2962 if (exec->inputStackNr <= 0) {
2963 xmlFARegExecSaveInputString(exec, value, data);
2964 }
2965 xmlFARegExecSave(exec);
2966 }
2967 if (trans->counter >= 0) {
2968#ifdef DEBUG_PUSH
2969 printf("Increasing count %d\n", trans->counter);
2970#endif
2971 exec->counts[trans->counter]++;
2972 }
2973#ifdef DEBUG_PUSH
2974 printf("entering state %d\n", trans->to);
2975#endif
2976 exec->state = exec->comp->states[trans->to];
2977 exec->transno = 0;
2978 if (trans->atom != NULL) {
2979 if (exec->inputStack != NULL) {
2980 exec->index++;
2981 if (exec->index < exec->inputStackNr) {
2982 value = exec->inputStack[exec->index].value;
2983 data = exec->inputStack[exec->index].data;
2984#ifdef DEBUG_PUSH
2985 printf("value loaded: %s\n", value);
2986#endif
2987 } else {
2988 value = NULL;
2989 data = NULL;
2990#ifdef DEBUG_PUSH
2991 printf("end of input\n");
2992#endif
2993 }
2994 } else {
2995 value = NULL;
2996 data = NULL;
2997#ifdef DEBUG_PUSH
2998 printf("end of input\n");
2999#endif
3000 }
3001 }
3002 goto progress;
3003 } else if (ret < 0) {
3004 exec->status = -4;
3005 break;
3006 }
3007 }
3008 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3009rollback:
Daniel Veillard90700152005-01-08 22:05:09 +00003010 /*
3011 * if we didn't yet rollback on the current input store
3012 * the current state as the error state.
3013 */
3014 if (progress) {
3015 progress = 0;
3016 if (exec->errString != NULL)
3017 xmlFree(exec->errString);
3018 exec->errString = xmlStrdup(value);
3019 exec->errState = exec->state;
3020 memcpy(exec->errCounts, exec->counts,
3021 exec->comp->nbCounters * sizeof(int));
3022 }
3023
Daniel Veillard4255d502002-04-16 15:50:10 +00003024 /*
3025 * Failed to find a way out
3026 */
3027 exec->determinist = 0;
3028 xmlFARegExecRollBack(exec);
3029 if (exec->status == 0) {
3030 value = exec->inputStack[exec->index].value;
3031 data = exec->inputStack[exec->index].data;
3032#ifdef DEBUG_PUSH
3033 printf("value loaded: %s\n", value);
3034#endif
3035 }
3036 }
Daniel Veillard90700152005-01-08 22:05:09 +00003037 continue;
Daniel Veillard4255d502002-04-16 15:50:10 +00003038progress:
Daniel Veillard90700152005-01-08 22:05:09 +00003039 progress = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003040 continue;
3041 }
3042 if (exec->status == 0) {
3043 return(exec->state->type == XML_REGEXP_FINAL_STATE);
3044 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003045#ifdef DEBUG_ERR
Daniel Veillard90700152005-01-08 22:05:09 +00003046 if (exec->status < 0) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003047 testerr(exec);
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003048 }
Daniel Veillard90700152005-01-08 22:05:09 +00003049#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00003050 return(exec->status);
3051}
3052
Daniel Veillard52b48c72003-04-13 19:53:42 +00003053/**
3054 * xmlRegExecPushString2:
3055 * @exec: a regexp execution context or NULL to indicate the end
3056 * @value: the first string token input
3057 * @value2: the second string token input
3058 * @data: data associated to the token to reuse in callbacks
3059 *
3060 * Push one input token in the execution context
3061 *
3062 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3063 * a negative value in case of error.
3064 */
3065int
3066xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
3067 const xmlChar *value2, void *data) {
3068 xmlChar buf[150];
3069 int lenn, lenp, ret;
3070 xmlChar *str;
3071
3072 if (exec == NULL)
3073 return(-1);
3074 if (exec->comp == NULL)
3075 return(-1);
3076 if (exec->status != 0)
3077 return(exec->status);
3078
3079 if (value2 == NULL)
3080 return(xmlRegExecPushString(exec, value, data));
3081
3082 lenn = strlen((char *) value2);
3083 lenp = strlen((char *) value);
3084
3085 if (150 < lenn + lenp + 2) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003086 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00003087 if (str == NULL) {
3088 exec->status = -1;
3089 return(-1);
3090 }
3091 } else {
3092 str = buf;
3093 }
3094 memcpy(&str[0], value, lenp);
Daniel Veillardc0826a72004-08-10 14:17:33 +00003095 str[lenp] = XML_REG_STRING_SEPARATOR;
Daniel Veillard52b48c72003-04-13 19:53:42 +00003096 memcpy(&str[lenp + 1], value2, lenn);
3097 str[lenn + lenp + 1] = 0;
3098
3099 if (exec->comp->compact != NULL)
3100 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
3101 else
3102 ret = xmlRegExecPushString(exec, str, data);
3103
3104 if (str != buf)
3105 xmlFree(buf);
3106 return(ret);
3107}
3108
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003109/**
3110 * xmlRegExecErrInfo:
3111 * @exec: a regexp execution context generating an error
3112 * @string: return value for the error string
3113 * @nbval: pointer to the number of accepted values IN/OUT
3114 * @values: pointer to the array of acceptable values
3115 *
3116 * Extract error informations from the regexp execution, the parameter
3117 * @string will be updated with the value pushed and not accepted,
3118 * the parameter @values must point to an array of @nbval string pointers
3119 * on return nbval will contain the number of possible strings in that
3120 * state and the @values array will be updated with them. The string values
Daniel Veillard90700152005-01-08 22:05:09 +00003121 * returned will be freed with the @exec context and don't need to be
3122 * deallocated.
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003123 *
3124 * Returns: 0 in case of success or -1 in case of error.
3125 */
3126int
3127xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
3128 int *nbval, xmlChar **values) {
3129 int maxval;
3130
3131 if (exec == NULL)
3132 return(-1);
3133 if (string != NULL) {
3134 if (exec->status != 0)
3135 *string = exec->errString;
3136 else
3137 *string = NULL;
3138 }
3139 if ((nbval == NULL) || (values == NULL) || (*nbval <= 0))
3140 return(-1);
3141 maxval = *nbval;
3142 *nbval = 0;
3143 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
3144 xmlRegexpPtr comp;
3145 int target, i, state;
3146
3147 comp = exec->comp;
3148 if (exec->errStateNo == -1) return(-1);
3149 state = exec->errStateNo;
3150 for (i = 0;(i < comp->nbstrings) && (*nbval < maxval);i++) {
3151 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3152 if ((target > 0) && (target <= comp->nbstates)) {
3153 values[*nbval] = comp->stringMap[i];
3154 (*nbval)++;
3155 }
3156 }
3157 } else {
3158 int transno;
3159 xmlRegTransPtr trans;
3160 xmlRegAtomPtr atom;
3161
3162 if (exec->errState == NULL) return(-1);
3163 for (transno = 0;
3164 (transno < exec->errState->nbTrans) && (*nbval < maxval);
3165 transno++) {
3166 trans = &exec->errState->trans[transno];
3167 if (trans->to < 0)
3168 continue;
3169 atom = trans->atom;
3170 if ((atom == NULL) || (atom->valuep == NULL))
3171 continue;
3172 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3173 TODO;
3174 } else if (trans->count == REGEXP_ALL_COUNTER) {
3175 TODO;
3176 } else if (trans->counter >= 0) {
3177 xmlRegCounterPtr counter;
3178 int count;
3179
Daniel Veillard90700152005-01-08 22:05:09 +00003180 count = exec->errCounts[trans->counter];
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003181 counter = &exec->comp->counters[trans->counter];
3182 if (count < counter->max) {
3183 values[*nbval] = (const xmlChar *) atom->valuep;
3184 (*nbval)++;
3185 }
3186 } else {
3187 values[*nbval] = (const xmlChar *) atom->valuep;
3188 (*nbval)++;
3189 }
3190 }
3191 }
3192 return(0);
3193}
3194
3195#ifdef DEBUG_ERR
3196static void testerr(xmlRegExecCtxtPtr exec) {
3197 const xmlChar *string;
3198 const xmlChar *values[5];
3199 int nb = 5;
3200 xmlRegExecErrInfo(exec, &string, &nb, &values[0]);
3201}
3202#endif
3203
Daniel Veillard4255d502002-04-16 15:50:10 +00003204#if 0
3205static int
3206xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
3207 xmlRegTransPtr trans;
3208 xmlRegAtomPtr atom;
3209 int ret;
3210 int codepoint, len;
3211
3212 if (exec == NULL)
3213 return(-1);
3214 if (exec->status != 0)
3215 return(exec->status);
3216
3217 while ((exec->status == 0) &&
3218 ((exec->inputString[exec->index] != 0) ||
3219 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
3220
3221 /*
3222 * End of input on non-terminal state, rollback, however we may
3223 * still have epsilon like transition for counted transitions
3224 * on counters, in that case don't break too early.
3225 */
3226 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
3227 goto rollback;
3228
3229 exec->transcount = 0;
3230 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3231 trans = &exec->state->trans[exec->transno];
3232 if (trans->to < 0)
3233 continue;
3234 atom = trans->atom;
3235 ret = 0;
3236 if (trans->count >= 0) {
3237 int count;
3238 xmlRegCounterPtr counter;
3239
3240 /*
3241 * A counted transition.
3242 */
3243
3244 count = exec->counts[trans->count];
3245 counter = &exec->comp->counters[trans->count];
3246#ifdef DEBUG_REGEXP_EXEC
3247 printf("testing count %d: val %d, min %d, max %d\n",
3248 trans->count, count, counter->min, counter->max);
3249#endif
3250 ret = ((count >= counter->min) && (count <= counter->max));
3251 } else if (atom == NULL) {
3252 fprintf(stderr, "epsilon transition left at runtime\n");
3253 exec->status = -2;
3254 break;
3255 } else if (exec->inputString[exec->index] != 0) {
3256 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3257 ret = xmlRegCheckCharacter(atom, codepoint);
3258 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3259 xmlRegStatePtr to = exec->comp->states[trans->to];
3260
3261 /*
3262 * this is a multiple input sequence
3263 */
3264 if (exec->state->nbTrans > exec->transno + 1) {
3265 xmlFARegExecSave(exec);
3266 }
3267 exec->transcount = 1;
3268 do {
3269 /*
3270 * Try to progress as much as possible on the input
3271 */
3272 if (exec->transcount == atom->max) {
3273 break;
3274 }
3275 exec->index += len;
3276 /*
3277 * End of input: stop here
3278 */
3279 if (exec->inputString[exec->index] == 0) {
3280 exec->index -= len;
3281 break;
3282 }
3283 if (exec->transcount >= atom->min) {
3284 int transno = exec->transno;
3285 xmlRegStatePtr state = exec->state;
3286
3287 /*
3288 * The transition is acceptable save it
3289 */
3290 exec->transno = -1; /* trick */
3291 exec->state = to;
3292 xmlFARegExecSave(exec);
3293 exec->transno = transno;
3294 exec->state = state;
3295 }
3296 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3297 len);
3298 ret = xmlRegCheckCharacter(atom, codepoint);
3299 exec->transcount++;
3300 } while (ret == 1);
3301 if (exec->transcount < atom->min)
3302 ret = 0;
3303
3304 /*
3305 * If the last check failed but one transition was found
3306 * possible, rollback
3307 */
3308 if (ret < 0)
3309 ret = 0;
3310 if (ret == 0) {
3311 goto rollback;
3312 }
3313 }
3314 }
3315 if (ret == 1) {
3316 if (exec->state->nbTrans > exec->transno + 1) {
3317 xmlFARegExecSave(exec);
3318 }
3319 if (trans->counter >= 0) {
3320#ifdef DEBUG_REGEXP_EXEC
3321 printf("Increasing count %d\n", trans->counter);
3322#endif
3323 exec->counts[trans->counter]++;
3324 }
3325#ifdef DEBUG_REGEXP_EXEC
3326 printf("entering state %d\n", trans->to);
3327#endif
3328 exec->state = exec->comp->states[trans->to];
3329 exec->transno = 0;
3330 if (trans->atom != NULL) {
3331 exec->index += len;
3332 }
3333 goto progress;
3334 } else if (ret < 0) {
3335 exec->status = -4;
3336 break;
3337 }
3338 }
3339 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3340rollback:
3341 /*
3342 * Failed to find a way out
3343 */
3344 exec->determinist = 0;
3345 xmlFARegExecRollBack(exec);
3346 }
3347progress:
3348 continue;
3349 }
3350}
3351#endif
3352/************************************************************************
3353 * *
William M. Brackddf71d62004-05-06 04:17:26 +00003354 * Parser for the Schemas Datatype Regular Expressions *
Daniel Veillard4255d502002-04-16 15:50:10 +00003355 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
3356 * *
3357 ************************************************************************/
3358
3359/**
3360 * xmlFAIsChar:
Daniel Veillard441bc322002-04-20 17:38:48 +00003361 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003362 *
3363 * [10] Char ::= [^.\?*+()|#x5B#x5D]
3364 */
3365static int
3366xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
3367 int cur;
3368 int len;
3369
3370 cur = CUR_SCHAR(ctxt->cur, len);
3371 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
3372 (cur == '*') || (cur == '+') || (cur == '(') ||
3373 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
3374 (cur == 0x5D) || (cur == 0))
3375 return(-1);
3376 return(cur);
3377}
3378
3379/**
3380 * xmlFAParseCharProp:
Daniel Veillard441bc322002-04-20 17:38:48 +00003381 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003382 *
3383 * [27] charProp ::= IsCategory | IsBlock
3384 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
3385 * Separators | Symbols | Others
3386 * [29] Letters ::= 'L' [ultmo]?
3387 * [30] Marks ::= 'M' [nce]?
3388 * [31] Numbers ::= 'N' [dlo]?
3389 * [32] Punctuation ::= 'P' [cdseifo]?
3390 * [33] Separators ::= 'Z' [slp]?
3391 * [34] Symbols ::= 'S' [mcko]?
3392 * [35] Others ::= 'C' [cfon]?
3393 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
3394 */
3395static void
3396xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
3397 int cur;
William M. Brack779af002003-08-01 15:55:39 +00003398 xmlRegAtomType type = (xmlRegAtomType) 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003399 xmlChar *blockName = NULL;
3400
3401 cur = CUR;
3402 if (cur == 'L') {
3403 NEXT;
3404 cur = CUR;
3405 if (cur == 'u') {
3406 NEXT;
3407 type = XML_REGEXP_LETTER_UPPERCASE;
3408 } else if (cur == 'l') {
3409 NEXT;
3410 type = XML_REGEXP_LETTER_LOWERCASE;
3411 } else if (cur == 't') {
3412 NEXT;
3413 type = XML_REGEXP_LETTER_TITLECASE;
3414 } else if (cur == 'm') {
3415 NEXT;
3416 type = XML_REGEXP_LETTER_MODIFIER;
3417 } else if (cur == 'o') {
3418 NEXT;
3419 type = XML_REGEXP_LETTER_OTHERS;
3420 } else {
3421 type = XML_REGEXP_LETTER;
3422 }
3423 } else if (cur == 'M') {
3424 NEXT;
3425 cur = CUR;
3426 if (cur == 'n') {
3427 NEXT;
3428 /* nonspacing */
3429 type = XML_REGEXP_MARK_NONSPACING;
3430 } else if (cur == 'c') {
3431 NEXT;
3432 /* spacing combining */
3433 type = XML_REGEXP_MARK_SPACECOMBINING;
3434 } else if (cur == 'e') {
3435 NEXT;
3436 /* enclosing */
3437 type = XML_REGEXP_MARK_ENCLOSING;
3438 } else {
3439 /* all marks */
3440 type = XML_REGEXP_MARK;
3441 }
3442 } else if (cur == 'N') {
3443 NEXT;
3444 cur = CUR;
3445 if (cur == 'd') {
3446 NEXT;
3447 /* digital */
3448 type = XML_REGEXP_NUMBER_DECIMAL;
3449 } else if (cur == 'l') {
3450 NEXT;
3451 /* letter */
3452 type = XML_REGEXP_NUMBER_LETTER;
3453 } else if (cur == 'o') {
3454 NEXT;
3455 /* other */
3456 type = XML_REGEXP_NUMBER_OTHERS;
3457 } else {
3458 /* all numbers */
3459 type = XML_REGEXP_NUMBER;
3460 }
3461 } else if (cur == 'P') {
3462 NEXT;
3463 cur = CUR;
3464 if (cur == 'c') {
3465 NEXT;
3466 /* connector */
3467 type = XML_REGEXP_PUNCT_CONNECTOR;
3468 } else if (cur == 'd') {
3469 NEXT;
3470 /* dash */
3471 type = XML_REGEXP_PUNCT_DASH;
3472 } else if (cur == 's') {
3473 NEXT;
3474 /* open */
3475 type = XML_REGEXP_PUNCT_OPEN;
3476 } else if (cur == 'e') {
3477 NEXT;
3478 /* close */
3479 type = XML_REGEXP_PUNCT_CLOSE;
3480 } else if (cur == 'i') {
3481 NEXT;
3482 /* initial quote */
3483 type = XML_REGEXP_PUNCT_INITQUOTE;
3484 } else if (cur == 'f') {
3485 NEXT;
3486 /* final quote */
3487 type = XML_REGEXP_PUNCT_FINQUOTE;
3488 } else if (cur == 'o') {
3489 NEXT;
3490 /* other */
3491 type = XML_REGEXP_PUNCT_OTHERS;
3492 } else {
3493 /* all punctuation */
3494 type = XML_REGEXP_PUNCT;
3495 }
3496 } else if (cur == 'Z') {
3497 NEXT;
3498 cur = CUR;
3499 if (cur == 's') {
3500 NEXT;
3501 /* space */
3502 type = XML_REGEXP_SEPAR_SPACE;
3503 } else if (cur == 'l') {
3504 NEXT;
3505 /* line */
3506 type = XML_REGEXP_SEPAR_LINE;
3507 } else if (cur == 'p') {
3508 NEXT;
3509 /* paragraph */
3510 type = XML_REGEXP_SEPAR_PARA;
3511 } else {
3512 /* all separators */
3513 type = XML_REGEXP_SEPAR;
3514 }
3515 } else if (cur == 'S') {
3516 NEXT;
3517 cur = CUR;
3518 if (cur == 'm') {
3519 NEXT;
3520 type = XML_REGEXP_SYMBOL_MATH;
3521 /* math */
3522 } else if (cur == 'c') {
3523 NEXT;
3524 type = XML_REGEXP_SYMBOL_CURRENCY;
3525 /* currency */
3526 } else if (cur == 'k') {
3527 NEXT;
3528 type = XML_REGEXP_SYMBOL_MODIFIER;
3529 /* modifiers */
3530 } else if (cur == 'o') {
3531 NEXT;
3532 type = XML_REGEXP_SYMBOL_OTHERS;
3533 /* other */
3534 } else {
3535 /* all symbols */
3536 type = XML_REGEXP_SYMBOL;
3537 }
3538 } else if (cur == 'C') {
3539 NEXT;
3540 cur = CUR;
3541 if (cur == 'c') {
3542 NEXT;
3543 /* control */
3544 type = XML_REGEXP_OTHER_CONTROL;
3545 } else if (cur == 'f') {
3546 NEXT;
3547 /* format */
3548 type = XML_REGEXP_OTHER_FORMAT;
3549 } else if (cur == 'o') {
3550 NEXT;
3551 /* private use */
3552 type = XML_REGEXP_OTHER_PRIVATE;
3553 } else if (cur == 'n') {
3554 NEXT;
3555 /* not assigned */
3556 type = XML_REGEXP_OTHER_NA;
3557 } else {
3558 /* all others */
3559 type = XML_REGEXP_OTHER;
3560 }
3561 } else if (cur == 'I') {
3562 const xmlChar *start;
3563 NEXT;
3564 cur = CUR;
3565 if (cur != 's') {
3566 ERROR("IsXXXX expected");
3567 return;
3568 }
3569 NEXT;
3570 start = ctxt->cur;
3571 cur = CUR;
3572 if (((cur >= 'a') && (cur <= 'z')) ||
3573 ((cur >= 'A') && (cur <= 'Z')) ||
3574 ((cur >= '0') && (cur <= '9')) ||
3575 (cur == 0x2D)) {
3576 NEXT;
3577 cur = CUR;
3578 while (((cur >= 'a') && (cur <= 'z')) ||
3579 ((cur >= 'A') && (cur <= 'Z')) ||
3580 ((cur >= '0') && (cur <= '9')) ||
3581 (cur == 0x2D)) {
3582 NEXT;
3583 cur = CUR;
3584 }
3585 }
3586 type = XML_REGEXP_BLOCK_NAME;
3587 blockName = xmlStrndup(start, ctxt->cur - start);
3588 } else {
3589 ERROR("Unknown char property");
3590 return;
3591 }
3592 if (ctxt->atom == NULL) {
3593 ctxt->atom = xmlRegNewAtom(ctxt, type);
3594 if (ctxt->atom != NULL)
3595 ctxt->atom->valuep = blockName;
3596 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3597 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3598 type, 0, 0, blockName);
3599 }
3600}
3601
3602/**
3603 * xmlFAParseCharClassEsc:
Daniel Veillard441bc322002-04-20 17:38:48 +00003604 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003605 *
3606 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
3607 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
3608 * [25] catEsc ::= '\p{' charProp '}'
3609 * [26] complEsc ::= '\P{' charProp '}'
3610 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
3611 */
3612static void
3613xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
3614 int cur;
3615
3616 if (CUR == '.') {
3617 if (ctxt->atom == NULL) {
3618 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
3619 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3620 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3621 XML_REGEXP_ANYCHAR, 0, 0, NULL);
3622 }
3623 NEXT;
3624 return;
3625 }
3626 if (CUR != '\\') {
3627 ERROR("Escaped sequence: expecting \\");
3628 return;
3629 }
3630 NEXT;
3631 cur = CUR;
3632 if (cur == 'p') {
3633 NEXT;
3634 if (CUR != '{') {
3635 ERROR("Expecting '{'");
3636 return;
3637 }
3638 NEXT;
3639 xmlFAParseCharProp(ctxt);
3640 if (CUR != '}') {
3641 ERROR("Expecting '}'");
3642 return;
3643 }
3644 NEXT;
3645 } else if (cur == 'P') {
3646 NEXT;
3647 if (CUR != '{') {
3648 ERROR("Expecting '{'");
3649 return;
3650 }
3651 NEXT;
3652 xmlFAParseCharProp(ctxt);
3653 ctxt->atom->neg = 1;
3654 if (CUR != '}') {
3655 ERROR("Expecting '}'");
3656 return;
3657 }
3658 NEXT;
3659 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
3660 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
3661 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
3662 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
3663 (cur == 0x5E)) {
3664 if (ctxt->atom == NULL) {
3665 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
3666 if (ctxt->atom != NULL)
3667 ctxt->atom->codepoint = cur;
3668 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3669 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3670 XML_REGEXP_CHARVAL, cur, cur, NULL);
3671 }
3672 NEXT;
3673 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
3674 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
3675 (cur == 'w') || (cur == 'W')) {
Daniel Veillardb509f152002-04-17 16:28:10 +00003676 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
Daniel Veillard4255d502002-04-16 15:50:10 +00003677
3678 switch (cur) {
3679 case 's':
3680 type = XML_REGEXP_ANYSPACE;
3681 break;
3682 case 'S':
3683 type = XML_REGEXP_NOTSPACE;
3684 break;
3685 case 'i':
3686 type = XML_REGEXP_INITNAME;
3687 break;
3688 case 'I':
3689 type = XML_REGEXP_NOTINITNAME;
3690 break;
3691 case 'c':
3692 type = XML_REGEXP_NAMECHAR;
3693 break;
3694 case 'C':
3695 type = XML_REGEXP_NOTNAMECHAR;
3696 break;
3697 case 'd':
3698 type = XML_REGEXP_DECIMAL;
3699 break;
3700 case 'D':
3701 type = XML_REGEXP_NOTDECIMAL;
3702 break;
3703 case 'w':
3704 type = XML_REGEXP_REALCHAR;
3705 break;
3706 case 'W':
3707 type = XML_REGEXP_NOTREALCHAR;
3708 break;
3709 }
3710 NEXT;
3711 if (ctxt->atom == NULL) {
3712 ctxt->atom = xmlRegNewAtom(ctxt, type);
3713 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3714 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3715 type, 0, 0, NULL);
3716 }
3717 }
3718}
3719
3720/**
3721 * xmlFAParseCharRef:
Daniel Veillard441bc322002-04-20 17:38:48 +00003722 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003723 *
3724 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' )
3725 */
3726static int
3727xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) {
3728 int ret = 0, cur;
3729
3730 if ((CUR != '&') || (NXT(1) != '#'))
3731 return(-1);
3732 NEXT;
3733 NEXT;
3734 cur = CUR;
3735 if (cur == 'x') {
3736 NEXT;
3737 cur = CUR;
3738 if (((cur >= '0') && (cur <= '9')) ||
3739 ((cur >= 'a') && (cur <= 'f')) ||
3740 ((cur >= 'A') && (cur <= 'F'))) {
3741 while (((cur >= '0') && (cur <= '9')) ||
3742 ((cur >= 'A') && (cur <= 'F'))) {
3743 if ((cur >= '0') && (cur <= '9'))
3744 ret = ret * 16 + cur - '0';
3745 else if ((cur >= 'a') && (cur <= 'f'))
3746 ret = ret * 16 + 10 + (cur - 'a');
3747 else
3748 ret = ret * 16 + 10 + (cur - 'A');
3749 NEXT;
3750 cur = CUR;
3751 }
3752 } else {
3753 ERROR("Char ref: expecting [0-9A-F]");
3754 return(-1);
3755 }
3756 } else {
3757 if ((cur >= '0') && (cur <= '9')) {
3758 while ((cur >= '0') && (cur <= '9')) {
3759 ret = ret * 10 + cur - '0';
3760 NEXT;
3761 cur = CUR;
3762 }
3763 } else {
3764 ERROR("Char ref: expecting [0-9]");
3765 return(-1);
3766 }
3767 }
3768 if (cur != ';') {
3769 ERROR("Char ref: expecting ';'");
3770 return(-1);
3771 } else {
3772 NEXT;
3773 }
3774 return(ret);
3775}
3776
3777/**
3778 * xmlFAParseCharRange:
Daniel Veillard441bc322002-04-20 17:38:48 +00003779 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003780 *
3781 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
3782 * [18] seRange ::= charOrEsc '-' charOrEsc
3783 * [20] charOrEsc ::= XmlChar | SingleCharEsc
3784 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
3785 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
3786 */
3787static void
3788xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
William M. Brackdc99df92003-12-27 01:54:25 +00003789 int cur, len;
Daniel Veillard4255d502002-04-16 15:50:10 +00003790 int start = -1;
3791 int end = -1;
3792
3793 if ((CUR == '&') && (NXT(1) == '#')) {
3794 end = start = xmlFAParseCharRef(ctxt);
3795 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3796 XML_REGEXP_CHARVAL, start, end, NULL);
3797 return;
3798 }
3799 cur = CUR;
3800 if (cur == '\\') {
3801 NEXT;
3802 cur = CUR;
3803 switch (cur) {
3804 case 'n': start = 0xA; break;
3805 case 'r': start = 0xD; break;
3806 case 't': start = 0x9; break;
3807 case '\\': case '|': case '.': case '-': case '^': case '?':
3808 case '*': case '+': case '{': case '}': case '(': case ')':
3809 case '[': case ']':
3810 start = cur; break;
3811 default:
3812 ERROR("Invalid escape value");
3813 return;
3814 }
3815 end = start;
William M. Brackdc99df92003-12-27 01:54:25 +00003816 len = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003817 } else if ((cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00003818 end = start = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00003819 } else {
3820 ERROR("Expecting a char range");
3821 return;
3822 }
William M. Brackdc99df92003-12-27 01:54:25 +00003823 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00003824 if (start == '-') {
3825 return;
3826 }
3827 cur = CUR;
William M. Brack10f1ef42004-03-20 14:51:25 +00003828 if ((cur != '-') || (NXT(1) == ']')) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003829 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3830 XML_REGEXP_CHARVAL, start, end, NULL);
3831 return;
3832 }
3833 NEXT;
3834 cur = CUR;
3835 if (cur == '\\') {
3836 NEXT;
3837 cur = CUR;
3838 switch (cur) {
3839 case 'n': end = 0xA; break;
3840 case 'r': end = 0xD; break;
3841 case 't': end = 0x9; break;
3842 case '\\': case '|': case '.': case '-': case '^': case '?':
3843 case '*': case '+': case '{': case '}': case '(': case ')':
3844 case '[': case ']':
3845 end = cur; break;
3846 default:
3847 ERROR("Invalid escape value");
3848 return;
3849 }
William M. Brackdc99df92003-12-27 01:54:25 +00003850 len = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003851 } else if ((cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00003852 end = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00003853 } else {
3854 ERROR("Expecting the end of a char range");
3855 return;
3856 }
William M. Brackdc99df92003-12-27 01:54:25 +00003857 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00003858 /* TODO check that the values are acceptable character ranges for XML */
3859 if (end < start) {
3860 ERROR("End of range is before start of range");
3861 } else {
3862 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3863 XML_REGEXP_CHARVAL, start, end, NULL);
3864 }
3865 return;
3866}
3867
3868/**
3869 * xmlFAParsePosCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00003870 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003871 *
3872 * [14] posCharGroup ::= ( charRange | charClassEsc )+
3873 */
3874static void
3875xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
3876 do {
3877 if ((CUR == '\\') || (CUR == '.')) {
3878 xmlFAParseCharClassEsc(ctxt);
3879 } else {
3880 xmlFAParseCharRange(ctxt);
3881 }
3882 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
3883 (ctxt->error == 0));
3884}
3885
3886/**
3887 * xmlFAParseCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00003888 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003889 *
3890 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
3891 * [15] negCharGroup ::= '^' posCharGroup
3892 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
3893 * [12] charClassExpr ::= '[' charGroup ']'
3894 */
3895static void
3896xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
3897 int n = ctxt->neg;
3898 while ((CUR != ']') && (ctxt->error == 0)) {
3899 if (CUR == '^') {
3900 int neg = ctxt->neg;
3901
3902 NEXT;
3903 ctxt->neg = !ctxt->neg;
3904 xmlFAParsePosCharGroup(ctxt);
3905 ctxt->neg = neg;
William M. Brack10f1ef42004-03-20 14:51:25 +00003906 } else if ((CUR == '-') && (NXT(1) == '[')) {
Daniel Veillardf8b9de32003-11-24 14:27:26 +00003907 int neg = ctxt->neg;
Daniel Veillardf8b9de32003-11-24 14:27:26 +00003908 ctxt->neg = 2;
William M. Brack10f1ef42004-03-20 14:51:25 +00003909 NEXT; /* eat the '-' */
3910 NEXT; /* eat the '[' */
Daniel Veillard4255d502002-04-16 15:50:10 +00003911 xmlFAParseCharGroup(ctxt);
3912 if (CUR == ']') {
3913 NEXT;
3914 } else {
3915 ERROR("charClassExpr: ']' expected");
3916 break;
3917 }
Daniel Veillardf8b9de32003-11-24 14:27:26 +00003918 ctxt->neg = neg;
Daniel Veillard4255d502002-04-16 15:50:10 +00003919 break;
3920 } else if (CUR != ']') {
3921 xmlFAParsePosCharGroup(ctxt);
3922 }
3923 }
3924 ctxt->neg = n;
3925}
3926
3927/**
3928 * xmlFAParseCharClass:
Daniel Veillard441bc322002-04-20 17:38:48 +00003929 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003930 *
3931 * [11] charClass ::= charClassEsc | charClassExpr
3932 * [12] charClassExpr ::= '[' charGroup ']'
3933 */
3934static void
3935xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
3936 if (CUR == '[') {
3937 NEXT;
3938 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
3939 if (ctxt->atom == NULL)
3940 return;
3941 xmlFAParseCharGroup(ctxt);
3942 if (CUR == ']') {
3943 NEXT;
3944 } else {
3945 ERROR("xmlFAParseCharClass: ']' expected");
3946 }
3947 } else {
3948 xmlFAParseCharClassEsc(ctxt);
3949 }
3950}
3951
3952/**
3953 * xmlFAParseQuantExact:
Daniel Veillard441bc322002-04-20 17:38:48 +00003954 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003955 *
3956 * [8] QuantExact ::= [0-9]+
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00003957 *
3958 * Returns 0 if success or -1 in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00003959 */
3960static int
3961xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
3962 int ret = 0;
3963 int ok = 0;
3964
3965 while ((CUR >= '0') && (CUR <= '9')) {
3966 ret = ret * 10 + (CUR - '0');
3967 ok = 1;
3968 NEXT;
3969 }
3970 if (ok != 1) {
3971 return(-1);
3972 }
3973 return(ret);
3974}
3975
3976/**
3977 * xmlFAParseQuantifier:
Daniel Veillard441bc322002-04-20 17:38:48 +00003978 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003979 *
3980 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
3981 * [5] quantity ::= quantRange | quantMin | QuantExact
3982 * [6] quantRange ::= QuantExact ',' QuantExact
3983 * [7] quantMin ::= QuantExact ','
3984 * [8] QuantExact ::= [0-9]+
3985 */
3986static int
3987xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
3988 int cur;
3989
3990 cur = CUR;
3991 if ((cur == '?') || (cur == '*') || (cur == '+')) {
3992 if (ctxt->atom != NULL) {
3993 if (cur == '?')
3994 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
3995 else if (cur == '*')
3996 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
3997 else if (cur == '+')
3998 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
3999 }
4000 NEXT;
4001 return(1);
4002 }
4003 if (cur == '{') {
4004 int min = 0, max = 0;
4005
4006 NEXT;
4007 cur = xmlFAParseQuantExact(ctxt);
4008 if (cur >= 0)
4009 min = cur;
4010 if (CUR == ',') {
4011 NEXT;
Daniel Veillardebe48c62003-12-03 12:12:27 +00004012 if (CUR == '}')
4013 max = INT_MAX;
4014 else {
4015 cur = xmlFAParseQuantExact(ctxt);
4016 if (cur >= 0)
4017 max = cur;
4018 else {
4019 ERROR("Improper quantifier");
4020 }
4021 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004022 }
4023 if (CUR == '}') {
4024 NEXT;
4025 } else {
4026 ERROR("Unterminated quantifier");
4027 }
4028 if (max == 0)
4029 max = min;
4030 if (ctxt->atom != NULL) {
4031 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
4032 ctxt->atom->min = min;
4033 ctxt->atom->max = max;
4034 }
4035 return(1);
4036 }
4037 return(0);
4038}
4039
4040/**
4041 * xmlFAParseAtom:
Daniel Veillard441bc322002-04-20 17:38:48 +00004042 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004043 *
4044 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
4045 */
4046static int
4047xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
4048 int codepoint, len;
4049
4050 codepoint = xmlFAIsChar(ctxt);
4051 if (codepoint > 0) {
4052 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
4053 if (ctxt->atom == NULL)
4054 return(-1);
4055 codepoint = CUR_SCHAR(ctxt->cur, len);
4056 ctxt->atom->codepoint = codepoint;
4057 NEXTL(len);
4058 return(1);
4059 } else if (CUR == '|') {
4060 return(0);
4061 } else if (CUR == 0) {
4062 return(0);
4063 } else if (CUR == ')') {
4064 return(0);
4065 } else if (CUR == '(') {
4066 xmlRegStatePtr start, oldend;
4067
4068 NEXT;
4069 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
4070 start = ctxt->state;
4071 oldend = ctxt->end;
4072 ctxt->end = NULL;
4073 ctxt->atom = NULL;
4074 xmlFAParseRegExp(ctxt, 0);
4075 if (CUR == ')') {
4076 NEXT;
4077 } else {
4078 ERROR("xmlFAParseAtom: expecting ')'");
4079 }
4080 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
4081 if (ctxt->atom == NULL)
4082 return(-1);
4083 ctxt->atom->start = start;
4084 ctxt->atom->stop = ctxt->state;
4085 ctxt->end = oldend;
4086 return(1);
4087 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
4088 xmlFAParseCharClass(ctxt);
4089 return(1);
4090 }
4091 return(0);
4092}
4093
4094/**
4095 * xmlFAParsePiece:
Daniel Veillard441bc322002-04-20 17:38:48 +00004096 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004097 *
4098 * [3] piece ::= atom quantifier?
4099 */
4100static int
4101xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
4102 int ret;
4103
4104 ctxt->atom = NULL;
4105 ret = xmlFAParseAtom(ctxt);
4106 if (ret == 0)
4107 return(0);
4108 if (ctxt->atom == NULL) {
4109 ERROR("internal: no atom generated");
4110 }
4111 xmlFAParseQuantifier(ctxt);
4112 return(1);
4113}
4114
4115/**
4116 * xmlFAParseBranch:
Daniel Veillard441bc322002-04-20 17:38:48 +00004117 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004118 *
4119 * [2] branch ::= piece*
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004120 8
Daniel Veillard4255d502002-04-16 15:50:10 +00004121 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004122static int
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004123xmlFAParseBranch(xmlRegParserCtxtPtr ctxt) {
Daniel Veillard4255d502002-04-16 15:50:10 +00004124 xmlRegStatePtr previous;
Daniel Veillard4255d502002-04-16 15:50:10 +00004125 int ret;
4126
4127 previous = ctxt->state;
4128 ret = xmlFAParsePiece(ctxt);
4129 if (ret != 0) {
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004130 if (xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom) < 0)
4131 return(-1);
4132 previous = ctxt->state;
Daniel Veillard4255d502002-04-16 15:50:10 +00004133 ctxt->atom = NULL;
4134 }
4135 while ((ret != 0) && (ctxt->error == 0)) {
4136 ret = xmlFAParsePiece(ctxt);
4137 if (ret != 0) {
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004138 if (xmlFAGenerateTransitions(ctxt, previous, NULL,
4139 ctxt->atom) < 0)
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004140 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00004141 previous = ctxt->state;
4142 ctxt->atom = NULL;
4143 }
4144 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004145 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00004146}
4147
4148/**
4149 * xmlFAParseRegExp:
Daniel Veillard441bc322002-04-20 17:38:48 +00004150 * @ctxt: a regexp parser context
William M. Brackddf71d62004-05-06 04:17:26 +00004151 * @top: is this the top-level expression ?
Daniel Veillard4255d502002-04-16 15:50:10 +00004152 *
4153 * [1] regExp ::= branch ( '|' branch )*
4154 */
4155static void
4156xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
Daniel Veillardc7e3cc42004-09-28 12:33:52 +00004157 xmlRegStatePtr start, end;
Daniel Veillard4255d502002-04-16 15:50:10 +00004158
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004159 /* if not top start should have been generated by an epsilon trans */
Daniel Veillard4255d502002-04-16 15:50:10 +00004160 start = ctxt->state;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004161 ctxt->end = NULL;
4162 xmlFAParseBranch(ctxt);
4163 if (top) {
4164#ifdef DEBUG_REGEXP_GRAPH
4165 printf("State %d is final\n", ctxt->state->no);
4166#endif
4167 ctxt->state->type = XML_REGEXP_FINAL_STATE;
4168 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004169 if (CUR != '|') {
4170 ctxt->end = ctxt->state;
4171 return;
4172 }
4173 end = ctxt->state;
4174 while ((CUR == '|') && (ctxt->error == 0)) {
4175 NEXT;
4176 ctxt->state = start;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004177 ctxt->end = NULL;
4178 xmlFAParseBranch(ctxt);
4179 if (top) {
4180 ctxt->state->type = XML_REGEXP_FINAL_STATE;
4181#ifdef DEBUG_REGEXP_GRAPH
4182 printf("State %d is final\n", ctxt->state->no);
4183#endif
4184 } else {
4185 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, end);
4186 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004187 }
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004188 if (!top) {
4189 ctxt->state = end;
4190 ctxt->end = end;
4191 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004192}
4193
4194/************************************************************************
4195 * *
4196 * The basic API *
4197 * *
4198 ************************************************************************/
4199
4200/**
4201 * xmlRegexpPrint:
4202 * @output: the file for the output debug
4203 * @regexp: the compiled regexp
4204 *
4205 * Print the content of the compiled regular expression
4206 */
4207void
4208xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
4209 int i;
4210
Daniel Veillarda82b1822004-11-08 16:24:57 +00004211 if (output == NULL)
4212 return;
Daniel Veillard4255d502002-04-16 15:50:10 +00004213 fprintf(output, " regexp: ");
4214 if (regexp == NULL) {
4215 fprintf(output, "NULL\n");
4216 return;
4217 }
4218 fprintf(output, "'%s' ", regexp->string);
4219 fprintf(output, "\n");
4220 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
4221 for (i = 0;i < regexp->nbAtoms; i++) {
4222 fprintf(output, " %02d ", i);
4223 xmlRegPrintAtom(output, regexp->atoms[i]);
4224 }
4225 fprintf(output, "%d states:", regexp->nbStates);
4226 fprintf(output, "\n");
4227 for (i = 0;i < regexp->nbStates; i++) {
4228 xmlRegPrintState(output, regexp->states[i]);
4229 }
4230 fprintf(output, "%d counters:\n", regexp->nbCounters);
4231 for (i = 0;i < regexp->nbCounters; i++) {
4232 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
4233 regexp->counters[i].max);
4234 }
4235}
4236
4237/**
4238 * xmlRegexpCompile:
4239 * @regexp: a regular expression string
4240 *
4241 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
William M. Brackddf71d62004-05-06 04:17:26 +00004242 * Appendix F and builds an automata suitable for testing strings against
Daniel Veillard4255d502002-04-16 15:50:10 +00004243 * that regular expression
4244 *
4245 * Returns the compiled expression or NULL in case of error
4246 */
4247xmlRegexpPtr
4248xmlRegexpCompile(const xmlChar *regexp) {
4249 xmlRegexpPtr ret;
4250 xmlRegParserCtxtPtr ctxt;
4251
4252 ctxt = xmlRegNewParserCtxt(regexp);
4253 if (ctxt == NULL)
4254 return(NULL);
4255
4256 /* initialize the parser */
4257 ctxt->end = NULL;
4258 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
4259 xmlRegStatePush(ctxt, ctxt->start);
4260
4261 /* parse the expression building an automata */
4262 xmlFAParseRegExp(ctxt, 1);
4263 if (CUR != 0) {
4264 ERROR("xmlFAParseRegExp: extra characters");
4265 }
4266 ctxt->end = ctxt->state;
4267 ctxt->start->type = XML_REGEXP_START_STATE;
4268 ctxt->end->type = XML_REGEXP_FINAL_STATE;
4269
4270 /* remove the Epsilon except for counted transitions */
4271 xmlFAEliminateEpsilonTransitions(ctxt);
4272
4273
4274 if (ctxt->error != 0) {
4275 xmlRegFreeParserCtxt(ctxt);
4276 return(NULL);
4277 }
4278 ret = xmlRegEpxFromParse(ctxt);
4279 xmlRegFreeParserCtxt(ctxt);
4280 return(ret);
4281}
4282
4283/**
4284 * xmlRegexpExec:
4285 * @comp: the compiled regular expression
4286 * @content: the value to check against the regular expression
4287 *
William M. Brackddf71d62004-05-06 04:17:26 +00004288 * Check if the regular expression generates the value
Daniel Veillard4255d502002-04-16 15:50:10 +00004289 *
William M. Brackddf71d62004-05-06 04:17:26 +00004290 * Returns 1 if it matches, 0 if not and a negative value in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00004291 */
4292int
4293xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
4294 if ((comp == NULL) || (content == NULL))
4295 return(-1);
4296 return(xmlFARegExec(comp, content));
4297}
4298
4299/**
Daniel Veillard23e73572002-09-19 19:56:43 +00004300 * xmlRegexpIsDeterminist:
4301 * @comp: the compiled regular expression
4302 *
4303 * Check if the regular expression is determinist
4304 *
William M. Brackddf71d62004-05-06 04:17:26 +00004305 * Returns 1 if it yes, 0 if not and a negative value in case of error
Daniel Veillard23e73572002-09-19 19:56:43 +00004306 */
4307int
4308xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
4309 xmlAutomataPtr am;
4310 int ret;
4311
4312 if (comp == NULL)
4313 return(-1);
4314 if (comp->determinist != -1)
4315 return(comp->determinist);
4316
4317 am = xmlNewAutomata();
Daniel Veillardbd9afb52002-09-25 22:25:35 +00004318 if (am->states != NULL) {
4319 int i;
4320
4321 for (i = 0;i < am->nbStates;i++)
4322 xmlRegFreeState(am->states[i]);
4323 xmlFree(am->states);
4324 }
Daniel Veillard23e73572002-09-19 19:56:43 +00004325 am->nbAtoms = comp->nbAtoms;
4326 am->atoms = comp->atoms;
4327 am->nbStates = comp->nbStates;
4328 am->states = comp->states;
4329 am->determinist = -1;
4330 ret = xmlFAComputesDeterminism(am);
4331 am->atoms = NULL;
4332 am->states = NULL;
4333 xmlFreeAutomata(am);
4334 return(ret);
4335}
4336
4337/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004338 * xmlRegFreeRegexp:
4339 * @regexp: the regexp
4340 *
4341 * Free a regexp
4342 */
4343void
4344xmlRegFreeRegexp(xmlRegexpPtr regexp) {
4345 int i;
4346 if (regexp == NULL)
4347 return;
4348
4349 if (regexp->string != NULL)
4350 xmlFree(regexp->string);
4351 if (regexp->states != NULL) {
4352 for (i = 0;i < regexp->nbStates;i++)
4353 xmlRegFreeState(regexp->states[i]);
4354 xmlFree(regexp->states);
4355 }
4356 if (regexp->atoms != NULL) {
4357 for (i = 0;i < regexp->nbAtoms;i++)
4358 xmlRegFreeAtom(regexp->atoms[i]);
4359 xmlFree(regexp->atoms);
4360 }
4361 if (regexp->counters != NULL)
4362 xmlFree(regexp->counters);
Daniel Veillard23e73572002-09-19 19:56:43 +00004363 if (regexp->compact != NULL)
4364 xmlFree(regexp->compact);
Daniel Veillard118aed72002-09-24 14:13:13 +00004365 if (regexp->transdata != NULL)
4366 xmlFree(regexp->transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +00004367 if (regexp->stringMap != NULL) {
4368 for (i = 0; i < regexp->nbstrings;i++)
4369 xmlFree(regexp->stringMap[i]);
4370 xmlFree(regexp->stringMap);
4371 }
4372
Daniel Veillard4255d502002-04-16 15:50:10 +00004373 xmlFree(regexp);
4374}
4375
4376#ifdef LIBXML_AUTOMATA_ENABLED
4377/************************************************************************
4378 * *
4379 * The Automata interface *
4380 * *
4381 ************************************************************************/
4382
4383/**
4384 * xmlNewAutomata:
4385 *
4386 * Create a new automata
4387 *
4388 * Returns the new object or NULL in case of failure
4389 */
4390xmlAutomataPtr
4391xmlNewAutomata(void) {
4392 xmlAutomataPtr ctxt;
4393
4394 ctxt = xmlRegNewParserCtxt(NULL);
4395 if (ctxt == NULL)
4396 return(NULL);
4397
4398 /* initialize the parser */
4399 ctxt->end = NULL;
4400 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004401 if (ctxt->start == NULL) {
4402 xmlFreeAutomata(ctxt);
4403 return(NULL);
4404 }
4405 if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
4406 xmlRegFreeState(ctxt->start);
4407 xmlFreeAutomata(ctxt);
4408 return(NULL);
4409 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004410
4411 return(ctxt);
4412}
4413
4414/**
4415 * xmlFreeAutomata:
4416 * @am: an automata
4417 *
4418 * Free an automata
4419 */
4420void
4421xmlFreeAutomata(xmlAutomataPtr am) {
4422 if (am == NULL)
4423 return;
4424 xmlRegFreeParserCtxt(am);
4425}
4426
4427/**
4428 * xmlAutomataGetInitState:
4429 * @am: an automata
4430 *
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004431 * Initial state lookup
4432 *
Daniel Veillard4255d502002-04-16 15:50:10 +00004433 * Returns the initial state of the automata
4434 */
4435xmlAutomataStatePtr
4436xmlAutomataGetInitState(xmlAutomataPtr am) {
4437 if (am == NULL)
4438 return(NULL);
4439 return(am->start);
4440}
4441
4442/**
4443 * xmlAutomataSetFinalState:
4444 * @am: an automata
4445 * @state: a state in this automata
4446 *
4447 * Makes that state a final state
4448 *
4449 * Returns 0 or -1 in case of error
4450 */
4451int
4452xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
4453 if ((am == NULL) || (state == NULL))
4454 return(-1);
4455 state->type = XML_REGEXP_FINAL_STATE;
4456 return(0);
4457}
4458
4459/**
4460 * xmlAutomataNewTransition:
4461 * @am: an automata
4462 * @from: the starting point of the transition
4463 * @to: the target point of the transition or NULL
4464 * @token: the input string associated to that transition
4465 * @data: data passed to the callback function if the transition is activated
4466 *
William M. Brackddf71d62004-05-06 04:17:26 +00004467 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00004468 * and then adds a transition from the @from state to the target state
4469 * activated by the value of @token
4470 *
4471 * Returns the target state or NULL in case of error
4472 */
4473xmlAutomataStatePtr
4474xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
4475 xmlAutomataStatePtr to, const xmlChar *token,
4476 void *data) {
4477 xmlRegAtomPtr atom;
4478
4479 if ((am == NULL) || (from == NULL) || (token == NULL))
4480 return(NULL);
4481 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004482 if (atom == NULL)
4483 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00004484 atom->data = data;
4485 if (atom == NULL)
4486 return(NULL);
4487 atom->valuep = xmlStrdup(token);
4488
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004489 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
4490 xmlRegFreeAtom(atom);
4491 return(NULL);
4492 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004493 if (to == NULL)
4494 return(am->state);
4495 return(to);
4496}
4497
4498/**
Daniel Veillard52b48c72003-04-13 19:53:42 +00004499 * xmlAutomataNewTransition2:
4500 * @am: an automata
4501 * @from: the starting point of the transition
4502 * @to: the target point of the transition or NULL
4503 * @token: the first input string associated to that transition
4504 * @token2: the second input string associated to that transition
4505 * @data: data passed to the callback function if the transition is activated
4506 *
William M. Brackddf71d62004-05-06 04:17:26 +00004507 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard52b48c72003-04-13 19:53:42 +00004508 * and then adds a transition from the @from state to the target state
4509 * activated by the value of @token
4510 *
4511 * Returns the target state or NULL in case of error
4512 */
4513xmlAutomataStatePtr
4514xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
4515 xmlAutomataStatePtr to, const xmlChar *token,
4516 const xmlChar *token2, void *data) {
4517 xmlRegAtomPtr atom;
4518
4519 if ((am == NULL) || (from == NULL) || (token == NULL))
4520 return(NULL);
4521 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4522 atom->data = data;
4523 if (atom == NULL)
4524 return(NULL);
4525 if ((token2 == NULL) || (*token2 == 0)) {
4526 atom->valuep = xmlStrdup(token);
4527 } else {
4528 int lenn, lenp;
4529 xmlChar *str;
4530
4531 lenn = strlen((char *) token2);
4532 lenp = strlen((char *) token);
4533
Daniel Veillard3c908dc2003-04-19 00:07:51 +00004534 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00004535 if (str == NULL) {
4536 xmlRegFreeAtom(atom);
4537 return(NULL);
4538 }
4539 memcpy(&str[0], token, lenp);
4540 str[lenp] = '|';
4541 memcpy(&str[lenp + 1], token2, lenn);
4542 str[lenn + lenp + 1] = 0;
4543
4544 atom->valuep = str;
4545 }
4546
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004547 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
4548 xmlRegFreeAtom(atom);
4549 return(NULL);
4550 }
Daniel Veillard52b48c72003-04-13 19:53:42 +00004551 if (to == NULL)
4552 return(am->state);
4553 return(to);
4554}
4555
4556/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00004557 * xmlAutomataNewCountTrans2:
4558 * @am: an automata
4559 * @from: the starting point of the transition
4560 * @to: the target point of the transition or NULL
4561 * @token: the input string associated to that transition
4562 * @token2: the second input string associated to that transition
4563 * @min: the minimum successive occurences of token
4564 * @max: the maximum successive occurences of token
4565 * @data: data associated to the transition
4566 *
4567 * If @to is NULL, this creates first a new target state in the automata
4568 * and then adds a transition from the @from state to the target state
4569 * activated by a succession of input of value @token and @token2 and
4570 * whose number is between @min and @max
4571 *
4572 * Returns the target state or NULL in case of error
4573 */
4574xmlAutomataStatePtr
4575xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
4576 xmlAutomataStatePtr to, const xmlChar *token,
4577 const xmlChar *token2,
4578 int min, int max, void *data) {
4579 xmlRegAtomPtr atom;
4580 int counter;
4581
4582 if ((am == NULL) || (from == NULL) || (token == NULL))
4583 return(NULL);
4584 if (min < 0)
4585 return(NULL);
4586 if ((max < min) || (max < 1))
4587 return(NULL);
4588 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4589 if (atom == NULL)
4590 return(NULL);
4591 if ((token2 == NULL) || (*token2 == 0)) {
4592 atom->valuep = xmlStrdup(token);
4593 } else {
4594 int lenn, lenp;
4595 xmlChar *str;
4596
4597 lenn = strlen((char *) token2);
4598 lenp = strlen((char *) token);
4599
4600 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4601 if (str == NULL) {
4602 xmlRegFreeAtom(atom);
4603 return(NULL);
4604 }
4605 memcpy(&str[0], token, lenp);
4606 str[lenp] = '|';
4607 memcpy(&str[lenp + 1], token2, lenn);
4608 str[lenn + lenp + 1] = 0;
4609
4610 atom->valuep = str;
4611 }
4612 atom->data = data;
4613 if (min == 0)
4614 atom->min = 1;
4615 else
4616 atom->min = min;
4617 atom->max = max;
4618
4619 /*
4620 * associate a counter to the transition.
4621 */
4622 counter = xmlRegGetCounter(am);
4623 am->counters[counter].min = min;
4624 am->counters[counter].max = max;
4625
4626 /* xmlFAGenerateTransitions(am, from, to, atom); */
4627 if (to == NULL) {
4628 to = xmlRegNewState(am);
4629 xmlRegStatePush(am, to);
4630 }
4631 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4632 xmlRegAtomPush(am, atom);
4633 am->state = to;
4634
4635 if (to == NULL)
4636 to = am->state;
4637 if (to == NULL)
4638 return(NULL);
4639 if (min == 0)
4640 xmlFAGenerateEpsilonTransition(am, from, to);
4641 return(to);
4642}
4643
4644/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004645 * xmlAutomataNewCountTrans:
4646 * @am: an automata
4647 * @from: the starting point of the transition
4648 * @to: the target point of the transition or NULL
4649 * @token: the input string associated to that transition
4650 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004651 * @max: the maximum successive occurences of token
4652 * @data: data associated to the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00004653 *
William M. Brackddf71d62004-05-06 04:17:26 +00004654 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00004655 * and then adds a transition from the @from state to the target state
4656 * activated by a succession of input of value @token and whose number
4657 * is between @min and @max
4658 *
4659 * Returns the target state or NULL in case of error
4660 */
4661xmlAutomataStatePtr
4662xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4663 xmlAutomataStatePtr to, const xmlChar *token,
4664 int min, int max, void *data) {
4665 xmlRegAtomPtr atom;
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00004666 int counter;
Daniel Veillard4255d502002-04-16 15:50:10 +00004667
4668 if ((am == NULL) || (from == NULL) || (token == NULL))
4669 return(NULL);
4670 if (min < 0)
4671 return(NULL);
4672 if ((max < min) || (max < 1))
4673 return(NULL);
4674 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4675 if (atom == NULL)
4676 return(NULL);
4677 atom->valuep = xmlStrdup(token);
4678 atom->data = data;
4679 if (min == 0)
4680 atom->min = 1;
4681 else
4682 atom->min = min;
4683 atom->max = max;
4684
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00004685 /*
4686 * associate a counter to the transition.
4687 */
4688 counter = xmlRegGetCounter(am);
4689 am->counters[counter].min = min;
4690 am->counters[counter].max = max;
4691
4692 /* xmlFAGenerateTransitions(am, from, to, atom); */
4693 if (to == NULL) {
4694 to = xmlRegNewState(am);
4695 xmlRegStatePush(am, to);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004696 }
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00004697 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4698 xmlRegAtomPush(am, atom);
4699 am->state = to;
4700
Daniel Veillard4255d502002-04-16 15:50:10 +00004701 if (to == NULL)
4702 to = am->state;
4703 if (to == NULL)
4704 return(NULL);
4705 if (min == 0)
4706 xmlFAGenerateEpsilonTransition(am, from, to);
4707 return(to);
4708}
4709
4710/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00004711 * xmlAutomataNewOnceTrans2:
4712 * @am: an automata
4713 * @from: the starting point of the transition
4714 * @to: the target point of the transition or NULL
4715 * @token: the input string associated to that transition
4716 * @token2: the second input string associated to that transition
4717 * @min: the minimum successive occurences of token
4718 * @max: the maximum successive occurences of token
4719 * @data: data associated to the transition
4720 *
4721 * If @to is NULL, this creates first a new target state in the automata
4722 * and then adds a transition from the @from state to the target state
4723 * activated by a succession of input of value @token and @token2 and whose
4724 * number is between @min and @max, moreover that transition can only be
4725 * crossed once.
4726 *
4727 * Returns the target state or NULL in case of error
4728 */
4729xmlAutomataStatePtr
4730xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
4731 xmlAutomataStatePtr to, const xmlChar *token,
4732 const xmlChar *token2,
4733 int min, int max, void *data) {
4734 xmlRegAtomPtr atom;
4735 int counter;
4736
4737 if ((am == NULL) || (from == NULL) || (token == NULL))
4738 return(NULL);
4739 if (min < 1)
4740 return(NULL);
4741 if ((max < min) || (max < 1))
4742 return(NULL);
4743 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4744 if (atom == NULL)
4745 return(NULL);
4746 if ((token2 == NULL) || (*token2 == 0)) {
4747 atom->valuep = xmlStrdup(token);
4748 } else {
4749 int lenn, lenp;
4750 xmlChar *str;
4751
4752 lenn = strlen((char *) token2);
4753 lenp = strlen((char *) token);
4754
4755 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4756 if (str == NULL) {
4757 xmlRegFreeAtom(atom);
4758 return(NULL);
4759 }
4760 memcpy(&str[0], token, lenp);
4761 str[lenp] = '|';
4762 memcpy(&str[lenp + 1], token2, lenn);
4763 str[lenn + lenp + 1] = 0;
4764
4765 atom->valuep = str;
4766 }
4767 atom->data = data;
4768 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
4769 if (min == 0)
4770 atom->min = 1;
4771 else
4772 atom->min = min;
4773 atom->max = max;
4774 /*
4775 * associate a counter to the transition.
4776 */
4777 counter = xmlRegGetCounter(am);
4778 am->counters[counter].min = 1;
4779 am->counters[counter].max = 1;
4780
4781 /* xmlFAGenerateTransitions(am, from, to, atom); */
4782 if (to == NULL) {
4783 to = xmlRegNewState(am);
4784 xmlRegStatePush(am, to);
4785 }
4786 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4787 xmlRegAtomPush(am, atom);
4788 am->state = to;
4789 return(to);
4790}
4791
4792
4793
4794/**
Daniel Veillard7646b182002-04-20 06:41:40 +00004795 * xmlAutomataNewOnceTrans:
4796 * @am: an automata
4797 * @from: the starting point of the transition
4798 * @to: the target point of the transition or NULL
4799 * @token: the input string associated to that transition
4800 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004801 * @max: the maximum successive occurences of token
4802 * @data: data associated to the transition
Daniel Veillard7646b182002-04-20 06:41:40 +00004803 *
William M. Brackddf71d62004-05-06 04:17:26 +00004804 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00004805 * and then adds a transition from the @from state to the target state
4806 * activated by a succession of input of value @token and whose number
William M. Brackddf71d62004-05-06 04:17:26 +00004807 * is between @min and @max, moreover that transition can only be crossed
Daniel Veillard7646b182002-04-20 06:41:40 +00004808 * once.
4809 *
4810 * Returns the target state or NULL in case of error
4811 */
4812xmlAutomataStatePtr
4813xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4814 xmlAutomataStatePtr to, const xmlChar *token,
4815 int min, int max, void *data) {
4816 xmlRegAtomPtr atom;
4817 int counter;
4818
4819 if ((am == NULL) || (from == NULL) || (token == NULL))
4820 return(NULL);
4821 if (min < 1)
4822 return(NULL);
4823 if ((max < min) || (max < 1))
4824 return(NULL);
4825 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4826 if (atom == NULL)
4827 return(NULL);
4828 atom->valuep = xmlStrdup(token);
4829 atom->data = data;
4830 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
4831 if (min == 0)
4832 atom->min = 1;
4833 else
4834 atom->min = min;
4835 atom->max = max;
4836 /*
4837 * associate a counter to the transition.
4838 */
4839 counter = xmlRegGetCounter(am);
4840 am->counters[counter].min = 1;
4841 am->counters[counter].max = 1;
4842
4843 /* xmlFAGenerateTransitions(am, from, to, atom); */
4844 if (to == NULL) {
4845 to = xmlRegNewState(am);
4846 xmlRegStatePush(am, to);
4847 }
4848 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4849 xmlRegAtomPush(am, atom);
4850 am->state = to;
Daniel Veillard7646b182002-04-20 06:41:40 +00004851 return(to);
4852}
4853
4854/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004855 * xmlAutomataNewState:
4856 * @am: an automata
4857 *
4858 * Create a new disconnected state in the automata
4859 *
4860 * Returns the new state or NULL in case of error
4861 */
4862xmlAutomataStatePtr
4863xmlAutomataNewState(xmlAutomataPtr am) {
4864 xmlAutomataStatePtr to;
4865
4866 if (am == NULL)
4867 return(NULL);
4868 to = xmlRegNewState(am);
4869 xmlRegStatePush(am, to);
4870 return(to);
4871}
4872
4873/**
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004874 * xmlAutomataNewEpsilon:
Daniel Veillard4255d502002-04-16 15:50:10 +00004875 * @am: an automata
4876 * @from: the starting point of the transition
4877 * @to: the target point of the transition or NULL
4878 *
William M. Brackddf71d62004-05-06 04:17:26 +00004879 * If @to is NULL, this creates first a new target state in the automata
4880 * and then adds an epsilon transition from the @from state to the
Daniel Veillard4255d502002-04-16 15:50:10 +00004881 * target state
4882 *
4883 * Returns the target state or NULL in case of error
4884 */
4885xmlAutomataStatePtr
4886xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
4887 xmlAutomataStatePtr to) {
4888 if ((am == NULL) || (from == NULL))
4889 return(NULL);
4890 xmlFAGenerateEpsilonTransition(am, from, to);
4891 if (to == NULL)
4892 return(am->state);
4893 return(to);
4894}
4895
Daniel Veillardb509f152002-04-17 16:28:10 +00004896/**
Daniel Veillard7646b182002-04-20 06:41:40 +00004897 * xmlAutomataNewAllTrans:
4898 * @am: an automata
4899 * @from: the starting point of the transition
4900 * @to: the target point of the transition or NULL
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004901 * @lax: allow to transition if not all all transitions have been activated
Daniel Veillard7646b182002-04-20 06:41:40 +00004902 *
William M. Brackddf71d62004-05-06 04:17:26 +00004903 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00004904 * and then adds a an ALL transition from the @from state to the
4905 * target state. That transition is an epsilon transition allowed only when
4906 * all transitions from the @from node have been activated.
4907 *
4908 * Returns the target state or NULL in case of error
4909 */
4910xmlAutomataStatePtr
4911xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
Daniel Veillard441bc322002-04-20 17:38:48 +00004912 xmlAutomataStatePtr to, int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00004913 if ((am == NULL) || (from == NULL))
4914 return(NULL);
Daniel Veillard441bc322002-04-20 17:38:48 +00004915 xmlFAGenerateAllTransition(am, from, to, lax);
Daniel Veillard7646b182002-04-20 06:41:40 +00004916 if (to == NULL)
4917 return(am->state);
4918 return(to);
4919}
4920
4921/**
Daniel Veillardb509f152002-04-17 16:28:10 +00004922 * xmlAutomataNewCounter:
4923 * @am: an automata
4924 * @min: the minimal value on the counter
4925 * @max: the maximal value on the counter
4926 *
4927 * Create a new counter
4928 *
4929 * Returns the counter number or -1 in case of error
4930 */
4931int
4932xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
4933 int ret;
4934
4935 if (am == NULL)
4936 return(-1);
4937
4938 ret = xmlRegGetCounter(am);
4939 if (ret < 0)
4940 return(-1);
4941 am->counters[ret].min = min;
4942 am->counters[ret].max = max;
4943 return(ret);
4944}
4945
4946/**
4947 * xmlAutomataNewCountedTrans:
4948 * @am: an automata
4949 * @from: the starting point of the transition
4950 * @to: the target point of the transition or NULL
4951 * @counter: the counter associated to that transition
4952 *
William M. Brackddf71d62004-05-06 04:17:26 +00004953 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00004954 * and then adds an epsilon transition from the @from state to the target state
4955 * which will increment the counter provided
4956 *
4957 * Returns the target state or NULL in case of error
4958 */
4959xmlAutomataStatePtr
4960xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4961 xmlAutomataStatePtr to, int counter) {
4962 if ((am == NULL) || (from == NULL) || (counter < 0))
4963 return(NULL);
4964 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
4965 if (to == NULL)
4966 return(am->state);
4967 return(to);
4968}
4969
4970/**
4971 * xmlAutomataNewCounterTrans:
4972 * @am: an automata
4973 * @from: the starting point of the transition
4974 * @to: the target point of the transition or NULL
4975 * @counter: the counter associated to that transition
4976 *
William M. Brackddf71d62004-05-06 04:17:26 +00004977 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00004978 * and then adds an epsilon transition from the @from state to the target state
4979 * which will be allowed only if the counter is within the right range.
4980 *
4981 * Returns the target state or NULL in case of error
4982 */
4983xmlAutomataStatePtr
4984xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4985 xmlAutomataStatePtr to, int counter) {
4986 if ((am == NULL) || (from == NULL) || (counter < 0))
4987 return(NULL);
4988 xmlFAGenerateCountedTransition(am, from, to, counter);
4989 if (to == NULL)
4990 return(am->state);
4991 return(to);
4992}
Daniel Veillard4255d502002-04-16 15:50:10 +00004993
4994/**
4995 * xmlAutomataCompile:
4996 * @am: an automata
4997 *
4998 * Compile the automata into a Reg Exp ready for being executed.
4999 * The automata should be free after this point.
5000 *
5001 * Returns the compiled regexp or NULL in case of error
5002 */
5003xmlRegexpPtr
5004xmlAutomataCompile(xmlAutomataPtr am) {
5005 xmlRegexpPtr ret;
5006
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00005007 if ((am == NULL) || (am->error != 0)) return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00005008 xmlFAEliminateEpsilonTransitions(am);
Daniel Veillard23e73572002-09-19 19:56:43 +00005009 /* xmlFAComputesDeterminism(am); */
Daniel Veillard4255d502002-04-16 15:50:10 +00005010 ret = xmlRegEpxFromParse(am);
5011
5012 return(ret);
5013}
Daniel Veillarde19fc232002-04-22 16:01:24 +00005014
5015/**
5016 * xmlAutomataIsDeterminist:
5017 * @am: an automata
5018 *
5019 * Checks if an automata is determinist.
5020 *
5021 * Returns 1 if true, 0 if not, and -1 in case of error
5022 */
5023int
5024xmlAutomataIsDeterminist(xmlAutomataPtr am) {
5025 int ret;
5026
5027 if (am == NULL)
5028 return(-1);
5029
5030 ret = xmlFAComputesDeterminism(am);
5031 return(ret);
5032}
Daniel Veillard4255d502002-04-16 15:50:10 +00005033#endif /* LIBXML_AUTOMATA_ENABLED */
5034#endif /* LIBXML_REGEXP_ENABLED */