blob: 8755b4f659fe7f6f66d71573ce0db80261c2d9cb [file] [log] [blame]
Daniel Veillard4255d502002-04-16 15:50:10 +00001/*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/shemas mechanisms now available in
William M. Brackddf71d62004-05-06 04:17:26 +00006 * XML related specifications these include:
Daniel Veillard4255d502002-04-16 15:50:10 +00007 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17#define IN_LIBXML
18#include "libxml.h"
19
20#ifdef LIBXML_REGEXP_ENABLED
21
22#include <stdio.h>
23#include <string.h>
Daniel Veillardebe48c62003-12-03 12:12:27 +000024#ifdef HAVE_LIMITS_H
25#include <limits.h>
26#endif
27
Daniel Veillard4255d502002-04-16 15:50:10 +000028#include <libxml/tree.h>
29#include <libxml/parserInternals.h>
30#include <libxml/xmlregexp.h>
31#include <libxml/xmlautomata.h>
32#include <libxml/xmlunicode.h>
33
Daniel Veillardebe48c62003-12-03 12:12:27 +000034#ifndef INT_MAX
35#define INT_MAX 123456789 /* easy to flag and big enough for our needs */
36#endif
37
Daniel Veillardc0826a72004-08-10 14:17:33 +000038/* #define DEBUG_REGEXP_GRAPH */
39/* #define DEBUG_REGEXP_EXEC */
Daniel Veillard4255d502002-04-16 15:50:10 +000040/* #define DEBUG_PUSH */
Daniel Veillard23e73572002-09-19 19:56:43 +000041/* #define DEBUG_COMPACTION */
Daniel Veillard4255d502002-04-16 15:50:10 +000042
Daniel Veillardff46a042003-10-08 08:53:17 +000043#define ERROR(str) \
44 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
45 xmlRegexpErrCompile(ctxt, str);
Daniel Veillard4255d502002-04-16 15:50:10 +000046#define NEXT ctxt->cur++
47#define CUR (*(ctxt->cur))
48#define NXT(index) (ctxt->cur[index])
49
50#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
51#define NEXTL(l) ctxt->cur += l;
Daniel Veillardc0826a72004-08-10 14:17:33 +000052#define XML_REG_STRING_SEPARATOR '|'
Daniel Veillard4255d502002-04-16 15:50:10 +000053
Daniel Veillarde19fc232002-04-22 16:01:24 +000054/**
55 * TODO:
56 *
57 * macro to flag unimplemented blocks
58 */
59#define TODO \
60 xmlGenericError(xmlGenericErrorContext, \
61 "Unimplemented block at %s:%d\n", \
62 __FILE__, __LINE__);
63
Daniel Veillard4255d502002-04-16 15:50:10 +000064/************************************************************************
65 * *
66 * Datatypes and structures *
67 * *
68 ************************************************************************/
69
70typedef enum {
71 XML_REGEXP_EPSILON = 1,
72 XML_REGEXP_CHARVAL,
73 XML_REGEXP_RANGES,
74 XML_REGEXP_SUBREG,
75 XML_REGEXP_STRING,
76 XML_REGEXP_ANYCHAR, /* . */
77 XML_REGEXP_ANYSPACE, /* \s */
78 XML_REGEXP_NOTSPACE, /* \S */
79 XML_REGEXP_INITNAME, /* \l */
80 XML_REGEXP_NOTINITNAME, /* \l */
81 XML_REGEXP_NAMECHAR, /* \c */
82 XML_REGEXP_NOTNAMECHAR, /* \C */
83 XML_REGEXP_DECIMAL, /* \d */
84 XML_REGEXP_NOTDECIMAL, /* \d */
85 XML_REGEXP_REALCHAR, /* \w */
86 XML_REGEXP_NOTREALCHAR, /* \w */
87 XML_REGEXP_LETTER,
88 XML_REGEXP_LETTER_UPPERCASE,
89 XML_REGEXP_LETTER_LOWERCASE,
90 XML_REGEXP_LETTER_TITLECASE,
91 XML_REGEXP_LETTER_MODIFIER,
92 XML_REGEXP_LETTER_OTHERS,
93 XML_REGEXP_MARK,
94 XML_REGEXP_MARK_NONSPACING,
95 XML_REGEXP_MARK_SPACECOMBINING,
96 XML_REGEXP_MARK_ENCLOSING,
97 XML_REGEXP_NUMBER,
98 XML_REGEXP_NUMBER_DECIMAL,
99 XML_REGEXP_NUMBER_LETTER,
100 XML_REGEXP_NUMBER_OTHERS,
101 XML_REGEXP_PUNCT,
102 XML_REGEXP_PUNCT_CONNECTOR,
103 XML_REGEXP_PUNCT_DASH,
104 XML_REGEXP_PUNCT_OPEN,
105 XML_REGEXP_PUNCT_CLOSE,
106 XML_REGEXP_PUNCT_INITQUOTE,
107 XML_REGEXP_PUNCT_FINQUOTE,
108 XML_REGEXP_PUNCT_OTHERS,
109 XML_REGEXP_SEPAR,
110 XML_REGEXP_SEPAR_SPACE,
111 XML_REGEXP_SEPAR_LINE,
112 XML_REGEXP_SEPAR_PARA,
113 XML_REGEXP_SYMBOL,
114 XML_REGEXP_SYMBOL_MATH,
115 XML_REGEXP_SYMBOL_CURRENCY,
116 XML_REGEXP_SYMBOL_MODIFIER,
117 XML_REGEXP_SYMBOL_OTHERS,
118 XML_REGEXP_OTHER,
119 XML_REGEXP_OTHER_CONTROL,
120 XML_REGEXP_OTHER_FORMAT,
121 XML_REGEXP_OTHER_PRIVATE,
122 XML_REGEXP_OTHER_NA,
123 XML_REGEXP_BLOCK_NAME
124} xmlRegAtomType;
125
126typedef enum {
127 XML_REGEXP_QUANT_EPSILON = 1,
128 XML_REGEXP_QUANT_ONCE,
129 XML_REGEXP_QUANT_OPT,
130 XML_REGEXP_QUANT_MULT,
131 XML_REGEXP_QUANT_PLUS,
Daniel Veillard7646b182002-04-20 06:41:40 +0000132 XML_REGEXP_QUANT_ONCEONLY,
133 XML_REGEXP_QUANT_ALL,
Daniel Veillard4255d502002-04-16 15:50:10 +0000134 XML_REGEXP_QUANT_RANGE
135} xmlRegQuantType;
136
137typedef enum {
138 XML_REGEXP_START_STATE = 1,
139 XML_REGEXP_FINAL_STATE,
140 XML_REGEXP_TRANS_STATE
141} xmlRegStateType;
142
143typedef enum {
144 XML_REGEXP_MARK_NORMAL = 0,
145 XML_REGEXP_MARK_START,
146 XML_REGEXP_MARK_VISITED
147} xmlRegMarkedType;
148
149typedef struct _xmlRegRange xmlRegRange;
150typedef xmlRegRange *xmlRegRangePtr;
151
152struct _xmlRegRange {
Daniel Veillardf8b9de32003-11-24 14:27:26 +0000153 int neg; /* 0 normal, 1 not, 2 exclude */
Daniel Veillard4255d502002-04-16 15:50:10 +0000154 xmlRegAtomType type;
155 int start;
156 int end;
157 xmlChar *blockName;
158};
159
160typedef struct _xmlRegAtom xmlRegAtom;
161typedef xmlRegAtom *xmlRegAtomPtr;
162
163typedef struct _xmlAutomataState xmlRegState;
164typedef xmlRegState *xmlRegStatePtr;
165
166struct _xmlRegAtom {
167 int no;
168 xmlRegAtomType type;
169 xmlRegQuantType quant;
170 int min;
171 int max;
172
173 void *valuep;
Daniel Veillarda646cfd2002-09-17 21:50:03 +0000174 void *valuep2;
Daniel Veillard4255d502002-04-16 15:50:10 +0000175 int neg;
176 int codepoint;
177 xmlRegStatePtr start;
178 xmlRegStatePtr stop;
179 int maxRanges;
180 int nbRanges;
181 xmlRegRangePtr *ranges;
182 void *data;
183};
184
185typedef struct _xmlRegCounter xmlRegCounter;
186typedef xmlRegCounter *xmlRegCounterPtr;
187
188struct _xmlRegCounter {
189 int min;
190 int max;
191};
192
193typedef struct _xmlRegTrans xmlRegTrans;
194typedef xmlRegTrans *xmlRegTransPtr;
195
196struct _xmlRegTrans {
197 xmlRegAtomPtr atom;
198 int to;
199 int counter;
200 int count;
201};
202
203struct _xmlAutomataState {
204 xmlRegStateType type;
205 xmlRegMarkedType mark;
Daniel Veillard23e73572002-09-19 19:56:43 +0000206 xmlRegMarkedType reached;
Daniel Veillard4255d502002-04-16 15:50:10 +0000207 int no;
208
209 int maxTrans;
210 int nbTrans;
211 xmlRegTrans *trans;
212};
213
214typedef struct _xmlAutomata xmlRegParserCtxt;
215typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
216
217struct _xmlAutomata {
218 xmlChar *string;
219 xmlChar *cur;
220
221 int error;
222 int neg;
223
224 xmlRegStatePtr start;
225 xmlRegStatePtr end;
226 xmlRegStatePtr state;
227
228 xmlRegAtomPtr atom;
229
230 int maxAtoms;
231 int nbAtoms;
232 xmlRegAtomPtr *atoms;
233
234 int maxStates;
235 int nbStates;
236 xmlRegStatePtr *states;
237
238 int maxCounters;
239 int nbCounters;
240 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000241
242 int determinist;
Daniel Veillard4255d502002-04-16 15:50:10 +0000243};
244
245struct _xmlRegexp {
246 xmlChar *string;
247 int nbStates;
248 xmlRegStatePtr *states;
249 int nbAtoms;
250 xmlRegAtomPtr *atoms;
251 int nbCounters;
252 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000253 int determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000254 /*
255 * That's the compact form for determinists automatas
256 */
257 int nbstates;
258 int *compact;
Daniel Veillard118aed72002-09-24 14:13:13 +0000259 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000260 int nbstrings;
261 xmlChar **stringMap;
Daniel Veillard4255d502002-04-16 15:50:10 +0000262};
263
264typedef struct _xmlRegExecRollback xmlRegExecRollback;
265typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
266
267struct _xmlRegExecRollback {
268 xmlRegStatePtr state;/* the current state */
269 int index; /* the index in the input stack */
270 int nextbranch; /* the next transition to explore in that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000271 int *counts; /* save the automata state if it has some */
Daniel Veillard4255d502002-04-16 15:50:10 +0000272};
273
274typedef struct _xmlRegInputToken xmlRegInputToken;
275typedef xmlRegInputToken *xmlRegInputTokenPtr;
276
277struct _xmlRegInputToken {
278 xmlChar *value;
279 void *data;
280};
281
282struct _xmlRegExecCtxt {
283 int status; /* execution status != 0 indicate an error */
William M. Brackddf71d62004-05-06 04:17:26 +0000284 int determinist; /* did we find an indeterministic behaviour */
Daniel Veillard4255d502002-04-16 15:50:10 +0000285 xmlRegexpPtr comp; /* the compiled regexp */
286 xmlRegExecCallbacks callback;
287 void *data;
288
289 xmlRegStatePtr state;/* the current state */
290 int transno; /* the current transition on that state */
William M. Brackddf71d62004-05-06 04:17:26 +0000291 int transcount; /* the number of chars in char counted transitions */
Daniel Veillard4255d502002-04-16 15:50:10 +0000292
293 /*
294 * A stack of rollback states
295 */
296 int maxRollbacks;
297 int nbRollbacks;
298 xmlRegExecRollback *rollbacks;
299
300 /*
301 * The state of the automata if any
302 */
303 int *counts;
304
305 /*
306 * The input stack
307 */
308 int inputStackMax;
309 int inputStackNr;
310 int index;
311 int *charStack;
312 const xmlChar *inputString; /* when operating on characters */
313 xmlRegInputTokenPtr inputStack;/* when operating on strings */
314
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +0000315 /*
316 * error handling
317 */
318 int errStateNo; /* the error state number */
319 xmlRegStatePtr errState; /* the error state */
320 xmlChar *errString; /* the string raising the error */
321 int *errCounts; /* counters at the error state */
Daniel Veillard4255d502002-04-16 15:50:10 +0000322};
323
Daniel Veillard441bc322002-04-20 17:38:48 +0000324#define REGEXP_ALL_COUNTER 0x123456
325#define REGEXP_ALL_LAX_COUNTER 0x123457
Daniel Veillard7646b182002-04-20 06:41:40 +0000326
Daniel Veillard4255d502002-04-16 15:50:10 +0000327static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
Daniel Veillard23e73572002-09-19 19:56:43 +0000328static void xmlRegFreeState(xmlRegStatePtr state);
329static void xmlRegFreeAtom(xmlRegAtomPtr atom);
Daniel Veillard4255d502002-04-16 15:50:10 +0000330
331/************************************************************************
Daniel Veillardff46a042003-10-08 08:53:17 +0000332 * *
333 * Regexp memory error handler *
334 * *
335 ************************************************************************/
336/**
337 * xmlRegexpErrMemory:
William M. Brackddf71d62004-05-06 04:17:26 +0000338 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000339 *
340 * Handle an out of memory condition
341 */
342static void
343xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
344{
345 const char *regexp = NULL;
346 if (ctxt != NULL) {
347 regexp = (const char *) ctxt->string;
348 ctxt->error = XML_ERR_NO_MEMORY;
349 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000350 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000351 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
352 regexp, NULL, 0, 0,
353 "Memory allocation failed : %s\n", extra);
354}
355
356/**
357 * xmlRegexpErrCompile:
William M. Brackddf71d62004-05-06 04:17:26 +0000358 * @extra: extra information
Daniel Veillardff46a042003-10-08 08:53:17 +0000359 *
William M. Brackddf71d62004-05-06 04:17:26 +0000360 * Handle a compilation failure
Daniel Veillardff46a042003-10-08 08:53:17 +0000361 */
362static void
363xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
364{
365 const char *regexp = NULL;
366 int idx = 0;
367
368 if (ctxt != NULL) {
369 regexp = (const char *) ctxt->string;
370 idx = ctxt->cur - ctxt->string;
371 ctxt->error = XML_REGEXP_COMPILE_ERROR;
372 }
Daniel Veillard659e71e2003-10-10 14:10:40 +0000373 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
Daniel Veillardff46a042003-10-08 08:53:17 +0000374 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
375 regexp, NULL, idx, 0,
376 "failed to compile: %s\n", extra);
377}
378
379/************************************************************************
Daniel Veillard4255d502002-04-16 15:50:10 +0000380 * *
381 * Allocation/Deallocation *
382 * *
383 ************************************************************************/
384
Daniel Veillard23e73572002-09-19 19:56:43 +0000385static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
Daniel Veillard4255d502002-04-16 15:50:10 +0000386/**
387 * xmlRegEpxFromParse:
388 * @ctxt: the parser context used to build it
389 *
William M. Brackddf71d62004-05-06 04:17:26 +0000390 * Allocate a new regexp and fill it with the result from the parser
Daniel Veillard4255d502002-04-16 15:50:10 +0000391 *
392 * Returns the new regexp or NULL in case of error
393 */
394static xmlRegexpPtr
395xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
396 xmlRegexpPtr ret;
397
398 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000399 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000400 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +0000401 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000402 }
Daniel Veillard4255d502002-04-16 15:50:10 +0000403 memset(ret, 0, sizeof(xmlRegexp));
404 ret->string = ctxt->string;
Daniel Veillard4255d502002-04-16 15:50:10 +0000405 ret->nbStates = ctxt->nbStates;
Daniel Veillard4255d502002-04-16 15:50:10 +0000406 ret->states = ctxt->states;
Daniel Veillard4255d502002-04-16 15:50:10 +0000407 ret->nbAtoms = ctxt->nbAtoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000408 ret->atoms = ctxt->atoms;
Daniel Veillard4255d502002-04-16 15:50:10 +0000409 ret->nbCounters = ctxt->nbCounters;
Daniel Veillard4255d502002-04-16 15:50:10 +0000410 ret->counters = ctxt->counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000411 ret->determinist = ctxt->determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000412
413 if ((ret->determinist != 0) &&
414 (ret->nbCounters == 0) &&
Daniel Veillard118aed72002-09-24 14:13:13 +0000415 (ret->atoms != NULL) &&
Daniel Veillard23e73572002-09-19 19:56:43 +0000416 (ret->atoms[0] != NULL) &&
417 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
418 int i, j, nbstates = 0, nbatoms = 0;
419 int *stateRemap;
420 int *stringRemap;
421 int *transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000422 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000423 xmlChar **stringMap;
424 xmlChar *value;
425
426 /*
427 * Switch to a compact representation
428 * 1/ counting the effective number of states left
William M. Brackddf71d62004-05-06 04:17:26 +0000429 * 2/ counting the unique number of atoms, and check that
Daniel Veillard23e73572002-09-19 19:56:43 +0000430 * they are all of the string type
431 * 3/ build a table state x atom for the transitions
432 */
433
434 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000435 if (stateRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000436 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000437 xmlFree(ret);
438 return(NULL);
439 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000440 for (i = 0;i < ret->nbStates;i++) {
441 if (ret->states[i] != NULL) {
442 stateRemap[i] = nbstates;
443 nbstates++;
444 } else {
445 stateRemap[i] = -1;
446 }
447 }
448#ifdef DEBUG_COMPACTION
449 printf("Final: %d states\n", nbstates);
450#endif
451 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000452 if (stringMap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000453 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000454 xmlFree(stateRemap);
455 xmlFree(ret);
456 return(NULL);
457 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000458 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000459 if (stringRemap == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000460 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000461 xmlFree(stringMap);
462 xmlFree(stateRemap);
463 xmlFree(ret);
464 return(NULL);
465 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000466 for (i = 0;i < ret->nbAtoms;i++) {
467 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
468 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
469 value = ret->atoms[i]->valuep;
470 for (j = 0;j < nbatoms;j++) {
471 if (xmlStrEqual(stringMap[j], value)) {
472 stringRemap[i] = j;
473 break;
474 }
475 }
476 if (j >= nbatoms) {
477 stringRemap[i] = nbatoms;
478 stringMap[nbatoms] = xmlStrdup(value);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000479 if (stringMap[nbatoms] == NULL) {
480 for (i = 0;i < nbatoms;i++)
481 xmlFree(stringMap[i]);
482 xmlFree(stringRemap);
483 xmlFree(stringMap);
484 xmlFree(stateRemap);
485 xmlFree(ret);
486 return(NULL);
487 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000488 nbatoms++;
489 }
490 } else {
491 xmlFree(stateRemap);
492 xmlFree(stringRemap);
493 for (i = 0;i < nbatoms;i++)
494 xmlFree(stringMap[i]);
495 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000496 xmlFree(ret);
497 return(NULL);
Daniel Veillard23e73572002-09-19 19:56:43 +0000498 }
499 }
500#ifdef DEBUG_COMPACTION
501 printf("Final: %d atoms\n", nbatoms);
502#endif
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000503 transitions = (int *) xmlMalloc((nbstates + 1) *
504 (nbatoms + 1) * sizeof(int));
505 if (transitions == NULL) {
506 xmlFree(stateRemap);
507 xmlFree(stringRemap);
508 xmlFree(stringMap);
509 xmlFree(ret);
510 return(NULL);
511 }
512 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int));
Daniel Veillard23e73572002-09-19 19:56:43 +0000513
514 /*
515 * Allocate the transition table. The first entry for each
William M. Brackddf71d62004-05-06 04:17:26 +0000516 * state corresponds to the state type.
Daniel Veillard23e73572002-09-19 19:56:43 +0000517 */
Daniel Veillard118aed72002-09-24 14:13:13 +0000518 transdata = NULL;
Daniel Veillard23e73572002-09-19 19:56:43 +0000519
520 for (i = 0;i < ret->nbStates;i++) {
521 int stateno, atomno, targetno, prev;
522 xmlRegStatePtr state;
523 xmlRegTransPtr trans;
524
525 stateno = stateRemap[i];
526 if (stateno == -1)
527 continue;
528 state = ret->states[i];
529
530 transitions[stateno * (nbatoms + 1)] = state->type;
531
532 for (j = 0;j < state->nbTrans;j++) {
533 trans = &(state->trans[j]);
534 if ((trans->to == -1) || (trans->atom == NULL))
535 continue;
536 atomno = stringRemap[trans->atom->no];
Daniel Veillard118aed72002-09-24 14:13:13 +0000537 if ((trans->atom->data != NULL) && (transdata == NULL)) {
538 transdata = (void **) xmlMalloc(nbstates * nbatoms *
539 sizeof(void *));
540 if (transdata != NULL)
541 memset(transdata, 0,
542 nbstates * nbatoms * sizeof(void *));
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000543 else {
Daniel Veillardff46a042003-10-08 08:53:17 +0000544 xmlRegexpErrMemory(ctxt, "compiling regexp");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000545 break;
546 }
Daniel Veillard118aed72002-09-24 14:13:13 +0000547 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000548 targetno = stateRemap[trans->to];
549 /*
William M. Brackddf71d62004-05-06 04:17:26 +0000550 * if the same atom can generate transitions to 2 different
Daniel Veillard23e73572002-09-19 19:56:43 +0000551 * states then it means the automata is not determinist and
552 * the compact form can't be used !
553 */
554 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
555 if (prev != 0) {
556 if (prev != targetno + 1) {
Daniel Veillard23e73572002-09-19 19:56:43 +0000557 ret->determinist = 0;
558#ifdef DEBUG_COMPACTION
559 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
560 i, j, trans->atom->no, trans->to, atomno, targetno);
561 printf(" previous to is %d\n", prev);
562#endif
563 ret->determinist = 0;
Daniel Veillard118aed72002-09-24 14:13:13 +0000564 if (transdata != NULL)
565 xmlFree(transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +0000566 xmlFree(transitions);
567 xmlFree(stateRemap);
568 xmlFree(stringRemap);
569 for (i = 0;i < nbatoms;i++)
570 xmlFree(stringMap[i]);
571 xmlFree(stringMap);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000572 goto not_determ;
Daniel Veillard23e73572002-09-19 19:56:43 +0000573 }
574 } else {
575#if 0
576 printf("State %d trans %d: atom %d to %d : %d to %d\n",
577 i, j, trans->atom->no, trans->to, atomno, targetno);
578#endif
579 transitions[stateno * (nbatoms + 1) + atomno + 1] =
Daniel Veillard118aed72002-09-24 14:13:13 +0000580 targetno + 1; /* to avoid 0 */
581 if (transdata != NULL)
582 transdata[stateno * nbatoms + atomno] =
583 trans->atom->data;
Daniel Veillard23e73572002-09-19 19:56:43 +0000584 }
585 }
586 }
587 ret->determinist = 1;
588#ifdef DEBUG_COMPACTION
589 /*
590 * Debug
591 */
592 for (i = 0;i < nbstates;i++) {
593 for (j = 0;j < nbatoms + 1;j++) {
594 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
595 }
596 printf("\n");
597 }
598 printf("\n");
599#endif
600 /*
601 * Cleanup of the old data
602 */
603 if (ret->states != NULL) {
604 for (i = 0;i < ret->nbStates;i++)
605 xmlRegFreeState(ret->states[i]);
606 xmlFree(ret->states);
607 }
608 ret->states = NULL;
609 ret->nbStates = 0;
610 if (ret->atoms != NULL) {
611 for (i = 0;i < ret->nbAtoms;i++)
612 xmlRegFreeAtom(ret->atoms[i]);
613 xmlFree(ret->atoms);
614 }
615 ret->atoms = NULL;
616 ret->nbAtoms = 0;
617
618 ret->compact = transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000619 ret->transdata = transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000620 ret->stringMap = stringMap;
621 ret->nbstrings = nbatoms;
622 ret->nbstates = nbstates;
623 xmlFree(stateRemap);
624 xmlFree(stringRemap);
625 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +0000626not_determ:
627 ctxt->string = NULL;
628 ctxt->nbStates = 0;
629 ctxt->states = NULL;
630 ctxt->nbAtoms = 0;
631 ctxt->atoms = NULL;
632 ctxt->nbCounters = 0;
633 ctxt->counters = NULL;
Daniel Veillard4255d502002-04-16 15:50:10 +0000634 return(ret);
635}
636
637/**
638 * xmlRegNewParserCtxt:
639 * @string: the string to parse
640 *
641 * Allocate a new regexp parser context
642 *
643 * Returns the new context or NULL in case of error
644 */
645static xmlRegParserCtxtPtr
646xmlRegNewParserCtxt(const xmlChar *string) {
647 xmlRegParserCtxtPtr ret;
648
649 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
650 if (ret == NULL)
651 return(NULL);
652 memset(ret, 0, sizeof(xmlRegParserCtxt));
653 if (string != NULL)
654 ret->string = xmlStrdup(string);
655 ret->cur = ret->string;
656 ret->neg = 0;
657 ret->error = 0;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000658 ret->determinist = -1;
Daniel Veillard4255d502002-04-16 15:50:10 +0000659 return(ret);
660}
661
662/**
663 * xmlRegNewRange:
664 * @ctxt: the regexp parser context
665 * @neg: is that negative
666 * @type: the type of range
667 * @start: the start codepoint
668 * @end: the end codepoint
669 *
670 * Allocate a new regexp range
671 *
672 * Returns the new range or NULL in case of error
673 */
674static xmlRegRangePtr
675xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
676 int neg, xmlRegAtomType type, int start, int end) {
677 xmlRegRangePtr ret;
678
679 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
680 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000681 xmlRegexpErrMemory(ctxt, "allocating range");
Daniel Veillard4255d502002-04-16 15:50:10 +0000682 return(NULL);
683 }
684 ret->neg = neg;
685 ret->type = type;
686 ret->start = start;
687 ret->end = end;
688 return(ret);
689}
690
691/**
692 * xmlRegFreeRange:
693 * @range: the regexp range
694 *
695 * Free a regexp range
696 */
697static void
698xmlRegFreeRange(xmlRegRangePtr range) {
699 if (range == NULL)
700 return;
701
702 if (range->blockName != NULL)
703 xmlFree(range->blockName);
704 xmlFree(range);
705}
706
707/**
708 * xmlRegNewAtom:
709 * @ctxt: the regexp parser context
710 * @type: the type of atom
711 *
712 * Allocate a new regexp range
713 *
714 * Returns the new atom or NULL in case of error
715 */
716static xmlRegAtomPtr
717xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
718 xmlRegAtomPtr ret;
719
720 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
721 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000722 xmlRegexpErrMemory(ctxt, "allocating atom");
Daniel Veillard4255d502002-04-16 15:50:10 +0000723 return(NULL);
724 }
725 memset(ret, 0, sizeof(xmlRegAtom));
726 ret->type = type;
727 ret->quant = XML_REGEXP_QUANT_ONCE;
728 ret->min = 0;
729 ret->max = 0;
730 return(ret);
731}
732
733/**
734 * xmlRegFreeAtom:
735 * @atom: the regexp atom
736 *
737 * Free a regexp atom
738 */
739static void
740xmlRegFreeAtom(xmlRegAtomPtr atom) {
741 int i;
742
743 if (atom == NULL)
744 return;
745
746 for (i = 0;i < atom->nbRanges;i++)
747 xmlRegFreeRange(atom->ranges[i]);
748 if (atom->ranges != NULL)
749 xmlFree(atom->ranges);
750 if (atom->type == XML_REGEXP_STRING)
751 xmlFree(atom->valuep);
752 xmlFree(atom);
753}
754
755static xmlRegStatePtr
756xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
757 xmlRegStatePtr ret;
758
759 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
760 if (ret == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +0000761 xmlRegexpErrMemory(ctxt, "allocating state");
Daniel Veillard4255d502002-04-16 15:50:10 +0000762 return(NULL);
763 }
764 memset(ret, 0, sizeof(xmlRegState));
765 ret->type = XML_REGEXP_TRANS_STATE;
766 ret->mark = XML_REGEXP_MARK_NORMAL;
767 return(ret);
768}
769
770/**
771 * xmlRegFreeState:
772 * @state: the regexp state
773 *
774 * Free a regexp state
775 */
776static void
777xmlRegFreeState(xmlRegStatePtr state) {
778 if (state == NULL)
779 return;
780
781 if (state->trans != NULL)
782 xmlFree(state->trans);
783 xmlFree(state);
784}
785
786/**
787 * xmlRegFreeParserCtxt:
788 * @ctxt: the regexp parser context
789 *
790 * Free a regexp parser context
791 */
792static void
793xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
794 int i;
795 if (ctxt == NULL)
796 return;
797
798 if (ctxt->string != NULL)
799 xmlFree(ctxt->string);
800 if (ctxt->states != NULL) {
801 for (i = 0;i < ctxt->nbStates;i++)
802 xmlRegFreeState(ctxt->states[i]);
803 xmlFree(ctxt->states);
804 }
805 if (ctxt->atoms != NULL) {
806 for (i = 0;i < ctxt->nbAtoms;i++)
807 xmlRegFreeAtom(ctxt->atoms[i]);
808 xmlFree(ctxt->atoms);
809 }
810 if (ctxt->counters != NULL)
811 xmlFree(ctxt->counters);
812 xmlFree(ctxt);
813}
814
815/************************************************************************
816 * *
817 * Display of Data structures *
818 * *
819 ************************************************************************/
820
821static void
822xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
823 switch (type) {
824 case XML_REGEXP_EPSILON:
825 fprintf(output, "epsilon "); break;
826 case XML_REGEXP_CHARVAL:
827 fprintf(output, "charval "); break;
828 case XML_REGEXP_RANGES:
829 fprintf(output, "ranges "); break;
830 case XML_REGEXP_SUBREG:
831 fprintf(output, "subexpr "); break;
832 case XML_REGEXP_STRING:
833 fprintf(output, "string "); break;
834 case XML_REGEXP_ANYCHAR:
835 fprintf(output, "anychar "); break;
836 case XML_REGEXP_ANYSPACE:
837 fprintf(output, "anyspace "); break;
838 case XML_REGEXP_NOTSPACE:
839 fprintf(output, "notspace "); break;
840 case XML_REGEXP_INITNAME:
841 fprintf(output, "initname "); break;
842 case XML_REGEXP_NOTINITNAME:
843 fprintf(output, "notinitname "); break;
844 case XML_REGEXP_NAMECHAR:
845 fprintf(output, "namechar "); break;
846 case XML_REGEXP_NOTNAMECHAR:
847 fprintf(output, "notnamechar "); break;
848 case XML_REGEXP_DECIMAL:
849 fprintf(output, "decimal "); break;
850 case XML_REGEXP_NOTDECIMAL:
851 fprintf(output, "notdecimal "); break;
852 case XML_REGEXP_REALCHAR:
853 fprintf(output, "realchar "); break;
854 case XML_REGEXP_NOTREALCHAR:
855 fprintf(output, "notrealchar "); break;
856 case XML_REGEXP_LETTER:
857 fprintf(output, "LETTER "); break;
858 case XML_REGEXP_LETTER_UPPERCASE:
859 fprintf(output, "LETTER_UPPERCASE "); break;
860 case XML_REGEXP_LETTER_LOWERCASE:
861 fprintf(output, "LETTER_LOWERCASE "); break;
862 case XML_REGEXP_LETTER_TITLECASE:
863 fprintf(output, "LETTER_TITLECASE "); break;
864 case XML_REGEXP_LETTER_MODIFIER:
865 fprintf(output, "LETTER_MODIFIER "); break;
866 case XML_REGEXP_LETTER_OTHERS:
867 fprintf(output, "LETTER_OTHERS "); break;
868 case XML_REGEXP_MARK:
869 fprintf(output, "MARK "); break;
870 case XML_REGEXP_MARK_NONSPACING:
871 fprintf(output, "MARK_NONSPACING "); break;
872 case XML_REGEXP_MARK_SPACECOMBINING:
873 fprintf(output, "MARK_SPACECOMBINING "); break;
874 case XML_REGEXP_MARK_ENCLOSING:
875 fprintf(output, "MARK_ENCLOSING "); break;
876 case XML_REGEXP_NUMBER:
877 fprintf(output, "NUMBER "); break;
878 case XML_REGEXP_NUMBER_DECIMAL:
879 fprintf(output, "NUMBER_DECIMAL "); break;
880 case XML_REGEXP_NUMBER_LETTER:
881 fprintf(output, "NUMBER_LETTER "); break;
882 case XML_REGEXP_NUMBER_OTHERS:
883 fprintf(output, "NUMBER_OTHERS "); break;
884 case XML_REGEXP_PUNCT:
885 fprintf(output, "PUNCT "); break;
886 case XML_REGEXP_PUNCT_CONNECTOR:
887 fprintf(output, "PUNCT_CONNECTOR "); break;
888 case XML_REGEXP_PUNCT_DASH:
889 fprintf(output, "PUNCT_DASH "); break;
890 case XML_REGEXP_PUNCT_OPEN:
891 fprintf(output, "PUNCT_OPEN "); break;
892 case XML_REGEXP_PUNCT_CLOSE:
893 fprintf(output, "PUNCT_CLOSE "); break;
894 case XML_REGEXP_PUNCT_INITQUOTE:
895 fprintf(output, "PUNCT_INITQUOTE "); break;
896 case XML_REGEXP_PUNCT_FINQUOTE:
897 fprintf(output, "PUNCT_FINQUOTE "); break;
898 case XML_REGEXP_PUNCT_OTHERS:
899 fprintf(output, "PUNCT_OTHERS "); break;
900 case XML_REGEXP_SEPAR:
901 fprintf(output, "SEPAR "); break;
902 case XML_REGEXP_SEPAR_SPACE:
903 fprintf(output, "SEPAR_SPACE "); break;
904 case XML_REGEXP_SEPAR_LINE:
905 fprintf(output, "SEPAR_LINE "); break;
906 case XML_REGEXP_SEPAR_PARA:
907 fprintf(output, "SEPAR_PARA "); break;
908 case XML_REGEXP_SYMBOL:
909 fprintf(output, "SYMBOL "); break;
910 case XML_REGEXP_SYMBOL_MATH:
911 fprintf(output, "SYMBOL_MATH "); break;
912 case XML_REGEXP_SYMBOL_CURRENCY:
913 fprintf(output, "SYMBOL_CURRENCY "); break;
914 case XML_REGEXP_SYMBOL_MODIFIER:
915 fprintf(output, "SYMBOL_MODIFIER "); break;
916 case XML_REGEXP_SYMBOL_OTHERS:
917 fprintf(output, "SYMBOL_OTHERS "); break;
918 case XML_REGEXP_OTHER:
919 fprintf(output, "OTHER "); break;
920 case XML_REGEXP_OTHER_CONTROL:
921 fprintf(output, "OTHER_CONTROL "); break;
922 case XML_REGEXP_OTHER_FORMAT:
923 fprintf(output, "OTHER_FORMAT "); break;
924 case XML_REGEXP_OTHER_PRIVATE:
925 fprintf(output, "OTHER_PRIVATE "); break;
926 case XML_REGEXP_OTHER_NA:
927 fprintf(output, "OTHER_NA "); break;
928 case XML_REGEXP_BLOCK_NAME:
929 fprintf(output, "BLOCK "); break;
930 }
931}
932
933static void
934xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
935 switch (type) {
936 case XML_REGEXP_QUANT_EPSILON:
937 fprintf(output, "epsilon "); break;
938 case XML_REGEXP_QUANT_ONCE:
939 fprintf(output, "once "); break;
940 case XML_REGEXP_QUANT_OPT:
941 fprintf(output, "? "); break;
942 case XML_REGEXP_QUANT_MULT:
943 fprintf(output, "* "); break;
944 case XML_REGEXP_QUANT_PLUS:
945 fprintf(output, "+ "); break;
946 case XML_REGEXP_QUANT_RANGE:
947 fprintf(output, "range "); break;
Daniel Veillard7646b182002-04-20 06:41:40 +0000948 case XML_REGEXP_QUANT_ONCEONLY:
949 fprintf(output, "onceonly "); break;
950 case XML_REGEXP_QUANT_ALL:
951 fprintf(output, "all "); break;
Daniel Veillard4255d502002-04-16 15:50:10 +0000952 }
953}
954static void
955xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
956 fprintf(output, " range: ");
957 if (range->neg)
958 fprintf(output, "negative ");
959 xmlRegPrintAtomType(output, range->type);
960 fprintf(output, "%c - %c\n", range->start, range->end);
961}
962
963static void
964xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
965 fprintf(output, " atom: ");
966 if (atom == NULL) {
967 fprintf(output, "NULL\n");
968 return;
969 }
970 xmlRegPrintAtomType(output, atom->type);
971 xmlRegPrintQuantType(output, atom->quant);
972 if (atom->quant == XML_REGEXP_QUANT_RANGE)
973 fprintf(output, "%d-%d ", atom->min, atom->max);
974 if (atom->type == XML_REGEXP_STRING)
975 fprintf(output, "'%s' ", (char *) atom->valuep);
976 if (atom->type == XML_REGEXP_CHARVAL)
977 fprintf(output, "char %c\n", atom->codepoint);
978 else if (atom->type == XML_REGEXP_RANGES) {
979 int i;
980 fprintf(output, "%d entries\n", atom->nbRanges);
981 for (i = 0; i < atom->nbRanges;i++)
982 xmlRegPrintRange(output, atom->ranges[i]);
983 } else if (atom->type == XML_REGEXP_SUBREG) {
984 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
985 } else {
986 fprintf(output, "\n");
987 }
988}
989
990static void
991xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
992 fprintf(output, " trans: ");
993 if (trans == NULL) {
994 fprintf(output, "NULL\n");
995 return;
996 }
997 if (trans->to < 0) {
998 fprintf(output, "removed\n");
999 return;
1000 }
1001 if (trans->counter >= 0) {
1002 fprintf(output, "counted %d, ", trans->counter);
1003 }
Daniel Veillard8a001f62002-04-20 07:24:11 +00001004 if (trans->count == REGEXP_ALL_COUNTER) {
1005 fprintf(output, "all transition, ");
1006 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00001007 fprintf(output, "count based %d, ", trans->count);
1008 }
1009 if (trans->atom == NULL) {
1010 fprintf(output, "epsilon to %d\n", trans->to);
1011 return;
1012 }
1013 if (trans->atom->type == XML_REGEXP_CHARVAL)
1014 fprintf(output, "char %c ", trans->atom->codepoint);
1015 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1016}
1017
1018static void
1019xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1020 int i;
1021
1022 fprintf(output, " state: ");
1023 if (state == NULL) {
1024 fprintf(output, "NULL\n");
1025 return;
1026 }
1027 if (state->type == XML_REGEXP_START_STATE)
1028 fprintf(output, "START ");
1029 if (state->type == XML_REGEXP_FINAL_STATE)
1030 fprintf(output, "FINAL ");
1031
1032 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1033 for (i = 0;i < state->nbTrans; i++) {
1034 xmlRegPrintTrans(output, &(state->trans[i]));
1035 }
1036}
1037
Daniel Veillard23e73572002-09-19 19:56:43 +00001038#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard4255d502002-04-16 15:50:10 +00001039static void
1040xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1041 int i;
1042
1043 fprintf(output, " ctxt: ");
1044 if (ctxt == NULL) {
1045 fprintf(output, "NULL\n");
1046 return;
1047 }
1048 fprintf(output, "'%s' ", ctxt->string);
1049 if (ctxt->error)
1050 fprintf(output, "error ");
1051 if (ctxt->neg)
1052 fprintf(output, "neg ");
1053 fprintf(output, "\n");
1054 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1055 for (i = 0;i < ctxt->nbAtoms; i++) {
1056 fprintf(output, " %02d ", i);
1057 xmlRegPrintAtom(output, ctxt->atoms[i]);
1058 }
1059 if (ctxt->atom != NULL) {
1060 fprintf(output, "current atom:\n");
1061 xmlRegPrintAtom(output, ctxt->atom);
1062 }
1063 fprintf(output, "%d states:", ctxt->nbStates);
1064 if (ctxt->start != NULL)
1065 fprintf(output, " start: %d", ctxt->start->no);
1066 if (ctxt->end != NULL)
1067 fprintf(output, " end: %d", ctxt->end->no);
1068 fprintf(output, "\n");
1069 for (i = 0;i < ctxt->nbStates; i++) {
1070 xmlRegPrintState(output, ctxt->states[i]);
1071 }
1072 fprintf(output, "%d counters:\n", ctxt->nbCounters);
1073 for (i = 0;i < ctxt->nbCounters; i++) {
1074 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1075 ctxt->counters[i].max);
1076 }
1077}
Daniel Veillard23e73572002-09-19 19:56:43 +00001078#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00001079
1080/************************************************************************
1081 * *
1082 * Finite Automata structures manipulations *
1083 * *
1084 ************************************************************************/
1085
1086static void
1087xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1088 int neg, xmlRegAtomType type, int start, int end,
1089 xmlChar *blockName) {
1090 xmlRegRangePtr range;
1091
1092 if (atom == NULL) {
1093 ERROR("add range: atom is NULL");
1094 return;
1095 }
1096 if (atom->type != XML_REGEXP_RANGES) {
1097 ERROR("add range: atom is not ranges");
1098 return;
1099 }
1100 if (atom->maxRanges == 0) {
1101 atom->maxRanges = 4;
1102 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1103 sizeof(xmlRegRangePtr));
1104 if (atom->ranges == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001105 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001106 atom->maxRanges = 0;
1107 return;
1108 }
1109 } else if (atom->nbRanges >= atom->maxRanges) {
1110 xmlRegRangePtr *tmp;
1111 atom->maxRanges *= 2;
1112 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1113 sizeof(xmlRegRangePtr));
1114 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001115 xmlRegexpErrMemory(ctxt, "adding ranges");
Daniel Veillard4255d502002-04-16 15:50:10 +00001116 atom->maxRanges /= 2;
1117 return;
1118 }
1119 atom->ranges = tmp;
1120 }
1121 range = xmlRegNewRange(ctxt, neg, type, start, end);
1122 if (range == NULL)
1123 return;
1124 range->blockName = blockName;
1125 atom->ranges[atom->nbRanges++] = range;
1126
1127}
1128
1129static int
1130xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1131 if (ctxt->maxCounters == 0) {
1132 ctxt->maxCounters = 4;
1133 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1134 sizeof(xmlRegCounter));
1135 if (ctxt->counters == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001136 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001137 ctxt->maxCounters = 0;
1138 return(-1);
1139 }
1140 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1141 xmlRegCounter *tmp;
1142 ctxt->maxCounters *= 2;
1143 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1144 sizeof(xmlRegCounter));
1145 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001146 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001147 ctxt->maxCounters /= 2;
1148 return(-1);
1149 }
1150 ctxt->counters = tmp;
1151 }
1152 ctxt->counters[ctxt->nbCounters].min = -1;
1153 ctxt->counters[ctxt->nbCounters].max = -1;
1154 return(ctxt->nbCounters++);
1155}
1156
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001157static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001158xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1159 if (atom == NULL) {
1160 ERROR("atom push: atom is NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001161 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001162 }
1163 if (ctxt->maxAtoms == 0) {
1164 ctxt->maxAtoms = 4;
1165 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1166 sizeof(xmlRegAtomPtr));
1167 if (ctxt->atoms == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001168 xmlRegexpErrMemory(ctxt, "pushing atom");
Daniel Veillard4255d502002-04-16 15:50:10 +00001169 ctxt->maxAtoms = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001170 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001171 }
1172 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1173 xmlRegAtomPtr *tmp;
1174 ctxt->maxAtoms *= 2;
1175 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1176 sizeof(xmlRegAtomPtr));
1177 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001178 xmlRegexpErrMemory(ctxt, "allocating counter");
Daniel Veillard4255d502002-04-16 15:50:10 +00001179 ctxt->maxAtoms /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001180 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001181 }
1182 ctxt->atoms = tmp;
1183 }
1184 atom->no = ctxt->nbAtoms;
1185 ctxt->atoms[ctxt->nbAtoms++] = atom;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001186 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001187}
1188
1189static void
1190xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1191 xmlRegAtomPtr atom, xmlRegStatePtr target,
1192 int counter, int count) {
William M. Brackf9b5fa22004-05-10 07:52:15 +00001193
1194 int nrtrans;
1195
Daniel Veillard4255d502002-04-16 15:50:10 +00001196 if (state == NULL) {
1197 ERROR("add state: state is NULL");
1198 return;
1199 }
1200 if (target == NULL) {
1201 ERROR("add state: target is NULL");
1202 return;
1203 }
William M. Brackf9b5fa22004-05-10 07:52:15 +00001204 /*
1205 * Other routines follow the philosophy 'When in doubt, add a transition'
1206 * so we check here whether such a transition is already present and, if
1207 * so, silently ignore this request.
1208 */
1209
1210 for (nrtrans=0; nrtrans<state->nbTrans; nrtrans++) {
1211 if ((state->trans[nrtrans].atom == atom) &&
1212 (state->trans[nrtrans].to == target->no) &&
1213 (state->trans[nrtrans].counter == counter) &&
1214 (state->trans[nrtrans].count == count)) {
1215#ifdef DEBUG_REGEXP_GRAPH
1216 printf("Ignoring duplicate transition from %d to %d\n",
1217 state->no, target->no);
1218#endif
1219 return;
1220 }
1221 }
1222
Daniel Veillard4255d502002-04-16 15:50:10 +00001223 if (state->maxTrans == 0) {
1224 state->maxTrans = 4;
1225 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1226 sizeof(xmlRegTrans));
1227 if (state->trans == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001228 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001229 state->maxTrans = 0;
1230 return;
1231 }
1232 } else if (state->nbTrans >= state->maxTrans) {
1233 xmlRegTrans *tmp;
1234 state->maxTrans *= 2;
1235 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1236 sizeof(xmlRegTrans));
1237 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001238 xmlRegexpErrMemory(ctxt, "adding transition");
Daniel Veillard4255d502002-04-16 15:50:10 +00001239 state->maxTrans /= 2;
1240 return;
1241 }
1242 state->trans = tmp;
1243 }
1244#ifdef DEBUG_REGEXP_GRAPH
1245 printf("Add trans from %d to %d ", state->no, target->no);
Daniel Veillard8a001f62002-04-20 07:24:11 +00001246 if (count == REGEXP_ALL_COUNTER)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001247 printf("all transition\n");
Daniel Veillard4402ab42002-09-12 16:02:56 +00001248 else if (count >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001249 printf("count based %d\n", count);
Daniel Veillard4255d502002-04-16 15:50:10 +00001250 else if (counter >= 0)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001251 printf("counted %d\n", counter);
Daniel Veillard4255d502002-04-16 15:50:10 +00001252 else if (atom == NULL)
Daniel Veillard2cbf5962004-03-31 15:50:43 +00001253 printf("epsilon transition\n");
1254 else if (atom != NULL)
1255 xmlRegPrintAtom(stdout, atom);
Daniel Veillard4255d502002-04-16 15:50:10 +00001256#endif
1257
1258 state->trans[state->nbTrans].atom = atom;
1259 state->trans[state->nbTrans].to = target->no;
1260 state->trans[state->nbTrans].counter = counter;
1261 state->trans[state->nbTrans].count = count;
1262 state->nbTrans++;
1263}
1264
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001265static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001266xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001267 if (state == NULL) return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001268 if (ctxt->maxStates == 0) {
1269 ctxt->maxStates = 4;
1270 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1271 sizeof(xmlRegStatePtr));
1272 if (ctxt->states == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001273 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001274 ctxt->maxStates = 0;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001275 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001276 }
1277 } else if (ctxt->nbStates >= ctxt->maxStates) {
1278 xmlRegStatePtr *tmp;
1279 ctxt->maxStates *= 2;
1280 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1281 sizeof(xmlRegStatePtr));
1282 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00001283 xmlRegexpErrMemory(ctxt, "adding state");
Daniel Veillard4255d502002-04-16 15:50:10 +00001284 ctxt->maxStates /= 2;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001285 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001286 }
1287 ctxt->states = tmp;
1288 }
1289 state->no = ctxt->nbStates;
1290 ctxt->states[ctxt->nbStates++] = state;
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001291 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001292}
1293
1294/**
Daniel Veillard7646b182002-04-20 06:41:40 +00001295 * xmlFAGenerateAllTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001296 * @ctxt: a regexp parser context
1297 * @from: the from state
1298 * @to: the target state or NULL for building a new one
1299 * @lax:
Daniel Veillard7646b182002-04-20 06:41:40 +00001300 *
1301 */
1302static void
1303xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
Daniel Veillard441bc322002-04-20 17:38:48 +00001304 xmlRegStatePtr from, xmlRegStatePtr to,
1305 int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00001306 if (to == NULL) {
1307 to = xmlRegNewState(ctxt);
1308 xmlRegStatePush(ctxt, to);
1309 ctxt->state = to;
1310 }
Daniel Veillard441bc322002-04-20 17:38:48 +00001311 if (lax)
1312 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1313 else
1314 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
Daniel Veillard7646b182002-04-20 06:41:40 +00001315}
1316
1317/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001318 * xmlFAGenerateEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001319 * @ctxt: a regexp parser context
1320 * @from: the from state
1321 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001322 *
1323 */
1324static void
1325xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1326 xmlRegStatePtr from, xmlRegStatePtr to) {
1327 if (to == NULL) {
1328 to = xmlRegNewState(ctxt);
1329 xmlRegStatePush(ctxt, to);
1330 ctxt->state = to;
1331 }
1332 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1333}
1334
1335/**
1336 * xmlFAGenerateCountedEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001337 * @ctxt: a regexp parser context
1338 * @from: the from state
1339 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001340 * counter: the counter for that transition
1341 *
1342 */
1343static void
1344xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1345 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1346 if (to == NULL) {
1347 to = xmlRegNewState(ctxt);
1348 xmlRegStatePush(ctxt, to);
1349 ctxt->state = to;
1350 }
1351 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1352}
1353
1354/**
1355 * xmlFAGenerateCountedTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001356 * @ctxt: a regexp parser context
1357 * @from: the from state
1358 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001359 * counter: the counter for that transition
1360 *
1361 */
1362static void
1363xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1364 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1365 if (to == NULL) {
1366 to = xmlRegNewState(ctxt);
1367 xmlRegStatePush(ctxt, to);
1368 ctxt->state = to;
1369 }
1370 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1371}
1372
1373/**
1374 * xmlFAGenerateTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001375 * @ctxt: a regexp parser context
1376 * @from: the from state
1377 * @to: the target state or NULL for building a new one
1378 * @atom: the atom generating the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00001379 *
William M. Brackddf71d62004-05-06 04:17:26 +00001380 * Returns 0 if success and -1 in case of error.
Daniel Veillard4255d502002-04-16 15:50:10 +00001381 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001382static int
Daniel Veillard4255d502002-04-16 15:50:10 +00001383xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1384 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1385 if (atom == NULL) {
1386 ERROR("genrate transition: atom == NULL");
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001387 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001388 }
1389 if (atom->type == XML_REGEXP_SUBREG) {
1390 /*
1391 * this is a subexpression handling one should not need to
William M. Brackddf71d62004-05-06 04:17:26 +00001392 * create a new node except for XML_REGEXP_QUANT_RANGE.
Daniel Veillard4255d502002-04-16 15:50:10 +00001393 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001394 if (xmlRegAtomPush(ctxt, atom) < 0) {
1395 return(-1);
1396 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001397 if ((to != NULL) && (atom->stop != to) &&
1398 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1399 /*
1400 * Generate an epsilon transition to link to the target
1401 */
1402 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1403 }
1404 switch (atom->quant) {
1405 case XML_REGEXP_QUANT_OPT:
1406 atom->quant = XML_REGEXP_QUANT_ONCE;
1407 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1408 break;
1409 case XML_REGEXP_QUANT_MULT:
1410 atom->quant = XML_REGEXP_QUANT_ONCE;
1411 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1412 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1413 break;
1414 case XML_REGEXP_QUANT_PLUS:
1415 atom->quant = XML_REGEXP_QUANT_ONCE;
1416 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1417 break;
1418 case XML_REGEXP_QUANT_RANGE: {
1419 int counter;
1420 xmlRegStatePtr newstate;
1421
1422 /*
1423 * This one is nasty:
William M. Brackddf71d62004-05-06 04:17:26 +00001424 * 1/ if range has minOccurs == 0, create a new state
1425 * and create epsilon transitions from atom->start
1426 * to atom->stop, as well as atom->start to the new
1427 * state
1428 * 2/ register a new counter
1429 * 3/ register an epsilon transition associated to
Daniel Veillard4255d502002-04-16 15:50:10 +00001430 * this counter going from atom->stop to atom->start
William M. Brackddf71d62004-05-06 04:17:26 +00001431 * 4/ create a new state
1432 * 5/ generate a counted transition from atom->stop to
Daniel Veillard4255d502002-04-16 15:50:10 +00001433 * that state
1434 */
William M. Brackddf71d62004-05-06 04:17:26 +00001435 if (atom->min == 0) {
1436 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1437 atom->stop);
1438 newstate = xmlRegNewState(ctxt);
1439 xmlRegStatePush(ctxt, newstate);
1440 ctxt->state = newstate;
1441 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1442 newstate);
1443 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001444 counter = xmlRegGetCounter(ctxt);
1445 ctxt->counters[counter].min = atom->min - 1;
1446 ctxt->counters[counter].max = atom->max - 1;
1447 atom->min = 0;
1448 atom->max = 0;
1449 atom->quant = XML_REGEXP_QUANT_ONCE;
1450 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1451 atom->start, counter);
1452 if (to != NULL) {
1453 newstate = to;
1454 } else {
1455 newstate = xmlRegNewState(ctxt);
1456 xmlRegStatePush(ctxt, newstate);
1457 ctxt->state = newstate;
1458 }
1459 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1460 newstate, counter);
1461 }
1462 default:
1463 break;
1464 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001465 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001466 } else {
1467 if (to == NULL) {
1468 to = xmlRegNewState(ctxt);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001469 if (to != NULL)
1470 xmlRegStatePush(ctxt, to);
1471 else {
1472 return(-1);
1473 }
1474 }
1475 if (xmlRegAtomPush(ctxt, atom) < 0) {
1476 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001477 }
1478 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
Daniel Veillard4255d502002-04-16 15:50:10 +00001479 ctxt->state = to;
1480 }
1481 switch (atom->quant) {
1482 case XML_REGEXP_QUANT_OPT:
1483 atom->quant = XML_REGEXP_QUANT_ONCE;
1484 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1485 break;
1486 case XML_REGEXP_QUANT_MULT:
1487 atom->quant = XML_REGEXP_QUANT_ONCE;
1488 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1489 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1490 break;
1491 case XML_REGEXP_QUANT_PLUS:
1492 atom->quant = XML_REGEXP_QUANT_ONCE;
1493 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1494 break;
1495 default:
1496 break;
1497 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001498 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00001499}
1500
1501/**
1502 * xmlFAReduceEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001503 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001504 * @fromnr: the from state
1505 * @tonr: the to state
William M. Brackddf71d62004-05-06 04:17:26 +00001506 * @counter: should that transition be associated to a counted
Daniel Veillard4255d502002-04-16 15:50:10 +00001507 *
1508 */
1509static void
1510xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1511 int tonr, int counter) {
1512 int transnr;
1513 xmlRegStatePtr from;
1514 xmlRegStatePtr to;
1515
1516#ifdef DEBUG_REGEXP_GRAPH
1517 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1518#endif
1519 from = ctxt->states[fromnr];
1520 if (from == NULL)
1521 return;
1522 to = ctxt->states[tonr];
1523 if (to == NULL)
1524 return;
1525 if ((to->mark == XML_REGEXP_MARK_START) ||
1526 (to->mark == XML_REGEXP_MARK_VISITED))
1527 return;
1528
1529 to->mark = XML_REGEXP_MARK_VISITED;
1530 if (to->type == XML_REGEXP_FINAL_STATE) {
1531#ifdef DEBUG_REGEXP_GRAPH
1532 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1533#endif
1534 from->type = XML_REGEXP_FINAL_STATE;
1535 }
1536 for (transnr = 0;transnr < to->nbTrans;transnr++) {
1537 if (to->trans[transnr].atom == NULL) {
1538 /*
1539 * Don't remove counted transitions
1540 * Don't loop either
1541 */
Daniel Veillardb509f152002-04-17 16:28:10 +00001542 if (to->trans[transnr].to != fromnr) {
1543 if (to->trans[transnr].count >= 0) {
1544 int newto = to->trans[transnr].to;
1545
1546 xmlRegStateAddTrans(ctxt, from, NULL,
1547 ctxt->states[newto],
1548 -1, to->trans[transnr].count);
1549 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00001550#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillardb509f152002-04-17 16:28:10 +00001551 printf("Found epsilon trans %d from %d to %d\n",
1552 transnr, tonr, to->trans[transnr].to);
Daniel Veillard4255d502002-04-16 15:50:10 +00001553#endif
Daniel Veillardb509f152002-04-17 16:28:10 +00001554 if (to->trans[transnr].counter >= 0) {
1555 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1556 to->trans[transnr].to,
1557 to->trans[transnr].counter);
1558 } else {
1559 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1560 to->trans[transnr].to,
1561 counter);
1562 }
1563 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001564 }
1565 } else {
1566 int newto = to->trans[transnr].to;
1567
Daniel Veillardb509f152002-04-17 16:28:10 +00001568 if (to->trans[transnr].counter >= 0) {
1569 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1570 ctxt->states[newto],
1571 to->trans[transnr].counter, -1);
1572 } else {
1573 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1574 ctxt->states[newto], counter, -1);
1575 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001576 }
1577 }
1578 to->mark = XML_REGEXP_MARK_NORMAL;
1579}
1580
1581/**
1582 * xmlFAEliminateEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001583 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001584 *
1585 */
1586static void
1587xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1588 int statenr, transnr;
1589 xmlRegStatePtr state;
1590
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00001591 if (ctxt->states == NULL) return;
1592
1593
Daniel Veillard4255d502002-04-16 15:50:10 +00001594 /*
1595 * build the completed transitions bypassing the epsilons
1596 * Use a marking algorithm to avoid loops
1597 */
1598 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1599 state = ctxt->states[statenr];
1600 if (state == NULL)
1601 continue;
1602 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1603 if ((state->trans[transnr].atom == NULL) &&
1604 (state->trans[transnr].to >= 0)) {
1605 if (state->trans[transnr].to == statenr) {
1606 state->trans[transnr].to = -1;
1607#ifdef DEBUG_REGEXP_GRAPH
1608 printf("Removed loopback epsilon trans %d on %d\n",
1609 transnr, statenr);
1610#endif
1611 } else if (state->trans[transnr].count < 0) {
1612 int newto = state->trans[transnr].to;
1613
1614#ifdef DEBUG_REGEXP_GRAPH
1615 printf("Found epsilon trans %d from %d to %d\n",
1616 transnr, statenr, newto);
1617#endif
1618 state->mark = XML_REGEXP_MARK_START;
1619 xmlFAReduceEpsilonTransitions(ctxt, statenr,
1620 newto, state->trans[transnr].counter);
1621 state->mark = XML_REGEXP_MARK_NORMAL;
1622#ifdef DEBUG_REGEXP_GRAPH
1623 } else {
1624 printf("Found counted transition %d on %d\n",
1625 transnr, statenr);
1626#endif
1627 }
1628 }
1629 }
1630 }
1631 /*
1632 * Eliminate the epsilon transitions
1633 */
1634 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1635 state = ctxt->states[statenr];
1636 if (state == NULL)
1637 continue;
1638 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1639 if ((state->trans[transnr].atom == NULL) &&
1640 (state->trans[transnr].count < 0) &&
1641 (state->trans[transnr].to >= 0)) {
1642 state->trans[transnr].to = -1;
1643 }
1644 }
1645 }
Daniel Veillard23e73572002-09-19 19:56:43 +00001646
1647 /*
1648 * Use this pass to detect unreachable states too
1649 */
1650 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1651 state = ctxt->states[statenr];
1652 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00001653 state->reached = XML_REGEXP_MARK_NORMAL;
Daniel Veillard23e73572002-09-19 19:56:43 +00001654 }
1655 state = ctxt->states[0];
1656 if (state != NULL)
William M. Brack779af002003-08-01 15:55:39 +00001657 state->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00001658 while (state != NULL) {
1659 xmlRegStatePtr target = NULL;
William M. Brack779af002003-08-01 15:55:39 +00001660 state->reached = XML_REGEXP_MARK_VISITED;
Daniel Veillard23e73572002-09-19 19:56:43 +00001661 /*
William M. Brackddf71d62004-05-06 04:17:26 +00001662 * Mark all states reachable from the current reachable state
Daniel Veillard23e73572002-09-19 19:56:43 +00001663 */
1664 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1665 if ((state->trans[transnr].to >= 0) &&
1666 ((state->trans[transnr].atom != NULL) ||
1667 (state->trans[transnr].count >= 0))) {
1668 int newto = state->trans[transnr].to;
1669
1670 if (ctxt->states[newto] == NULL)
1671 continue;
William M. Brack779af002003-08-01 15:55:39 +00001672 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
1673 ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
Daniel Veillard23e73572002-09-19 19:56:43 +00001674 target = ctxt->states[newto];
1675 }
1676 }
1677 }
1678 /*
1679 * find the next accessible state not explored
1680 */
1681 if (target == NULL) {
1682 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
1683 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00001684 if ((state != NULL) && (state->reached ==
1685 XML_REGEXP_MARK_START)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00001686 target = state;
1687 break;
1688 }
1689 }
1690 }
1691 state = target;
1692 }
1693 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1694 state = ctxt->states[statenr];
William M. Brack779af002003-08-01 15:55:39 +00001695 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
Daniel Veillard23e73572002-09-19 19:56:43 +00001696#ifdef DEBUG_REGEXP_GRAPH
1697 printf("Removed unreachable state %d\n", statenr);
1698#endif
1699 xmlRegFreeState(state);
1700 ctxt->states[statenr] = NULL;
1701 }
1702 }
1703
Daniel Veillard4255d502002-04-16 15:50:10 +00001704}
1705
Daniel Veillarde19fc232002-04-22 16:01:24 +00001706/**
1707 * xmlFACompareAtoms:
1708 * @atom1: an atom
1709 * @atom2: an atom
1710 *
William M. Brackddf71d62004-05-06 04:17:26 +00001711 * Compares two atoms to check whether they are equivalents
Daniel Veillarde19fc232002-04-22 16:01:24 +00001712 *
1713 * Returns 1 if yes and 0 otherwise
1714 */
1715static int
1716xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
1717 if (atom1 == atom2)
1718 return(1);
1719 if ((atom1 == NULL) || (atom2 == NULL))
1720 return(0);
1721
1722 if (atom1->type != atom2->type)
1723 return(0);
1724 switch (atom1->type) {
1725 case XML_REGEXP_STRING:
1726 return(xmlStrEqual((xmlChar *)atom1->valuep,
1727 (xmlChar *)atom2->valuep));
1728 case XML_REGEXP_EPSILON:
1729 return(1);
1730 case XML_REGEXP_CHARVAL:
1731 return(atom1->codepoint == atom2->codepoint);
1732 case XML_REGEXP_RANGES:
1733 TODO;
1734 return(0);
1735 default:
1736 break;
1737 }
1738 return(1);
1739}
1740
1741/**
1742 * xmlFARecurseDeterminism:
1743 * @ctxt: a regexp parser context
1744 *
1745 * Check whether the associated regexp is determinist,
1746 * should be called after xmlFAEliminateEpsilonTransitions()
1747 *
1748 */
1749static int
1750xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1751 int to, xmlRegAtomPtr atom) {
1752 int ret = 1;
1753 int transnr;
1754 xmlRegTransPtr t1;
1755
1756 if (state == NULL)
1757 return(ret);
1758 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1759 t1 = &(state->trans[transnr]);
1760 /*
1761 * check transitions conflicting with the one looked at
1762 */
1763 if (t1->atom == NULL) {
1764 if (t1->to == -1)
1765 continue;
1766 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1767 to, atom);
1768 if (ret == 0)
1769 return(0);
1770 continue;
1771 }
1772 if (t1->to != to)
1773 continue;
1774 if (xmlFACompareAtoms(t1->atom, atom))
1775 return(0);
1776 }
1777 return(ret);
1778}
1779
1780/**
1781 * xmlFAComputesDeterminism:
1782 * @ctxt: a regexp parser context
1783 *
1784 * Check whether the associated regexp is determinist,
1785 * should be called after xmlFAEliminateEpsilonTransitions()
1786 *
1787 */
1788static int
1789xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
1790 int statenr, transnr;
1791 xmlRegStatePtr state;
1792 xmlRegTransPtr t1, t2;
1793 int i;
1794 int ret = 1;
1795
Daniel Veillard4402ab42002-09-12 16:02:56 +00001796#ifdef DEBUG_REGEXP_GRAPH
1797 printf("xmlFAComputesDeterminism\n");
1798 xmlRegPrintCtxt(stdout, ctxt);
1799#endif
Daniel Veillarde19fc232002-04-22 16:01:24 +00001800 if (ctxt->determinist != -1)
1801 return(ctxt->determinist);
1802
1803 /*
William M. Brackddf71d62004-05-06 04:17:26 +00001804 * Check for all states that there aren't 2 transitions
Daniel Veillarde19fc232002-04-22 16:01:24 +00001805 * with the same atom and a different target.
1806 */
1807 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1808 state = ctxt->states[statenr];
1809 if (state == NULL)
1810 continue;
1811 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1812 t1 = &(state->trans[transnr]);
1813 /*
1814 * Determinism checks in case of counted or all transitions
1815 * will have to be handled separately
1816 */
1817 if (t1->atom == NULL)
1818 continue;
1819 if (t1->to == -1) /* eliminated */
1820 continue;
1821 for (i = 0;i < transnr;i++) {
1822 t2 = &(state->trans[i]);
1823 if (t2->to == -1) /* eliminated */
1824 continue;
1825 if (t2->atom != NULL) {
1826 if (t1->to == t2->to) {
1827 if (xmlFACompareAtoms(t1->atom, t2->atom))
William M. Brackddf71d62004-05-06 04:17:26 +00001828 t2->to = -1; /* eliminated */
Daniel Veillarde19fc232002-04-22 16:01:24 +00001829 } else {
1830 /* not determinist ! */
1831 if (xmlFACompareAtoms(t1->atom, t2->atom))
1832 ret = 0;
1833 }
1834 } else if (t1->to != -1) {
1835 /*
1836 * do the closure in case of remaining specific
1837 * epsilon transitions like choices or all
1838 */
1839 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1840 t2->to, t2->atom);
1841 if (ret == 0)
1842 return(0);
1843 }
1844 }
1845 if (ret == 0)
1846 break;
1847 }
1848 if (ret == 0)
1849 break;
1850 }
1851 ctxt->determinist = ret;
1852 return(ret);
1853}
1854
Daniel Veillard4255d502002-04-16 15:50:10 +00001855/************************************************************************
1856 * *
1857 * Routines to check input against transition atoms *
1858 * *
1859 ************************************************************************/
1860
1861static int
1862xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
1863 int start, int end, const xmlChar *blockName) {
1864 int ret = 0;
1865
1866 switch (type) {
1867 case XML_REGEXP_STRING:
1868 case XML_REGEXP_SUBREG:
1869 case XML_REGEXP_RANGES:
1870 case XML_REGEXP_EPSILON:
1871 return(-1);
1872 case XML_REGEXP_ANYCHAR:
1873 ret = ((codepoint != '\n') && (codepoint != '\r'));
1874 break;
1875 case XML_REGEXP_CHARVAL:
1876 ret = ((codepoint >= start) && (codepoint <= end));
1877 break;
1878 case XML_REGEXP_NOTSPACE:
1879 neg = !neg;
1880 case XML_REGEXP_ANYSPACE:
1881 ret = ((codepoint == '\n') || (codepoint == '\r') ||
1882 (codepoint == '\t') || (codepoint == ' '));
1883 break;
1884 case XML_REGEXP_NOTINITNAME:
1885 neg = !neg;
1886 case XML_REGEXP_INITNAME:
William M. Brack871611b2003-10-18 04:53:14 +00001887 ret = (IS_LETTER(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00001888 (codepoint == '_') || (codepoint == ':'));
1889 break;
1890 case XML_REGEXP_NOTNAMECHAR:
1891 neg = !neg;
1892 case XML_REGEXP_NAMECHAR:
William M. Brack871611b2003-10-18 04:53:14 +00001893 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
Daniel Veillard4255d502002-04-16 15:50:10 +00001894 (codepoint == '.') || (codepoint == '-') ||
1895 (codepoint == '_') || (codepoint == ':') ||
William M. Brack871611b2003-10-18 04:53:14 +00001896 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
Daniel Veillard4255d502002-04-16 15:50:10 +00001897 break;
1898 case XML_REGEXP_NOTDECIMAL:
1899 neg = !neg;
1900 case XML_REGEXP_DECIMAL:
1901 ret = xmlUCSIsCatNd(codepoint);
1902 break;
1903 case XML_REGEXP_REALCHAR:
1904 neg = !neg;
1905 case XML_REGEXP_NOTREALCHAR:
1906 ret = xmlUCSIsCatP(codepoint);
1907 if (ret == 0)
1908 ret = xmlUCSIsCatZ(codepoint);
1909 if (ret == 0)
1910 ret = xmlUCSIsCatC(codepoint);
1911 break;
1912 case XML_REGEXP_LETTER:
1913 ret = xmlUCSIsCatL(codepoint);
1914 break;
1915 case XML_REGEXP_LETTER_UPPERCASE:
1916 ret = xmlUCSIsCatLu(codepoint);
1917 break;
1918 case XML_REGEXP_LETTER_LOWERCASE:
1919 ret = xmlUCSIsCatLl(codepoint);
1920 break;
1921 case XML_REGEXP_LETTER_TITLECASE:
1922 ret = xmlUCSIsCatLt(codepoint);
1923 break;
1924 case XML_REGEXP_LETTER_MODIFIER:
1925 ret = xmlUCSIsCatLm(codepoint);
1926 break;
1927 case XML_REGEXP_LETTER_OTHERS:
1928 ret = xmlUCSIsCatLo(codepoint);
1929 break;
1930 case XML_REGEXP_MARK:
1931 ret = xmlUCSIsCatM(codepoint);
1932 break;
1933 case XML_REGEXP_MARK_NONSPACING:
1934 ret = xmlUCSIsCatMn(codepoint);
1935 break;
1936 case XML_REGEXP_MARK_SPACECOMBINING:
1937 ret = xmlUCSIsCatMc(codepoint);
1938 break;
1939 case XML_REGEXP_MARK_ENCLOSING:
1940 ret = xmlUCSIsCatMe(codepoint);
1941 break;
1942 case XML_REGEXP_NUMBER:
1943 ret = xmlUCSIsCatN(codepoint);
1944 break;
1945 case XML_REGEXP_NUMBER_DECIMAL:
1946 ret = xmlUCSIsCatNd(codepoint);
1947 break;
1948 case XML_REGEXP_NUMBER_LETTER:
1949 ret = xmlUCSIsCatNl(codepoint);
1950 break;
1951 case XML_REGEXP_NUMBER_OTHERS:
1952 ret = xmlUCSIsCatNo(codepoint);
1953 break;
1954 case XML_REGEXP_PUNCT:
1955 ret = xmlUCSIsCatP(codepoint);
1956 break;
1957 case XML_REGEXP_PUNCT_CONNECTOR:
1958 ret = xmlUCSIsCatPc(codepoint);
1959 break;
1960 case XML_REGEXP_PUNCT_DASH:
1961 ret = xmlUCSIsCatPd(codepoint);
1962 break;
1963 case XML_REGEXP_PUNCT_OPEN:
1964 ret = xmlUCSIsCatPs(codepoint);
1965 break;
1966 case XML_REGEXP_PUNCT_CLOSE:
1967 ret = xmlUCSIsCatPe(codepoint);
1968 break;
1969 case XML_REGEXP_PUNCT_INITQUOTE:
1970 ret = xmlUCSIsCatPi(codepoint);
1971 break;
1972 case XML_REGEXP_PUNCT_FINQUOTE:
1973 ret = xmlUCSIsCatPf(codepoint);
1974 break;
1975 case XML_REGEXP_PUNCT_OTHERS:
1976 ret = xmlUCSIsCatPo(codepoint);
1977 break;
1978 case XML_REGEXP_SEPAR:
1979 ret = xmlUCSIsCatZ(codepoint);
1980 break;
1981 case XML_REGEXP_SEPAR_SPACE:
1982 ret = xmlUCSIsCatZs(codepoint);
1983 break;
1984 case XML_REGEXP_SEPAR_LINE:
1985 ret = xmlUCSIsCatZl(codepoint);
1986 break;
1987 case XML_REGEXP_SEPAR_PARA:
1988 ret = xmlUCSIsCatZp(codepoint);
1989 break;
1990 case XML_REGEXP_SYMBOL:
1991 ret = xmlUCSIsCatS(codepoint);
1992 break;
1993 case XML_REGEXP_SYMBOL_MATH:
1994 ret = xmlUCSIsCatSm(codepoint);
1995 break;
1996 case XML_REGEXP_SYMBOL_CURRENCY:
1997 ret = xmlUCSIsCatSc(codepoint);
1998 break;
1999 case XML_REGEXP_SYMBOL_MODIFIER:
2000 ret = xmlUCSIsCatSk(codepoint);
2001 break;
2002 case XML_REGEXP_SYMBOL_OTHERS:
2003 ret = xmlUCSIsCatSo(codepoint);
2004 break;
2005 case XML_REGEXP_OTHER:
2006 ret = xmlUCSIsCatC(codepoint);
2007 break;
2008 case XML_REGEXP_OTHER_CONTROL:
2009 ret = xmlUCSIsCatCc(codepoint);
2010 break;
2011 case XML_REGEXP_OTHER_FORMAT:
2012 ret = xmlUCSIsCatCf(codepoint);
2013 break;
2014 case XML_REGEXP_OTHER_PRIVATE:
2015 ret = xmlUCSIsCatCo(codepoint);
2016 break;
2017 case XML_REGEXP_OTHER_NA:
2018 /* ret = xmlUCSIsCatCn(codepoint); */
2019 /* Seems it doesn't exist anymore in recent Unicode releases */
2020 ret = 0;
2021 break;
2022 case XML_REGEXP_BLOCK_NAME:
2023 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
2024 break;
2025 }
2026 if (neg)
2027 return(!ret);
2028 return(ret);
2029}
2030
2031static int
2032xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
2033 int i, ret = 0;
2034 xmlRegRangePtr range;
2035
William M. Brack871611b2003-10-18 04:53:14 +00002036 if ((atom == NULL) || (!IS_CHAR(codepoint)))
Daniel Veillard4255d502002-04-16 15:50:10 +00002037 return(-1);
2038
2039 switch (atom->type) {
2040 case XML_REGEXP_SUBREG:
2041 case XML_REGEXP_EPSILON:
2042 return(-1);
2043 case XML_REGEXP_CHARVAL:
2044 return(codepoint == atom->codepoint);
2045 case XML_REGEXP_RANGES: {
2046 int accept = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00002047
Daniel Veillard4255d502002-04-16 15:50:10 +00002048 for (i = 0;i < atom->nbRanges;i++) {
2049 range = atom->ranges[i];
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002050 if (range->neg == 2) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002051 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2052 0, range->start, range->end,
2053 range->blockName);
2054 if (ret != 0)
2055 return(0); /* excluded char */
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002056 } else if (range->neg) {
2057 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2058 0, range->start, range->end,
2059 range->blockName);
2060 if (ret == 0)
Daniel Veillardf2a12832003-11-24 13:04:35 +00002061 accept = 1;
Daniel Veillardf8b9de32003-11-24 14:27:26 +00002062 else
2063 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00002064 } else {
2065 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2066 0, range->start, range->end,
2067 range->blockName);
2068 if (ret != 0)
2069 accept = 1; /* might still be excluded */
2070 }
2071 }
2072 return(accept);
2073 }
2074 case XML_REGEXP_STRING:
2075 printf("TODO: XML_REGEXP_STRING\n");
2076 return(-1);
2077 case XML_REGEXP_ANYCHAR:
2078 case XML_REGEXP_ANYSPACE:
2079 case XML_REGEXP_NOTSPACE:
2080 case XML_REGEXP_INITNAME:
2081 case XML_REGEXP_NOTINITNAME:
2082 case XML_REGEXP_NAMECHAR:
2083 case XML_REGEXP_NOTNAMECHAR:
2084 case XML_REGEXP_DECIMAL:
2085 case XML_REGEXP_NOTDECIMAL:
2086 case XML_REGEXP_REALCHAR:
2087 case XML_REGEXP_NOTREALCHAR:
2088 case XML_REGEXP_LETTER:
2089 case XML_REGEXP_LETTER_UPPERCASE:
2090 case XML_REGEXP_LETTER_LOWERCASE:
2091 case XML_REGEXP_LETTER_TITLECASE:
2092 case XML_REGEXP_LETTER_MODIFIER:
2093 case XML_REGEXP_LETTER_OTHERS:
2094 case XML_REGEXP_MARK:
2095 case XML_REGEXP_MARK_NONSPACING:
2096 case XML_REGEXP_MARK_SPACECOMBINING:
2097 case XML_REGEXP_MARK_ENCLOSING:
2098 case XML_REGEXP_NUMBER:
2099 case XML_REGEXP_NUMBER_DECIMAL:
2100 case XML_REGEXP_NUMBER_LETTER:
2101 case XML_REGEXP_NUMBER_OTHERS:
2102 case XML_REGEXP_PUNCT:
2103 case XML_REGEXP_PUNCT_CONNECTOR:
2104 case XML_REGEXP_PUNCT_DASH:
2105 case XML_REGEXP_PUNCT_OPEN:
2106 case XML_REGEXP_PUNCT_CLOSE:
2107 case XML_REGEXP_PUNCT_INITQUOTE:
2108 case XML_REGEXP_PUNCT_FINQUOTE:
2109 case XML_REGEXP_PUNCT_OTHERS:
2110 case XML_REGEXP_SEPAR:
2111 case XML_REGEXP_SEPAR_SPACE:
2112 case XML_REGEXP_SEPAR_LINE:
2113 case XML_REGEXP_SEPAR_PARA:
2114 case XML_REGEXP_SYMBOL:
2115 case XML_REGEXP_SYMBOL_MATH:
2116 case XML_REGEXP_SYMBOL_CURRENCY:
2117 case XML_REGEXP_SYMBOL_MODIFIER:
2118 case XML_REGEXP_SYMBOL_OTHERS:
2119 case XML_REGEXP_OTHER:
2120 case XML_REGEXP_OTHER_CONTROL:
2121 case XML_REGEXP_OTHER_FORMAT:
2122 case XML_REGEXP_OTHER_PRIVATE:
2123 case XML_REGEXP_OTHER_NA:
2124 case XML_REGEXP_BLOCK_NAME:
2125 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
2126 (const xmlChar *)atom->valuep);
2127 if (atom->neg)
2128 ret = !ret;
2129 break;
2130 }
2131 return(ret);
2132}
2133
2134/************************************************************************
2135 * *
William M. Brackddf71d62004-05-06 04:17:26 +00002136 * Saving and restoring state of an execution context *
Daniel Veillard4255d502002-04-16 15:50:10 +00002137 * *
2138 ************************************************************************/
2139
2140#ifdef DEBUG_REGEXP_EXEC
2141static void
2142xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
2143 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
2144 if (exec->inputStack != NULL) {
2145 int i;
2146 printf(": ");
2147 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
2148 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]);
2149 } else {
2150 printf(": %s", &(exec->inputString[exec->index]));
2151 }
2152 printf("\n");
2153}
2154#endif
2155
2156static void
2157xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
2158#ifdef DEBUG_REGEXP_EXEC
2159 printf("saving ");
2160 exec->transno++;
2161 xmlFARegDebugExec(exec);
2162 exec->transno--;
2163#endif
2164
2165 if (exec->maxRollbacks == 0) {
2166 exec->maxRollbacks = 4;
2167 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
2168 sizeof(xmlRegExecRollback));
2169 if (exec->rollbacks == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002170 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002171 exec->maxRollbacks = 0;
2172 return;
2173 }
2174 memset(exec->rollbacks, 0,
2175 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2176 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
2177 xmlRegExecRollback *tmp;
2178 int len = exec->maxRollbacks;
2179
2180 exec->maxRollbacks *= 2;
2181 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
2182 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2183 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002184 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002185 exec->maxRollbacks /= 2;
2186 return;
2187 }
2188 exec->rollbacks = tmp;
2189 tmp = &exec->rollbacks[len];
2190 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
2191 }
2192 exec->rollbacks[exec->nbRollbacks].state = exec->state;
2193 exec->rollbacks[exec->nbRollbacks].index = exec->index;
2194 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
2195 if (exec->comp->nbCounters > 0) {
2196 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2197 exec->rollbacks[exec->nbRollbacks].counts = (int *)
2198 xmlMalloc(exec->comp->nbCounters * sizeof(int));
2199 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002200 xmlRegexpErrMemory(NULL, "saving regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002201 exec->status = -5;
2202 return;
2203 }
2204 }
2205 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
2206 exec->comp->nbCounters * sizeof(int));
2207 }
2208 exec->nbRollbacks++;
2209}
2210
2211static void
2212xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
2213 if (exec->nbRollbacks <= 0) {
2214 exec->status = -1;
2215#ifdef DEBUG_REGEXP_EXEC
2216 printf("rollback failed on empty stack\n");
2217#endif
2218 return;
2219 }
2220 exec->nbRollbacks--;
2221 exec->state = exec->rollbacks[exec->nbRollbacks].state;
2222 exec->index = exec->rollbacks[exec->nbRollbacks].index;
2223 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
2224 if (exec->comp->nbCounters > 0) {
2225 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2226 fprintf(stderr, "exec save: allocation failed");
2227 exec->status = -6;
2228 return;
2229 }
2230 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
2231 exec->comp->nbCounters * sizeof(int));
2232 }
2233
2234#ifdef DEBUG_REGEXP_EXEC
2235 printf("restored ");
2236 xmlFARegDebugExec(exec);
2237#endif
2238}
2239
2240/************************************************************************
2241 * *
William M. Brackddf71d62004-05-06 04:17:26 +00002242 * Verifier, running an input against a compiled regexp *
Daniel Veillard4255d502002-04-16 15:50:10 +00002243 * *
2244 ************************************************************************/
2245
2246static int
2247xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
2248 xmlRegExecCtxt execval;
2249 xmlRegExecCtxtPtr exec = &execval;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002250 int ret, codepoint = 0, len;
Daniel Veillard4255d502002-04-16 15:50:10 +00002251
2252 exec->inputString = content;
2253 exec->index = 0;
2254 exec->determinist = 1;
2255 exec->maxRollbacks = 0;
2256 exec->nbRollbacks = 0;
2257 exec->rollbacks = NULL;
2258 exec->status = 0;
2259 exec->comp = comp;
2260 exec->state = comp->states[0];
2261 exec->transno = 0;
2262 exec->transcount = 0;
Daniel Veillardf2a12832003-11-24 13:04:35 +00002263 exec->inputStack = NULL;
2264 exec->inputStackMax = 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00002265 if (comp->nbCounters > 0) {
2266 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
Daniel Veillardff46a042003-10-08 08:53:17 +00002267 if (exec->counts == NULL) {
2268 xmlRegexpErrMemory(NULL, "running regexp");
Daniel Veillard4255d502002-04-16 15:50:10 +00002269 return(-1);
Daniel Veillardff46a042003-10-08 08:53:17 +00002270 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002271 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
2272 } else
2273 exec->counts = NULL;
2274 while ((exec->status == 0) &&
2275 ((exec->inputString[exec->index] != 0) ||
2276 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
2277 xmlRegTransPtr trans;
2278 xmlRegAtomPtr atom;
2279
2280 /*
William M. Brack0e00b282004-04-26 15:40:47 +00002281 * If end of input on non-terminal state, rollback, however we may
Daniel Veillard4255d502002-04-16 15:50:10 +00002282 * still have epsilon like transition for counted transitions
William M. Brack0e00b282004-04-26 15:40:47 +00002283 * on counters, in that case don't break too early. Additionally,
2284 * if we are working on a range like "AB{0,2}", where B is not present,
2285 * we don't want to break.
Daniel Veillard4255d502002-04-16 15:50:10 +00002286 */
William M. Brack0e00b282004-04-26 15:40:47 +00002287 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
William M. Brackddf71d62004-05-06 04:17:26 +00002288 /*
2289 * if there is a transition, we must check if
2290 * atom allows minOccurs of 0
2291 */
2292 if (exec->transno < exec->state->nbTrans) {
William M. Brack0e00b282004-04-26 15:40:47 +00002293 trans = &exec->state->trans[exec->transno];
2294 if (trans->to >=0) {
2295 atom = trans->atom;
2296 if (!((atom->min == 0) && (atom->max > 0)))
2297 goto rollback;
2298 }
2299 } else
2300 goto rollback;
2301 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002302
2303 exec->transcount = 0;
2304 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2305 trans = &exec->state->trans[exec->transno];
2306 if (trans->to < 0)
2307 continue;
2308 atom = trans->atom;
2309 ret = 0;
2310 if (trans->count >= 0) {
2311 int count;
2312 xmlRegCounterPtr counter;
2313
2314 /*
2315 * A counted transition.
2316 */
2317
2318 count = exec->counts[trans->count];
2319 counter = &exec->comp->counters[trans->count];
2320#ifdef DEBUG_REGEXP_EXEC
2321 printf("testing count %d: val %d, min %d, max %d\n",
2322 trans->count, count, counter->min, counter->max);
2323#endif
2324 ret = ((count >= counter->min) && (count <= counter->max));
2325 } else if (atom == NULL) {
2326 fprintf(stderr, "epsilon transition left at runtime\n");
2327 exec->status = -2;
2328 break;
2329 } else if (exec->inputString[exec->index] != 0) {
2330 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
2331 ret = xmlRegCheckCharacter(atom, codepoint);
William M. Brack0e00b282004-04-26 15:40:47 +00002332 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002333 xmlRegStatePtr to = comp->states[trans->to];
2334
2335 /*
2336 * this is a multiple input sequence
2337 */
2338 if (exec->state->nbTrans > exec->transno + 1) {
2339 xmlFARegExecSave(exec);
2340 }
2341 exec->transcount = 1;
2342 do {
2343 /*
2344 * Try to progress as much as possible on the input
2345 */
2346 if (exec->transcount == atom->max) {
2347 break;
2348 }
2349 exec->index += len;
2350 /*
2351 * End of input: stop here
2352 */
2353 if (exec->inputString[exec->index] == 0) {
2354 exec->index -= len;
2355 break;
2356 }
2357 if (exec->transcount >= atom->min) {
2358 int transno = exec->transno;
2359 xmlRegStatePtr state = exec->state;
2360
2361 /*
2362 * The transition is acceptable save it
2363 */
2364 exec->transno = -1; /* trick */
2365 exec->state = to;
2366 xmlFARegExecSave(exec);
2367 exec->transno = transno;
2368 exec->state = state;
2369 }
2370 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
2371 len);
2372 ret = xmlRegCheckCharacter(atom, codepoint);
2373 exec->transcount++;
2374 } while (ret == 1);
2375 if (exec->transcount < atom->min)
2376 ret = 0;
2377
2378 /*
2379 * If the last check failed but one transition was found
2380 * possible, rollback
2381 */
2382 if (ret < 0)
2383 ret = 0;
2384 if (ret == 0) {
2385 goto rollback;
2386 }
William M. Brack0e00b282004-04-26 15:40:47 +00002387 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
2388 /*
2389 * we don't match on the codepoint, but minOccurs of 0
2390 * says that's ok. Setting len to 0 inhibits stepping
2391 * over the codepoint.
2392 */
2393 exec->transcount = 1;
2394 len = 0;
2395 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00002396 }
William M. Brack0e00b282004-04-26 15:40:47 +00002397 } else if ((atom->min == 0) && (atom->max > 0)) {
2398 /* another spot to match when minOccurs is 0 */
2399 exec->transcount = 1;
2400 len = 0;
2401 ret = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00002402 }
2403 if (ret == 1) {
2404 if (exec->state->nbTrans > exec->transno + 1) {
2405 xmlFARegExecSave(exec);
2406 }
2407 if (trans->counter >= 0) {
2408#ifdef DEBUG_REGEXP_EXEC
2409 printf("Increasing count %d\n", trans->counter);
2410#endif
2411 exec->counts[trans->counter]++;
2412 }
2413#ifdef DEBUG_REGEXP_EXEC
2414 printf("entering state %d\n", trans->to);
2415#endif
2416 exec->state = comp->states[trans->to];
2417 exec->transno = 0;
2418 if (trans->atom != NULL) {
2419 exec->index += len;
2420 }
2421 goto progress;
2422 } else if (ret < 0) {
2423 exec->status = -4;
2424 break;
2425 }
2426 }
2427 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2428rollback:
2429 /*
2430 * Failed to find a way out
2431 */
2432 exec->determinist = 0;
2433 xmlFARegExecRollBack(exec);
2434 }
2435progress:
2436 continue;
2437 }
2438 if (exec->rollbacks != NULL) {
2439 if (exec->counts != NULL) {
2440 int i;
2441
2442 for (i = 0;i < exec->maxRollbacks;i++)
2443 if (exec->rollbacks[i].counts != NULL)
2444 xmlFree(exec->rollbacks[i].counts);
2445 }
2446 xmlFree(exec->rollbacks);
2447 }
2448 if (exec->counts != NULL)
2449 xmlFree(exec->counts);
2450 if (exec->status == 0)
2451 return(1);
2452 if (exec->status == -1)
2453 return(0);
2454 return(exec->status);
2455}
2456
2457/************************************************************************
2458 * *
William M. Brackddf71d62004-05-06 04:17:26 +00002459 * Progressive interface to the verifier one atom at a time *
Daniel Veillard4255d502002-04-16 15:50:10 +00002460 * *
2461 ************************************************************************/
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002462#ifdef DEBUG_ERR
2463static void testerr(xmlRegExecCtxtPtr exec);
2464#endif
Daniel Veillard4255d502002-04-16 15:50:10 +00002465
2466/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002467 * xmlRegNewExecCtxt:
Daniel Veillard4255d502002-04-16 15:50:10 +00002468 * @comp: a precompiled regular expression
2469 * @callback: a callback function used for handling progresses in the
2470 * automata matching phase
2471 * @data: the context data associated to the callback in this context
2472 *
2473 * Build a context used for progressive evaluation of a regexp.
Daniel Veillard01c13b52002-12-10 15:19:08 +00002474 *
2475 * Returns the new context
Daniel Veillard4255d502002-04-16 15:50:10 +00002476 */
2477xmlRegExecCtxtPtr
2478xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
2479 xmlRegExecCtxtPtr exec;
2480
2481 if (comp == NULL)
2482 return(NULL);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00002483 if ((comp->compact == NULL) && (comp->states == NULL))
2484 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00002485 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
2486 if (exec == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002487 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00002488 return(NULL);
2489 }
2490 memset(exec, 0, sizeof(xmlRegExecCtxt));
2491 exec->inputString = NULL;
2492 exec->index = 0;
2493 exec->determinist = 1;
2494 exec->maxRollbacks = 0;
2495 exec->nbRollbacks = 0;
2496 exec->rollbacks = NULL;
2497 exec->status = 0;
2498 exec->comp = comp;
Daniel Veillard23e73572002-09-19 19:56:43 +00002499 if (comp->compact == NULL)
2500 exec->state = comp->states[0];
Daniel Veillard4255d502002-04-16 15:50:10 +00002501 exec->transno = 0;
2502 exec->transcount = 0;
2503 exec->callback = callback;
2504 exec->data = data;
2505 if (comp->nbCounters > 0) {
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002506 /*
2507 * For error handling, exec->counts is allocated twice the size
2508 * the second half is used to store the data in case of rollback
2509 */
2510 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
2511 * 2);
Daniel Veillard4255d502002-04-16 15:50:10 +00002512 if (exec->counts == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002513 xmlRegexpErrMemory(NULL, "creating execution context");
Daniel Veillard4255d502002-04-16 15:50:10 +00002514 xmlFree(exec);
2515 return(NULL);
2516 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002517 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
2518 exec->errCounts = &exec->counts[comp->nbCounters];
2519 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00002520 exec->counts = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002521 exec->errCounts = NULL;
2522 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002523 exec->inputStackMax = 0;
2524 exec->inputStackNr = 0;
2525 exec->inputStack = NULL;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002526 exec->errStateNo = -1;
2527 exec->errString = NULL;
Daniel Veillard4255d502002-04-16 15:50:10 +00002528 return(exec);
2529}
2530
2531/**
2532 * xmlRegFreeExecCtxt:
2533 * @exec: a regular expression evaulation context
2534 *
2535 * Free the structures associated to a regular expression evaulation context.
2536 */
2537void
2538xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
2539 if (exec == NULL)
2540 return;
2541
2542 if (exec->rollbacks != NULL) {
2543 if (exec->counts != NULL) {
2544 int i;
2545
2546 for (i = 0;i < exec->maxRollbacks;i++)
2547 if (exec->rollbacks[i].counts != NULL)
2548 xmlFree(exec->rollbacks[i].counts);
2549 }
2550 xmlFree(exec->rollbacks);
2551 }
2552 if (exec->counts != NULL)
2553 xmlFree(exec->counts);
2554 if (exec->inputStack != NULL) {
2555 int i;
2556
Daniel Veillard32370232002-10-16 14:08:14 +00002557 for (i = 0;i < exec->inputStackNr;i++) {
2558 if (exec->inputStack[i].value != NULL)
2559 xmlFree(exec->inputStack[i].value);
2560 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002561 xmlFree(exec->inputStack);
2562 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002563 if (exec->errString != NULL)
2564 xmlFree(exec->errString);
Daniel Veillard4255d502002-04-16 15:50:10 +00002565 xmlFree(exec);
2566}
2567
2568static void
2569xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2570 void *data) {
2571#ifdef DEBUG_PUSH
2572 printf("saving value: %d:%s\n", exec->inputStackNr, value);
2573#endif
2574 if (exec->inputStackMax == 0) {
2575 exec->inputStackMax = 4;
2576 exec->inputStack = (xmlRegInputTokenPtr)
2577 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
2578 if (exec->inputStack == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002579 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00002580 exec->inputStackMax = 0;
2581 return;
2582 }
2583 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
2584 xmlRegInputTokenPtr tmp;
2585
2586 exec->inputStackMax *= 2;
2587 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
2588 exec->inputStackMax * sizeof(xmlRegInputToken));
2589 if (tmp == NULL) {
Daniel Veillardff46a042003-10-08 08:53:17 +00002590 xmlRegexpErrMemory(NULL, "pushing input string");
Daniel Veillard4255d502002-04-16 15:50:10 +00002591 exec->inputStackMax /= 2;
2592 return;
2593 }
2594 exec->inputStack = tmp;
2595 }
2596 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
2597 exec->inputStack[exec->inputStackNr].data = data;
2598 exec->inputStackNr++;
2599 exec->inputStack[exec->inputStackNr].value = NULL;
2600 exec->inputStack[exec->inputStackNr].data = NULL;
2601}
2602
Daniel Veillardc0826a72004-08-10 14:17:33 +00002603/**
2604 * xmlRegStrEqualWildcard:
2605 * @expStr: the string to be evaluated
2606 * @valStr: the validation string
2607 *
2608 * Checks if both strings are equal or have the same content. "*"
2609 * can be used as a wildcard in @valStr; "|" is used as a seperator of
2610 * substrings in both @expStr and @valStr.
2611 *
2612 * Returns 1 if the comparison is satisfied and the number of substrings
2613 * is equal, 0 otherwise.
2614 */
2615
2616static int
2617xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
2618 if (expStr == valStr) return(1);
2619 if (expStr == NULL) return(0);
2620 if (valStr == NULL) return(0);
2621 do {
2622 /*
2623 * Eval if we have a wildcard for the current item.
2624 */
2625 if (*expStr != *valStr) {
2626 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
2627 do {
2628 if (*valStr == XML_REG_STRING_SEPARATOR)
2629 break;
2630 *valStr++;
2631 } while (*valStr != 0);
2632 continue;
2633 } else
2634 return(0);
2635 }
2636 *expStr++;
2637 *valStr++;
2638 } while (*valStr != 0);
2639 if (*expStr != 0)
2640 return (0);
2641 else
2642 return (1);
2643}
Daniel Veillard4255d502002-04-16 15:50:10 +00002644
2645/**
Daniel Veillard23e73572002-09-19 19:56:43 +00002646 * xmlRegCompactPushString:
2647 * @exec: a regexp execution context
2648 * @comp: the precompiled exec with a compact table
2649 * @value: a string token input
2650 * @data: data associated to the token to reuse in callbacks
2651 *
2652 * Push one input token in the execution context
2653 *
2654 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2655 * a negative value in case of error.
2656 */
2657static int
2658xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
2659 xmlRegexpPtr comp,
2660 const xmlChar *value,
2661 void *data) {
2662 int state = exec->index;
2663 int i, target;
2664
2665 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
2666 return(-1);
2667
2668 if (value == NULL) {
2669 /*
2670 * are we at a final state ?
2671 */
2672 if (comp->compact[state * (comp->nbstrings + 1)] ==
2673 XML_REGEXP_FINAL_STATE)
2674 return(1);
2675 return(0);
2676 }
2677
2678#ifdef DEBUG_PUSH
2679 printf("value pushed: %s\n", value);
2680#endif
2681
2682 /*
William M. Brackddf71d62004-05-06 04:17:26 +00002683 * Examine all outside transitions from current state
Daniel Veillard23e73572002-09-19 19:56:43 +00002684 */
2685 for (i = 0;i < comp->nbstrings;i++) {
2686 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
2687 if ((target > 0) && (target <= comp->nbstates)) {
Daniel Veillardc0826a72004-08-10 14:17:33 +00002688 target--; /* to avoid 0 */
2689 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
2690 exec->index = target;
Daniel Veillard118aed72002-09-24 14:13:13 +00002691 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
2692 exec->callback(exec->data, value,
2693 comp->transdata[state * comp->nbstrings + i], data);
2694 }
Daniel Veillard23e73572002-09-19 19:56:43 +00002695#ifdef DEBUG_PUSH
2696 printf("entering state %d\n", target);
2697#endif
2698 if (comp->compact[target * (comp->nbstrings + 1)] ==
2699 XML_REGEXP_FINAL_STATE)
2700 return(1);
2701 return(0);
2702 }
2703 }
2704 }
2705 /*
2706 * Failed to find an exit transition out from current state for the
2707 * current token
2708 */
2709#ifdef DEBUG_PUSH
2710 printf("failed to find a transition for %s on state %d\n", value, state);
2711#endif
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002712 if (exec->errString != NULL)
2713 xmlFree(exec->errString);
2714 exec->errString = xmlStrdup(value);
2715 exec->errStateNo = state;
Daniel Veillard23e73572002-09-19 19:56:43 +00002716 exec->status = -1;
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00002717#ifdef DEBUG_ERR
2718 testerr(exec);
2719#endif
Daniel Veillard23e73572002-09-19 19:56:43 +00002720 return(-1);
2721}
2722
2723/**
Daniel Veillard4255d502002-04-16 15:50:10 +00002724 * xmlRegExecPushString:
Daniel Veillardea7751d2002-12-20 00:16:24 +00002725 * @exec: a regexp execution context or NULL to indicate the end
Daniel Veillard4255d502002-04-16 15:50:10 +00002726 * @value: a string token input
2727 * @data: data associated to the token to reuse in callbacks
2728 *
2729 * Push one input token in the execution context
2730 *
2731 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2732 * a negative value in case of error.
2733 */
2734int
2735xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2736 void *data) {
2737 xmlRegTransPtr trans;
2738 xmlRegAtomPtr atom;
2739 int ret;
2740 int final = 0;
2741
2742 if (exec == NULL)
2743 return(-1);
Daniel Veillard23e73572002-09-19 19:56:43 +00002744 if (exec->comp == NULL)
2745 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00002746 if (exec->status != 0)
2747 return(exec->status);
2748
Daniel Veillard23e73572002-09-19 19:56:43 +00002749 if (exec->comp->compact != NULL)
2750 return(xmlRegCompactPushString(exec, exec->comp, value, data));
2751
Daniel Veillard4255d502002-04-16 15:50:10 +00002752 if (value == NULL) {
2753 if (exec->state->type == XML_REGEXP_FINAL_STATE)
2754 return(1);
2755 final = 1;
2756 }
2757
2758#ifdef DEBUG_PUSH
2759 printf("value pushed: %s\n", value);
2760#endif
2761 /*
2762 * If we have an active rollback stack push the new value there
2763 * and get back to where we were left
2764 */
2765 if ((value != NULL) && (exec->inputStackNr > 0)) {
2766 xmlFARegExecSaveInputString(exec, value, data);
2767 value = exec->inputStack[exec->index].value;
2768 data = exec->inputStack[exec->index].data;
2769#ifdef DEBUG_PUSH
2770 printf("value loaded: %s\n", value);
2771#endif
2772 }
2773
2774 while ((exec->status == 0) &&
2775 ((value != NULL) ||
2776 ((final == 1) &&
2777 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
2778
2779 /*
2780 * End of input on non-terminal state, rollback, however we may
2781 * still have epsilon like transition for counted transitions
2782 * on counters, in that case don't break too early.
2783 */
Daniel Veillardb509f152002-04-17 16:28:10 +00002784 if ((value == NULL) && (exec->counts == NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +00002785 goto rollback;
2786
2787 exec->transcount = 0;
2788 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2789 trans = &exec->state->trans[exec->transno];
2790 if (trans->to < 0)
2791 continue;
2792 atom = trans->atom;
2793 ret = 0;
Daniel Veillard441bc322002-04-20 17:38:48 +00002794 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
2795 int i;
2796 int count;
2797 xmlRegTransPtr t;
2798 xmlRegCounterPtr counter;
2799
2800 ret = 0;
2801
2802#ifdef DEBUG_PUSH
2803 printf("testing all lax %d\n", trans->count);
2804#endif
2805 /*
2806 * Check all counted transitions from the current state
2807 */
2808 if ((value == NULL) && (final)) {
2809 ret = 1;
2810 } else if (value != NULL) {
2811 for (i = 0;i < exec->state->nbTrans;i++) {
2812 t = &exec->state->trans[i];
2813 if ((t->counter < 0) || (t == trans))
2814 continue;
2815 counter = &exec->comp->counters[t->counter];
2816 count = exec->counts[t->counter];
2817 if ((count < counter->max) &&
2818 (t->atom != NULL) &&
2819 (xmlStrEqual(value, t->atom->valuep))) {
2820 ret = 0;
2821 break;
2822 }
2823 if ((count >= counter->min) &&
2824 (count < counter->max) &&
2825 (xmlStrEqual(value, t->atom->valuep))) {
2826 ret = 1;
2827 break;
2828 }
2829 }
2830 }
2831 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillard8a001f62002-04-20 07:24:11 +00002832 int i;
2833 int count;
2834 xmlRegTransPtr t;
2835 xmlRegCounterPtr counter;
2836
2837 ret = 1;
2838
2839#ifdef DEBUG_PUSH
2840 printf("testing all %d\n", trans->count);
2841#endif
2842 /*
2843 * Check all counted transitions from the current state
2844 */
2845 for (i = 0;i < exec->state->nbTrans;i++) {
2846 t = &exec->state->trans[i];
2847 if ((t->counter < 0) || (t == trans))
2848 continue;
2849 counter = &exec->comp->counters[t->counter];
2850 count = exec->counts[t->counter];
2851 if ((count < counter->min) || (count > counter->max)) {
2852 ret = 0;
2853 break;
2854 }
2855 }
2856 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002857 int count;
2858 xmlRegCounterPtr counter;
2859
2860 /*
2861 * A counted transition.
2862 */
2863
2864 count = exec->counts[trans->count];
2865 counter = &exec->comp->counters[trans->count];
2866#ifdef DEBUG_PUSH
2867 printf("testing count %d: val %d, min %d, max %d\n",
2868 trans->count, count, counter->min, counter->max);
2869#endif
2870 ret = ((count >= counter->min) && (count <= counter->max));
2871 } else if (atom == NULL) {
2872 fprintf(stderr, "epsilon transition left at runtime\n");
2873 exec->status = -2;
2874 break;
2875 } else if (value != NULL) {
Daniel Veillardc0826a72004-08-10 14:17:33 +00002876 ret = xmlRegStrEqualWildcard(atom->valuep, value);
Daniel Veillard441bc322002-04-20 17:38:48 +00002877 if ((ret == 1) && (trans->counter >= 0)) {
2878 xmlRegCounterPtr counter;
2879 int count;
2880
2881 count = exec->counts[trans->counter];
2882 counter = &exec->comp->counters[trans->counter];
2883 if (count >= counter->max)
2884 ret = 0;
2885 }
2886
Daniel Veillard4255d502002-04-16 15:50:10 +00002887 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2888 xmlRegStatePtr to = exec->comp->states[trans->to];
2889
2890 /*
2891 * this is a multiple input sequence
2892 */
2893 if (exec->state->nbTrans > exec->transno + 1) {
2894 if (exec->inputStackNr <= 0) {
2895 xmlFARegExecSaveInputString(exec, value, data);
2896 }
2897 xmlFARegExecSave(exec);
2898 }
2899 exec->transcount = 1;
2900 do {
2901 /*
2902 * Try to progress as much as possible on the input
2903 */
2904 if (exec->transcount == atom->max) {
2905 break;
2906 }
2907 exec->index++;
2908 value = exec->inputStack[exec->index].value;
2909 data = exec->inputStack[exec->index].data;
2910#ifdef DEBUG_PUSH
2911 printf("value loaded: %s\n", value);
2912#endif
2913
2914 /*
2915 * End of input: stop here
2916 */
2917 if (value == NULL) {
2918 exec->index --;
2919 break;
2920 }
2921 if (exec->transcount >= atom->min) {
2922 int transno = exec->transno;
2923 xmlRegStatePtr state = exec->state;
2924
2925 /*
2926 * The transition is acceptable save it
2927 */
2928 exec->transno = -1; /* trick */
2929 exec->state = to;
2930 if (exec->inputStackNr <= 0) {
2931 xmlFARegExecSaveInputString(exec, value, data);
2932 }
2933 xmlFARegExecSave(exec);
2934 exec->transno = transno;
2935 exec->state = state;
2936 }
2937 ret = xmlStrEqual(value, atom->valuep);
2938 exec->transcount++;
2939 } while (ret == 1);
2940 if (exec->transcount < atom->min)
2941 ret = 0;
2942
2943 /*
2944 * If the last check failed but one transition was found
2945 * possible, rollback
2946 */
2947 if (ret < 0)
2948 ret = 0;
2949 if (ret == 0) {
2950 goto rollback;
2951 }
2952 }
2953 }
2954 if (ret == 1) {
William M. Brack98873952003-12-26 06:03:14 +00002955 if ((exec->callback != NULL) && (atom != NULL) &&
2956 (data != NULL)) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002957 exec->callback(exec->data, atom->valuep,
2958 atom->data, data);
2959 }
2960 if (exec->state->nbTrans > exec->transno + 1) {
2961 if (exec->inputStackNr <= 0) {
2962 xmlFARegExecSaveInputString(exec, value, data);
2963 }
2964 xmlFARegExecSave(exec);
2965 }
2966 if (trans->counter >= 0) {
2967#ifdef DEBUG_PUSH
2968 printf("Increasing count %d\n", trans->counter);
2969#endif
2970 exec->counts[trans->counter]++;
2971 }
2972#ifdef DEBUG_PUSH
2973 printf("entering state %d\n", trans->to);
2974#endif
2975 exec->state = exec->comp->states[trans->to];
2976 exec->transno = 0;
2977 if (trans->atom != NULL) {
2978 if (exec->inputStack != NULL) {
2979 exec->index++;
2980 if (exec->index < exec->inputStackNr) {
2981 value = exec->inputStack[exec->index].value;
2982 data = exec->inputStack[exec->index].data;
2983#ifdef DEBUG_PUSH
2984 printf("value loaded: %s\n", value);
2985#endif
2986 } else {
2987 value = NULL;
2988 data = NULL;
2989#ifdef DEBUG_PUSH
2990 printf("end of input\n");
2991#endif
2992 }
2993 } else {
2994 value = NULL;
2995 data = NULL;
2996#ifdef DEBUG_PUSH
2997 printf("end of input\n");
2998#endif
2999 }
3000 }
3001 goto progress;
3002 } else if (ret < 0) {
3003 exec->status = -4;
3004 break;
3005 }
3006 }
3007 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3008rollback:
3009 /*
3010 * Failed to find a way out
3011 */
3012 exec->determinist = 0;
3013 xmlFARegExecRollBack(exec);
3014 if (exec->status == 0) {
3015 value = exec->inputStack[exec->index].value;
3016 data = exec->inputStack[exec->index].data;
3017#ifdef DEBUG_PUSH
3018 printf("value loaded: %s\n", value);
3019#endif
3020 }
3021 }
3022progress:
3023 continue;
3024 }
3025 if (exec->status == 0) {
3026 return(exec->state->type == XML_REGEXP_FINAL_STATE);
3027 }
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003028 if (exec->status < 0) {
3029 if (exec->errString != NULL)
3030 xmlFree(exec->errString);
3031 exec->errString = xmlStrdup(value);
3032 exec->errState = exec->state;
3033#ifdef DEBUG_ERR
3034 testerr(exec);
3035#endif
3036 }
Daniel Veillard4255d502002-04-16 15:50:10 +00003037 return(exec->status);
3038}
3039
Daniel Veillard52b48c72003-04-13 19:53:42 +00003040/**
3041 * xmlRegExecPushString2:
3042 * @exec: a regexp execution context or NULL to indicate the end
3043 * @value: the first string token input
3044 * @value2: the second string token input
3045 * @data: data associated to the token to reuse in callbacks
3046 *
3047 * Push one input token in the execution context
3048 *
3049 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3050 * a negative value in case of error.
3051 */
3052int
3053xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
3054 const xmlChar *value2, void *data) {
3055 xmlChar buf[150];
3056 int lenn, lenp, ret;
3057 xmlChar *str;
3058
3059 if (exec == NULL)
3060 return(-1);
3061 if (exec->comp == NULL)
3062 return(-1);
3063 if (exec->status != 0)
3064 return(exec->status);
3065
3066 if (value2 == NULL)
3067 return(xmlRegExecPushString(exec, value, data));
3068
3069 lenn = strlen((char *) value2);
3070 lenp = strlen((char *) value);
3071
3072 if (150 < lenn + lenp + 2) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00003073 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00003074 if (str == NULL) {
3075 exec->status = -1;
3076 return(-1);
3077 }
3078 } else {
3079 str = buf;
3080 }
3081 memcpy(&str[0], value, lenp);
Daniel Veillardc0826a72004-08-10 14:17:33 +00003082 str[lenp] = XML_REG_STRING_SEPARATOR;
Daniel Veillard52b48c72003-04-13 19:53:42 +00003083 memcpy(&str[lenp + 1], value2, lenn);
3084 str[lenn + lenp + 1] = 0;
3085
3086 if (exec->comp->compact != NULL)
3087 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
3088 else
3089 ret = xmlRegExecPushString(exec, str, data);
3090
3091 if (str != buf)
3092 xmlFree(buf);
3093 return(ret);
3094}
3095
Daniel Veillard7bd8b4b2005-01-07 13:56:19 +00003096/**
3097 * xmlRegExecErrInfo:
3098 * @exec: a regexp execution context generating an error
3099 * @string: return value for the error string
3100 * @nbval: pointer to the number of accepted values IN/OUT
3101 * @values: pointer to the array of acceptable values
3102 *
3103 * Extract error informations from the regexp execution, the parameter
3104 * @string will be updated with the value pushed and not accepted,
3105 * the parameter @values must point to an array of @nbval string pointers
3106 * on return nbval will contain the number of possible strings in that
3107 * state and the @values array will be updated with them. The string values
3108 * returned will be freed with the @exec context.
3109 *
3110 * Returns: 0 in case of success or -1 in case of error.
3111 */
3112int
3113xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
3114 int *nbval, xmlChar **values) {
3115 int maxval;
3116
3117 if (exec == NULL)
3118 return(-1);
3119 if (string != NULL) {
3120 if (exec->status != 0)
3121 *string = exec->errString;
3122 else
3123 *string = NULL;
3124 }
3125 if ((nbval == NULL) || (values == NULL) || (*nbval <= 0))
3126 return(-1);
3127 maxval = *nbval;
3128 *nbval = 0;
3129 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
3130 xmlRegexpPtr comp;
3131 int target, i, state;
3132
3133 comp = exec->comp;
3134 if (exec->errStateNo == -1) return(-1);
3135 state = exec->errStateNo;
3136 for (i = 0;(i < comp->nbstrings) && (*nbval < maxval);i++) {
3137 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3138 if ((target > 0) && (target <= comp->nbstates)) {
3139 values[*nbval] = comp->stringMap[i];
3140 (*nbval)++;
3141 }
3142 }
3143 } else {
3144 int transno;
3145 xmlRegTransPtr trans;
3146 xmlRegAtomPtr atom;
3147
3148 if (exec->errState == NULL) return(-1);
3149 for (transno = 0;
3150 (transno < exec->errState->nbTrans) && (*nbval < maxval);
3151 transno++) {
3152 trans = &exec->errState->trans[transno];
3153 if (trans->to < 0)
3154 continue;
3155 atom = trans->atom;
3156 if ((atom == NULL) || (atom->valuep == NULL))
3157 continue;
3158 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3159 TODO;
3160 } else if (trans->count == REGEXP_ALL_COUNTER) {
3161 TODO;
3162 } else if (trans->counter >= 0) {
3163 xmlRegCounterPtr counter;
3164 int count;
3165
3166 count = exec->counts[trans->counter];
3167 counter = &exec->comp->counters[trans->counter];
3168 if (count < counter->max) {
3169 values[*nbval] = (const xmlChar *) atom->valuep;
3170 (*nbval)++;
3171 }
3172 } else {
3173 values[*nbval] = (const xmlChar *) atom->valuep;
3174 (*nbval)++;
3175 }
3176 }
3177 }
3178 return(0);
3179}
3180
3181#ifdef DEBUG_ERR
3182static void testerr(xmlRegExecCtxtPtr exec) {
3183 const xmlChar *string;
3184 const xmlChar *values[5];
3185 int nb = 5;
3186 xmlRegExecErrInfo(exec, &string, &nb, &values[0]);
3187}
3188#endif
3189
Daniel Veillard4255d502002-04-16 15:50:10 +00003190#if 0
3191static int
3192xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
3193 xmlRegTransPtr trans;
3194 xmlRegAtomPtr atom;
3195 int ret;
3196 int codepoint, len;
3197
3198 if (exec == NULL)
3199 return(-1);
3200 if (exec->status != 0)
3201 return(exec->status);
3202
3203 while ((exec->status == 0) &&
3204 ((exec->inputString[exec->index] != 0) ||
3205 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
3206
3207 /*
3208 * End of input on non-terminal state, rollback, however we may
3209 * still have epsilon like transition for counted transitions
3210 * on counters, in that case don't break too early.
3211 */
3212 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
3213 goto rollback;
3214
3215 exec->transcount = 0;
3216 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3217 trans = &exec->state->trans[exec->transno];
3218 if (trans->to < 0)
3219 continue;
3220 atom = trans->atom;
3221 ret = 0;
3222 if (trans->count >= 0) {
3223 int count;
3224 xmlRegCounterPtr counter;
3225
3226 /*
3227 * A counted transition.
3228 */
3229
3230 count = exec->counts[trans->count];
3231 counter = &exec->comp->counters[trans->count];
3232#ifdef DEBUG_REGEXP_EXEC
3233 printf("testing count %d: val %d, min %d, max %d\n",
3234 trans->count, count, counter->min, counter->max);
3235#endif
3236 ret = ((count >= counter->min) && (count <= counter->max));
3237 } else if (atom == NULL) {
3238 fprintf(stderr, "epsilon transition left at runtime\n");
3239 exec->status = -2;
3240 break;
3241 } else if (exec->inputString[exec->index] != 0) {
3242 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3243 ret = xmlRegCheckCharacter(atom, codepoint);
3244 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3245 xmlRegStatePtr to = exec->comp->states[trans->to];
3246
3247 /*
3248 * this is a multiple input sequence
3249 */
3250 if (exec->state->nbTrans > exec->transno + 1) {
3251 xmlFARegExecSave(exec);
3252 }
3253 exec->transcount = 1;
3254 do {
3255 /*
3256 * Try to progress as much as possible on the input
3257 */
3258 if (exec->transcount == atom->max) {
3259 break;
3260 }
3261 exec->index += len;
3262 /*
3263 * End of input: stop here
3264 */
3265 if (exec->inputString[exec->index] == 0) {
3266 exec->index -= len;
3267 break;
3268 }
3269 if (exec->transcount >= atom->min) {
3270 int transno = exec->transno;
3271 xmlRegStatePtr state = exec->state;
3272
3273 /*
3274 * The transition is acceptable save it
3275 */
3276 exec->transno = -1; /* trick */
3277 exec->state = to;
3278 xmlFARegExecSave(exec);
3279 exec->transno = transno;
3280 exec->state = state;
3281 }
3282 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3283 len);
3284 ret = xmlRegCheckCharacter(atom, codepoint);
3285 exec->transcount++;
3286 } while (ret == 1);
3287 if (exec->transcount < atom->min)
3288 ret = 0;
3289
3290 /*
3291 * If the last check failed but one transition was found
3292 * possible, rollback
3293 */
3294 if (ret < 0)
3295 ret = 0;
3296 if (ret == 0) {
3297 goto rollback;
3298 }
3299 }
3300 }
3301 if (ret == 1) {
3302 if (exec->state->nbTrans > exec->transno + 1) {
3303 xmlFARegExecSave(exec);
3304 }
3305 if (trans->counter >= 0) {
3306#ifdef DEBUG_REGEXP_EXEC
3307 printf("Increasing count %d\n", trans->counter);
3308#endif
3309 exec->counts[trans->counter]++;
3310 }
3311#ifdef DEBUG_REGEXP_EXEC
3312 printf("entering state %d\n", trans->to);
3313#endif
3314 exec->state = exec->comp->states[trans->to];
3315 exec->transno = 0;
3316 if (trans->atom != NULL) {
3317 exec->index += len;
3318 }
3319 goto progress;
3320 } else if (ret < 0) {
3321 exec->status = -4;
3322 break;
3323 }
3324 }
3325 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3326rollback:
3327 /*
3328 * Failed to find a way out
3329 */
3330 exec->determinist = 0;
3331 xmlFARegExecRollBack(exec);
3332 }
3333progress:
3334 continue;
3335 }
3336}
3337#endif
3338/************************************************************************
3339 * *
William M. Brackddf71d62004-05-06 04:17:26 +00003340 * Parser for the Schemas Datatype Regular Expressions *
Daniel Veillard4255d502002-04-16 15:50:10 +00003341 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
3342 * *
3343 ************************************************************************/
3344
3345/**
3346 * xmlFAIsChar:
Daniel Veillard441bc322002-04-20 17:38:48 +00003347 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003348 *
3349 * [10] Char ::= [^.\?*+()|#x5B#x5D]
3350 */
3351static int
3352xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
3353 int cur;
3354 int len;
3355
3356 cur = CUR_SCHAR(ctxt->cur, len);
3357 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
3358 (cur == '*') || (cur == '+') || (cur == '(') ||
3359 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
3360 (cur == 0x5D) || (cur == 0))
3361 return(-1);
3362 return(cur);
3363}
3364
3365/**
3366 * xmlFAParseCharProp:
Daniel Veillard441bc322002-04-20 17:38:48 +00003367 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003368 *
3369 * [27] charProp ::= IsCategory | IsBlock
3370 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
3371 * Separators | Symbols | Others
3372 * [29] Letters ::= 'L' [ultmo]?
3373 * [30] Marks ::= 'M' [nce]?
3374 * [31] Numbers ::= 'N' [dlo]?
3375 * [32] Punctuation ::= 'P' [cdseifo]?
3376 * [33] Separators ::= 'Z' [slp]?
3377 * [34] Symbols ::= 'S' [mcko]?
3378 * [35] Others ::= 'C' [cfon]?
3379 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
3380 */
3381static void
3382xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
3383 int cur;
William M. Brack779af002003-08-01 15:55:39 +00003384 xmlRegAtomType type = (xmlRegAtomType) 0;
Daniel Veillard4255d502002-04-16 15:50:10 +00003385 xmlChar *blockName = NULL;
3386
3387 cur = CUR;
3388 if (cur == 'L') {
3389 NEXT;
3390 cur = CUR;
3391 if (cur == 'u') {
3392 NEXT;
3393 type = XML_REGEXP_LETTER_UPPERCASE;
3394 } else if (cur == 'l') {
3395 NEXT;
3396 type = XML_REGEXP_LETTER_LOWERCASE;
3397 } else if (cur == 't') {
3398 NEXT;
3399 type = XML_REGEXP_LETTER_TITLECASE;
3400 } else if (cur == 'm') {
3401 NEXT;
3402 type = XML_REGEXP_LETTER_MODIFIER;
3403 } else if (cur == 'o') {
3404 NEXT;
3405 type = XML_REGEXP_LETTER_OTHERS;
3406 } else {
3407 type = XML_REGEXP_LETTER;
3408 }
3409 } else if (cur == 'M') {
3410 NEXT;
3411 cur = CUR;
3412 if (cur == 'n') {
3413 NEXT;
3414 /* nonspacing */
3415 type = XML_REGEXP_MARK_NONSPACING;
3416 } else if (cur == 'c') {
3417 NEXT;
3418 /* spacing combining */
3419 type = XML_REGEXP_MARK_SPACECOMBINING;
3420 } else if (cur == 'e') {
3421 NEXT;
3422 /* enclosing */
3423 type = XML_REGEXP_MARK_ENCLOSING;
3424 } else {
3425 /* all marks */
3426 type = XML_REGEXP_MARK;
3427 }
3428 } else if (cur == 'N') {
3429 NEXT;
3430 cur = CUR;
3431 if (cur == 'd') {
3432 NEXT;
3433 /* digital */
3434 type = XML_REGEXP_NUMBER_DECIMAL;
3435 } else if (cur == 'l') {
3436 NEXT;
3437 /* letter */
3438 type = XML_REGEXP_NUMBER_LETTER;
3439 } else if (cur == 'o') {
3440 NEXT;
3441 /* other */
3442 type = XML_REGEXP_NUMBER_OTHERS;
3443 } else {
3444 /* all numbers */
3445 type = XML_REGEXP_NUMBER;
3446 }
3447 } else if (cur == 'P') {
3448 NEXT;
3449 cur = CUR;
3450 if (cur == 'c') {
3451 NEXT;
3452 /* connector */
3453 type = XML_REGEXP_PUNCT_CONNECTOR;
3454 } else if (cur == 'd') {
3455 NEXT;
3456 /* dash */
3457 type = XML_REGEXP_PUNCT_DASH;
3458 } else if (cur == 's') {
3459 NEXT;
3460 /* open */
3461 type = XML_REGEXP_PUNCT_OPEN;
3462 } else if (cur == 'e') {
3463 NEXT;
3464 /* close */
3465 type = XML_REGEXP_PUNCT_CLOSE;
3466 } else if (cur == 'i') {
3467 NEXT;
3468 /* initial quote */
3469 type = XML_REGEXP_PUNCT_INITQUOTE;
3470 } else if (cur == 'f') {
3471 NEXT;
3472 /* final quote */
3473 type = XML_REGEXP_PUNCT_FINQUOTE;
3474 } else if (cur == 'o') {
3475 NEXT;
3476 /* other */
3477 type = XML_REGEXP_PUNCT_OTHERS;
3478 } else {
3479 /* all punctuation */
3480 type = XML_REGEXP_PUNCT;
3481 }
3482 } else if (cur == 'Z') {
3483 NEXT;
3484 cur = CUR;
3485 if (cur == 's') {
3486 NEXT;
3487 /* space */
3488 type = XML_REGEXP_SEPAR_SPACE;
3489 } else if (cur == 'l') {
3490 NEXT;
3491 /* line */
3492 type = XML_REGEXP_SEPAR_LINE;
3493 } else if (cur == 'p') {
3494 NEXT;
3495 /* paragraph */
3496 type = XML_REGEXP_SEPAR_PARA;
3497 } else {
3498 /* all separators */
3499 type = XML_REGEXP_SEPAR;
3500 }
3501 } else if (cur == 'S') {
3502 NEXT;
3503 cur = CUR;
3504 if (cur == 'm') {
3505 NEXT;
3506 type = XML_REGEXP_SYMBOL_MATH;
3507 /* math */
3508 } else if (cur == 'c') {
3509 NEXT;
3510 type = XML_REGEXP_SYMBOL_CURRENCY;
3511 /* currency */
3512 } else if (cur == 'k') {
3513 NEXT;
3514 type = XML_REGEXP_SYMBOL_MODIFIER;
3515 /* modifiers */
3516 } else if (cur == 'o') {
3517 NEXT;
3518 type = XML_REGEXP_SYMBOL_OTHERS;
3519 /* other */
3520 } else {
3521 /* all symbols */
3522 type = XML_REGEXP_SYMBOL;
3523 }
3524 } else if (cur == 'C') {
3525 NEXT;
3526 cur = CUR;
3527 if (cur == 'c') {
3528 NEXT;
3529 /* control */
3530 type = XML_REGEXP_OTHER_CONTROL;
3531 } else if (cur == 'f') {
3532 NEXT;
3533 /* format */
3534 type = XML_REGEXP_OTHER_FORMAT;
3535 } else if (cur == 'o') {
3536 NEXT;
3537 /* private use */
3538 type = XML_REGEXP_OTHER_PRIVATE;
3539 } else if (cur == 'n') {
3540 NEXT;
3541 /* not assigned */
3542 type = XML_REGEXP_OTHER_NA;
3543 } else {
3544 /* all others */
3545 type = XML_REGEXP_OTHER;
3546 }
3547 } else if (cur == 'I') {
3548 const xmlChar *start;
3549 NEXT;
3550 cur = CUR;
3551 if (cur != 's') {
3552 ERROR("IsXXXX expected");
3553 return;
3554 }
3555 NEXT;
3556 start = ctxt->cur;
3557 cur = CUR;
3558 if (((cur >= 'a') && (cur <= 'z')) ||
3559 ((cur >= 'A') && (cur <= 'Z')) ||
3560 ((cur >= '0') && (cur <= '9')) ||
3561 (cur == 0x2D)) {
3562 NEXT;
3563 cur = CUR;
3564 while (((cur >= 'a') && (cur <= 'z')) ||
3565 ((cur >= 'A') && (cur <= 'Z')) ||
3566 ((cur >= '0') && (cur <= '9')) ||
3567 (cur == 0x2D)) {
3568 NEXT;
3569 cur = CUR;
3570 }
3571 }
3572 type = XML_REGEXP_BLOCK_NAME;
3573 blockName = xmlStrndup(start, ctxt->cur - start);
3574 } else {
3575 ERROR("Unknown char property");
3576 return;
3577 }
3578 if (ctxt->atom == NULL) {
3579 ctxt->atom = xmlRegNewAtom(ctxt, type);
3580 if (ctxt->atom != NULL)
3581 ctxt->atom->valuep = blockName;
3582 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3583 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3584 type, 0, 0, blockName);
3585 }
3586}
3587
3588/**
3589 * xmlFAParseCharClassEsc:
Daniel Veillard441bc322002-04-20 17:38:48 +00003590 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003591 *
3592 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
3593 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
3594 * [25] catEsc ::= '\p{' charProp '}'
3595 * [26] complEsc ::= '\P{' charProp '}'
3596 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
3597 */
3598static void
3599xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
3600 int cur;
3601
3602 if (CUR == '.') {
3603 if (ctxt->atom == NULL) {
3604 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
3605 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3606 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3607 XML_REGEXP_ANYCHAR, 0, 0, NULL);
3608 }
3609 NEXT;
3610 return;
3611 }
3612 if (CUR != '\\') {
3613 ERROR("Escaped sequence: expecting \\");
3614 return;
3615 }
3616 NEXT;
3617 cur = CUR;
3618 if (cur == 'p') {
3619 NEXT;
3620 if (CUR != '{') {
3621 ERROR("Expecting '{'");
3622 return;
3623 }
3624 NEXT;
3625 xmlFAParseCharProp(ctxt);
3626 if (CUR != '}') {
3627 ERROR("Expecting '}'");
3628 return;
3629 }
3630 NEXT;
3631 } else if (cur == 'P') {
3632 NEXT;
3633 if (CUR != '{') {
3634 ERROR("Expecting '{'");
3635 return;
3636 }
3637 NEXT;
3638 xmlFAParseCharProp(ctxt);
3639 ctxt->atom->neg = 1;
3640 if (CUR != '}') {
3641 ERROR("Expecting '}'");
3642 return;
3643 }
3644 NEXT;
3645 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
3646 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
3647 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
3648 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
3649 (cur == 0x5E)) {
3650 if (ctxt->atom == NULL) {
3651 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
3652 if (ctxt->atom != NULL)
3653 ctxt->atom->codepoint = cur;
3654 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3655 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3656 XML_REGEXP_CHARVAL, cur, cur, NULL);
3657 }
3658 NEXT;
3659 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
3660 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
3661 (cur == 'w') || (cur == 'W')) {
Daniel Veillardb509f152002-04-17 16:28:10 +00003662 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
Daniel Veillard4255d502002-04-16 15:50:10 +00003663
3664 switch (cur) {
3665 case 's':
3666 type = XML_REGEXP_ANYSPACE;
3667 break;
3668 case 'S':
3669 type = XML_REGEXP_NOTSPACE;
3670 break;
3671 case 'i':
3672 type = XML_REGEXP_INITNAME;
3673 break;
3674 case 'I':
3675 type = XML_REGEXP_NOTINITNAME;
3676 break;
3677 case 'c':
3678 type = XML_REGEXP_NAMECHAR;
3679 break;
3680 case 'C':
3681 type = XML_REGEXP_NOTNAMECHAR;
3682 break;
3683 case 'd':
3684 type = XML_REGEXP_DECIMAL;
3685 break;
3686 case 'D':
3687 type = XML_REGEXP_NOTDECIMAL;
3688 break;
3689 case 'w':
3690 type = XML_REGEXP_REALCHAR;
3691 break;
3692 case 'W':
3693 type = XML_REGEXP_NOTREALCHAR;
3694 break;
3695 }
3696 NEXT;
3697 if (ctxt->atom == NULL) {
3698 ctxt->atom = xmlRegNewAtom(ctxt, type);
3699 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3700 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3701 type, 0, 0, NULL);
3702 }
3703 }
3704}
3705
3706/**
3707 * xmlFAParseCharRef:
Daniel Veillard441bc322002-04-20 17:38:48 +00003708 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003709 *
3710 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' )
3711 */
3712static int
3713xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) {
3714 int ret = 0, cur;
3715
3716 if ((CUR != '&') || (NXT(1) != '#'))
3717 return(-1);
3718 NEXT;
3719 NEXT;
3720 cur = CUR;
3721 if (cur == 'x') {
3722 NEXT;
3723 cur = CUR;
3724 if (((cur >= '0') && (cur <= '9')) ||
3725 ((cur >= 'a') && (cur <= 'f')) ||
3726 ((cur >= 'A') && (cur <= 'F'))) {
3727 while (((cur >= '0') && (cur <= '9')) ||
3728 ((cur >= 'A') && (cur <= 'F'))) {
3729 if ((cur >= '0') && (cur <= '9'))
3730 ret = ret * 16 + cur - '0';
3731 else if ((cur >= 'a') && (cur <= 'f'))
3732 ret = ret * 16 + 10 + (cur - 'a');
3733 else
3734 ret = ret * 16 + 10 + (cur - 'A');
3735 NEXT;
3736 cur = CUR;
3737 }
3738 } else {
3739 ERROR("Char ref: expecting [0-9A-F]");
3740 return(-1);
3741 }
3742 } else {
3743 if ((cur >= '0') && (cur <= '9')) {
3744 while ((cur >= '0') && (cur <= '9')) {
3745 ret = ret * 10 + cur - '0';
3746 NEXT;
3747 cur = CUR;
3748 }
3749 } else {
3750 ERROR("Char ref: expecting [0-9]");
3751 return(-1);
3752 }
3753 }
3754 if (cur != ';') {
3755 ERROR("Char ref: expecting ';'");
3756 return(-1);
3757 } else {
3758 NEXT;
3759 }
3760 return(ret);
3761}
3762
3763/**
3764 * xmlFAParseCharRange:
Daniel Veillard441bc322002-04-20 17:38:48 +00003765 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003766 *
3767 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
3768 * [18] seRange ::= charOrEsc '-' charOrEsc
3769 * [20] charOrEsc ::= XmlChar | SingleCharEsc
3770 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
3771 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
3772 */
3773static void
3774xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
William M. Brackdc99df92003-12-27 01:54:25 +00003775 int cur, len;
Daniel Veillard4255d502002-04-16 15:50:10 +00003776 int start = -1;
3777 int end = -1;
3778
3779 if ((CUR == '&') && (NXT(1) == '#')) {
3780 end = start = xmlFAParseCharRef(ctxt);
3781 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3782 XML_REGEXP_CHARVAL, start, end, NULL);
3783 return;
3784 }
3785 cur = CUR;
3786 if (cur == '\\') {
3787 NEXT;
3788 cur = CUR;
3789 switch (cur) {
3790 case 'n': start = 0xA; break;
3791 case 'r': start = 0xD; break;
3792 case 't': start = 0x9; break;
3793 case '\\': case '|': case '.': case '-': case '^': case '?':
3794 case '*': case '+': case '{': case '}': case '(': case ')':
3795 case '[': case ']':
3796 start = cur; break;
3797 default:
3798 ERROR("Invalid escape value");
3799 return;
3800 }
3801 end = start;
William M. Brackdc99df92003-12-27 01:54:25 +00003802 len = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003803 } else if ((cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00003804 end = start = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00003805 } else {
3806 ERROR("Expecting a char range");
3807 return;
3808 }
William M. Brackdc99df92003-12-27 01:54:25 +00003809 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00003810 if (start == '-') {
3811 return;
3812 }
3813 cur = CUR;
William M. Brack10f1ef42004-03-20 14:51:25 +00003814 if ((cur != '-') || (NXT(1) == ']')) {
Daniel Veillard4255d502002-04-16 15:50:10 +00003815 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3816 XML_REGEXP_CHARVAL, start, end, NULL);
3817 return;
3818 }
3819 NEXT;
3820 cur = CUR;
3821 if (cur == '\\') {
3822 NEXT;
3823 cur = CUR;
3824 switch (cur) {
3825 case 'n': end = 0xA; break;
3826 case 'r': end = 0xD; break;
3827 case 't': end = 0x9; break;
3828 case '\\': case '|': case '.': case '-': case '^': case '?':
3829 case '*': case '+': case '{': case '}': case '(': case ')':
3830 case '[': case ']':
3831 end = cur; break;
3832 default:
3833 ERROR("Invalid escape value");
3834 return;
3835 }
William M. Brackdc99df92003-12-27 01:54:25 +00003836 len = 1;
Daniel Veillard4255d502002-04-16 15:50:10 +00003837 } else if ((cur != 0x5B) && (cur != 0x5D)) {
William M. Brackdc99df92003-12-27 01:54:25 +00003838 end = CUR_SCHAR(ctxt->cur, len);
Daniel Veillard4255d502002-04-16 15:50:10 +00003839 } else {
3840 ERROR("Expecting the end of a char range");
3841 return;
3842 }
William M. Brackdc99df92003-12-27 01:54:25 +00003843 NEXTL(len);
Daniel Veillard4255d502002-04-16 15:50:10 +00003844 /* TODO check that the values are acceptable character ranges for XML */
3845 if (end < start) {
3846 ERROR("End of range is before start of range");
3847 } else {
3848 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3849 XML_REGEXP_CHARVAL, start, end, NULL);
3850 }
3851 return;
3852}
3853
3854/**
3855 * xmlFAParsePosCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00003856 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003857 *
3858 * [14] posCharGroup ::= ( charRange | charClassEsc )+
3859 */
3860static void
3861xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
3862 do {
3863 if ((CUR == '\\') || (CUR == '.')) {
3864 xmlFAParseCharClassEsc(ctxt);
3865 } else {
3866 xmlFAParseCharRange(ctxt);
3867 }
3868 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
3869 (ctxt->error == 0));
3870}
3871
3872/**
3873 * xmlFAParseCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00003874 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003875 *
3876 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
3877 * [15] negCharGroup ::= '^' posCharGroup
3878 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
3879 * [12] charClassExpr ::= '[' charGroup ']'
3880 */
3881static void
3882xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
3883 int n = ctxt->neg;
3884 while ((CUR != ']') && (ctxt->error == 0)) {
3885 if (CUR == '^') {
3886 int neg = ctxt->neg;
3887
3888 NEXT;
3889 ctxt->neg = !ctxt->neg;
3890 xmlFAParsePosCharGroup(ctxt);
3891 ctxt->neg = neg;
William M. Brack10f1ef42004-03-20 14:51:25 +00003892 } else if ((CUR == '-') && (NXT(1) == '[')) {
Daniel Veillardf8b9de32003-11-24 14:27:26 +00003893 int neg = ctxt->neg;
Daniel Veillardf8b9de32003-11-24 14:27:26 +00003894 ctxt->neg = 2;
William M. Brack10f1ef42004-03-20 14:51:25 +00003895 NEXT; /* eat the '-' */
3896 NEXT; /* eat the '[' */
Daniel Veillard4255d502002-04-16 15:50:10 +00003897 xmlFAParseCharGroup(ctxt);
3898 if (CUR == ']') {
3899 NEXT;
3900 } else {
3901 ERROR("charClassExpr: ']' expected");
3902 break;
3903 }
Daniel Veillardf8b9de32003-11-24 14:27:26 +00003904 ctxt->neg = neg;
Daniel Veillard4255d502002-04-16 15:50:10 +00003905 break;
3906 } else if (CUR != ']') {
3907 xmlFAParsePosCharGroup(ctxt);
3908 }
3909 }
3910 ctxt->neg = n;
3911}
3912
3913/**
3914 * xmlFAParseCharClass:
Daniel Veillard441bc322002-04-20 17:38:48 +00003915 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003916 *
3917 * [11] charClass ::= charClassEsc | charClassExpr
3918 * [12] charClassExpr ::= '[' charGroup ']'
3919 */
3920static void
3921xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
3922 if (CUR == '[') {
3923 NEXT;
3924 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
3925 if (ctxt->atom == NULL)
3926 return;
3927 xmlFAParseCharGroup(ctxt);
3928 if (CUR == ']') {
3929 NEXT;
3930 } else {
3931 ERROR("xmlFAParseCharClass: ']' expected");
3932 }
3933 } else {
3934 xmlFAParseCharClassEsc(ctxt);
3935 }
3936}
3937
3938/**
3939 * xmlFAParseQuantExact:
Daniel Veillard441bc322002-04-20 17:38:48 +00003940 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003941 *
3942 * [8] QuantExact ::= [0-9]+
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00003943 *
3944 * Returns 0 if success or -1 in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00003945 */
3946static int
3947xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
3948 int ret = 0;
3949 int ok = 0;
3950
3951 while ((CUR >= '0') && (CUR <= '9')) {
3952 ret = ret * 10 + (CUR - '0');
3953 ok = 1;
3954 NEXT;
3955 }
3956 if (ok != 1) {
3957 return(-1);
3958 }
3959 return(ret);
3960}
3961
3962/**
3963 * xmlFAParseQuantifier:
Daniel Veillard441bc322002-04-20 17:38:48 +00003964 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003965 *
3966 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
3967 * [5] quantity ::= quantRange | quantMin | QuantExact
3968 * [6] quantRange ::= QuantExact ',' QuantExact
3969 * [7] quantMin ::= QuantExact ','
3970 * [8] QuantExact ::= [0-9]+
3971 */
3972static int
3973xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
3974 int cur;
3975
3976 cur = CUR;
3977 if ((cur == '?') || (cur == '*') || (cur == '+')) {
3978 if (ctxt->atom != NULL) {
3979 if (cur == '?')
3980 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
3981 else if (cur == '*')
3982 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
3983 else if (cur == '+')
3984 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
3985 }
3986 NEXT;
3987 return(1);
3988 }
3989 if (cur == '{') {
3990 int min = 0, max = 0;
3991
3992 NEXT;
3993 cur = xmlFAParseQuantExact(ctxt);
3994 if (cur >= 0)
3995 min = cur;
3996 if (CUR == ',') {
3997 NEXT;
Daniel Veillardebe48c62003-12-03 12:12:27 +00003998 if (CUR == '}')
3999 max = INT_MAX;
4000 else {
4001 cur = xmlFAParseQuantExact(ctxt);
4002 if (cur >= 0)
4003 max = cur;
4004 else {
4005 ERROR("Improper quantifier");
4006 }
4007 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004008 }
4009 if (CUR == '}') {
4010 NEXT;
4011 } else {
4012 ERROR("Unterminated quantifier");
4013 }
4014 if (max == 0)
4015 max = min;
4016 if (ctxt->atom != NULL) {
4017 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
4018 ctxt->atom->min = min;
4019 ctxt->atom->max = max;
4020 }
4021 return(1);
4022 }
4023 return(0);
4024}
4025
4026/**
4027 * xmlFAParseAtom:
Daniel Veillard441bc322002-04-20 17:38:48 +00004028 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004029 *
4030 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
4031 */
4032static int
4033xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
4034 int codepoint, len;
4035
4036 codepoint = xmlFAIsChar(ctxt);
4037 if (codepoint > 0) {
4038 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
4039 if (ctxt->atom == NULL)
4040 return(-1);
4041 codepoint = CUR_SCHAR(ctxt->cur, len);
4042 ctxt->atom->codepoint = codepoint;
4043 NEXTL(len);
4044 return(1);
4045 } else if (CUR == '|') {
4046 return(0);
4047 } else if (CUR == 0) {
4048 return(0);
4049 } else if (CUR == ')') {
4050 return(0);
4051 } else if (CUR == '(') {
4052 xmlRegStatePtr start, oldend;
4053
4054 NEXT;
4055 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
4056 start = ctxt->state;
4057 oldend = ctxt->end;
4058 ctxt->end = NULL;
4059 ctxt->atom = NULL;
4060 xmlFAParseRegExp(ctxt, 0);
4061 if (CUR == ')') {
4062 NEXT;
4063 } else {
4064 ERROR("xmlFAParseAtom: expecting ')'");
4065 }
4066 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
4067 if (ctxt->atom == NULL)
4068 return(-1);
4069 ctxt->atom->start = start;
4070 ctxt->atom->stop = ctxt->state;
4071 ctxt->end = oldend;
4072 return(1);
4073 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
4074 xmlFAParseCharClass(ctxt);
4075 return(1);
4076 }
4077 return(0);
4078}
4079
4080/**
4081 * xmlFAParsePiece:
Daniel Veillard441bc322002-04-20 17:38:48 +00004082 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004083 *
4084 * [3] piece ::= atom quantifier?
4085 */
4086static int
4087xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
4088 int ret;
4089
4090 ctxt->atom = NULL;
4091 ret = xmlFAParseAtom(ctxt);
4092 if (ret == 0)
4093 return(0);
4094 if (ctxt->atom == NULL) {
4095 ERROR("internal: no atom generated");
4096 }
4097 xmlFAParseQuantifier(ctxt);
4098 return(1);
4099}
4100
4101/**
4102 * xmlFAParseBranch:
Daniel Veillard441bc322002-04-20 17:38:48 +00004103 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00004104 *
4105 * [2] branch ::= piece*
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004106 8
Daniel Veillard4255d502002-04-16 15:50:10 +00004107 */
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004108static int
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004109xmlFAParseBranch(xmlRegParserCtxtPtr ctxt) {
Daniel Veillard4255d502002-04-16 15:50:10 +00004110 xmlRegStatePtr previous;
Daniel Veillard4255d502002-04-16 15:50:10 +00004111 int ret;
4112
4113 previous = ctxt->state;
4114 ret = xmlFAParsePiece(ctxt);
4115 if (ret != 0) {
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004116 if (xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom) < 0)
4117 return(-1);
4118 previous = ctxt->state;
Daniel Veillard4255d502002-04-16 15:50:10 +00004119 ctxt->atom = NULL;
4120 }
4121 while ((ret != 0) && (ctxt->error == 0)) {
4122 ret = xmlFAParsePiece(ctxt);
4123 if (ret != 0) {
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004124 if (xmlFAGenerateTransitions(ctxt, previous, NULL,
4125 ctxt->atom) < 0)
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004126 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00004127 previous = ctxt->state;
4128 ctxt->atom = NULL;
4129 }
4130 }
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004131 return(0);
Daniel Veillard4255d502002-04-16 15:50:10 +00004132}
4133
4134/**
4135 * xmlFAParseRegExp:
Daniel Veillard441bc322002-04-20 17:38:48 +00004136 * @ctxt: a regexp parser context
William M. Brackddf71d62004-05-06 04:17:26 +00004137 * @top: is this the top-level expression ?
Daniel Veillard4255d502002-04-16 15:50:10 +00004138 *
4139 * [1] regExp ::= branch ( '|' branch )*
4140 */
4141static void
4142xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
Daniel Veillardc7e3cc42004-09-28 12:33:52 +00004143 xmlRegStatePtr start, end;
Daniel Veillard4255d502002-04-16 15:50:10 +00004144
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004145 /* if not top start should have been generated by an epsilon trans */
Daniel Veillard4255d502002-04-16 15:50:10 +00004146 start = ctxt->state;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004147 ctxt->end = NULL;
4148 xmlFAParseBranch(ctxt);
4149 if (top) {
4150#ifdef DEBUG_REGEXP_GRAPH
4151 printf("State %d is final\n", ctxt->state->no);
4152#endif
4153 ctxt->state->type = XML_REGEXP_FINAL_STATE;
4154 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004155 if (CUR != '|') {
4156 ctxt->end = ctxt->state;
4157 return;
4158 }
4159 end = ctxt->state;
4160 while ((CUR == '|') && (ctxt->error == 0)) {
4161 NEXT;
4162 ctxt->state = start;
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004163 ctxt->end = NULL;
4164 xmlFAParseBranch(ctxt);
4165 if (top) {
4166 ctxt->state->type = XML_REGEXP_FINAL_STATE;
4167#ifdef DEBUG_REGEXP_GRAPH
4168 printf("State %d is final\n", ctxt->state->no);
4169#endif
4170 } else {
4171 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, end);
4172 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004173 }
Daniel Veillard2cbf5962004-03-31 15:50:43 +00004174 if (!top) {
4175 ctxt->state = end;
4176 ctxt->end = end;
4177 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004178}
4179
4180/************************************************************************
4181 * *
4182 * The basic API *
4183 * *
4184 ************************************************************************/
4185
4186/**
4187 * xmlRegexpPrint:
4188 * @output: the file for the output debug
4189 * @regexp: the compiled regexp
4190 *
4191 * Print the content of the compiled regular expression
4192 */
4193void
4194xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
4195 int i;
4196
Daniel Veillarda82b1822004-11-08 16:24:57 +00004197 if (output == NULL)
4198 return;
Daniel Veillard4255d502002-04-16 15:50:10 +00004199 fprintf(output, " regexp: ");
4200 if (regexp == NULL) {
4201 fprintf(output, "NULL\n");
4202 return;
4203 }
4204 fprintf(output, "'%s' ", regexp->string);
4205 fprintf(output, "\n");
4206 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
4207 for (i = 0;i < regexp->nbAtoms; i++) {
4208 fprintf(output, " %02d ", i);
4209 xmlRegPrintAtom(output, regexp->atoms[i]);
4210 }
4211 fprintf(output, "%d states:", regexp->nbStates);
4212 fprintf(output, "\n");
4213 for (i = 0;i < regexp->nbStates; i++) {
4214 xmlRegPrintState(output, regexp->states[i]);
4215 }
4216 fprintf(output, "%d counters:\n", regexp->nbCounters);
4217 for (i = 0;i < regexp->nbCounters; i++) {
4218 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
4219 regexp->counters[i].max);
4220 }
4221}
4222
4223/**
4224 * xmlRegexpCompile:
4225 * @regexp: a regular expression string
4226 *
4227 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
William M. Brackddf71d62004-05-06 04:17:26 +00004228 * Appendix F and builds an automata suitable for testing strings against
Daniel Veillard4255d502002-04-16 15:50:10 +00004229 * that regular expression
4230 *
4231 * Returns the compiled expression or NULL in case of error
4232 */
4233xmlRegexpPtr
4234xmlRegexpCompile(const xmlChar *regexp) {
4235 xmlRegexpPtr ret;
4236 xmlRegParserCtxtPtr ctxt;
4237
4238 ctxt = xmlRegNewParserCtxt(regexp);
4239 if (ctxt == NULL)
4240 return(NULL);
4241
4242 /* initialize the parser */
4243 ctxt->end = NULL;
4244 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
4245 xmlRegStatePush(ctxt, ctxt->start);
4246
4247 /* parse the expression building an automata */
4248 xmlFAParseRegExp(ctxt, 1);
4249 if (CUR != 0) {
4250 ERROR("xmlFAParseRegExp: extra characters");
4251 }
4252 ctxt->end = ctxt->state;
4253 ctxt->start->type = XML_REGEXP_START_STATE;
4254 ctxt->end->type = XML_REGEXP_FINAL_STATE;
4255
4256 /* remove the Epsilon except for counted transitions */
4257 xmlFAEliminateEpsilonTransitions(ctxt);
4258
4259
4260 if (ctxt->error != 0) {
4261 xmlRegFreeParserCtxt(ctxt);
4262 return(NULL);
4263 }
4264 ret = xmlRegEpxFromParse(ctxt);
4265 xmlRegFreeParserCtxt(ctxt);
4266 return(ret);
4267}
4268
4269/**
4270 * xmlRegexpExec:
4271 * @comp: the compiled regular expression
4272 * @content: the value to check against the regular expression
4273 *
William M. Brackddf71d62004-05-06 04:17:26 +00004274 * Check if the regular expression generates the value
Daniel Veillard4255d502002-04-16 15:50:10 +00004275 *
William M. Brackddf71d62004-05-06 04:17:26 +00004276 * Returns 1 if it matches, 0 if not and a negative value in case of error
Daniel Veillard4255d502002-04-16 15:50:10 +00004277 */
4278int
4279xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
4280 if ((comp == NULL) || (content == NULL))
4281 return(-1);
4282 return(xmlFARegExec(comp, content));
4283}
4284
4285/**
Daniel Veillard23e73572002-09-19 19:56:43 +00004286 * xmlRegexpIsDeterminist:
4287 * @comp: the compiled regular expression
4288 *
4289 * Check if the regular expression is determinist
4290 *
William M. Brackddf71d62004-05-06 04:17:26 +00004291 * Returns 1 if it yes, 0 if not and a negative value in case of error
Daniel Veillard23e73572002-09-19 19:56:43 +00004292 */
4293int
4294xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
4295 xmlAutomataPtr am;
4296 int ret;
4297
4298 if (comp == NULL)
4299 return(-1);
4300 if (comp->determinist != -1)
4301 return(comp->determinist);
4302
4303 am = xmlNewAutomata();
Daniel Veillardbd9afb52002-09-25 22:25:35 +00004304 if (am->states != NULL) {
4305 int i;
4306
4307 for (i = 0;i < am->nbStates;i++)
4308 xmlRegFreeState(am->states[i]);
4309 xmlFree(am->states);
4310 }
Daniel Veillard23e73572002-09-19 19:56:43 +00004311 am->nbAtoms = comp->nbAtoms;
4312 am->atoms = comp->atoms;
4313 am->nbStates = comp->nbStates;
4314 am->states = comp->states;
4315 am->determinist = -1;
4316 ret = xmlFAComputesDeterminism(am);
4317 am->atoms = NULL;
4318 am->states = NULL;
4319 xmlFreeAutomata(am);
4320 return(ret);
4321}
4322
4323/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004324 * xmlRegFreeRegexp:
4325 * @regexp: the regexp
4326 *
4327 * Free a regexp
4328 */
4329void
4330xmlRegFreeRegexp(xmlRegexpPtr regexp) {
4331 int i;
4332 if (regexp == NULL)
4333 return;
4334
4335 if (regexp->string != NULL)
4336 xmlFree(regexp->string);
4337 if (regexp->states != NULL) {
4338 for (i = 0;i < regexp->nbStates;i++)
4339 xmlRegFreeState(regexp->states[i]);
4340 xmlFree(regexp->states);
4341 }
4342 if (regexp->atoms != NULL) {
4343 for (i = 0;i < regexp->nbAtoms;i++)
4344 xmlRegFreeAtom(regexp->atoms[i]);
4345 xmlFree(regexp->atoms);
4346 }
4347 if (regexp->counters != NULL)
4348 xmlFree(regexp->counters);
Daniel Veillard23e73572002-09-19 19:56:43 +00004349 if (regexp->compact != NULL)
4350 xmlFree(regexp->compact);
Daniel Veillard118aed72002-09-24 14:13:13 +00004351 if (regexp->transdata != NULL)
4352 xmlFree(regexp->transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +00004353 if (regexp->stringMap != NULL) {
4354 for (i = 0; i < regexp->nbstrings;i++)
4355 xmlFree(regexp->stringMap[i]);
4356 xmlFree(regexp->stringMap);
4357 }
4358
Daniel Veillard4255d502002-04-16 15:50:10 +00004359 xmlFree(regexp);
4360}
4361
4362#ifdef LIBXML_AUTOMATA_ENABLED
4363/************************************************************************
4364 * *
4365 * The Automata interface *
4366 * *
4367 ************************************************************************/
4368
4369/**
4370 * xmlNewAutomata:
4371 *
4372 * Create a new automata
4373 *
4374 * Returns the new object or NULL in case of failure
4375 */
4376xmlAutomataPtr
4377xmlNewAutomata(void) {
4378 xmlAutomataPtr ctxt;
4379
4380 ctxt = xmlRegNewParserCtxt(NULL);
4381 if (ctxt == NULL)
4382 return(NULL);
4383
4384 /* initialize the parser */
4385 ctxt->end = NULL;
4386 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004387 if (ctxt->start == NULL) {
4388 xmlFreeAutomata(ctxt);
4389 return(NULL);
4390 }
4391 if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
4392 xmlRegFreeState(ctxt->start);
4393 xmlFreeAutomata(ctxt);
4394 return(NULL);
4395 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004396
4397 return(ctxt);
4398}
4399
4400/**
4401 * xmlFreeAutomata:
4402 * @am: an automata
4403 *
4404 * Free an automata
4405 */
4406void
4407xmlFreeAutomata(xmlAutomataPtr am) {
4408 if (am == NULL)
4409 return;
4410 xmlRegFreeParserCtxt(am);
4411}
4412
4413/**
4414 * xmlAutomataGetInitState:
4415 * @am: an automata
4416 *
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004417 * Initial state lookup
4418 *
Daniel Veillard4255d502002-04-16 15:50:10 +00004419 * Returns the initial state of the automata
4420 */
4421xmlAutomataStatePtr
4422xmlAutomataGetInitState(xmlAutomataPtr am) {
4423 if (am == NULL)
4424 return(NULL);
4425 return(am->start);
4426}
4427
4428/**
4429 * xmlAutomataSetFinalState:
4430 * @am: an automata
4431 * @state: a state in this automata
4432 *
4433 * Makes that state a final state
4434 *
4435 * Returns 0 or -1 in case of error
4436 */
4437int
4438xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
4439 if ((am == NULL) || (state == NULL))
4440 return(-1);
4441 state->type = XML_REGEXP_FINAL_STATE;
4442 return(0);
4443}
4444
4445/**
4446 * xmlAutomataNewTransition:
4447 * @am: an automata
4448 * @from: the starting point of the transition
4449 * @to: the target point of the transition or NULL
4450 * @token: the input string associated to that transition
4451 * @data: data passed to the callback function if the transition is activated
4452 *
William M. Brackddf71d62004-05-06 04:17:26 +00004453 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00004454 * and then adds a transition from the @from state to the target state
4455 * activated by the value of @token
4456 *
4457 * Returns the target state or NULL in case of error
4458 */
4459xmlAutomataStatePtr
4460xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
4461 xmlAutomataStatePtr to, const xmlChar *token,
4462 void *data) {
4463 xmlRegAtomPtr atom;
4464
4465 if ((am == NULL) || (from == NULL) || (token == NULL))
4466 return(NULL);
4467 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004468 if (atom == NULL)
4469 return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00004470 atom->data = data;
4471 if (atom == NULL)
4472 return(NULL);
4473 atom->valuep = xmlStrdup(token);
4474
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004475 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
4476 xmlRegFreeAtom(atom);
4477 return(NULL);
4478 }
Daniel Veillard4255d502002-04-16 15:50:10 +00004479 if (to == NULL)
4480 return(am->state);
4481 return(to);
4482}
4483
4484/**
Daniel Veillard52b48c72003-04-13 19:53:42 +00004485 * xmlAutomataNewTransition2:
4486 * @am: an automata
4487 * @from: the starting point of the transition
4488 * @to: the target point of the transition or NULL
4489 * @token: the first input string associated to that transition
4490 * @token2: the second input string associated to that transition
4491 * @data: data passed to the callback function if the transition is activated
4492 *
William M. Brackddf71d62004-05-06 04:17:26 +00004493 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard52b48c72003-04-13 19:53:42 +00004494 * and then adds a transition from the @from state to the target state
4495 * activated by the value of @token
4496 *
4497 * Returns the target state or NULL in case of error
4498 */
4499xmlAutomataStatePtr
4500xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
4501 xmlAutomataStatePtr to, const xmlChar *token,
4502 const xmlChar *token2, void *data) {
4503 xmlRegAtomPtr atom;
4504
4505 if ((am == NULL) || (from == NULL) || (token == NULL))
4506 return(NULL);
4507 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4508 atom->data = data;
4509 if (atom == NULL)
4510 return(NULL);
4511 if ((token2 == NULL) || (*token2 == 0)) {
4512 atom->valuep = xmlStrdup(token);
4513 } else {
4514 int lenn, lenp;
4515 xmlChar *str;
4516
4517 lenn = strlen((char *) token2);
4518 lenp = strlen((char *) token);
4519
Daniel Veillard3c908dc2003-04-19 00:07:51 +00004520 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00004521 if (str == NULL) {
4522 xmlRegFreeAtom(atom);
4523 return(NULL);
4524 }
4525 memcpy(&str[0], token, lenp);
4526 str[lenp] = '|';
4527 memcpy(&str[lenp + 1], token2, lenn);
4528 str[lenn + lenp + 1] = 0;
4529
4530 atom->valuep = str;
4531 }
4532
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004533 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
4534 xmlRegFreeAtom(atom);
4535 return(NULL);
4536 }
Daniel Veillard52b48c72003-04-13 19:53:42 +00004537 if (to == NULL)
4538 return(am->state);
4539 return(to);
4540}
4541
4542/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00004543 * xmlAutomataNewCountTrans2:
4544 * @am: an automata
4545 * @from: the starting point of the transition
4546 * @to: the target point of the transition or NULL
4547 * @token: the input string associated to that transition
4548 * @token2: the second input string associated to that transition
4549 * @min: the minimum successive occurences of token
4550 * @max: the maximum successive occurences of token
4551 * @data: data associated to the transition
4552 *
4553 * If @to is NULL, this creates first a new target state in the automata
4554 * and then adds a transition from the @from state to the target state
4555 * activated by a succession of input of value @token and @token2 and
4556 * whose number is between @min and @max
4557 *
4558 * Returns the target state or NULL in case of error
4559 */
4560xmlAutomataStatePtr
4561xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
4562 xmlAutomataStatePtr to, const xmlChar *token,
4563 const xmlChar *token2,
4564 int min, int max, void *data) {
4565 xmlRegAtomPtr atom;
4566 int counter;
4567
4568 if ((am == NULL) || (from == NULL) || (token == NULL))
4569 return(NULL);
4570 if (min < 0)
4571 return(NULL);
4572 if ((max < min) || (max < 1))
4573 return(NULL);
4574 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4575 if (atom == NULL)
4576 return(NULL);
4577 if ((token2 == NULL) || (*token2 == 0)) {
4578 atom->valuep = xmlStrdup(token);
4579 } else {
4580 int lenn, lenp;
4581 xmlChar *str;
4582
4583 lenn = strlen((char *) token2);
4584 lenp = strlen((char *) token);
4585
4586 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4587 if (str == NULL) {
4588 xmlRegFreeAtom(atom);
4589 return(NULL);
4590 }
4591 memcpy(&str[0], token, lenp);
4592 str[lenp] = '|';
4593 memcpy(&str[lenp + 1], token2, lenn);
4594 str[lenn + lenp + 1] = 0;
4595
4596 atom->valuep = str;
4597 }
4598 atom->data = data;
4599 if (min == 0)
4600 atom->min = 1;
4601 else
4602 atom->min = min;
4603 atom->max = max;
4604
4605 /*
4606 * associate a counter to the transition.
4607 */
4608 counter = xmlRegGetCounter(am);
4609 am->counters[counter].min = min;
4610 am->counters[counter].max = max;
4611
4612 /* xmlFAGenerateTransitions(am, from, to, atom); */
4613 if (to == NULL) {
4614 to = xmlRegNewState(am);
4615 xmlRegStatePush(am, to);
4616 }
4617 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4618 xmlRegAtomPush(am, atom);
4619 am->state = to;
4620
4621 if (to == NULL)
4622 to = am->state;
4623 if (to == NULL)
4624 return(NULL);
4625 if (min == 0)
4626 xmlFAGenerateEpsilonTransition(am, from, to);
4627 return(to);
4628}
4629
4630/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004631 * xmlAutomataNewCountTrans:
4632 * @am: an automata
4633 * @from: the starting point of the transition
4634 * @to: the target point of the transition or NULL
4635 * @token: the input string associated to that transition
4636 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004637 * @max: the maximum successive occurences of token
4638 * @data: data associated to the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00004639 *
William M. Brackddf71d62004-05-06 04:17:26 +00004640 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard4255d502002-04-16 15:50:10 +00004641 * and then adds a transition from the @from state to the target state
4642 * activated by a succession of input of value @token and whose number
4643 * is between @min and @max
4644 *
4645 * Returns the target state or NULL in case of error
4646 */
4647xmlAutomataStatePtr
4648xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4649 xmlAutomataStatePtr to, const xmlChar *token,
4650 int min, int max, void *data) {
4651 xmlRegAtomPtr atom;
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00004652 int counter;
Daniel Veillard4255d502002-04-16 15:50:10 +00004653
4654 if ((am == NULL) || (from == NULL) || (token == NULL))
4655 return(NULL);
4656 if (min < 0)
4657 return(NULL);
4658 if ((max < min) || (max < 1))
4659 return(NULL);
4660 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4661 if (atom == NULL)
4662 return(NULL);
4663 atom->valuep = xmlStrdup(token);
4664 atom->data = data;
4665 if (min == 0)
4666 atom->min = 1;
4667 else
4668 atom->min = min;
4669 atom->max = max;
4670
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00004671 /*
4672 * associate a counter to the transition.
4673 */
4674 counter = xmlRegGetCounter(am);
4675 am->counters[counter].min = min;
4676 am->counters[counter].max = max;
4677
4678 /* xmlFAGenerateTransitions(am, from, to, atom); */
4679 if (to == NULL) {
4680 to = xmlRegNewState(am);
4681 xmlRegStatePush(am, to);
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004682 }
Daniel Veillard0ddb21c2004-02-12 12:43:49 +00004683 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4684 xmlRegAtomPush(am, atom);
4685 am->state = to;
4686
Daniel Veillard4255d502002-04-16 15:50:10 +00004687 if (to == NULL)
4688 to = am->state;
4689 if (to == NULL)
4690 return(NULL);
4691 if (min == 0)
4692 xmlFAGenerateEpsilonTransition(am, from, to);
4693 return(to);
4694}
4695
4696/**
Kasimier T. Buchcik87876402004-09-29 13:29:03 +00004697 * xmlAutomataNewOnceTrans2:
4698 * @am: an automata
4699 * @from: the starting point of the transition
4700 * @to: the target point of the transition or NULL
4701 * @token: the input string associated to that transition
4702 * @token2: the second input string associated to that transition
4703 * @min: the minimum successive occurences of token
4704 * @max: the maximum successive occurences of token
4705 * @data: data associated to the transition
4706 *
4707 * If @to is NULL, this creates first a new target state in the automata
4708 * and then adds a transition from the @from state to the target state
4709 * activated by a succession of input of value @token and @token2 and whose
4710 * number is between @min and @max, moreover that transition can only be
4711 * crossed once.
4712 *
4713 * Returns the target state or NULL in case of error
4714 */
4715xmlAutomataStatePtr
4716xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
4717 xmlAutomataStatePtr to, const xmlChar *token,
4718 const xmlChar *token2,
4719 int min, int max, void *data) {
4720 xmlRegAtomPtr atom;
4721 int counter;
4722
4723 if ((am == NULL) || (from == NULL) || (token == NULL))
4724 return(NULL);
4725 if (min < 1)
4726 return(NULL);
4727 if ((max < min) || (max < 1))
4728 return(NULL);
4729 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4730 if (atom == NULL)
4731 return(NULL);
4732 if ((token2 == NULL) || (*token2 == 0)) {
4733 atom->valuep = xmlStrdup(token);
4734 } else {
4735 int lenn, lenp;
4736 xmlChar *str;
4737
4738 lenn = strlen((char *) token2);
4739 lenp = strlen((char *) token);
4740
4741 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4742 if (str == NULL) {
4743 xmlRegFreeAtom(atom);
4744 return(NULL);
4745 }
4746 memcpy(&str[0], token, lenp);
4747 str[lenp] = '|';
4748 memcpy(&str[lenp + 1], token2, lenn);
4749 str[lenn + lenp + 1] = 0;
4750
4751 atom->valuep = str;
4752 }
4753 atom->data = data;
4754 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
4755 if (min == 0)
4756 atom->min = 1;
4757 else
4758 atom->min = min;
4759 atom->max = max;
4760 /*
4761 * associate a counter to the transition.
4762 */
4763 counter = xmlRegGetCounter(am);
4764 am->counters[counter].min = 1;
4765 am->counters[counter].max = 1;
4766
4767 /* xmlFAGenerateTransitions(am, from, to, atom); */
4768 if (to == NULL) {
4769 to = xmlRegNewState(am);
4770 xmlRegStatePush(am, to);
4771 }
4772 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4773 xmlRegAtomPush(am, atom);
4774 am->state = to;
4775 return(to);
4776}
4777
4778
4779
4780/**
Daniel Veillard7646b182002-04-20 06:41:40 +00004781 * xmlAutomataNewOnceTrans:
4782 * @am: an automata
4783 * @from: the starting point of the transition
4784 * @to: the target point of the transition or NULL
4785 * @token: the input string associated to that transition
4786 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004787 * @max: the maximum successive occurences of token
4788 * @data: data associated to the transition
Daniel Veillard7646b182002-04-20 06:41:40 +00004789 *
William M. Brackddf71d62004-05-06 04:17:26 +00004790 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00004791 * and then adds a transition from the @from state to the target state
4792 * activated by a succession of input of value @token and whose number
William M. Brackddf71d62004-05-06 04:17:26 +00004793 * is between @min and @max, moreover that transition can only be crossed
Daniel Veillard7646b182002-04-20 06:41:40 +00004794 * once.
4795 *
4796 * Returns the target state or NULL in case of error
4797 */
4798xmlAutomataStatePtr
4799xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4800 xmlAutomataStatePtr to, const xmlChar *token,
4801 int min, int max, void *data) {
4802 xmlRegAtomPtr atom;
4803 int counter;
4804
4805 if ((am == NULL) || (from == NULL) || (token == NULL))
4806 return(NULL);
4807 if (min < 1)
4808 return(NULL);
4809 if ((max < min) || (max < 1))
4810 return(NULL);
4811 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4812 if (atom == NULL)
4813 return(NULL);
4814 atom->valuep = xmlStrdup(token);
4815 atom->data = data;
4816 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
4817 if (min == 0)
4818 atom->min = 1;
4819 else
4820 atom->min = min;
4821 atom->max = max;
4822 /*
4823 * associate a counter to the transition.
4824 */
4825 counter = xmlRegGetCounter(am);
4826 am->counters[counter].min = 1;
4827 am->counters[counter].max = 1;
4828
4829 /* xmlFAGenerateTransitions(am, from, to, atom); */
4830 if (to == NULL) {
4831 to = xmlRegNewState(am);
4832 xmlRegStatePush(am, to);
4833 }
4834 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4835 xmlRegAtomPush(am, atom);
4836 am->state = to;
Daniel Veillard7646b182002-04-20 06:41:40 +00004837 return(to);
4838}
4839
4840/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004841 * xmlAutomataNewState:
4842 * @am: an automata
4843 *
4844 * Create a new disconnected state in the automata
4845 *
4846 * Returns the new state or NULL in case of error
4847 */
4848xmlAutomataStatePtr
4849xmlAutomataNewState(xmlAutomataPtr am) {
4850 xmlAutomataStatePtr to;
4851
4852 if (am == NULL)
4853 return(NULL);
4854 to = xmlRegNewState(am);
4855 xmlRegStatePush(am, to);
4856 return(to);
4857}
4858
4859/**
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004860 * xmlAutomataNewEpsilon:
Daniel Veillard4255d502002-04-16 15:50:10 +00004861 * @am: an automata
4862 * @from: the starting point of the transition
4863 * @to: the target point of the transition or NULL
4864 *
William M. Brackddf71d62004-05-06 04:17:26 +00004865 * If @to is NULL, this creates first a new target state in the automata
4866 * and then adds an epsilon transition from the @from state to the
Daniel Veillard4255d502002-04-16 15:50:10 +00004867 * target state
4868 *
4869 * Returns the target state or NULL in case of error
4870 */
4871xmlAutomataStatePtr
4872xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
4873 xmlAutomataStatePtr to) {
4874 if ((am == NULL) || (from == NULL))
4875 return(NULL);
4876 xmlFAGenerateEpsilonTransition(am, from, to);
4877 if (to == NULL)
4878 return(am->state);
4879 return(to);
4880}
4881
Daniel Veillardb509f152002-04-17 16:28:10 +00004882/**
Daniel Veillard7646b182002-04-20 06:41:40 +00004883 * xmlAutomataNewAllTrans:
4884 * @am: an automata
4885 * @from: the starting point of the transition
4886 * @to: the target point of the transition or NULL
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004887 * @lax: allow to transition if not all all transitions have been activated
Daniel Veillard7646b182002-04-20 06:41:40 +00004888 *
William M. Brackddf71d62004-05-06 04:17:26 +00004889 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillard7646b182002-04-20 06:41:40 +00004890 * and then adds a an ALL transition from the @from state to the
4891 * target state. That transition is an epsilon transition allowed only when
4892 * all transitions from the @from node have been activated.
4893 *
4894 * Returns the target state or NULL in case of error
4895 */
4896xmlAutomataStatePtr
4897xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
Daniel Veillard441bc322002-04-20 17:38:48 +00004898 xmlAutomataStatePtr to, int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00004899 if ((am == NULL) || (from == NULL))
4900 return(NULL);
Daniel Veillard441bc322002-04-20 17:38:48 +00004901 xmlFAGenerateAllTransition(am, from, to, lax);
Daniel Veillard7646b182002-04-20 06:41:40 +00004902 if (to == NULL)
4903 return(am->state);
4904 return(to);
4905}
4906
4907/**
Daniel Veillardb509f152002-04-17 16:28:10 +00004908 * xmlAutomataNewCounter:
4909 * @am: an automata
4910 * @min: the minimal value on the counter
4911 * @max: the maximal value on the counter
4912 *
4913 * Create a new counter
4914 *
4915 * Returns the counter number or -1 in case of error
4916 */
4917int
4918xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
4919 int ret;
4920
4921 if (am == NULL)
4922 return(-1);
4923
4924 ret = xmlRegGetCounter(am);
4925 if (ret < 0)
4926 return(-1);
4927 am->counters[ret].min = min;
4928 am->counters[ret].max = max;
4929 return(ret);
4930}
4931
4932/**
4933 * xmlAutomataNewCountedTrans:
4934 * @am: an automata
4935 * @from: the starting point of the transition
4936 * @to: the target point of the transition or NULL
4937 * @counter: the counter associated to that transition
4938 *
William M. Brackddf71d62004-05-06 04:17:26 +00004939 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00004940 * and then adds an epsilon transition from the @from state to the target state
4941 * which will increment the counter provided
4942 *
4943 * Returns the target state or NULL in case of error
4944 */
4945xmlAutomataStatePtr
4946xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4947 xmlAutomataStatePtr to, int counter) {
4948 if ((am == NULL) || (from == NULL) || (counter < 0))
4949 return(NULL);
4950 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
4951 if (to == NULL)
4952 return(am->state);
4953 return(to);
4954}
4955
4956/**
4957 * xmlAutomataNewCounterTrans:
4958 * @am: an automata
4959 * @from: the starting point of the transition
4960 * @to: the target point of the transition or NULL
4961 * @counter: the counter associated to that transition
4962 *
William M. Brackddf71d62004-05-06 04:17:26 +00004963 * If @to is NULL, this creates first a new target state in the automata
Daniel Veillardb509f152002-04-17 16:28:10 +00004964 * and then adds an epsilon transition from the @from state to the target state
4965 * which will be allowed only if the counter is within the right range.
4966 *
4967 * Returns the target state or NULL in case of error
4968 */
4969xmlAutomataStatePtr
4970xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4971 xmlAutomataStatePtr to, int counter) {
4972 if ((am == NULL) || (from == NULL) || (counter < 0))
4973 return(NULL);
4974 xmlFAGenerateCountedTransition(am, from, to, counter);
4975 if (to == NULL)
4976 return(am->state);
4977 return(to);
4978}
Daniel Veillard4255d502002-04-16 15:50:10 +00004979
4980/**
4981 * xmlAutomataCompile:
4982 * @am: an automata
4983 *
4984 * Compile the automata into a Reg Exp ready for being executed.
4985 * The automata should be free after this point.
4986 *
4987 * Returns the compiled regexp or NULL in case of error
4988 */
4989xmlRegexpPtr
4990xmlAutomataCompile(xmlAutomataPtr am) {
4991 xmlRegexpPtr ret;
4992
Daniel Veillarda76fe5c2003-04-24 16:06:47 +00004993 if ((am == NULL) || (am->error != 0)) return(NULL);
Daniel Veillard4255d502002-04-16 15:50:10 +00004994 xmlFAEliminateEpsilonTransitions(am);
Daniel Veillard23e73572002-09-19 19:56:43 +00004995 /* xmlFAComputesDeterminism(am); */
Daniel Veillard4255d502002-04-16 15:50:10 +00004996 ret = xmlRegEpxFromParse(am);
4997
4998 return(ret);
4999}
Daniel Veillarde19fc232002-04-22 16:01:24 +00005000
5001/**
5002 * xmlAutomataIsDeterminist:
5003 * @am: an automata
5004 *
5005 * Checks if an automata is determinist.
5006 *
5007 * Returns 1 if true, 0 if not, and -1 in case of error
5008 */
5009int
5010xmlAutomataIsDeterminist(xmlAutomataPtr am) {
5011 int ret;
5012
5013 if (am == NULL)
5014 return(-1);
5015
5016 ret = xmlFAComputesDeterminism(am);
5017 return(ret);
5018}
Daniel Veillard4255d502002-04-16 15:50:10 +00005019#endif /* LIBXML_AUTOMATA_ENABLED */
5020#endif /* LIBXML_REGEXP_ENABLED */