blob: c4cf6812d3c5d2d1947246a96cb4bb53d5297014 [file] [log] [blame]
Daniel Veillard4255d502002-04-16 15:50:10 +00001/*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/shemas mechanisms now available in
6 * XML related specifications thise includes:
7 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17#define IN_LIBXML
18#include "libxml.h"
19
20#ifdef LIBXML_REGEXP_ENABLED
21
22#include <stdio.h>
23#include <string.h>
24#include <libxml/tree.h>
25#include <libxml/parserInternals.h>
26#include <libxml/xmlregexp.h>
27#include <libxml/xmlautomata.h>
28#include <libxml/xmlunicode.h>
29
30/* #define DEBUG_REGEXP_GRAPH */
31/* #define DEBUG_REGEXP_EXEC */
32/* #define DEBUG_PUSH */
Daniel Veillard23e73572002-09-19 19:56:43 +000033/* #define DEBUG_COMPACTION */
Daniel Veillard4255d502002-04-16 15:50:10 +000034
35#define ERROR(str) ctxt->error = 1; \
36 xmlGenericError(xmlGenericErrorContext, "Regexp: %s: %s\n", str, ctxt->cur)
37#define NEXT ctxt->cur++
38#define CUR (*(ctxt->cur))
39#define NXT(index) (ctxt->cur[index])
40
41#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
42#define NEXTL(l) ctxt->cur += l;
43
Daniel Veillarde19fc232002-04-22 16:01:24 +000044/**
45 * TODO:
46 *
47 * macro to flag unimplemented blocks
48 */
49#define TODO \
50 xmlGenericError(xmlGenericErrorContext, \
51 "Unimplemented block at %s:%d\n", \
52 __FILE__, __LINE__);
53
Daniel Veillard4255d502002-04-16 15:50:10 +000054
55/************************************************************************
56 * *
57 * Datatypes and structures *
58 * *
59 ************************************************************************/
60
61typedef enum {
62 XML_REGEXP_EPSILON = 1,
63 XML_REGEXP_CHARVAL,
64 XML_REGEXP_RANGES,
65 XML_REGEXP_SUBREG,
66 XML_REGEXP_STRING,
67 XML_REGEXP_ANYCHAR, /* . */
68 XML_REGEXP_ANYSPACE, /* \s */
69 XML_REGEXP_NOTSPACE, /* \S */
70 XML_REGEXP_INITNAME, /* \l */
71 XML_REGEXP_NOTINITNAME, /* \l */
72 XML_REGEXP_NAMECHAR, /* \c */
73 XML_REGEXP_NOTNAMECHAR, /* \C */
74 XML_REGEXP_DECIMAL, /* \d */
75 XML_REGEXP_NOTDECIMAL, /* \d */
76 XML_REGEXP_REALCHAR, /* \w */
77 XML_REGEXP_NOTREALCHAR, /* \w */
78 XML_REGEXP_LETTER,
79 XML_REGEXP_LETTER_UPPERCASE,
80 XML_REGEXP_LETTER_LOWERCASE,
81 XML_REGEXP_LETTER_TITLECASE,
82 XML_REGEXP_LETTER_MODIFIER,
83 XML_REGEXP_LETTER_OTHERS,
84 XML_REGEXP_MARK,
85 XML_REGEXP_MARK_NONSPACING,
86 XML_REGEXP_MARK_SPACECOMBINING,
87 XML_REGEXP_MARK_ENCLOSING,
88 XML_REGEXP_NUMBER,
89 XML_REGEXP_NUMBER_DECIMAL,
90 XML_REGEXP_NUMBER_LETTER,
91 XML_REGEXP_NUMBER_OTHERS,
92 XML_REGEXP_PUNCT,
93 XML_REGEXP_PUNCT_CONNECTOR,
94 XML_REGEXP_PUNCT_DASH,
95 XML_REGEXP_PUNCT_OPEN,
96 XML_REGEXP_PUNCT_CLOSE,
97 XML_REGEXP_PUNCT_INITQUOTE,
98 XML_REGEXP_PUNCT_FINQUOTE,
99 XML_REGEXP_PUNCT_OTHERS,
100 XML_REGEXP_SEPAR,
101 XML_REGEXP_SEPAR_SPACE,
102 XML_REGEXP_SEPAR_LINE,
103 XML_REGEXP_SEPAR_PARA,
104 XML_REGEXP_SYMBOL,
105 XML_REGEXP_SYMBOL_MATH,
106 XML_REGEXP_SYMBOL_CURRENCY,
107 XML_REGEXP_SYMBOL_MODIFIER,
108 XML_REGEXP_SYMBOL_OTHERS,
109 XML_REGEXP_OTHER,
110 XML_REGEXP_OTHER_CONTROL,
111 XML_REGEXP_OTHER_FORMAT,
112 XML_REGEXP_OTHER_PRIVATE,
113 XML_REGEXP_OTHER_NA,
114 XML_REGEXP_BLOCK_NAME
115} xmlRegAtomType;
116
117typedef enum {
118 XML_REGEXP_QUANT_EPSILON = 1,
119 XML_REGEXP_QUANT_ONCE,
120 XML_REGEXP_QUANT_OPT,
121 XML_REGEXP_QUANT_MULT,
122 XML_REGEXP_QUANT_PLUS,
Daniel Veillard7646b182002-04-20 06:41:40 +0000123 XML_REGEXP_QUANT_ONCEONLY,
124 XML_REGEXP_QUANT_ALL,
Daniel Veillard4255d502002-04-16 15:50:10 +0000125 XML_REGEXP_QUANT_RANGE
126} xmlRegQuantType;
127
128typedef enum {
129 XML_REGEXP_START_STATE = 1,
130 XML_REGEXP_FINAL_STATE,
131 XML_REGEXP_TRANS_STATE
132} xmlRegStateType;
133
134typedef enum {
135 XML_REGEXP_MARK_NORMAL = 0,
136 XML_REGEXP_MARK_START,
137 XML_REGEXP_MARK_VISITED
138} xmlRegMarkedType;
139
140typedef struct _xmlRegRange xmlRegRange;
141typedef xmlRegRange *xmlRegRangePtr;
142
143struct _xmlRegRange {
144 int neg;
145 xmlRegAtomType type;
146 int start;
147 int end;
148 xmlChar *blockName;
149};
150
151typedef struct _xmlRegAtom xmlRegAtom;
152typedef xmlRegAtom *xmlRegAtomPtr;
153
154typedef struct _xmlAutomataState xmlRegState;
155typedef xmlRegState *xmlRegStatePtr;
156
157struct _xmlRegAtom {
158 int no;
159 xmlRegAtomType type;
160 xmlRegQuantType quant;
161 int min;
162 int max;
163
164 void *valuep;
Daniel Veillarda646cfd2002-09-17 21:50:03 +0000165 void *valuep2;
Daniel Veillard4255d502002-04-16 15:50:10 +0000166 int neg;
167 int codepoint;
168 xmlRegStatePtr start;
169 xmlRegStatePtr stop;
170 int maxRanges;
171 int nbRanges;
172 xmlRegRangePtr *ranges;
173 void *data;
174};
175
176typedef struct _xmlRegCounter xmlRegCounter;
177typedef xmlRegCounter *xmlRegCounterPtr;
178
179struct _xmlRegCounter {
180 int min;
181 int max;
182};
183
184typedef struct _xmlRegTrans xmlRegTrans;
185typedef xmlRegTrans *xmlRegTransPtr;
186
187struct _xmlRegTrans {
188 xmlRegAtomPtr atom;
189 int to;
190 int counter;
191 int count;
192};
193
194struct _xmlAutomataState {
195 xmlRegStateType type;
196 xmlRegMarkedType mark;
Daniel Veillard23e73572002-09-19 19:56:43 +0000197 xmlRegMarkedType reached;
Daniel Veillard4255d502002-04-16 15:50:10 +0000198 int no;
199
200 int maxTrans;
201 int nbTrans;
202 xmlRegTrans *trans;
203};
204
205typedef struct _xmlAutomata xmlRegParserCtxt;
206typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
207
208struct _xmlAutomata {
209 xmlChar *string;
210 xmlChar *cur;
211
212 int error;
213 int neg;
214
215 xmlRegStatePtr start;
216 xmlRegStatePtr end;
217 xmlRegStatePtr state;
218
219 xmlRegAtomPtr atom;
220
221 int maxAtoms;
222 int nbAtoms;
223 xmlRegAtomPtr *atoms;
224
225 int maxStates;
226 int nbStates;
227 xmlRegStatePtr *states;
228
229 int maxCounters;
230 int nbCounters;
231 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000232
233 int determinist;
Daniel Veillard4255d502002-04-16 15:50:10 +0000234};
235
236struct _xmlRegexp {
237 xmlChar *string;
238 int nbStates;
239 xmlRegStatePtr *states;
240 int nbAtoms;
241 xmlRegAtomPtr *atoms;
242 int nbCounters;
243 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000244 int determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000245 /*
246 * That's the compact form for determinists automatas
247 */
248 int nbstates;
249 int *compact;
250 int nbstrings;
251 xmlChar **stringMap;
Daniel Veillard4255d502002-04-16 15:50:10 +0000252};
253
254typedef struct _xmlRegExecRollback xmlRegExecRollback;
255typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
256
257struct _xmlRegExecRollback {
258 xmlRegStatePtr state;/* the current state */
259 int index; /* the index in the input stack */
260 int nextbranch; /* the next transition to explore in that state */
261 int *counts; /* save the automate state if it has some */
262};
263
264typedef struct _xmlRegInputToken xmlRegInputToken;
265typedef xmlRegInputToken *xmlRegInputTokenPtr;
266
267struct _xmlRegInputToken {
268 xmlChar *value;
269 void *data;
270};
271
272struct _xmlRegExecCtxt {
273 int status; /* execution status != 0 indicate an error */
274 int determinist; /* did we found an inderterministic behaviour */
275 xmlRegexpPtr comp; /* the compiled regexp */
276 xmlRegExecCallbacks callback;
277 void *data;
278
279 xmlRegStatePtr state;/* the current state */
280 int transno; /* the current transition on that state */
281 int transcount; /* the number of char in char counted transitions */
282
283 /*
284 * A stack of rollback states
285 */
286 int maxRollbacks;
287 int nbRollbacks;
288 xmlRegExecRollback *rollbacks;
289
290 /*
291 * The state of the automata if any
292 */
293 int *counts;
294
295 /*
296 * The input stack
297 */
298 int inputStackMax;
299 int inputStackNr;
300 int index;
301 int *charStack;
302 const xmlChar *inputString; /* when operating on characters */
303 xmlRegInputTokenPtr inputStack;/* when operating on strings */
304
305};
306
Daniel Veillard441bc322002-04-20 17:38:48 +0000307#define REGEXP_ALL_COUNTER 0x123456
308#define REGEXP_ALL_LAX_COUNTER 0x123457
Daniel Veillard7646b182002-04-20 06:41:40 +0000309
Daniel Veillard4255d502002-04-16 15:50:10 +0000310static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
Daniel Veillard23e73572002-09-19 19:56:43 +0000311static void xmlRegFreeState(xmlRegStatePtr state);
312static void xmlRegFreeAtom(xmlRegAtomPtr atom);
Daniel Veillard4255d502002-04-16 15:50:10 +0000313
314/************************************************************************
315 * *
316 * Allocation/Deallocation *
317 * *
318 ************************************************************************/
319
Daniel Veillard23e73572002-09-19 19:56:43 +0000320static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
Daniel Veillard4255d502002-04-16 15:50:10 +0000321/**
322 * xmlRegEpxFromParse:
323 * @ctxt: the parser context used to build it
324 *
325 * Allocate a new regexp and fill it with the reult from the parser
326 *
327 * Returns the new regexp or NULL in case of error
328 */
329static xmlRegexpPtr
330xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
331 xmlRegexpPtr ret;
332
333 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
334 if (ret == NULL)
335 return(NULL);
336 memset(ret, 0, sizeof(xmlRegexp));
337 ret->string = ctxt->string;
338 ctxt->string = NULL;
339 ret->nbStates = ctxt->nbStates;
340 ctxt->nbStates = 0;
341 ret->states = ctxt->states;
342 ctxt->states = NULL;
343 ret->nbAtoms = ctxt->nbAtoms;
344 ctxt->nbAtoms = 0;
345 ret->atoms = ctxt->atoms;
346 ctxt->atoms = NULL;
347 ret->nbCounters = ctxt->nbCounters;
348 ctxt->nbCounters = 0;
349 ret->counters = ctxt->counters;
350 ctxt->counters = NULL;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000351 ret->determinist = ctxt->determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000352
353 if ((ret->determinist != 0) &&
354 (ret->nbCounters == 0) &&
355 (ret->atoms[0] != NULL) &&
356 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
357 int i, j, nbstates = 0, nbatoms = 0;
358 int *stateRemap;
359 int *stringRemap;
360 int *transitions;
361 xmlChar **stringMap;
362 xmlChar *value;
363
364 /*
365 * Switch to a compact representation
366 * 1/ counting the effective number of states left
367 * 2/ conting the unique number of atoms, and check that
368 * they are all of the string type
369 * 3/ build a table state x atom for the transitions
370 */
371
372 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
373 for (i = 0;i < ret->nbStates;i++) {
374 if (ret->states[i] != NULL) {
375 stateRemap[i] = nbstates;
376 nbstates++;
377 } else {
378 stateRemap[i] = -1;
379 }
380 }
381#ifdef DEBUG_COMPACTION
382 printf("Final: %d states\n", nbstates);
383#endif
384 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
385 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
386 for (i = 0;i < ret->nbAtoms;i++) {
387 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
388 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
389 value = ret->atoms[i]->valuep;
390 for (j = 0;j < nbatoms;j++) {
391 if (xmlStrEqual(stringMap[j], value)) {
392 stringRemap[i] = j;
393 break;
394 }
395 }
396 if (j >= nbatoms) {
397 stringRemap[i] = nbatoms;
398 stringMap[nbatoms] = xmlStrdup(value);
399 nbatoms++;
400 }
401 } else {
402 xmlFree(stateRemap);
403 xmlFree(stringRemap);
404 for (i = 0;i < nbatoms;i++)
405 xmlFree(stringMap[i]);
406 xmlFree(stringMap);
407 goto fail_compact;
408 }
409 }
410#ifdef DEBUG_COMPACTION
411 printf("Final: %d atoms\n", nbatoms);
412#endif
413
414 /*
415 * Allocate the transition table. The first entry for each
416 * state correspond to the state type.
417 */
418 transitions = (int *) xmlMalloc(nbstates * (nbatoms + 1) * sizeof(int));
419 memset(transitions, 0, nbstates * (nbatoms + 1) * sizeof(int));
420
421 for (i = 0;i < ret->nbStates;i++) {
422 int stateno, atomno, targetno, prev;
423 xmlRegStatePtr state;
424 xmlRegTransPtr trans;
425
426 stateno = stateRemap[i];
427 if (stateno == -1)
428 continue;
429 state = ret->states[i];
430
431 transitions[stateno * (nbatoms + 1)] = state->type;
432
433 for (j = 0;j < state->nbTrans;j++) {
434 trans = &(state->trans[j]);
435 if ((trans->to == -1) || (trans->atom == NULL))
436 continue;
437 atomno = stringRemap[trans->atom->no];
438 targetno = stateRemap[trans->to];
439 /*
440 * if the same atome can generate transition to 2 different
441 * states then it means the automata is not determinist and
442 * the compact form can't be used !
443 */
444 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
445 if (prev != 0) {
446 if (prev != targetno + 1) {
447 printf("not determinist\n");
448 ret->determinist = 0;
449#ifdef DEBUG_COMPACTION
450 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
451 i, j, trans->atom->no, trans->to, atomno, targetno);
452 printf(" previous to is %d\n", prev);
453#endif
454 ret->determinist = 0;
455 xmlFree(transitions);
456 xmlFree(stateRemap);
457 xmlFree(stringRemap);
458 for (i = 0;i < nbatoms;i++)
459 xmlFree(stringMap[i]);
460 xmlFree(stringMap);
461 goto fail_compact;
462 }
463 } else {
464#if 0
465 printf("State %d trans %d: atom %d to %d : %d to %d\n",
466 i, j, trans->atom->no, trans->to, atomno, targetno);
467#endif
468 transitions[stateno * (nbatoms + 1) + atomno + 1] =
469 targetno + 1;; /* to avoid 0 */
470 }
471 }
472 }
473 ret->determinist = 1;
474#ifdef DEBUG_COMPACTION
475 /*
476 * Debug
477 */
478 for (i = 0;i < nbstates;i++) {
479 for (j = 0;j < nbatoms + 1;j++) {
480 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
481 }
482 printf("\n");
483 }
484 printf("\n");
485#endif
486 /*
487 * Cleanup of the old data
488 */
489 if (ret->states != NULL) {
490 for (i = 0;i < ret->nbStates;i++)
491 xmlRegFreeState(ret->states[i]);
492 xmlFree(ret->states);
493 }
494 ret->states = NULL;
495 ret->nbStates = 0;
496 if (ret->atoms != NULL) {
497 for (i = 0;i < ret->nbAtoms;i++)
498 xmlRegFreeAtom(ret->atoms[i]);
499 xmlFree(ret->atoms);
500 }
501 ret->atoms = NULL;
502 ret->nbAtoms = 0;
503
504 ret->compact = transitions;
505 ret->stringMap = stringMap;
506 ret->nbstrings = nbatoms;
507 ret->nbstates = nbstates;
508 xmlFree(stateRemap);
509 xmlFree(stringRemap);
510 }
511fail_compact:
Daniel Veillard4255d502002-04-16 15:50:10 +0000512 return(ret);
513}
514
515/**
516 * xmlRegNewParserCtxt:
517 * @string: the string to parse
518 *
519 * Allocate a new regexp parser context
520 *
521 * Returns the new context or NULL in case of error
522 */
523static xmlRegParserCtxtPtr
524xmlRegNewParserCtxt(const xmlChar *string) {
525 xmlRegParserCtxtPtr ret;
526
527 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
528 if (ret == NULL)
529 return(NULL);
530 memset(ret, 0, sizeof(xmlRegParserCtxt));
531 if (string != NULL)
532 ret->string = xmlStrdup(string);
533 ret->cur = ret->string;
534 ret->neg = 0;
535 ret->error = 0;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000536 ret->determinist = -1;
Daniel Veillard4255d502002-04-16 15:50:10 +0000537 return(ret);
538}
539
540/**
541 * xmlRegNewRange:
542 * @ctxt: the regexp parser context
543 * @neg: is that negative
544 * @type: the type of range
545 * @start: the start codepoint
546 * @end: the end codepoint
547 *
548 * Allocate a new regexp range
549 *
550 * Returns the new range or NULL in case of error
551 */
552static xmlRegRangePtr
553xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
554 int neg, xmlRegAtomType type, int start, int end) {
555 xmlRegRangePtr ret;
556
557 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
558 if (ret == NULL) {
559 ERROR("failed to allocate regexp range");
560 return(NULL);
561 }
562 ret->neg = neg;
563 ret->type = type;
564 ret->start = start;
565 ret->end = end;
566 return(ret);
567}
568
569/**
570 * xmlRegFreeRange:
571 * @range: the regexp range
572 *
573 * Free a regexp range
574 */
575static void
576xmlRegFreeRange(xmlRegRangePtr range) {
577 if (range == NULL)
578 return;
579
580 if (range->blockName != NULL)
581 xmlFree(range->blockName);
582 xmlFree(range);
583}
584
585/**
586 * xmlRegNewAtom:
587 * @ctxt: the regexp parser context
588 * @type: the type of atom
589 *
590 * Allocate a new regexp range
591 *
592 * Returns the new atom or NULL in case of error
593 */
594static xmlRegAtomPtr
595xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
596 xmlRegAtomPtr ret;
597
598 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
599 if (ret == NULL) {
600 ERROR("failed to allocate regexp atom");
601 return(NULL);
602 }
603 memset(ret, 0, sizeof(xmlRegAtom));
604 ret->type = type;
605 ret->quant = XML_REGEXP_QUANT_ONCE;
606 ret->min = 0;
607 ret->max = 0;
608 return(ret);
609}
610
611/**
612 * xmlRegFreeAtom:
613 * @atom: the regexp atom
614 *
615 * Free a regexp atom
616 */
617static void
618xmlRegFreeAtom(xmlRegAtomPtr atom) {
619 int i;
620
621 if (atom == NULL)
622 return;
623
624 for (i = 0;i < atom->nbRanges;i++)
625 xmlRegFreeRange(atom->ranges[i]);
626 if (atom->ranges != NULL)
627 xmlFree(atom->ranges);
628 if (atom->type == XML_REGEXP_STRING)
629 xmlFree(atom->valuep);
630 xmlFree(atom);
631}
632
633static xmlRegStatePtr
634xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
635 xmlRegStatePtr ret;
636
637 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
638 if (ret == NULL) {
639 ERROR("failed to allocate regexp state");
640 return(NULL);
641 }
642 memset(ret, 0, sizeof(xmlRegState));
643 ret->type = XML_REGEXP_TRANS_STATE;
644 ret->mark = XML_REGEXP_MARK_NORMAL;
645 return(ret);
646}
647
648/**
649 * xmlRegFreeState:
650 * @state: the regexp state
651 *
652 * Free a regexp state
653 */
654static void
655xmlRegFreeState(xmlRegStatePtr state) {
656 if (state == NULL)
657 return;
658
659 if (state->trans != NULL)
660 xmlFree(state->trans);
661 xmlFree(state);
662}
663
664/**
665 * xmlRegFreeParserCtxt:
666 * @ctxt: the regexp parser context
667 *
668 * Free a regexp parser context
669 */
670static void
671xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
672 int i;
673 if (ctxt == NULL)
674 return;
675
676 if (ctxt->string != NULL)
677 xmlFree(ctxt->string);
678 if (ctxt->states != NULL) {
679 for (i = 0;i < ctxt->nbStates;i++)
680 xmlRegFreeState(ctxt->states[i]);
681 xmlFree(ctxt->states);
682 }
683 if (ctxt->atoms != NULL) {
684 for (i = 0;i < ctxt->nbAtoms;i++)
685 xmlRegFreeAtom(ctxt->atoms[i]);
686 xmlFree(ctxt->atoms);
687 }
688 if (ctxt->counters != NULL)
689 xmlFree(ctxt->counters);
690 xmlFree(ctxt);
691}
692
693/************************************************************************
694 * *
695 * Display of Data structures *
696 * *
697 ************************************************************************/
698
699static void
700xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
701 switch (type) {
702 case XML_REGEXP_EPSILON:
703 fprintf(output, "epsilon "); break;
704 case XML_REGEXP_CHARVAL:
705 fprintf(output, "charval "); break;
706 case XML_REGEXP_RANGES:
707 fprintf(output, "ranges "); break;
708 case XML_REGEXP_SUBREG:
709 fprintf(output, "subexpr "); break;
710 case XML_REGEXP_STRING:
711 fprintf(output, "string "); break;
712 case XML_REGEXP_ANYCHAR:
713 fprintf(output, "anychar "); break;
714 case XML_REGEXP_ANYSPACE:
715 fprintf(output, "anyspace "); break;
716 case XML_REGEXP_NOTSPACE:
717 fprintf(output, "notspace "); break;
718 case XML_REGEXP_INITNAME:
719 fprintf(output, "initname "); break;
720 case XML_REGEXP_NOTINITNAME:
721 fprintf(output, "notinitname "); break;
722 case XML_REGEXP_NAMECHAR:
723 fprintf(output, "namechar "); break;
724 case XML_REGEXP_NOTNAMECHAR:
725 fprintf(output, "notnamechar "); break;
726 case XML_REGEXP_DECIMAL:
727 fprintf(output, "decimal "); break;
728 case XML_REGEXP_NOTDECIMAL:
729 fprintf(output, "notdecimal "); break;
730 case XML_REGEXP_REALCHAR:
731 fprintf(output, "realchar "); break;
732 case XML_REGEXP_NOTREALCHAR:
733 fprintf(output, "notrealchar "); break;
734 case XML_REGEXP_LETTER:
735 fprintf(output, "LETTER "); break;
736 case XML_REGEXP_LETTER_UPPERCASE:
737 fprintf(output, "LETTER_UPPERCASE "); break;
738 case XML_REGEXP_LETTER_LOWERCASE:
739 fprintf(output, "LETTER_LOWERCASE "); break;
740 case XML_REGEXP_LETTER_TITLECASE:
741 fprintf(output, "LETTER_TITLECASE "); break;
742 case XML_REGEXP_LETTER_MODIFIER:
743 fprintf(output, "LETTER_MODIFIER "); break;
744 case XML_REGEXP_LETTER_OTHERS:
745 fprintf(output, "LETTER_OTHERS "); break;
746 case XML_REGEXP_MARK:
747 fprintf(output, "MARK "); break;
748 case XML_REGEXP_MARK_NONSPACING:
749 fprintf(output, "MARK_NONSPACING "); break;
750 case XML_REGEXP_MARK_SPACECOMBINING:
751 fprintf(output, "MARK_SPACECOMBINING "); break;
752 case XML_REGEXP_MARK_ENCLOSING:
753 fprintf(output, "MARK_ENCLOSING "); break;
754 case XML_REGEXP_NUMBER:
755 fprintf(output, "NUMBER "); break;
756 case XML_REGEXP_NUMBER_DECIMAL:
757 fprintf(output, "NUMBER_DECIMAL "); break;
758 case XML_REGEXP_NUMBER_LETTER:
759 fprintf(output, "NUMBER_LETTER "); break;
760 case XML_REGEXP_NUMBER_OTHERS:
761 fprintf(output, "NUMBER_OTHERS "); break;
762 case XML_REGEXP_PUNCT:
763 fprintf(output, "PUNCT "); break;
764 case XML_REGEXP_PUNCT_CONNECTOR:
765 fprintf(output, "PUNCT_CONNECTOR "); break;
766 case XML_REGEXP_PUNCT_DASH:
767 fprintf(output, "PUNCT_DASH "); break;
768 case XML_REGEXP_PUNCT_OPEN:
769 fprintf(output, "PUNCT_OPEN "); break;
770 case XML_REGEXP_PUNCT_CLOSE:
771 fprintf(output, "PUNCT_CLOSE "); break;
772 case XML_REGEXP_PUNCT_INITQUOTE:
773 fprintf(output, "PUNCT_INITQUOTE "); break;
774 case XML_REGEXP_PUNCT_FINQUOTE:
775 fprintf(output, "PUNCT_FINQUOTE "); break;
776 case XML_REGEXP_PUNCT_OTHERS:
777 fprintf(output, "PUNCT_OTHERS "); break;
778 case XML_REGEXP_SEPAR:
779 fprintf(output, "SEPAR "); break;
780 case XML_REGEXP_SEPAR_SPACE:
781 fprintf(output, "SEPAR_SPACE "); break;
782 case XML_REGEXP_SEPAR_LINE:
783 fprintf(output, "SEPAR_LINE "); break;
784 case XML_REGEXP_SEPAR_PARA:
785 fprintf(output, "SEPAR_PARA "); break;
786 case XML_REGEXP_SYMBOL:
787 fprintf(output, "SYMBOL "); break;
788 case XML_REGEXP_SYMBOL_MATH:
789 fprintf(output, "SYMBOL_MATH "); break;
790 case XML_REGEXP_SYMBOL_CURRENCY:
791 fprintf(output, "SYMBOL_CURRENCY "); break;
792 case XML_REGEXP_SYMBOL_MODIFIER:
793 fprintf(output, "SYMBOL_MODIFIER "); break;
794 case XML_REGEXP_SYMBOL_OTHERS:
795 fprintf(output, "SYMBOL_OTHERS "); break;
796 case XML_REGEXP_OTHER:
797 fprintf(output, "OTHER "); break;
798 case XML_REGEXP_OTHER_CONTROL:
799 fprintf(output, "OTHER_CONTROL "); break;
800 case XML_REGEXP_OTHER_FORMAT:
801 fprintf(output, "OTHER_FORMAT "); break;
802 case XML_REGEXP_OTHER_PRIVATE:
803 fprintf(output, "OTHER_PRIVATE "); break;
804 case XML_REGEXP_OTHER_NA:
805 fprintf(output, "OTHER_NA "); break;
806 case XML_REGEXP_BLOCK_NAME:
807 fprintf(output, "BLOCK "); break;
808 }
809}
810
811static void
812xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
813 switch (type) {
814 case XML_REGEXP_QUANT_EPSILON:
815 fprintf(output, "epsilon "); break;
816 case XML_REGEXP_QUANT_ONCE:
817 fprintf(output, "once "); break;
818 case XML_REGEXP_QUANT_OPT:
819 fprintf(output, "? "); break;
820 case XML_REGEXP_QUANT_MULT:
821 fprintf(output, "* "); break;
822 case XML_REGEXP_QUANT_PLUS:
823 fprintf(output, "+ "); break;
824 case XML_REGEXP_QUANT_RANGE:
825 fprintf(output, "range "); break;
Daniel Veillard7646b182002-04-20 06:41:40 +0000826 case XML_REGEXP_QUANT_ONCEONLY:
827 fprintf(output, "onceonly "); break;
828 case XML_REGEXP_QUANT_ALL:
829 fprintf(output, "all "); break;
Daniel Veillard4255d502002-04-16 15:50:10 +0000830 }
831}
832static void
833xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
834 fprintf(output, " range: ");
835 if (range->neg)
836 fprintf(output, "negative ");
837 xmlRegPrintAtomType(output, range->type);
838 fprintf(output, "%c - %c\n", range->start, range->end);
839}
840
841static void
842xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
843 fprintf(output, " atom: ");
844 if (atom == NULL) {
845 fprintf(output, "NULL\n");
846 return;
847 }
848 xmlRegPrintAtomType(output, atom->type);
849 xmlRegPrintQuantType(output, atom->quant);
850 if (atom->quant == XML_REGEXP_QUANT_RANGE)
851 fprintf(output, "%d-%d ", atom->min, atom->max);
852 if (atom->type == XML_REGEXP_STRING)
853 fprintf(output, "'%s' ", (char *) atom->valuep);
854 if (atom->type == XML_REGEXP_CHARVAL)
855 fprintf(output, "char %c\n", atom->codepoint);
856 else if (atom->type == XML_REGEXP_RANGES) {
857 int i;
858 fprintf(output, "%d entries\n", atom->nbRanges);
859 for (i = 0; i < atom->nbRanges;i++)
860 xmlRegPrintRange(output, atom->ranges[i]);
861 } else if (atom->type == XML_REGEXP_SUBREG) {
862 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
863 } else {
864 fprintf(output, "\n");
865 }
866}
867
868static void
869xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
870 fprintf(output, " trans: ");
871 if (trans == NULL) {
872 fprintf(output, "NULL\n");
873 return;
874 }
875 if (trans->to < 0) {
876 fprintf(output, "removed\n");
877 return;
878 }
879 if (trans->counter >= 0) {
880 fprintf(output, "counted %d, ", trans->counter);
881 }
Daniel Veillard8a001f62002-04-20 07:24:11 +0000882 if (trans->count == REGEXP_ALL_COUNTER) {
883 fprintf(output, "all transition, ");
884 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +0000885 fprintf(output, "count based %d, ", trans->count);
886 }
887 if (trans->atom == NULL) {
888 fprintf(output, "epsilon to %d\n", trans->to);
889 return;
890 }
891 if (trans->atom->type == XML_REGEXP_CHARVAL)
892 fprintf(output, "char %c ", trans->atom->codepoint);
893 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
894}
895
896static void
897xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
898 int i;
899
900 fprintf(output, " state: ");
901 if (state == NULL) {
902 fprintf(output, "NULL\n");
903 return;
904 }
905 if (state->type == XML_REGEXP_START_STATE)
906 fprintf(output, "START ");
907 if (state->type == XML_REGEXP_FINAL_STATE)
908 fprintf(output, "FINAL ");
909
910 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
911 for (i = 0;i < state->nbTrans; i++) {
912 xmlRegPrintTrans(output, &(state->trans[i]));
913 }
914}
915
Daniel Veillard23e73572002-09-19 19:56:43 +0000916#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard4255d502002-04-16 15:50:10 +0000917static void
918xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
919 int i;
920
921 fprintf(output, " ctxt: ");
922 if (ctxt == NULL) {
923 fprintf(output, "NULL\n");
924 return;
925 }
926 fprintf(output, "'%s' ", ctxt->string);
927 if (ctxt->error)
928 fprintf(output, "error ");
929 if (ctxt->neg)
930 fprintf(output, "neg ");
931 fprintf(output, "\n");
932 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
933 for (i = 0;i < ctxt->nbAtoms; i++) {
934 fprintf(output, " %02d ", i);
935 xmlRegPrintAtom(output, ctxt->atoms[i]);
936 }
937 if (ctxt->atom != NULL) {
938 fprintf(output, "current atom:\n");
939 xmlRegPrintAtom(output, ctxt->atom);
940 }
941 fprintf(output, "%d states:", ctxt->nbStates);
942 if (ctxt->start != NULL)
943 fprintf(output, " start: %d", ctxt->start->no);
944 if (ctxt->end != NULL)
945 fprintf(output, " end: %d", ctxt->end->no);
946 fprintf(output, "\n");
947 for (i = 0;i < ctxt->nbStates; i++) {
948 xmlRegPrintState(output, ctxt->states[i]);
949 }
950 fprintf(output, "%d counters:\n", ctxt->nbCounters);
951 for (i = 0;i < ctxt->nbCounters; i++) {
952 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
953 ctxt->counters[i].max);
954 }
955}
Daniel Veillard23e73572002-09-19 19:56:43 +0000956#endif
Daniel Veillard4255d502002-04-16 15:50:10 +0000957
958/************************************************************************
959 * *
960 * Finite Automata structures manipulations *
961 * *
962 ************************************************************************/
963
964static void
965xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
966 int neg, xmlRegAtomType type, int start, int end,
967 xmlChar *blockName) {
968 xmlRegRangePtr range;
969
970 if (atom == NULL) {
971 ERROR("add range: atom is NULL");
972 return;
973 }
974 if (atom->type != XML_REGEXP_RANGES) {
975 ERROR("add range: atom is not ranges");
976 return;
977 }
978 if (atom->maxRanges == 0) {
979 atom->maxRanges = 4;
980 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
981 sizeof(xmlRegRangePtr));
982 if (atom->ranges == NULL) {
983 ERROR("add range: allocation failed");
984 atom->maxRanges = 0;
985 return;
986 }
987 } else if (atom->nbRanges >= atom->maxRanges) {
988 xmlRegRangePtr *tmp;
989 atom->maxRanges *= 2;
990 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
991 sizeof(xmlRegRangePtr));
992 if (tmp == NULL) {
993 ERROR("add range: allocation failed");
994 atom->maxRanges /= 2;
995 return;
996 }
997 atom->ranges = tmp;
998 }
999 range = xmlRegNewRange(ctxt, neg, type, start, end);
1000 if (range == NULL)
1001 return;
1002 range->blockName = blockName;
1003 atom->ranges[atom->nbRanges++] = range;
1004
1005}
1006
1007static int
1008xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1009 if (ctxt->maxCounters == 0) {
1010 ctxt->maxCounters = 4;
1011 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1012 sizeof(xmlRegCounter));
1013 if (ctxt->counters == NULL) {
1014 ERROR("reg counter: allocation failed");
1015 ctxt->maxCounters = 0;
1016 return(-1);
1017 }
1018 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1019 xmlRegCounter *tmp;
1020 ctxt->maxCounters *= 2;
1021 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1022 sizeof(xmlRegCounter));
1023 if (tmp == NULL) {
1024 ERROR("reg counter: allocation failed");
1025 ctxt->maxCounters /= 2;
1026 return(-1);
1027 }
1028 ctxt->counters = tmp;
1029 }
1030 ctxt->counters[ctxt->nbCounters].min = -1;
1031 ctxt->counters[ctxt->nbCounters].max = -1;
1032 return(ctxt->nbCounters++);
1033}
1034
1035static void
1036xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1037 if (atom == NULL) {
1038 ERROR("atom push: atom is NULL");
1039 return;
1040 }
1041 if (ctxt->maxAtoms == 0) {
1042 ctxt->maxAtoms = 4;
1043 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1044 sizeof(xmlRegAtomPtr));
1045 if (ctxt->atoms == NULL) {
1046 ERROR("atom push: allocation failed");
1047 ctxt->maxAtoms = 0;
1048 return;
1049 }
1050 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1051 xmlRegAtomPtr *tmp;
1052 ctxt->maxAtoms *= 2;
1053 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1054 sizeof(xmlRegAtomPtr));
1055 if (tmp == NULL) {
1056 ERROR("atom push: allocation failed");
1057 ctxt->maxAtoms /= 2;
1058 return;
1059 }
1060 ctxt->atoms = tmp;
1061 }
1062 atom->no = ctxt->nbAtoms;
1063 ctxt->atoms[ctxt->nbAtoms++] = atom;
1064}
1065
1066static void
1067xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1068 xmlRegAtomPtr atom, xmlRegStatePtr target,
1069 int counter, int count) {
1070 if (state == NULL) {
1071 ERROR("add state: state is NULL");
1072 return;
1073 }
1074 if (target == NULL) {
1075 ERROR("add state: target is NULL");
1076 return;
1077 }
1078 if (state->maxTrans == 0) {
1079 state->maxTrans = 4;
1080 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1081 sizeof(xmlRegTrans));
1082 if (state->trans == NULL) {
1083 ERROR("add range: allocation failed");
1084 state->maxTrans = 0;
1085 return;
1086 }
1087 } else if (state->nbTrans >= state->maxTrans) {
1088 xmlRegTrans *tmp;
1089 state->maxTrans *= 2;
1090 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1091 sizeof(xmlRegTrans));
1092 if (tmp == NULL) {
1093 ERROR("add range: allocation failed");
1094 state->maxTrans /= 2;
1095 return;
1096 }
1097 state->trans = tmp;
1098 }
1099#ifdef DEBUG_REGEXP_GRAPH
1100 printf("Add trans from %d to %d ", state->no, target->no);
Daniel Veillard8a001f62002-04-20 07:24:11 +00001101 if (count == REGEXP_ALL_COUNTER)
1102 printf("all transition");
Daniel Veillard4402ab42002-09-12 16:02:56 +00001103 else if (count >= 0)
Daniel Veillard4255d502002-04-16 15:50:10 +00001104 printf("count based %d", count);
1105 else if (counter >= 0)
1106 printf("counted %d", counter);
1107 else if (atom == NULL)
1108 printf("epsilon transition");
1109 printf("\n");
1110#endif
1111
1112 state->trans[state->nbTrans].atom = atom;
1113 state->trans[state->nbTrans].to = target->no;
1114 state->trans[state->nbTrans].counter = counter;
1115 state->trans[state->nbTrans].count = count;
1116 state->nbTrans++;
1117}
1118
1119static void
1120xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
1121 if (ctxt->maxStates == 0) {
1122 ctxt->maxStates = 4;
1123 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1124 sizeof(xmlRegStatePtr));
1125 if (ctxt->states == NULL) {
1126 ERROR("add range: allocation failed");
1127 ctxt->maxStates = 0;
1128 return;
1129 }
1130 } else if (ctxt->nbStates >= ctxt->maxStates) {
1131 xmlRegStatePtr *tmp;
1132 ctxt->maxStates *= 2;
1133 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1134 sizeof(xmlRegStatePtr));
1135 if (tmp == NULL) {
1136 ERROR("add range: allocation failed");
1137 ctxt->maxStates /= 2;
1138 return;
1139 }
1140 ctxt->states = tmp;
1141 }
1142 state->no = ctxt->nbStates;
1143 ctxt->states[ctxt->nbStates++] = state;
1144}
1145
1146/**
Daniel Veillard7646b182002-04-20 06:41:40 +00001147 * xmlFAGenerateAllTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001148 * @ctxt: a regexp parser context
1149 * @from: the from state
1150 * @to: the target state or NULL for building a new one
1151 * @lax:
Daniel Veillard7646b182002-04-20 06:41:40 +00001152 *
1153 */
1154static void
1155xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
Daniel Veillard441bc322002-04-20 17:38:48 +00001156 xmlRegStatePtr from, xmlRegStatePtr to,
1157 int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00001158 if (to == NULL) {
1159 to = xmlRegNewState(ctxt);
1160 xmlRegStatePush(ctxt, to);
1161 ctxt->state = to;
1162 }
Daniel Veillard441bc322002-04-20 17:38:48 +00001163 if (lax)
1164 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1165 else
1166 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
Daniel Veillard7646b182002-04-20 06:41:40 +00001167}
1168
1169/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001170 * xmlFAGenerateEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001171 * @ctxt: a regexp parser context
1172 * @from: the from state
1173 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001174 *
1175 */
1176static void
1177xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1178 xmlRegStatePtr from, xmlRegStatePtr to) {
1179 if (to == NULL) {
1180 to = xmlRegNewState(ctxt);
1181 xmlRegStatePush(ctxt, to);
1182 ctxt->state = to;
1183 }
1184 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1185}
1186
1187/**
1188 * xmlFAGenerateCountedEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001189 * @ctxt: a regexp parser context
1190 * @from: the from state
1191 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001192 * counter: the counter for that transition
1193 *
1194 */
1195static void
1196xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1197 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1198 if (to == NULL) {
1199 to = xmlRegNewState(ctxt);
1200 xmlRegStatePush(ctxt, to);
1201 ctxt->state = to;
1202 }
1203 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1204}
1205
1206/**
1207 * xmlFAGenerateCountedTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001208 * @ctxt: a regexp parser context
1209 * @from: the from state
1210 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001211 * counter: the counter for that transition
1212 *
1213 */
1214static void
1215xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1216 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1217 if (to == NULL) {
1218 to = xmlRegNewState(ctxt);
1219 xmlRegStatePush(ctxt, to);
1220 ctxt->state = to;
1221 }
1222 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1223}
1224
1225/**
1226 * xmlFAGenerateTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001227 * @ctxt: a regexp parser context
1228 * @from: the from state
1229 * @to: the target state or NULL for building a new one
1230 * @atom: the atom generating the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00001231 *
1232 */
1233static void
1234xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1235 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1236 if (atom == NULL) {
1237 ERROR("genrate transition: atom == NULL");
1238 return;
1239 }
1240 if (atom->type == XML_REGEXP_SUBREG) {
1241 /*
1242 * this is a subexpression handling one should not need to
1243 * create a new node excep for XML_REGEXP_QUANT_RANGE.
1244 */
1245 xmlRegAtomPush(ctxt, atom);
1246 if ((to != NULL) && (atom->stop != to) &&
1247 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1248 /*
1249 * Generate an epsilon transition to link to the target
1250 */
1251 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1252 }
1253 switch (atom->quant) {
1254 case XML_REGEXP_QUANT_OPT:
1255 atom->quant = XML_REGEXP_QUANT_ONCE;
1256 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1257 break;
1258 case XML_REGEXP_QUANT_MULT:
1259 atom->quant = XML_REGEXP_QUANT_ONCE;
1260 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1261 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1262 break;
1263 case XML_REGEXP_QUANT_PLUS:
1264 atom->quant = XML_REGEXP_QUANT_ONCE;
1265 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1266 break;
1267 case XML_REGEXP_QUANT_RANGE: {
1268 int counter;
1269 xmlRegStatePtr newstate;
1270
1271 /*
1272 * This one is nasty:
1273 * 1/ register a new counter
1274 * 2/ register an epsilon transition associated to
1275 * this counter going from atom->stop to atom->start
1276 * 3/ create a new state
1277 * 4/ generate a counted transition from atom->stop to
1278 * that state
1279 */
1280 counter = xmlRegGetCounter(ctxt);
1281 ctxt->counters[counter].min = atom->min - 1;
1282 ctxt->counters[counter].max = atom->max - 1;
1283 atom->min = 0;
1284 atom->max = 0;
1285 atom->quant = XML_REGEXP_QUANT_ONCE;
1286 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1287 atom->start, counter);
1288 if (to != NULL) {
1289 newstate = to;
1290 } else {
1291 newstate = xmlRegNewState(ctxt);
1292 xmlRegStatePush(ctxt, newstate);
1293 ctxt->state = newstate;
1294 }
1295 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1296 newstate, counter);
1297 }
1298 default:
1299 break;
1300 }
1301 return;
1302 } else {
1303 if (to == NULL) {
1304 to = xmlRegNewState(ctxt);
1305 xmlRegStatePush(ctxt, to);
1306 }
1307 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1308 xmlRegAtomPush(ctxt, atom);
1309 ctxt->state = to;
1310 }
1311 switch (atom->quant) {
1312 case XML_REGEXP_QUANT_OPT:
1313 atom->quant = XML_REGEXP_QUANT_ONCE;
1314 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1315 break;
1316 case XML_REGEXP_QUANT_MULT:
1317 atom->quant = XML_REGEXP_QUANT_ONCE;
1318 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1319 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1320 break;
1321 case XML_REGEXP_QUANT_PLUS:
1322 atom->quant = XML_REGEXP_QUANT_ONCE;
1323 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1324 break;
1325 default:
1326 break;
1327 }
1328}
1329
1330/**
1331 * xmlFAReduceEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001332 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001333 * @fromnr: the from state
1334 * @tonr: the to state
1335 * @cpunter: should that transition be associted to a counted
1336 *
1337 */
1338static void
1339xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1340 int tonr, int counter) {
1341 int transnr;
1342 xmlRegStatePtr from;
1343 xmlRegStatePtr to;
1344
1345#ifdef DEBUG_REGEXP_GRAPH
1346 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1347#endif
1348 from = ctxt->states[fromnr];
1349 if (from == NULL)
1350 return;
1351 to = ctxt->states[tonr];
1352 if (to == NULL)
1353 return;
1354 if ((to->mark == XML_REGEXP_MARK_START) ||
1355 (to->mark == XML_REGEXP_MARK_VISITED))
1356 return;
1357
1358 to->mark = XML_REGEXP_MARK_VISITED;
1359 if (to->type == XML_REGEXP_FINAL_STATE) {
1360#ifdef DEBUG_REGEXP_GRAPH
1361 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1362#endif
1363 from->type = XML_REGEXP_FINAL_STATE;
1364 }
1365 for (transnr = 0;transnr < to->nbTrans;transnr++) {
1366 if (to->trans[transnr].atom == NULL) {
1367 /*
1368 * Don't remove counted transitions
1369 * Don't loop either
1370 */
Daniel Veillardb509f152002-04-17 16:28:10 +00001371 if (to->trans[transnr].to != fromnr) {
1372 if (to->trans[transnr].count >= 0) {
1373 int newto = to->trans[transnr].to;
1374
1375 xmlRegStateAddTrans(ctxt, from, NULL,
1376 ctxt->states[newto],
1377 -1, to->trans[transnr].count);
1378 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00001379#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillardb509f152002-04-17 16:28:10 +00001380 printf("Found epsilon trans %d from %d to %d\n",
1381 transnr, tonr, to->trans[transnr].to);
Daniel Veillard4255d502002-04-16 15:50:10 +00001382#endif
Daniel Veillardb509f152002-04-17 16:28:10 +00001383 if (to->trans[transnr].counter >= 0) {
1384 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1385 to->trans[transnr].to,
1386 to->trans[transnr].counter);
1387 } else {
1388 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1389 to->trans[transnr].to,
1390 counter);
1391 }
1392 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001393 }
1394 } else {
1395 int newto = to->trans[transnr].to;
1396
Daniel Veillardb509f152002-04-17 16:28:10 +00001397 if (to->trans[transnr].counter >= 0) {
1398 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1399 ctxt->states[newto],
1400 to->trans[transnr].counter, -1);
1401 } else {
1402 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1403 ctxt->states[newto], counter, -1);
1404 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001405 }
1406 }
1407 to->mark = XML_REGEXP_MARK_NORMAL;
1408}
1409
1410/**
1411 * xmlFAEliminateEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001412 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001413 *
1414 */
1415static void
1416xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1417 int statenr, transnr;
1418 xmlRegStatePtr state;
1419
1420 /*
1421 * build the completed transitions bypassing the epsilons
1422 * Use a marking algorithm to avoid loops
1423 */
1424 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1425 state = ctxt->states[statenr];
1426 if (state == NULL)
1427 continue;
1428 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1429 if ((state->trans[transnr].atom == NULL) &&
1430 (state->trans[transnr].to >= 0)) {
1431 if (state->trans[transnr].to == statenr) {
1432 state->trans[transnr].to = -1;
1433#ifdef DEBUG_REGEXP_GRAPH
1434 printf("Removed loopback epsilon trans %d on %d\n",
1435 transnr, statenr);
1436#endif
1437 } else if (state->trans[transnr].count < 0) {
1438 int newto = state->trans[transnr].to;
1439
1440#ifdef DEBUG_REGEXP_GRAPH
1441 printf("Found epsilon trans %d from %d to %d\n",
1442 transnr, statenr, newto);
1443#endif
1444 state->mark = XML_REGEXP_MARK_START;
1445 xmlFAReduceEpsilonTransitions(ctxt, statenr,
1446 newto, state->trans[transnr].counter);
1447 state->mark = XML_REGEXP_MARK_NORMAL;
1448#ifdef DEBUG_REGEXP_GRAPH
1449 } else {
1450 printf("Found counted transition %d on %d\n",
1451 transnr, statenr);
1452#endif
1453 }
1454 }
1455 }
1456 }
1457 /*
1458 * Eliminate the epsilon transitions
1459 */
1460 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1461 state = ctxt->states[statenr];
1462 if (state == NULL)
1463 continue;
1464 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1465 if ((state->trans[transnr].atom == NULL) &&
1466 (state->trans[transnr].count < 0) &&
1467 (state->trans[transnr].to >= 0)) {
1468 state->trans[transnr].to = -1;
1469 }
1470 }
1471 }
Daniel Veillard23e73572002-09-19 19:56:43 +00001472
1473 /*
1474 * Use this pass to detect unreachable states too
1475 */
1476 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1477 state = ctxt->states[statenr];
1478 if (state != NULL)
1479 state->reached = 0;
1480 }
1481 state = ctxt->states[0];
1482 if (state != NULL)
1483 state->reached = 1;
1484 while (state != NULL) {
1485 xmlRegStatePtr target = NULL;
1486 state->reached = 2;
1487 /*
1488 * Mark all state reachable from the current reachable state
1489 */
1490 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1491 if ((state->trans[transnr].to >= 0) &&
1492 ((state->trans[transnr].atom != NULL) ||
1493 (state->trans[transnr].count >= 0))) {
1494 int newto = state->trans[transnr].to;
1495
1496 if (ctxt->states[newto] == NULL)
1497 continue;
1498 if (ctxt->states[newto]->reached == 0) {
1499 ctxt->states[newto]->reached = 1;
1500 target = ctxt->states[newto];
1501 }
1502 }
1503 }
1504 /*
1505 * find the next accessible state not explored
1506 */
1507 if (target == NULL) {
1508 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
1509 state = ctxt->states[statenr];
1510 if ((state != NULL) && (state->reached == 1)) {
1511 target = state;
1512 break;
1513 }
1514 }
1515 }
1516 state = target;
1517 }
1518 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1519 state = ctxt->states[statenr];
1520 if ((state != NULL) && (state->reached == 0)) {
1521#ifdef DEBUG_REGEXP_GRAPH
1522 printf("Removed unreachable state %d\n", statenr);
1523#endif
1524 xmlRegFreeState(state);
1525 ctxt->states[statenr] = NULL;
1526 }
1527 }
1528
Daniel Veillard4255d502002-04-16 15:50:10 +00001529}
1530
Daniel Veillarde19fc232002-04-22 16:01:24 +00001531/**
1532 * xmlFACompareAtoms:
1533 * @atom1: an atom
1534 * @atom2: an atom
1535 *
1536 * Compares two atoms to check whether they are equivatents
1537 *
1538 * Returns 1 if yes and 0 otherwise
1539 */
1540static int
1541xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
1542 if (atom1 == atom2)
1543 return(1);
1544 if ((atom1 == NULL) || (atom2 == NULL))
1545 return(0);
1546
1547 if (atom1->type != atom2->type)
1548 return(0);
1549 switch (atom1->type) {
1550 case XML_REGEXP_STRING:
1551 return(xmlStrEqual((xmlChar *)atom1->valuep,
1552 (xmlChar *)atom2->valuep));
1553 case XML_REGEXP_EPSILON:
1554 return(1);
1555 case XML_REGEXP_CHARVAL:
1556 return(atom1->codepoint == atom2->codepoint);
1557 case XML_REGEXP_RANGES:
1558 TODO;
1559 return(0);
1560 default:
1561 break;
1562 }
1563 return(1);
1564}
1565
1566/**
1567 * xmlFARecurseDeterminism:
1568 * @ctxt: a regexp parser context
1569 *
1570 * Check whether the associated regexp is determinist,
1571 * should be called after xmlFAEliminateEpsilonTransitions()
1572 *
1573 */
1574static int
1575xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1576 int to, xmlRegAtomPtr atom) {
1577 int ret = 1;
1578 int transnr;
1579 xmlRegTransPtr t1;
1580
1581 if (state == NULL)
1582 return(ret);
1583 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1584 t1 = &(state->trans[transnr]);
1585 /*
1586 * check transitions conflicting with the one looked at
1587 */
1588 if (t1->atom == NULL) {
1589 if (t1->to == -1)
1590 continue;
1591 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1592 to, atom);
1593 if (ret == 0)
1594 return(0);
1595 continue;
1596 }
1597 if (t1->to != to)
1598 continue;
1599 if (xmlFACompareAtoms(t1->atom, atom))
1600 return(0);
1601 }
1602 return(ret);
1603}
1604
1605/**
1606 * xmlFAComputesDeterminism:
1607 * @ctxt: a regexp parser context
1608 *
1609 * Check whether the associated regexp is determinist,
1610 * should be called after xmlFAEliminateEpsilonTransitions()
1611 *
1612 */
1613static int
1614xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
1615 int statenr, transnr;
1616 xmlRegStatePtr state;
1617 xmlRegTransPtr t1, t2;
1618 int i;
1619 int ret = 1;
1620
Daniel Veillard4402ab42002-09-12 16:02:56 +00001621#ifdef DEBUG_REGEXP_GRAPH
1622 printf("xmlFAComputesDeterminism\n");
1623 xmlRegPrintCtxt(stdout, ctxt);
1624#endif
Daniel Veillarde19fc232002-04-22 16:01:24 +00001625 if (ctxt->determinist != -1)
1626 return(ctxt->determinist);
1627
1628 /*
1629 * Check for all states that there isn't 2 transitions
1630 * with the same atom and a different target.
1631 */
1632 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1633 state = ctxt->states[statenr];
1634 if (state == NULL)
1635 continue;
1636 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1637 t1 = &(state->trans[transnr]);
1638 /*
1639 * Determinism checks in case of counted or all transitions
1640 * will have to be handled separately
1641 */
1642 if (t1->atom == NULL)
1643 continue;
1644 if (t1->to == -1) /* eliminated */
1645 continue;
1646 for (i = 0;i < transnr;i++) {
1647 t2 = &(state->trans[i]);
1648 if (t2->to == -1) /* eliminated */
1649 continue;
1650 if (t2->atom != NULL) {
1651 if (t1->to == t2->to) {
1652 if (xmlFACompareAtoms(t1->atom, t2->atom))
1653 t2->to = -1; /* eliminate */
1654 } else {
1655 /* not determinist ! */
1656 if (xmlFACompareAtoms(t1->atom, t2->atom))
1657 ret = 0;
1658 }
1659 } else if (t1->to != -1) {
1660 /*
1661 * do the closure in case of remaining specific
1662 * epsilon transitions like choices or all
1663 */
1664 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1665 t2->to, t2->atom);
1666 if (ret == 0)
1667 return(0);
1668 }
1669 }
1670 if (ret == 0)
1671 break;
1672 }
1673 if (ret == 0)
1674 break;
1675 }
1676 ctxt->determinist = ret;
1677 return(ret);
1678}
1679
Daniel Veillard4255d502002-04-16 15:50:10 +00001680/************************************************************************
1681 * *
1682 * Routines to check input against transition atoms *
1683 * *
1684 ************************************************************************/
1685
1686static int
1687xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
1688 int start, int end, const xmlChar *blockName) {
1689 int ret = 0;
1690
1691 switch (type) {
1692 case XML_REGEXP_STRING:
1693 case XML_REGEXP_SUBREG:
1694 case XML_REGEXP_RANGES:
1695 case XML_REGEXP_EPSILON:
1696 return(-1);
1697 case XML_REGEXP_ANYCHAR:
1698 ret = ((codepoint != '\n') && (codepoint != '\r'));
1699 break;
1700 case XML_REGEXP_CHARVAL:
1701 ret = ((codepoint >= start) && (codepoint <= end));
1702 break;
1703 case XML_REGEXP_NOTSPACE:
1704 neg = !neg;
1705 case XML_REGEXP_ANYSPACE:
1706 ret = ((codepoint == '\n') || (codepoint == '\r') ||
1707 (codepoint == '\t') || (codepoint == ' '));
1708 break;
1709 case XML_REGEXP_NOTINITNAME:
1710 neg = !neg;
1711 case XML_REGEXP_INITNAME:
1712 ret = (xmlIsLetter(codepoint) ||
1713 (codepoint == '_') || (codepoint == ':'));
1714 break;
1715 case XML_REGEXP_NOTNAMECHAR:
1716 neg = !neg;
1717 case XML_REGEXP_NAMECHAR:
1718 ret = (xmlIsLetter(codepoint) || xmlIsDigit(codepoint) ||
1719 (codepoint == '.') || (codepoint == '-') ||
1720 (codepoint == '_') || (codepoint == ':') ||
1721 xmlIsCombining(codepoint) || xmlIsExtender(codepoint));
1722 break;
1723 case XML_REGEXP_NOTDECIMAL:
1724 neg = !neg;
1725 case XML_REGEXP_DECIMAL:
1726 ret = xmlUCSIsCatNd(codepoint);
1727 break;
1728 case XML_REGEXP_REALCHAR:
1729 neg = !neg;
1730 case XML_REGEXP_NOTREALCHAR:
1731 ret = xmlUCSIsCatP(codepoint);
1732 if (ret == 0)
1733 ret = xmlUCSIsCatZ(codepoint);
1734 if (ret == 0)
1735 ret = xmlUCSIsCatC(codepoint);
1736 break;
1737 case XML_REGEXP_LETTER:
1738 ret = xmlUCSIsCatL(codepoint);
1739 break;
1740 case XML_REGEXP_LETTER_UPPERCASE:
1741 ret = xmlUCSIsCatLu(codepoint);
1742 break;
1743 case XML_REGEXP_LETTER_LOWERCASE:
1744 ret = xmlUCSIsCatLl(codepoint);
1745 break;
1746 case XML_REGEXP_LETTER_TITLECASE:
1747 ret = xmlUCSIsCatLt(codepoint);
1748 break;
1749 case XML_REGEXP_LETTER_MODIFIER:
1750 ret = xmlUCSIsCatLm(codepoint);
1751 break;
1752 case XML_REGEXP_LETTER_OTHERS:
1753 ret = xmlUCSIsCatLo(codepoint);
1754 break;
1755 case XML_REGEXP_MARK:
1756 ret = xmlUCSIsCatM(codepoint);
1757 break;
1758 case XML_REGEXP_MARK_NONSPACING:
1759 ret = xmlUCSIsCatMn(codepoint);
1760 break;
1761 case XML_REGEXP_MARK_SPACECOMBINING:
1762 ret = xmlUCSIsCatMc(codepoint);
1763 break;
1764 case XML_REGEXP_MARK_ENCLOSING:
1765 ret = xmlUCSIsCatMe(codepoint);
1766 break;
1767 case XML_REGEXP_NUMBER:
1768 ret = xmlUCSIsCatN(codepoint);
1769 break;
1770 case XML_REGEXP_NUMBER_DECIMAL:
1771 ret = xmlUCSIsCatNd(codepoint);
1772 break;
1773 case XML_REGEXP_NUMBER_LETTER:
1774 ret = xmlUCSIsCatNl(codepoint);
1775 break;
1776 case XML_REGEXP_NUMBER_OTHERS:
1777 ret = xmlUCSIsCatNo(codepoint);
1778 break;
1779 case XML_REGEXP_PUNCT:
1780 ret = xmlUCSIsCatP(codepoint);
1781 break;
1782 case XML_REGEXP_PUNCT_CONNECTOR:
1783 ret = xmlUCSIsCatPc(codepoint);
1784 break;
1785 case XML_REGEXP_PUNCT_DASH:
1786 ret = xmlUCSIsCatPd(codepoint);
1787 break;
1788 case XML_REGEXP_PUNCT_OPEN:
1789 ret = xmlUCSIsCatPs(codepoint);
1790 break;
1791 case XML_REGEXP_PUNCT_CLOSE:
1792 ret = xmlUCSIsCatPe(codepoint);
1793 break;
1794 case XML_REGEXP_PUNCT_INITQUOTE:
1795 ret = xmlUCSIsCatPi(codepoint);
1796 break;
1797 case XML_REGEXP_PUNCT_FINQUOTE:
1798 ret = xmlUCSIsCatPf(codepoint);
1799 break;
1800 case XML_REGEXP_PUNCT_OTHERS:
1801 ret = xmlUCSIsCatPo(codepoint);
1802 break;
1803 case XML_REGEXP_SEPAR:
1804 ret = xmlUCSIsCatZ(codepoint);
1805 break;
1806 case XML_REGEXP_SEPAR_SPACE:
1807 ret = xmlUCSIsCatZs(codepoint);
1808 break;
1809 case XML_REGEXP_SEPAR_LINE:
1810 ret = xmlUCSIsCatZl(codepoint);
1811 break;
1812 case XML_REGEXP_SEPAR_PARA:
1813 ret = xmlUCSIsCatZp(codepoint);
1814 break;
1815 case XML_REGEXP_SYMBOL:
1816 ret = xmlUCSIsCatS(codepoint);
1817 break;
1818 case XML_REGEXP_SYMBOL_MATH:
1819 ret = xmlUCSIsCatSm(codepoint);
1820 break;
1821 case XML_REGEXP_SYMBOL_CURRENCY:
1822 ret = xmlUCSIsCatSc(codepoint);
1823 break;
1824 case XML_REGEXP_SYMBOL_MODIFIER:
1825 ret = xmlUCSIsCatSk(codepoint);
1826 break;
1827 case XML_REGEXP_SYMBOL_OTHERS:
1828 ret = xmlUCSIsCatSo(codepoint);
1829 break;
1830 case XML_REGEXP_OTHER:
1831 ret = xmlUCSIsCatC(codepoint);
1832 break;
1833 case XML_REGEXP_OTHER_CONTROL:
1834 ret = xmlUCSIsCatCc(codepoint);
1835 break;
1836 case XML_REGEXP_OTHER_FORMAT:
1837 ret = xmlUCSIsCatCf(codepoint);
1838 break;
1839 case XML_REGEXP_OTHER_PRIVATE:
1840 ret = xmlUCSIsCatCo(codepoint);
1841 break;
1842 case XML_REGEXP_OTHER_NA:
1843 /* ret = xmlUCSIsCatCn(codepoint); */
1844 /* Seems it doesn't exist anymore in recent Unicode releases */
1845 ret = 0;
1846 break;
1847 case XML_REGEXP_BLOCK_NAME:
1848 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
1849 break;
1850 }
1851 if (neg)
1852 return(!ret);
1853 return(ret);
1854}
1855
1856static int
1857xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
1858 int i, ret = 0;
1859 xmlRegRangePtr range;
1860
1861 if ((atom == NULL) || (!xmlIsChar(codepoint)))
1862 return(-1);
1863
1864 switch (atom->type) {
1865 case XML_REGEXP_SUBREG:
1866 case XML_REGEXP_EPSILON:
1867 return(-1);
1868 case XML_REGEXP_CHARVAL:
1869 return(codepoint == atom->codepoint);
1870 case XML_REGEXP_RANGES: {
1871 int accept = 0;
1872 for (i = 0;i < atom->nbRanges;i++) {
1873 range = atom->ranges[i];
1874 if (range->neg) {
1875 ret = xmlRegCheckCharacterRange(range->type, codepoint,
1876 0, range->start, range->end,
1877 range->blockName);
1878 if (ret != 0)
1879 return(0); /* excluded char */
1880 } else {
1881 ret = xmlRegCheckCharacterRange(range->type, codepoint,
1882 0, range->start, range->end,
1883 range->blockName);
1884 if (ret != 0)
1885 accept = 1; /* might still be excluded */
1886 }
1887 }
1888 return(accept);
1889 }
1890 case XML_REGEXP_STRING:
1891 printf("TODO: XML_REGEXP_STRING\n");
1892 return(-1);
1893 case XML_REGEXP_ANYCHAR:
1894 case XML_REGEXP_ANYSPACE:
1895 case XML_REGEXP_NOTSPACE:
1896 case XML_REGEXP_INITNAME:
1897 case XML_REGEXP_NOTINITNAME:
1898 case XML_REGEXP_NAMECHAR:
1899 case XML_REGEXP_NOTNAMECHAR:
1900 case XML_REGEXP_DECIMAL:
1901 case XML_REGEXP_NOTDECIMAL:
1902 case XML_REGEXP_REALCHAR:
1903 case XML_REGEXP_NOTREALCHAR:
1904 case XML_REGEXP_LETTER:
1905 case XML_REGEXP_LETTER_UPPERCASE:
1906 case XML_REGEXP_LETTER_LOWERCASE:
1907 case XML_REGEXP_LETTER_TITLECASE:
1908 case XML_REGEXP_LETTER_MODIFIER:
1909 case XML_REGEXP_LETTER_OTHERS:
1910 case XML_REGEXP_MARK:
1911 case XML_REGEXP_MARK_NONSPACING:
1912 case XML_REGEXP_MARK_SPACECOMBINING:
1913 case XML_REGEXP_MARK_ENCLOSING:
1914 case XML_REGEXP_NUMBER:
1915 case XML_REGEXP_NUMBER_DECIMAL:
1916 case XML_REGEXP_NUMBER_LETTER:
1917 case XML_REGEXP_NUMBER_OTHERS:
1918 case XML_REGEXP_PUNCT:
1919 case XML_REGEXP_PUNCT_CONNECTOR:
1920 case XML_REGEXP_PUNCT_DASH:
1921 case XML_REGEXP_PUNCT_OPEN:
1922 case XML_REGEXP_PUNCT_CLOSE:
1923 case XML_REGEXP_PUNCT_INITQUOTE:
1924 case XML_REGEXP_PUNCT_FINQUOTE:
1925 case XML_REGEXP_PUNCT_OTHERS:
1926 case XML_REGEXP_SEPAR:
1927 case XML_REGEXP_SEPAR_SPACE:
1928 case XML_REGEXP_SEPAR_LINE:
1929 case XML_REGEXP_SEPAR_PARA:
1930 case XML_REGEXP_SYMBOL:
1931 case XML_REGEXP_SYMBOL_MATH:
1932 case XML_REGEXP_SYMBOL_CURRENCY:
1933 case XML_REGEXP_SYMBOL_MODIFIER:
1934 case XML_REGEXP_SYMBOL_OTHERS:
1935 case XML_REGEXP_OTHER:
1936 case XML_REGEXP_OTHER_CONTROL:
1937 case XML_REGEXP_OTHER_FORMAT:
1938 case XML_REGEXP_OTHER_PRIVATE:
1939 case XML_REGEXP_OTHER_NA:
1940 case XML_REGEXP_BLOCK_NAME:
1941 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
1942 (const xmlChar *)atom->valuep);
1943 if (atom->neg)
1944 ret = !ret;
1945 break;
1946 }
1947 return(ret);
1948}
1949
1950/************************************************************************
1951 * *
1952 * Saving an restoring state of an execution context *
1953 * *
1954 ************************************************************************/
1955
1956#ifdef DEBUG_REGEXP_EXEC
1957static void
1958xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
1959 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
1960 if (exec->inputStack != NULL) {
1961 int i;
1962 printf(": ");
1963 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
1964 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]);
1965 } else {
1966 printf(": %s", &(exec->inputString[exec->index]));
1967 }
1968 printf("\n");
1969}
1970#endif
1971
1972static void
1973xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
1974#ifdef DEBUG_REGEXP_EXEC
1975 printf("saving ");
1976 exec->transno++;
1977 xmlFARegDebugExec(exec);
1978 exec->transno--;
1979#endif
1980
1981 if (exec->maxRollbacks == 0) {
1982 exec->maxRollbacks = 4;
1983 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
1984 sizeof(xmlRegExecRollback));
1985 if (exec->rollbacks == NULL) {
1986 fprintf(stderr, "exec save: allocation failed");
1987 exec->maxRollbacks = 0;
1988 return;
1989 }
1990 memset(exec->rollbacks, 0,
1991 exec->maxRollbacks * sizeof(xmlRegExecRollback));
1992 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
1993 xmlRegExecRollback *tmp;
1994 int len = exec->maxRollbacks;
1995
1996 exec->maxRollbacks *= 2;
1997 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
1998 exec->maxRollbacks * sizeof(xmlRegExecRollback));
1999 if (tmp == NULL) {
2000 fprintf(stderr, "exec save: allocation failed");
2001 exec->maxRollbacks /= 2;
2002 return;
2003 }
2004 exec->rollbacks = tmp;
2005 tmp = &exec->rollbacks[len];
2006 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
2007 }
2008 exec->rollbacks[exec->nbRollbacks].state = exec->state;
2009 exec->rollbacks[exec->nbRollbacks].index = exec->index;
2010 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
2011 if (exec->comp->nbCounters > 0) {
2012 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2013 exec->rollbacks[exec->nbRollbacks].counts = (int *)
2014 xmlMalloc(exec->comp->nbCounters * sizeof(int));
2015 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2016 fprintf(stderr, "exec save: allocation failed");
2017 exec->status = -5;
2018 return;
2019 }
2020 }
2021 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
2022 exec->comp->nbCounters * sizeof(int));
2023 }
2024 exec->nbRollbacks++;
2025}
2026
2027static void
2028xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
2029 if (exec->nbRollbacks <= 0) {
2030 exec->status = -1;
2031#ifdef DEBUG_REGEXP_EXEC
2032 printf("rollback failed on empty stack\n");
2033#endif
2034 return;
2035 }
2036 exec->nbRollbacks--;
2037 exec->state = exec->rollbacks[exec->nbRollbacks].state;
2038 exec->index = exec->rollbacks[exec->nbRollbacks].index;
2039 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
2040 if (exec->comp->nbCounters > 0) {
2041 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2042 fprintf(stderr, "exec save: allocation failed");
2043 exec->status = -6;
2044 return;
2045 }
2046 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
2047 exec->comp->nbCounters * sizeof(int));
2048 }
2049
2050#ifdef DEBUG_REGEXP_EXEC
2051 printf("restored ");
2052 xmlFARegDebugExec(exec);
2053#endif
2054}
2055
2056/************************************************************************
2057 * *
2058 * Verifyer, running an input against a compiled regexp *
2059 * *
2060 ************************************************************************/
2061
2062static int
2063xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
2064 xmlRegExecCtxt execval;
2065 xmlRegExecCtxtPtr exec = &execval;
2066 int ret, codepoint, len;
2067
2068 exec->inputString = content;
2069 exec->index = 0;
2070 exec->determinist = 1;
2071 exec->maxRollbacks = 0;
2072 exec->nbRollbacks = 0;
2073 exec->rollbacks = NULL;
2074 exec->status = 0;
2075 exec->comp = comp;
2076 exec->state = comp->states[0];
2077 exec->transno = 0;
2078 exec->transcount = 0;
2079 if (comp->nbCounters > 0) {
2080 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
2081 if (exec->counts == NULL)
2082 return(-1);
2083 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
2084 } else
2085 exec->counts = NULL;
2086 while ((exec->status == 0) &&
2087 ((exec->inputString[exec->index] != 0) ||
2088 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
2089 xmlRegTransPtr trans;
2090 xmlRegAtomPtr atom;
2091
2092 /*
2093 * End of input on non-terminal state, rollback, however we may
2094 * still have epsilon like transition for counted transitions
2095 * on counters, in that case don't break too early.
2096 */
2097 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
2098 goto rollback;
2099
2100 exec->transcount = 0;
2101 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2102 trans = &exec->state->trans[exec->transno];
2103 if (trans->to < 0)
2104 continue;
2105 atom = trans->atom;
2106 ret = 0;
2107 if (trans->count >= 0) {
2108 int count;
2109 xmlRegCounterPtr counter;
2110
2111 /*
2112 * A counted transition.
2113 */
2114
2115 count = exec->counts[trans->count];
2116 counter = &exec->comp->counters[trans->count];
2117#ifdef DEBUG_REGEXP_EXEC
2118 printf("testing count %d: val %d, min %d, max %d\n",
2119 trans->count, count, counter->min, counter->max);
2120#endif
2121 ret = ((count >= counter->min) && (count <= counter->max));
2122 } else if (atom == NULL) {
2123 fprintf(stderr, "epsilon transition left at runtime\n");
2124 exec->status = -2;
2125 break;
2126 } else if (exec->inputString[exec->index] != 0) {
2127 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
2128 ret = xmlRegCheckCharacter(atom, codepoint);
2129 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2130 xmlRegStatePtr to = comp->states[trans->to];
2131
2132 /*
2133 * this is a multiple input sequence
2134 */
2135 if (exec->state->nbTrans > exec->transno + 1) {
2136 xmlFARegExecSave(exec);
2137 }
2138 exec->transcount = 1;
2139 do {
2140 /*
2141 * Try to progress as much as possible on the input
2142 */
2143 if (exec->transcount == atom->max) {
2144 break;
2145 }
2146 exec->index += len;
2147 /*
2148 * End of input: stop here
2149 */
2150 if (exec->inputString[exec->index] == 0) {
2151 exec->index -= len;
2152 break;
2153 }
2154 if (exec->transcount >= atom->min) {
2155 int transno = exec->transno;
2156 xmlRegStatePtr state = exec->state;
2157
2158 /*
2159 * The transition is acceptable save it
2160 */
2161 exec->transno = -1; /* trick */
2162 exec->state = to;
2163 xmlFARegExecSave(exec);
2164 exec->transno = transno;
2165 exec->state = state;
2166 }
2167 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
2168 len);
2169 ret = xmlRegCheckCharacter(atom, codepoint);
2170 exec->transcount++;
2171 } while (ret == 1);
2172 if (exec->transcount < atom->min)
2173 ret = 0;
2174
2175 /*
2176 * If the last check failed but one transition was found
2177 * possible, rollback
2178 */
2179 if (ret < 0)
2180 ret = 0;
2181 if (ret == 0) {
2182 goto rollback;
2183 }
2184 }
2185 }
2186 if (ret == 1) {
2187 if (exec->state->nbTrans > exec->transno + 1) {
2188 xmlFARegExecSave(exec);
2189 }
2190 if (trans->counter >= 0) {
2191#ifdef DEBUG_REGEXP_EXEC
2192 printf("Increasing count %d\n", trans->counter);
2193#endif
2194 exec->counts[trans->counter]++;
2195 }
2196#ifdef DEBUG_REGEXP_EXEC
2197 printf("entering state %d\n", trans->to);
2198#endif
2199 exec->state = comp->states[trans->to];
2200 exec->transno = 0;
2201 if (trans->atom != NULL) {
2202 exec->index += len;
2203 }
2204 goto progress;
2205 } else if (ret < 0) {
2206 exec->status = -4;
2207 break;
2208 }
2209 }
2210 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2211rollback:
2212 /*
2213 * Failed to find a way out
2214 */
2215 exec->determinist = 0;
2216 xmlFARegExecRollBack(exec);
2217 }
2218progress:
2219 continue;
2220 }
2221 if (exec->rollbacks != NULL) {
2222 if (exec->counts != NULL) {
2223 int i;
2224
2225 for (i = 0;i < exec->maxRollbacks;i++)
2226 if (exec->rollbacks[i].counts != NULL)
2227 xmlFree(exec->rollbacks[i].counts);
2228 }
2229 xmlFree(exec->rollbacks);
2230 }
2231 if (exec->counts != NULL)
2232 xmlFree(exec->counts);
2233 if (exec->status == 0)
2234 return(1);
2235 if (exec->status == -1)
2236 return(0);
2237 return(exec->status);
2238}
2239
2240/************************************************************************
2241 * *
2242 * Progressive interface to the verifyer one atom at a time *
2243 * *
2244 ************************************************************************/
2245
2246/**
2247 * xmlRegExecCtxtPtr:
2248 * @comp: a precompiled regular expression
2249 * @callback: a callback function used for handling progresses in the
2250 * automata matching phase
2251 * @data: the context data associated to the callback in this context
2252 *
2253 * Build a context used for progressive evaluation of a regexp.
2254 */
2255xmlRegExecCtxtPtr
2256xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
2257 xmlRegExecCtxtPtr exec;
2258
2259 if (comp == NULL)
2260 return(NULL);
2261 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
2262 if (exec == NULL) {
2263 return(NULL);
2264 }
2265 memset(exec, 0, sizeof(xmlRegExecCtxt));
2266 exec->inputString = NULL;
2267 exec->index = 0;
2268 exec->determinist = 1;
2269 exec->maxRollbacks = 0;
2270 exec->nbRollbacks = 0;
2271 exec->rollbacks = NULL;
2272 exec->status = 0;
2273 exec->comp = comp;
Daniel Veillard23e73572002-09-19 19:56:43 +00002274 if (comp->compact == NULL)
2275 exec->state = comp->states[0];
Daniel Veillard4255d502002-04-16 15:50:10 +00002276 exec->transno = 0;
2277 exec->transcount = 0;
2278 exec->callback = callback;
2279 exec->data = data;
2280 if (comp->nbCounters > 0) {
2281 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
2282 if (exec->counts == NULL) {
2283 xmlFree(exec);
2284 return(NULL);
2285 }
2286 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
2287 } else
2288 exec->counts = NULL;
2289 exec->inputStackMax = 0;
2290 exec->inputStackNr = 0;
2291 exec->inputStack = NULL;
2292 return(exec);
2293}
2294
2295/**
2296 * xmlRegFreeExecCtxt:
2297 * @exec: a regular expression evaulation context
2298 *
2299 * Free the structures associated to a regular expression evaulation context.
2300 */
2301void
2302xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
2303 if (exec == NULL)
2304 return;
2305
2306 if (exec->rollbacks != NULL) {
2307 if (exec->counts != NULL) {
2308 int i;
2309
2310 for (i = 0;i < exec->maxRollbacks;i++)
2311 if (exec->rollbacks[i].counts != NULL)
2312 xmlFree(exec->rollbacks[i].counts);
2313 }
2314 xmlFree(exec->rollbacks);
2315 }
2316 if (exec->counts != NULL)
2317 xmlFree(exec->counts);
2318 if (exec->inputStack != NULL) {
2319 int i;
2320
2321 for (i = 0;i < exec->inputStackNr;i++)
2322 xmlFree(exec->inputStack[i].value);
2323 xmlFree(exec->inputStack);
2324 }
2325 xmlFree(exec);
2326}
2327
2328static void
2329xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2330 void *data) {
2331#ifdef DEBUG_PUSH
2332 printf("saving value: %d:%s\n", exec->inputStackNr, value);
2333#endif
2334 if (exec->inputStackMax == 0) {
2335 exec->inputStackMax = 4;
2336 exec->inputStack = (xmlRegInputTokenPtr)
2337 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
2338 if (exec->inputStack == NULL) {
2339 fprintf(stderr, "push input: allocation failed");
2340 exec->inputStackMax = 0;
2341 return;
2342 }
2343 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
2344 xmlRegInputTokenPtr tmp;
2345
2346 exec->inputStackMax *= 2;
2347 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
2348 exec->inputStackMax * sizeof(xmlRegInputToken));
2349 if (tmp == NULL) {
2350 fprintf(stderr, "push input: allocation failed");
2351 exec->inputStackMax /= 2;
2352 return;
2353 }
2354 exec->inputStack = tmp;
2355 }
2356 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
2357 exec->inputStack[exec->inputStackNr].data = data;
2358 exec->inputStackNr++;
2359 exec->inputStack[exec->inputStackNr].value = NULL;
2360 exec->inputStack[exec->inputStackNr].data = NULL;
2361}
2362
2363
2364/**
Daniel Veillard23e73572002-09-19 19:56:43 +00002365 * xmlRegCompactPushString:
2366 * @exec: a regexp execution context
2367 * @comp: the precompiled exec with a compact table
2368 * @value: a string token input
2369 * @data: data associated to the token to reuse in callbacks
2370 *
2371 * Push one input token in the execution context
2372 *
2373 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2374 * a negative value in case of error.
2375 */
2376static int
2377xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
2378 xmlRegexpPtr comp,
2379 const xmlChar *value,
2380 void *data) {
2381 int state = exec->index;
2382 int i, target;
2383
2384 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
2385 return(-1);
2386
2387 if (value == NULL) {
2388 /*
2389 * are we at a final state ?
2390 */
2391 if (comp->compact[state * (comp->nbstrings + 1)] ==
2392 XML_REGEXP_FINAL_STATE)
2393 return(1);
2394 return(0);
2395 }
2396
2397#ifdef DEBUG_PUSH
2398 printf("value pushed: %s\n", value);
2399#endif
2400
2401 /*
2402 * Examine all outside transition from current state
2403 */
2404 for (i = 0;i < comp->nbstrings;i++) {
2405 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
2406 if ((target > 0) && (target <= comp->nbstates)) {
2407 target--; /* to avoid 0 */
2408 if (xmlStrEqual(comp->stringMap[i], value)) {
2409 exec->index = target;
2410#ifdef DEBUG_PUSH
2411 printf("entering state %d\n", target);
2412#endif
2413 if (comp->compact[target * (comp->nbstrings + 1)] ==
2414 XML_REGEXP_FINAL_STATE)
2415 return(1);
2416 return(0);
2417 }
2418 }
2419 }
2420 /*
2421 * Failed to find an exit transition out from current state for the
2422 * current token
2423 */
2424#ifdef DEBUG_PUSH
2425 printf("failed to find a transition for %s on state %d\n", value, state);
2426#endif
2427 exec->status = -1;
2428 return(-1);
2429}
2430
2431/**
Daniel Veillard4255d502002-04-16 15:50:10 +00002432 * xmlRegExecPushString:
2433 * @exec: a regexp execution context
2434 * @value: a string token input
2435 * @data: data associated to the token to reuse in callbacks
2436 *
2437 * Push one input token in the execution context
2438 *
2439 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2440 * a negative value in case of error.
2441 */
2442int
2443xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2444 void *data) {
2445 xmlRegTransPtr trans;
2446 xmlRegAtomPtr atom;
2447 int ret;
2448 int final = 0;
2449
2450 if (exec == NULL)
2451 return(-1);
Daniel Veillard23e73572002-09-19 19:56:43 +00002452 if (exec->comp == NULL)
2453 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00002454 if (exec->status != 0)
2455 return(exec->status);
2456
Daniel Veillard23e73572002-09-19 19:56:43 +00002457 if (exec->comp->compact != NULL)
2458 return(xmlRegCompactPushString(exec, exec->comp, value, data));
2459
Daniel Veillard4255d502002-04-16 15:50:10 +00002460 if (value == NULL) {
2461 if (exec->state->type == XML_REGEXP_FINAL_STATE)
2462 return(1);
2463 final = 1;
2464 }
2465
2466#ifdef DEBUG_PUSH
2467 printf("value pushed: %s\n", value);
2468#endif
2469 /*
2470 * If we have an active rollback stack push the new value there
2471 * and get back to where we were left
2472 */
2473 if ((value != NULL) && (exec->inputStackNr > 0)) {
2474 xmlFARegExecSaveInputString(exec, value, data);
2475 value = exec->inputStack[exec->index].value;
2476 data = exec->inputStack[exec->index].data;
2477#ifdef DEBUG_PUSH
2478 printf("value loaded: %s\n", value);
2479#endif
2480 }
2481
2482 while ((exec->status == 0) &&
2483 ((value != NULL) ||
2484 ((final == 1) &&
2485 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
2486
2487 /*
2488 * End of input on non-terminal state, rollback, however we may
2489 * still have epsilon like transition for counted transitions
2490 * on counters, in that case don't break too early.
2491 */
Daniel Veillardb509f152002-04-17 16:28:10 +00002492 if ((value == NULL) && (exec->counts == NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +00002493 goto rollback;
2494
2495 exec->transcount = 0;
2496 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2497 trans = &exec->state->trans[exec->transno];
2498 if (trans->to < 0)
2499 continue;
2500 atom = trans->atom;
2501 ret = 0;
Daniel Veillard441bc322002-04-20 17:38:48 +00002502 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
2503 int i;
2504 int count;
2505 xmlRegTransPtr t;
2506 xmlRegCounterPtr counter;
2507
2508 ret = 0;
2509
2510#ifdef DEBUG_PUSH
2511 printf("testing all lax %d\n", trans->count);
2512#endif
2513 /*
2514 * Check all counted transitions from the current state
2515 */
2516 if ((value == NULL) && (final)) {
2517 ret = 1;
2518 } else if (value != NULL) {
2519 for (i = 0;i < exec->state->nbTrans;i++) {
2520 t = &exec->state->trans[i];
2521 if ((t->counter < 0) || (t == trans))
2522 continue;
2523 counter = &exec->comp->counters[t->counter];
2524 count = exec->counts[t->counter];
2525 if ((count < counter->max) &&
2526 (t->atom != NULL) &&
2527 (xmlStrEqual(value, t->atom->valuep))) {
2528 ret = 0;
2529 break;
2530 }
2531 if ((count >= counter->min) &&
2532 (count < counter->max) &&
2533 (xmlStrEqual(value, t->atom->valuep))) {
2534 ret = 1;
2535 break;
2536 }
2537 }
2538 }
2539 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillard8a001f62002-04-20 07:24:11 +00002540 int i;
2541 int count;
2542 xmlRegTransPtr t;
2543 xmlRegCounterPtr counter;
2544
2545 ret = 1;
2546
2547#ifdef DEBUG_PUSH
2548 printf("testing all %d\n", trans->count);
2549#endif
2550 /*
2551 * Check all counted transitions from the current state
2552 */
2553 for (i = 0;i < exec->state->nbTrans;i++) {
2554 t = &exec->state->trans[i];
2555 if ((t->counter < 0) || (t == trans))
2556 continue;
2557 counter = &exec->comp->counters[t->counter];
2558 count = exec->counts[t->counter];
2559 if ((count < counter->min) || (count > counter->max)) {
2560 ret = 0;
2561 break;
2562 }
2563 }
2564 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002565 int count;
2566 xmlRegCounterPtr counter;
2567
2568 /*
2569 * A counted transition.
2570 */
2571
2572 count = exec->counts[trans->count];
2573 counter = &exec->comp->counters[trans->count];
2574#ifdef DEBUG_PUSH
2575 printf("testing count %d: val %d, min %d, max %d\n",
2576 trans->count, count, counter->min, counter->max);
2577#endif
2578 ret = ((count >= counter->min) && (count <= counter->max));
2579 } else if (atom == NULL) {
2580 fprintf(stderr, "epsilon transition left at runtime\n");
2581 exec->status = -2;
2582 break;
2583 } else if (value != NULL) {
2584 ret = xmlStrEqual(value, atom->valuep);
Daniel Veillard441bc322002-04-20 17:38:48 +00002585 if ((ret == 1) && (trans->counter >= 0)) {
2586 xmlRegCounterPtr counter;
2587 int count;
2588
2589 count = exec->counts[trans->counter];
2590 counter = &exec->comp->counters[trans->counter];
2591 if (count >= counter->max)
2592 ret = 0;
2593 }
2594
Daniel Veillard4255d502002-04-16 15:50:10 +00002595 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2596 xmlRegStatePtr to = exec->comp->states[trans->to];
2597
2598 /*
2599 * this is a multiple input sequence
2600 */
2601 if (exec->state->nbTrans > exec->transno + 1) {
2602 if (exec->inputStackNr <= 0) {
2603 xmlFARegExecSaveInputString(exec, value, data);
2604 }
2605 xmlFARegExecSave(exec);
2606 }
2607 exec->transcount = 1;
2608 do {
2609 /*
2610 * Try to progress as much as possible on the input
2611 */
2612 if (exec->transcount == atom->max) {
2613 break;
2614 }
2615 exec->index++;
2616 value = exec->inputStack[exec->index].value;
2617 data = exec->inputStack[exec->index].data;
2618#ifdef DEBUG_PUSH
2619 printf("value loaded: %s\n", value);
2620#endif
2621
2622 /*
2623 * End of input: stop here
2624 */
2625 if (value == NULL) {
2626 exec->index --;
2627 break;
2628 }
2629 if (exec->transcount >= atom->min) {
2630 int transno = exec->transno;
2631 xmlRegStatePtr state = exec->state;
2632
2633 /*
2634 * The transition is acceptable save it
2635 */
2636 exec->transno = -1; /* trick */
2637 exec->state = to;
2638 if (exec->inputStackNr <= 0) {
2639 xmlFARegExecSaveInputString(exec, value, data);
2640 }
2641 xmlFARegExecSave(exec);
2642 exec->transno = transno;
2643 exec->state = state;
2644 }
2645 ret = xmlStrEqual(value, atom->valuep);
2646 exec->transcount++;
2647 } while (ret == 1);
2648 if (exec->transcount < atom->min)
2649 ret = 0;
2650
2651 /*
2652 * If the last check failed but one transition was found
2653 * possible, rollback
2654 */
2655 if (ret < 0)
2656 ret = 0;
2657 if (ret == 0) {
2658 goto rollback;
2659 }
2660 }
2661 }
2662 if (ret == 1) {
2663 if ((exec->callback != NULL) && (atom != NULL)) {
2664 exec->callback(exec->data, atom->valuep,
2665 atom->data, data);
2666 }
2667 if (exec->state->nbTrans > exec->transno + 1) {
2668 if (exec->inputStackNr <= 0) {
2669 xmlFARegExecSaveInputString(exec, value, data);
2670 }
2671 xmlFARegExecSave(exec);
2672 }
2673 if (trans->counter >= 0) {
2674#ifdef DEBUG_PUSH
2675 printf("Increasing count %d\n", trans->counter);
2676#endif
2677 exec->counts[trans->counter]++;
2678 }
2679#ifdef DEBUG_PUSH
2680 printf("entering state %d\n", trans->to);
2681#endif
2682 exec->state = exec->comp->states[trans->to];
2683 exec->transno = 0;
2684 if (trans->atom != NULL) {
2685 if (exec->inputStack != NULL) {
2686 exec->index++;
2687 if (exec->index < exec->inputStackNr) {
2688 value = exec->inputStack[exec->index].value;
2689 data = exec->inputStack[exec->index].data;
2690#ifdef DEBUG_PUSH
2691 printf("value loaded: %s\n", value);
2692#endif
2693 } else {
2694 value = NULL;
2695 data = NULL;
2696#ifdef DEBUG_PUSH
2697 printf("end of input\n");
2698#endif
2699 }
2700 } else {
2701 value = NULL;
2702 data = NULL;
2703#ifdef DEBUG_PUSH
2704 printf("end of input\n");
2705#endif
2706 }
2707 }
2708 goto progress;
2709 } else if (ret < 0) {
2710 exec->status = -4;
2711 break;
2712 }
2713 }
2714 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2715rollback:
2716 /*
2717 * Failed to find a way out
2718 */
2719 exec->determinist = 0;
2720 xmlFARegExecRollBack(exec);
2721 if (exec->status == 0) {
2722 value = exec->inputStack[exec->index].value;
2723 data = exec->inputStack[exec->index].data;
2724#ifdef DEBUG_PUSH
2725 printf("value loaded: %s\n", value);
2726#endif
2727 }
2728 }
2729progress:
2730 continue;
2731 }
2732 if (exec->status == 0) {
2733 return(exec->state->type == XML_REGEXP_FINAL_STATE);
2734 }
2735 return(exec->status);
2736}
2737
2738#if 0
2739static int
2740xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
2741 xmlRegTransPtr trans;
2742 xmlRegAtomPtr atom;
2743 int ret;
2744 int codepoint, len;
2745
2746 if (exec == NULL)
2747 return(-1);
2748 if (exec->status != 0)
2749 return(exec->status);
2750
2751 while ((exec->status == 0) &&
2752 ((exec->inputString[exec->index] != 0) ||
2753 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
2754
2755 /*
2756 * End of input on non-terminal state, rollback, however we may
2757 * still have epsilon like transition for counted transitions
2758 * on counters, in that case don't break too early.
2759 */
2760 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
2761 goto rollback;
2762
2763 exec->transcount = 0;
2764 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2765 trans = &exec->state->trans[exec->transno];
2766 if (trans->to < 0)
2767 continue;
2768 atom = trans->atom;
2769 ret = 0;
2770 if (trans->count >= 0) {
2771 int count;
2772 xmlRegCounterPtr counter;
2773
2774 /*
2775 * A counted transition.
2776 */
2777
2778 count = exec->counts[trans->count];
2779 counter = &exec->comp->counters[trans->count];
2780#ifdef DEBUG_REGEXP_EXEC
2781 printf("testing count %d: val %d, min %d, max %d\n",
2782 trans->count, count, counter->min, counter->max);
2783#endif
2784 ret = ((count >= counter->min) && (count <= counter->max));
2785 } else if (atom == NULL) {
2786 fprintf(stderr, "epsilon transition left at runtime\n");
2787 exec->status = -2;
2788 break;
2789 } else if (exec->inputString[exec->index] != 0) {
2790 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
2791 ret = xmlRegCheckCharacter(atom, codepoint);
2792 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2793 xmlRegStatePtr to = exec->comp->states[trans->to];
2794
2795 /*
2796 * this is a multiple input sequence
2797 */
2798 if (exec->state->nbTrans > exec->transno + 1) {
2799 xmlFARegExecSave(exec);
2800 }
2801 exec->transcount = 1;
2802 do {
2803 /*
2804 * Try to progress as much as possible on the input
2805 */
2806 if (exec->transcount == atom->max) {
2807 break;
2808 }
2809 exec->index += len;
2810 /*
2811 * End of input: stop here
2812 */
2813 if (exec->inputString[exec->index] == 0) {
2814 exec->index -= len;
2815 break;
2816 }
2817 if (exec->transcount >= atom->min) {
2818 int transno = exec->transno;
2819 xmlRegStatePtr state = exec->state;
2820
2821 /*
2822 * The transition is acceptable save it
2823 */
2824 exec->transno = -1; /* trick */
2825 exec->state = to;
2826 xmlFARegExecSave(exec);
2827 exec->transno = transno;
2828 exec->state = state;
2829 }
2830 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
2831 len);
2832 ret = xmlRegCheckCharacter(atom, codepoint);
2833 exec->transcount++;
2834 } while (ret == 1);
2835 if (exec->transcount < atom->min)
2836 ret = 0;
2837
2838 /*
2839 * If the last check failed but one transition was found
2840 * possible, rollback
2841 */
2842 if (ret < 0)
2843 ret = 0;
2844 if (ret == 0) {
2845 goto rollback;
2846 }
2847 }
2848 }
2849 if (ret == 1) {
2850 if (exec->state->nbTrans > exec->transno + 1) {
2851 xmlFARegExecSave(exec);
2852 }
2853 if (trans->counter >= 0) {
2854#ifdef DEBUG_REGEXP_EXEC
2855 printf("Increasing count %d\n", trans->counter);
2856#endif
2857 exec->counts[trans->counter]++;
2858 }
2859#ifdef DEBUG_REGEXP_EXEC
2860 printf("entering state %d\n", trans->to);
2861#endif
2862 exec->state = exec->comp->states[trans->to];
2863 exec->transno = 0;
2864 if (trans->atom != NULL) {
2865 exec->index += len;
2866 }
2867 goto progress;
2868 } else if (ret < 0) {
2869 exec->status = -4;
2870 break;
2871 }
2872 }
2873 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2874rollback:
2875 /*
2876 * Failed to find a way out
2877 */
2878 exec->determinist = 0;
2879 xmlFARegExecRollBack(exec);
2880 }
2881progress:
2882 continue;
2883 }
2884}
2885#endif
2886/************************************************************************
2887 * *
2888 * Parser for the Shemas Datatype Regular Expressions *
2889 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
2890 * *
2891 ************************************************************************/
2892
2893/**
2894 * xmlFAIsChar:
Daniel Veillard441bc322002-04-20 17:38:48 +00002895 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00002896 *
2897 * [10] Char ::= [^.\?*+()|#x5B#x5D]
2898 */
2899static int
2900xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
2901 int cur;
2902 int len;
2903
2904 cur = CUR_SCHAR(ctxt->cur, len);
2905 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
2906 (cur == '*') || (cur == '+') || (cur == '(') ||
2907 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
2908 (cur == 0x5D) || (cur == 0))
2909 return(-1);
2910 return(cur);
2911}
2912
2913/**
2914 * xmlFAParseCharProp:
Daniel Veillard441bc322002-04-20 17:38:48 +00002915 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00002916 *
2917 * [27] charProp ::= IsCategory | IsBlock
2918 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
2919 * Separators | Symbols | Others
2920 * [29] Letters ::= 'L' [ultmo]?
2921 * [30] Marks ::= 'M' [nce]?
2922 * [31] Numbers ::= 'N' [dlo]?
2923 * [32] Punctuation ::= 'P' [cdseifo]?
2924 * [33] Separators ::= 'Z' [slp]?
2925 * [34] Symbols ::= 'S' [mcko]?
2926 * [35] Others ::= 'C' [cfon]?
2927 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
2928 */
2929static void
2930xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
2931 int cur;
2932 xmlRegAtomType type = 0;
2933 xmlChar *blockName = NULL;
2934
2935 cur = CUR;
2936 if (cur == 'L') {
2937 NEXT;
2938 cur = CUR;
2939 if (cur == 'u') {
2940 NEXT;
2941 type = XML_REGEXP_LETTER_UPPERCASE;
2942 } else if (cur == 'l') {
2943 NEXT;
2944 type = XML_REGEXP_LETTER_LOWERCASE;
2945 } else if (cur == 't') {
2946 NEXT;
2947 type = XML_REGEXP_LETTER_TITLECASE;
2948 } else if (cur == 'm') {
2949 NEXT;
2950 type = XML_REGEXP_LETTER_MODIFIER;
2951 } else if (cur == 'o') {
2952 NEXT;
2953 type = XML_REGEXP_LETTER_OTHERS;
2954 } else {
2955 type = XML_REGEXP_LETTER;
2956 }
2957 } else if (cur == 'M') {
2958 NEXT;
2959 cur = CUR;
2960 if (cur == 'n') {
2961 NEXT;
2962 /* nonspacing */
2963 type = XML_REGEXP_MARK_NONSPACING;
2964 } else if (cur == 'c') {
2965 NEXT;
2966 /* spacing combining */
2967 type = XML_REGEXP_MARK_SPACECOMBINING;
2968 } else if (cur == 'e') {
2969 NEXT;
2970 /* enclosing */
2971 type = XML_REGEXP_MARK_ENCLOSING;
2972 } else {
2973 /* all marks */
2974 type = XML_REGEXP_MARK;
2975 }
2976 } else if (cur == 'N') {
2977 NEXT;
2978 cur = CUR;
2979 if (cur == 'd') {
2980 NEXT;
2981 /* digital */
2982 type = XML_REGEXP_NUMBER_DECIMAL;
2983 } else if (cur == 'l') {
2984 NEXT;
2985 /* letter */
2986 type = XML_REGEXP_NUMBER_LETTER;
2987 } else if (cur == 'o') {
2988 NEXT;
2989 /* other */
2990 type = XML_REGEXP_NUMBER_OTHERS;
2991 } else {
2992 /* all numbers */
2993 type = XML_REGEXP_NUMBER;
2994 }
2995 } else if (cur == 'P') {
2996 NEXT;
2997 cur = CUR;
2998 if (cur == 'c') {
2999 NEXT;
3000 /* connector */
3001 type = XML_REGEXP_PUNCT_CONNECTOR;
3002 } else if (cur == 'd') {
3003 NEXT;
3004 /* dash */
3005 type = XML_REGEXP_PUNCT_DASH;
3006 } else if (cur == 's') {
3007 NEXT;
3008 /* open */
3009 type = XML_REGEXP_PUNCT_OPEN;
3010 } else if (cur == 'e') {
3011 NEXT;
3012 /* close */
3013 type = XML_REGEXP_PUNCT_CLOSE;
3014 } else if (cur == 'i') {
3015 NEXT;
3016 /* initial quote */
3017 type = XML_REGEXP_PUNCT_INITQUOTE;
3018 } else if (cur == 'f') {
3019 NEXT;
3020 /* final quote */
3021 type = XML_REGEXP_PUNCT_FINQUOTE;
3022 } else if (cur == 'o') {
3023 NEXT;
3024 /* other */
3025 type = XML_REGEXP_PUNCT_OTHERS;
3026 } else {
3027 /* all punctuation */
3028 type = XML_REGEXP_PUNCT;
3029 }
3030 } else if (cur == 'Z') {
3031 NEXT;
3032 cur = CUR;
3033 if (cur == 's') {
3034 NEXT;
3035 /* space */
3036 type = XML_REGEXP_SEPAR_SPACE;
3037 } else if (cur == 'l') {
3038 NEXT;
3039 /* line */
3040 type = XML_REGEXP_SEPAR_LINE;
3041 } else if (cur == 'p') {
3042 NEXT;
3043 /* paragraph */
3044 type = XML_REGEXP_SEPAR_PARA;
3045 } else {
3046 /* all separators */
3047 type = XML_REGEXP_SEPAR;
3048 }
3049 } else if (cur == 'S') {
3050 NEXT;
3051 cur = CUR;
3052 if (cur == 'm') {
3053 NEXT;
3054 type = XML_REGEXP_SYMBOL_MATH;
3055 /* math */
3056 } else if (cur == 'c') {
3057 NEXT;
3058 type = XML_REGEXP_SYMBOL_CURRENCY;
3059 /* currency */
3060 } else if (cur == 'k') {
3061 NEXT;
3062 type = XML_REGEXP_SYMBOL_MODIFIER;
3063 /* modifiers */
3064 } else if (cur == 'o') {
3065 NEXT;
3066 type = XML_REGEXP_SYMBOL_OTHERS;
3067 /* other */
3068 } else {
3069 /* all symbols */
3070 type = XML_REGEXP_SYMBOL;
3071 }
3072 } else if (cur == 'C') {
3073 NEXT;
3074 cur = CUR;
3075 if (cur == 'c') {
3076 NEXT;
3077 /* control */
3078 type = XML_REGEXP_OTHER_CONTROL;
3079 } else if (cur == 'f') {
3080 NEXT;
3081 /* format */
3082 type = XML_REGEXP_OTHER_FORMAT;
3083 } else if (cur == 'o') {
3084 NEXT;
3085 /* private use */
3086 type = XML_REGEXP_OTHER_PRIVATE;
3087 } else if (cur == 'n') {
3088 NEXT;
3089 /* not assigned */
3090 type = XML_REGEXP_OTHER_NA;
3091 } else {
3092 /* all others */
3093 type = XML_REGEXP_OTHER;
3094 }
3095 } else if (cur == 'I') {
3096 const xmlChar *start;
3097 NEXT;
3098 cur = CUR;
3099 if (cur != 's') {
3100 ERROR("IsXXXX expected");
3101 return;
3102 }
3103 NEXT;
3104 start = ctxt->cur;
3105 cur = CUR;
3106 if (((cur >= 'a') && (cur <= 'z')) ||
3107 ((cur >= 'A') && (cur <= 'Z')) ||
3108 ((cur >= '0') && (cur <= '9')) ||
3109 (cur == 0x2D)) {
3110 NEXT;
3111 cur = CUR;
3112 while (((cur >= 'a') && (cur <= 'z')) ||
3113 ((cur >= 'A') && (cur <= 'Z')) ||
3114 ((cur >= '0') && (cur <= '9')) ||
3115 (cur == 0x2D)) {
3116 NEXT;
3117 cur = CUR;
3118 }
3119 }
3120 type = XML_REGEXP_BLOCK_NAME;
3121 blockName = xmlStrndup(start, ctxt->cur - start);
3122 } else {
3123 ERROR("Unknown char property");
3124 return;
3125 }
3126 if (ctxt->atom == NULL) {
3127 ctxt->atom = xmlRegNewAtom(ctxt, type);
3128 if (ctxt->atom != NULL)
3129 ctxt->atom->valuep = blockName;
3130 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3131 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3132 type, 0, 0, blockName);
3133 }
3134}
3135
3136/**
3137 * xmlFAParseCharClassEsc:
Daniel Veillard441bc322002-04-20 17:38:48 +00003138 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003139 *
3140 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
3141 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
3142 * [25] catEsc ::= '\p{' charProp '}'
3143 * [26] complEsc ::= '\P{' charProp '}'
3144 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
3145 */
3146static void
3147xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
3148 int cur;
3149
3150 if (CUR == '.') {
3151 if (ctxt->atom == NULL) {
3152 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
3153 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3154 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3155 XML_REGEXP_ANYCHAR, 0, 0, NULL);
3156 }
3157 NEXT;
3158 return;
3159 }
3160 if (CUR != '\\') {
3161 ERROR("Escaped sequence: expecting \\");
3162 return;
3163 }
3164 NEXT;
3165 cur = CUR;
3166 if (cur == 'p') {
3167 NEXT;
3168 if (CUR != '{') {
3169 ERROR("Expecting '{'");
3170 return;
3171 }
3172 NEXT;
3173 xmlFAParseCharProp(ctxt);
3174 if (CUR != '}') {
3175 ERROR("Expecting '}'");
3176 return;
3177 }
3178 NEXT;
3179 } else if (cur == 'P') {
3180 NEXT;
3181 if (CUR != '{') {
3182 ERROR("Expecting '{'");
3183 return;
3184 }
3185 NEXT;
3186 xmlFAParseCharProp(ctxt);
3187 ctxt->atom->neg = 1;
3188 if (CUR != '}') {
3189 ERROR("Expecting '}'");
3190 return;
3191 }
3192 NEXT;
3193 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
3194 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
3195 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
3196 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
3197 (cur == 0x5E)) {
3198 if (ctxt->atom == NULL) {
3199 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
3200 if (ctxt->atom != NULL)
3201 ctxt->atom->codepoint = cur;
3202 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3203 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3204 XML_REGEXP_CHARVAL, cur, cur, NULL);
3205 }
3206 NEXT;
3207 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
3208 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
3209 (cur == 'w') || (cur == 'W')) {
Daniel Veillardb509f152002-04-17 16:28:10 +00003210 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
Daniel Veillard4255d502002-04-16 15:50:10 +00003211
3212 switch (cur) {
3213 case 's':
3214 type = XML_REGEXP_ANYSPACE;
3215 break;
3216 case 'S':
3217 type = XML_REGEXP_NOTSPACE;
3218 break;
3219 case 'i':
3220 type = XML_REGEXP_INITNAME;
3221 break;
3222 case 'I':
3223 type = XML_REGEXP_NOTINITNAME;
3224 break;
3225 case 'c':
3226 type = XML_REGEXP_NAMECHAR;
3227 break;
3228 case 'C':
3229 type = XML_REGEXP_NOTNAMECHAR;
3230 break;
3231 case 'd':
3232 type = XML_REGEXP_DECIMAL;
3233 break;
3234 case 'D':
3235 type = XML_REGEXP_NOTDECIMAL;
3236 break;
3237 case 'w':
3238 type = XML_REGEXP_REALCHAR;
3239 break;
3240 case 'W':
3241 type = XML_REGEXP_NOTREALCHAR;
3242 break;
3243 }
3244 NEXT;
3245 if (ctxt->atom == NULL) {
3246 ctxt->atom = xmlRegNewAtom(ctxt, type);
3247 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3248 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3249 type, 0, 0, NULL);
3250 }
3251 }
3252}
3253
3254/**
3255 * xmlFAParseCharRef:
Daniel Veillard441bc322002-04-20 17:38:48 +00003256 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003257 *
3258 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' )
3259 */
3260static int
3261xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) {
3262 int ret = 0, cur;
3263
3264 if ((CUR != '&') || (NXT(1) != '#'))
3265 return(-1);
3266 NEXT;
3267 NEXT;
3268 cur = CUR;
3269 if (cur == 'x') {
3270 NEXT;
3271 cur = CUR;
3272 if (((cur >= '0') && (cur <= '9')) ||
3273 ((cur >= 'a') && (cur <= 'f')) ||
3274 ((cur >= 'A') && (cur <= 'F'))) {
3275 while (((cur >= '0') && (cur <= '9')) ||
3276 ((cur >= 'A') && (cur <= 'F'))) {
3277 if ((cur >= '0') && (cur <= '9'))
3278 ret = ret * 16 + cur - '0';
3279 else if ((cur >= 'a') && (cur <= 'f'))
3280 ret = ret * 16 + 10 + (cur - 'a');
3281 else
3282 ret = ret * 16 + 10 + (cur - 'A');
3283 NEXT;
3284 cur = CUR;
3285 }
3286 } else {
3287 ERROR("Char ref: expecting [0-9A-F]");
3288 return(-1);
3289 }
3290 } else {
3291 if ((cur >= '0') && (cur <= '9')) {
3292 while ((cur >= '0') && (cur <= '9')) {
3293 ret = ret * 10 + cur - '0';
3294 NEXT;
3295 cur = CUR;
3296 }
3297 } else {
3298 ERROR("Char ref: expecting [0-9]");
3299 return(-1);
3300 }
3301 }
3302 if (cur != ';') {
3303 ERROR("Char ref: expecting ';'");
3304 return(-1);
3305 } else {
3306 NEXT;
3307 }
3308 return(ret);
3309}
3310
3311/**
3312 * xmlFAParseCharRange:
Daniel Veillard441bc322002-04-20 17:38:48 +00003313 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003314 *
3315 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
3316 * [18] seRange ::= charOrEsc '-' charOrEsc
3317 * [20] charOrEsc ::= XmlChar | SingleCharEsc
3318 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
3319 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
3320 */
3321static void
3322xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
3323 int cur;
3324 int start = -1;
3325 int end = -1;
3326
3327 if ((CUR == '&') && (NXT(1) == '#')) {
3328 end = start = xmlFAParseCharRef(ctxt);
3329 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3330 XML_REGEXP_CHARVAL, start, end, NULL);
3331 return;
3332 }
3333 cur = CUR;
3334 if (cur == '\\') {
3335 NEXT;
3336 cur = CUR;
3337 switch (cur) {
3338 case 'n': start = 0xA; break;
3339 case 'r': start = 0xD; break;
3340 case 't': start = 0x9; break;
3341 case '\\': case '|': case '.': case '-': case '^': case '?':
3342 case '*': case '+': case '{': case '}': case '(': case ')':
3343 case '[': case ']':
3344 start = cur; break;
3345 default:
3346 ERROR("Invalid escape value");
3347 return;
3348 }
3349 end = start;
3350 } else if ((cur != 0x5B) && (cur != 0x5D)) {
3351 end = start = cur;
3352 } else {
3353 ERROR("Expecting a char range");
3354 return;
3355 }
3356 NEXT;
3357 if (start == '-') {
3358 return;
3359 }
3360 cur = CUR;
3361 if (cur != '-') {
3362 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3363 XML_REGEXP_CHARVAL, start, end, NULL);
3364 return;
3365 }
3366 NEXT;
3367 cur = CUR;
3368 if (cur == '\\') {
3369 NEXT;
3370 cur = CUR;
3371 switch (cur) {
3372 case 'n': end = 0xA; break;
3373 case 'r': end = 0xD; break;
3374 case 't': end = 0x9; break;
3375 case '\\': case '|': case '.': case '-': case '^': case '?':
3376 case '*': case '+': case '{': case '}': case '(': case ')':
3377 case '[': case ']':
3378 end = cur; break;
3379 default:
3380 ERROR("Invalid escape value");
3381 return;
3382 }
3383 } else if ((cur != 0x5B) && (cur != 0x5D)) {
3384 end = cur;
3385 } else {
3386 ERROR("Expecting the end of a char range");
3387 return;
3388 }
3389 NEXT;
3390 /* TODO check that the values are acceptable character ranges for XML */
3391 if (end < start) {
3392 ERROR("End of range is before start of range");
3393 } else {
3394 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3395 XML_REGEXP_CHARVAL, start, end, NULL);
3396 }
3397 return;
3398}
3399
3400/**
3401 * xmlFAParsePosCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00003402 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003403 *
3404 * [14] posCharGroup ::= ( charRange | charClassEsc )+
3405 */
3406static void
3407xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
3408 do {
3409 if ((CUR == '\\') || (CUR == '.')) {
3410 xmlFAParseCharClassEsc(ctxt);
3411 } else {
3412 xmlFAParseCharRange(ctxt);
3413 }
3414 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
3415 (ctxt->error == 0));
3416}
3417
3418/**
3419 * xmlFAParseCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00003420 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003421 *
3422 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
3423 * [15] negCharGroup ::= '^' posCharGroup
3424 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
3425 * [12] charClassExpr ::= '[' charGroup ']'
3426 */
3427static void
3428xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
3429 int n = ctxt->neg;
3430 while ((CUR != ']') && (ctxt->error == 0)) {
3431 if (CUR == '^') {
3432 int neg = ctxt->neg;
3433
3434 NEXT;
3435 ctxt->neg = !ctxt->neg;
3436 xmlFAParsePosCharGroup(ctxt);
3437 ctxt->neg = neg;
3438 } else if (CUR == '-') {
3439 NEXT;
3440 ctxt->neg = !ctxt->neg;
3441 if (CUR != '[') {
3442 ERROR("charClassExpr: '[' expected");
3443 break;
3444 }
3445 NEXT;
3446 xmlFAParseCharGroup(ctxt);
3447 if (CUR == ']') {
3448 NEXT;
3449 } else {
3450 ERROR("charClassExpr: ']' expected");
3451 break;
3452 }
3453 break;
3454 } else if (CUR != ']') {
3455 xmlFAParsePosCharGroup(ctxt);
3456 }
3457 }
3458 ctxt->neg = n;
3459}
3460
3461/**
3462 * xmlFAParseCharClass:
Daniel Veillard441bc322002-04-20 17:38:48 +00003463 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003464 *
3465 * [11] charClass ::= charClassEsc | charClassExpr
3466 * [12] charClassExpr ::= '[' charGroup ']'
3467 */
3468static void
3469xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
3470 if (CUR == '[') {
3471 NEXT;
3472 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
3473 if (ctxt->atom == NULL)
3474 return;
3475 xmlFAParseCharGroup(ctxt);
3476 if (CUR == ']') {
3477 NEXT;
3478 } else {
3479 ERROR("xmlFAParseCharClass: ']' expected");
3480 }
3481 } else {
3482 xmlFAParseCharClassEsc(ctxt);
3483 }
3484}
3485
3486/**
3487 * xmlFAParseQuantExact:
Daniel Veillard441bc322002-04-20 17:38:48 +00003488 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003489 *
3490 * [8] QuantExact ::= [0-9]+
3491 */
3492static int
3493xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
3494 int ret = 0;
3495 int ok = 0;
3496
3497 while ((CUR >= '0') && (CUR <= '9')) {
3498 ret = ret * 10 + (CUR - '0');
3499 ok = 1;
3500 NEXT;
3501 }
3502 if (ok != 1) {
3503 return(-1);
3504 }
3505 return(ret);
3506}
3507
3508/**
3509 * xmlFAParseQuantifier:
Daniel Veillard441bc322002-04-20 17:38:48 +00003510 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003511 *
3512 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
3513 * [5] quantity ::= quantRange | quantMin | QuantExact
3514 * [6] quantRange ::= QuantExact ',' QuantExact
3515 * [7] quantMin ::= QuantExact ','
3516 * [8] QuantExact ::= [0-9]+
3517 */
3518static int
3519xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
3520 int cur;
3521
3522 cur = CUR;
3523 if ((cur == '?') || (cur == '*') || (cur == '+')) {
3524 if (ctxt->atom != NULL) {
3525 if (cur == '?')
3526 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
3527 else if (cur == '*')
3528 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
3529 else if (cur == '+')
3530 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
3531 }
3532 NEXT;
3533 return(1);
3534 }
3535 if (cur == '{') {
3536 int min = 0, max = 0;
3537
3538 NEXT;
3539 cur = xmlFAParseQuantExact(ctxt);
3540 if (cur >= 0)
3541 min = cur;
3542 if (CUR == ',') {
3543 NEXT;
3544 cur = xmlFAParseQuantExact(ctxt);
3545 if (cur >= 0)
3546 max = cur;
3547 }
3548 if (CUR == '}') {
3549 NEXT;
3550 } else {
3551 ERROR("Unterminated quantifier");
3552 }
3553 if (max == 0)
3554 max = min;
3555 if (ctxt->atom != NULL) {
3556 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
3557 ctxt->atom->min = min;
3558 ctxt->atom->max = max;
3559 }
3560 return(1);
3561 }
3562 return(0);
3563}
3564
3565/**
3566 * xmlFAParseAtom:
Daniel Veillard441bc322002-04-20 17:38:48 +00003567 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003568 *
3569 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
3570 */
3571static int
3572xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
3573 int codepoint, len;
3574
3575 codepoint = xmlFAIsChar(ctxt);
3576 if (codepoint > 0) {
3577 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
3578 if (ctxt->atom == NULL)
3579 return(-1);
3580 codepoint = CUR_SCHAR(ctxt->cur, len);
3581 ctxt->atom->codepoint = codepoint;
3582 NEXTL(len);
3583 return(1);
3584 } else if (CUR == '|') {
3585 return(0);
3586 } else if (CUR == 0) {
3587 return(0);
3588 } else if (CUR == ')') {
3589 return(0);
3590 } else if (CUR == '(') {
3591 xmlRegStatePtr start, oldend;
3592
3593 NEXT;
3594 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
3595 start = ctxt->state;
3596 oldend = ctxt->end;
3597 ctxt->end = NULL;
3598 ctxt->atom = NULL;
3599 xmlFAParseRegExp(ctxt, 0);
3600 if (CUR == ')') {
3601 NEXT;
3602 } else {
3603 ERROR("xmlFAParseAtom: expecting ')'");
3604 }
3605 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
3606 if (ctxt->atom == NULL)
3607 return(-1);
3608 ctxt->atom->start = start;
3609 ctxt->atom->stop = ctxt->state;
3610 ctxt->end = oldend;
3611 return(1);
3612 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
3613 xmlFAParseCharClass(ctxt);
3614 return(1);
3615 }
3616 return(0);
3617}
3618
3619/**
3620 * xmlFAParsePiece:
Daniel Veillard441bc322002-04-20 17:38:48 +00003621 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003622 *
3623 * [3] piece ::= atom quantifier?
3624 */
3625static int
3626xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
3627 int ret;
3628
3629 ctxt->atom = NULL;
3630 ret = xmlFAParseAtom(ctxt);
3631 if (ret == 0)
3632 return(0);
3633 if (ctxt->atom == NULL) {
3634 ERROR("internal: no atom generated");
3635 }
3636 xmlFAParseQuantifier(ctxt);
3637 return(1);
3638}
3639
3640/**
3641 * xmlFAParseBranch:
Daniel Veillard441bc322002-04-20 17:38:48 +00003642 * @ctxt: a regexp parser context
3643 * @first: is taht the first
Daniel Veillard4255d502002-04-16 15:50:10 +00003644 *
3645 * [2] branch ::= piece*
3646 */
3647static void
3648xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, int first) {
3649 xmlRegStatePtr previous;
3650 xmlRegAtomPtr prevatom = NULL;
3651 int ret;
3652
3653 previous = ctxt->state;
3654 ret = xmlFAParsePiece(ctxt);
3655 if (ret != 0) {
3656 if (first) {
3657 xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom);
3658 previous = ctxt->state;
3659 } else {
3660 prevatom = ctxt->atom;
3661 }
3662 ctxt->atom = NULL;
3663 }
3664 while ((ret != 0) && (ctxt->error == 0)) {
3665 ret = xmlFAParsePiece(ctxt);
3666 if (ret != 0) {
3667 if (first) {
3668 xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom);
3669 } else {
3670 xmlFAGenerateTransitions(ctxt, previous, NULL, prevatom);
3671 prevatom = ctxt->atom;
3672 }
3673 previous = ctxt->state;
3674 ctxt->atom = NULL;
3675 }
3676 }
3677 if (!first) {
3678 xmlFAGenerateTransitions(ctxt, previous, ctxt->end, prevatom);
3679 }
3680}
3681
3682/**
3683 * xmlFAParseRegExp:
Daniel Veillard441bc322002-04-20 17:38:48 +00003684 * @ctxt: a regexp parser context
3685 * @top: is that the top-level expressions ?
Daniel Veillard4255d502002-04-16 15:50:10 +00003686 *
3687 * [1] regExp ::= branch ( '|' branch )*
3688 */
3689static void
3690xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
3691 xmlRegStatePtr start, end, oldend;
3692
3693 oldend = ctxt->end;
3694
3695 start = ctxt->state;
3696 xmlFAParseBranch(ctxt, (ctxt->end == NULL));
3697 if (CUR != '|') {
3698 ctxt->end = ctxt->state;
3699 return;
3700 }
3701 end = ctxt->state;
3702 while ((CUR == '|') && (ctxt->error == 0)) {
3703 NEXT;
3704 ctxt->state = start;
3705 ctxt->end = end;
3706 xmlFAParseBranch(ctxt, 0);
3707 }
3708 if (!top)
3709 ctxt->end = oldend;
3710}
3711
3712/************************************************************************
3713 * *
3714 * The basic API *
3715 * *
3716 ************************************************************************/
3717
3718/**
3719 * xmlRegexpPrint:
3720 * @output: the file for the output debug
3721 * @regexp: the compiled regexp
3722 *
3723 * Print the content of the compiled regular expression
3724 */
3725void
3726xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
3727 int i;
3728
3729 fprintf(output, " regexp: ");
3730 if (regexp == NULL) {
3731 fprintf(output, "NULL\n");
3732 return;
3733 }
3734 fprintf(output, "'%s' ", regexp->string);
3735 fprintf(output, "\n");
3736 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
3737 for (i = 0;i < regexp->nbAtoms; i++) {
3738 fprintf(output, " %02d ", i);
3739 xmlRegPrintAtom(output, regexp->atoms[i]);
3740 }
3741 fprintf(output, "%d states:", regexp->nbStates);
3742 fprintf(output, "\n");
3743 for (i = 0;i < regexp->nbStates; i++) {
3744 xmlRegPrintState(output, regexp->states[i]);
3745 }
3746 fprintf(output, "%d counters:\n", regexp->nbCounters);
3747 for (i = 0;i < regexp->nbCounters; i++) {
3748 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
3749 regexp->counters[i].max);
3750 }
3751}
3752
3753/**
3754 * xmlRegexpCompile:
3755 * @regexp: a regular expression string
3756 *
3757 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
3758 * Appendix F and build an automata suitable for testing strings against
3759 * that regular expression
3760 *
3761 * Returns the compiled expression or NULL in case of error
3762 */
3763xmlRegexpPtr
3764xmlRegexpCompile(const xmlChar *regexp) {
3765 xmlRegexpPtr ret;
3766 xmlRegParserCtxtPtr ctxt;
3767
3768 ctxt = xmlRegNewParserCtxt(regexp);
3769 if (ctxt == NULL)
3770 return(NULL);
3771
3772 /* initialize the parser */
3773 ctxt->end = NULL;
3774 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
3775 xmlRegStatePush(ctxt, ctxt->start);
3776
3777 /* parse the expression building an automata */
3778 xmlFAParseRegExp(ctxt, 1);
3779 if (CUR != 0) {
3780 ERROR("xmlFAParseRegExp: extra characters");
3781 }
3782 ctxt->end = ctxt->state;
3783 ctxt->start->type = XML_REGEXP_START_STATE;
3784 ctxt->end->type = XML_REGEXP_FINAL_STATE;
3785
3786 /* remove the Epsilon except for counted transitions */
3787 xmlFAEliminateEpsilonTransitions(ctxt);
3788
3789
3790 if (ctxt->error != 0) {
3791 xmlRegFreeParserCtxt(ctxt);
3792 return(NULL);
3793 }
3794 ret = xmlRegEpxFromParse(ctxt);
3795 xmlRegFreeParserCtxt(ctxt);
3796 return(ret);
3797}
3798
3799/**
3800 * xmlRegexpExec:
3801 * @comp: the compiled regular expression
3802 * @content: the value to check against the regular expression
3803 *
3804 * Check if the regular expression generate the value
3805 *
3806 * Returns 1 if it matches, 0 if not and a negativa value in case of error
3807 */
3808int
3809xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
3810 if ((comp == NULL) || (content == NULL))
3811 return(-1);
3812 return(xmlFARegExec(comp, content));
3813}
3814
3815/**
Daniel Veillard23e73572002-09-19 19:56:43 +00003816 * xmlRegexpIsDeterminist:
3817 * @comp: the compiled regular expression
3818 *
3819 * Check if the regular expression is determinist
3820 *
3821 * Returns 1 if it yes, 0 if not and a negativa value in case of error
3822 */
3823int
3824xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
3825 xmlAutomataPtr am;
3826 int ret;
3827
3828 if (comp == NULL)
3829 return(-1);
3830 if (comp->determinist != -1)
3831 return(comp->determinist);
3832
3833 am = xmlNewAutomata();
3834 am->nbAtoms = comp->nbAtoms;
3835 am->atoms = comp->atoms;
3836 am->nbStates = comp->nbStates;
3837 am->states = comp->states;
3838 am->determinist = -1;
3839 ret = xmlFAComputesDeterminism(am);
3840 am->atoms = NULL;
3841 am->states = NULL;
3842 xmlFreeAutomata(am);
3843 return(ret);
3844}
3845
3846/**
Daniel Veillard4255d502002-04-16 15:50:10 +00003847 * xmlRegFreeRegexp:
3848 * @regexp: the regexp
3849 *
3850 * Free a regexp
3851 */
3852void
3853xmlRegFreeRegexp(xmlRegexpPtr regexp) {
3854 int i;
3855 if (regexp == NULL)
3856 return;
3857
3858 if (regexp->string != NULL)
3859 xmlFree(regexp->string);
3860 if (regexp->states != NULL) {
3861 for (i = 0;i < regexp->nbStates;i++)
3862 xmlRegFreeState(regexp->states[i]);
3863 xmlFree(regexp->states);
3864 }
3865 if (regexp->atoms != NULL) {
3866 for (i = 0;i < regexp->nbAtoms;i++)
3867 xmlRegFreeAtom(regexp->atoms[i]);
3868 xmlFree(regexp->atoms);
3869 }
3870 if (regexp->counters != NULL)
3871 xmlFree(regexp->counters);
Daniel Veillard23e73572002-09-19 19:56:43 +00003872 if (regexp->compact != NULL)
3873 xmlFree(regexp->compact);
3874 if (regexp->stringMap != NULL) {
3875 for (i = 0; i < regexp->nbstrings;i++)
3876 xmlFree(regexp->stringMap[i]);
3877 xmlFree(regexp->stringMap);
3878 }
3879
Daniel Veillard4255d502002-04-16 15:50:10 +00003880 xmlFree(regexp);
3881}
3882
3883#ifdef LIBXML_AUTOMATA_ENABLED
3884/************************************************************************
3885 * *
3886 * The Automata interface *
3887 * *
3888 ************************************************************************/
3889
3890/**
3891 * xmlNewAutomata:
3892 *
3893 * Create a new automata
3894 *
3895 * Returns the new object or NULL in case of failure
3896 */
3897xmlAutomataPtr
3898xmlNewAutomata(void) {
3899 xmlAutomataPtr ctxt;
3900
3901 ctxt = xmlRegNewParserCtxt(NULL);
3902 if (ctxt == NULL)
3903 return(NULL);
3904
3905 /* initialize the parser */
3906 ctxt->end = NULL;
3907 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
3908 xmlRegStatePush(ctxt, ctxt->start);
3909
3910 return(ctxt);
3911}
3912
3913/**
3914 * xmlFreeAutomata:
3915 * @am: an automata
3916 *
3917 * Free an automata
3918 */
3919void
3920xmlFreeAutomata(xmlAutomataPtr am) {
3921 if (am == NULL)
3922 return;
3923 xmlRegFreeParserCtxt(am);
3924}
3925
3926/**
3927 * xmlAutomataGetInitState:
3928 * @am: an automata
3929 *
3930 * Returns the initial state of the automata
3931 */
3932xmlAutomataStatePtr
3933xmlAutomataGetInitState(xmlAutomataPtr am) {
3934 if (am == NULL)
3935 return(NULL);
3936 return(am->start);
3937}
3938
3939/**
3940 * xmlAutomataSetFinalState:
3941 * @am: an automata
3942 * @state: a state in this automata
3943 *
3944 * Makes that state a final state
3945 *
3946 * Returns 0 or -1 in case of error
3947 */
3948int
3949xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
3950 if ((am == NULL) || (state == NULL))
3951 return(-1);
3952 state->type = XML_REGEXP_FINAL_STATE;
3953 return(0);
3954}
3955
3956/**
3957 * xmlAutomataNewTransition:
3958 * @am: an automata
3959 * @from: the starting point of the transition
3960 * @to: the target point of the transition or NULL
3961 * @token: the input string associated to that transition
3962 * @data: data passed to the callback function if the transition is activated
3963 *
3964 * If @to is NULL, this create first a new target state in the automata
3965 * and then adds a transition from the @from state to the target state
3966 * activated by the value of @token
3967 *
3968 * Returns the target state or NULL in case of error
3969 */
3970xmlAutomataStatePtr
3971xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
3972 xmlAutomataStatePtr to, const xmlChar *token,
3973 void *data) {
3974 xmlRegAtomPtr atom;
3975
3976 if ((am == NULL) || (from == NULL) || (token == NULL))
3977 return(NULL);
3978 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
3979 atom->data = data;
3980 if (atom == NULL)
3981 return(NULL);
3982 atom->valuep = xmlStrdup(token);
3983
3984 xmlFAGenerateTransitions(am, from, to, atom);
3985 if (to == NULL)
3986 return(am->state);
3987 return(to);
3988}
3989
3990/**
3991 * xmlAutomataNewCountTrans:
3992 * @am: an automata
3993 * @from: the starting point of the transition
3994 * @to: the target point of the transition or NULL
3995 * @token: the input string associated to that transition
3996 * @min: the minimum successive occurences of token
3997 * @min: the maximum successive occurences of token
3998 *
3999 * If @to is NULL, this create first a new target state in the automata
4000 * and then adds a transition from the @from state to the target state
4001 * activated by a succession of input of value @token and whose number
4002 * is between @min and @max
4003 *
4004 * Returns the target state or NULL in case of error
4005 */
4006xmlAutomataStatePtr
4007xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4008 xmlAutomataStatePtr to, const xmlChar *token,
4009 int min, int max, void *data) {
4010 xmlRegAtomPtr atom;
4011
4012 if ((am == NULL) || (from == NULL) || (token == NULL))
4013 return(NULL);
4014 if (min < 0)
4015 return(NULL);
4016 if ((max < min) || (max < 1))
4017 return(NULL);
4018 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4019 if (atom == NULL)
4020 return(NULL);
4021 atom->valuep = xmlStrdup(token);
4022 atom->data = data;
4023 if (min == 0)
4024 atom->min = 1;
4025 else
4026 atom->min = min;
4027 atom->max = max;
4028
4029 xmlFAGenerateTransitions(am, from, to, atom);
4030 if (to == NULL)
4031 to = am->state;
4032 if (to == NULL)
4033 return(NULL);
4034 if (min == 0)
4035 xmlFAGenerateEpsilonTransition(am, from, to);
4036 return(to);
4037}
4038
4039/**
Daniel Veillard7646b182002-04-20 06:41:40 +00004040 * xmlAutomataNewOnceTrans:
4041 * @am: an automata
4042 * @from: the starting point of the transition
4043 * @to: the target point of the transition or NULL
4044 * @token: the input string associated to that transition
4045 * @min: the minimum successive occurences of token
4046 * @min: the maximum successive occurences of token
4047 *
4048 * If @to is NULL, this create first a new target state in the automata
4049 * and then adds a transition from the @from state to the target state
4050 * activated by a succession of input of value @token and whose number
4051 * is between @min and @max, moreover that transistion can only be crossed
4052 * once.
4053 *
4054 * Returns the target state or NULL in case of error
4055 */
4056xmlAutomataStatePtr
4057xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4058 xmlAutomataStatePtr to, const xmlChar *token,
4059 int min, int max, void *data) {
4060 xmlRegAtomPtr atom;
4061 int counter;
4062
4063 if ((am == NULL) || (from == NULL) || (token == NULL))
4064 return(NULL);
4065 if (min < 1)
4066 return(NULL);
4067 if ((max < min) || (max < 1))
4068 return(NULL);
4069 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4070 if (atom == NULL)
4071 return(NULL);
4072 atom->valuep = xmlStrdup(token);
4073 atom->data = data;
4074 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
4075 if (min == 0)
4076 atom->min = 1;
4077 else
4078 atom->min = min;
4079 atom->max = max;
4080 /*
4081 * associate a counter to the transition.
4082 */
4083 counter = xmlRegGetCounter(am);
4084 am->counters[counter].min = 1;
4085 am->counters[counter].max = 1;
4086
4087 /* xmlFAGenerateTransitions(am, from, to, atom); */
4088 if (to == NULL) {
4089 to = xmlRegNewState(am);
4090 xmlRegStatePush(am, to);
4091 }
4092 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4093 xmlRegAtomPush(am, atom);
4094 am->state = to;
4095 if (to == NULL)
4096 to = am->state;
4097 if (to == NULL)
4098 return(NULL);
4099 return(to);
4100}
4101
4102/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004103 * xmlAutomataNewState:
4104 * @am: an automata
4105 *
4106 * Create a new disconnected state in the automata
4107 *
4108 * Returns the new state or NULL in case of error
4109 */
4110xmlAutomataStatePtr
4111xmlAutomataNewState(xmlAutomataPtr am) {
4112 xmlAutomataStatePtr to;
4113
4114 if (am == NULL)
4115 return(NULL);
4116 to = xmlRegNewState(am);
4117 xmlRegStatePush(am, to);
4118 return(to);
4119}
4120
4121/**
4122 * xmlAutomataNewTransition:
4123 * @am: an automata
4124 * @from: the starting point of the transition
4125 * @to: the target point of the transition or NULL
4126 *
4127 * If @to is NULL, this create first a new target state in the automata
4128 * and then adds a an epsilon transition from the @from state to the
4129 * target state
4130 *
4131 * Returns the target state or NULL in case of error
4132 */
4133xmlAutomataStatePtr
4134xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
4135 xmlAutomataStatePtr to) {
4136 if ((am == NULL) || (from == NULL))
4137 return(NULL);
4138 xmlFAGenerateEpsilonTransition(am, from, to);
4139 if (to == NULL)
4140 return(am->state);
4141 return(to);
4142}
4143
Daniel Veillardb509f152002-04-17 16:28:10 +00004144/**
Daniel Veillard7646b182002-04-20 06:41:40 +00004145 * xmlAutomataNewAllTrans:
4146 * @am: an automata
4147 * @from: the starting point of the transition
4148 * @to: the target point of the transition or NULL
4149 *
4150 * If @to is NULL, this create first a new target state in the automata
4151 * and then adds a an ALL transition from the @from state to the
4152 * target state. That transition is an epsilon transition allowed only when
4153 * all transitions from the @from node have been activated.
4154 *
4155 * Returns the target state or NULL in case of error
4156 */
4157xmlAutomataStatePtr
4158xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
Daniel Veillard441bc322002-04-20 17:38:48 +00004159 xmlAutomataStatePtr to, int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00004160 if ((am == NULL) || (from == NULL))
4161 return(NULL);
Daniel Veillard441bc322002-04-20 17:38:48 +00004162 xmlFAGenerateAllTransition(am, from, to, lax);
Daniel Veillard7646b182002-04-20 06:41:40 +00004163 if (to == NULL)
4164 return(am->state);
4165 return(to);
4166}
4167
4168/**
Daniel Veillardb509f152002-04-17 16:28:10 +00004169 * xmlAutomataNewCounter:
4170 * @am: an automata
4171 * @min: the minimal value on the counter
4172 * @max: the maximal value on the counter
4173 *
4174 * Create a new counter
4175 *
4176 * Returns the counter number or -1 in case of error
4177 */
4178int
4179xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
4180 int ret;
4181
4182 if (am == NULL)
4183 return(-1);
4184
4185 ret = xmlRegGetCounter(am);
4186 if (ret < 0)
4187 return(-1);
4188 am->counters[ret].min = min;
4189 am->counters[ret].max = max;
4190 return(ret);
4191}
4192
4193/**
4194 * xmlAutomataNewCountedTrans:
4195 * @am: an automata
4196 * @from: the starting point of the transition
4197 * @to: the target point of the transition or NULL
4198 * @counter: the counter associated to that transition
4199 *
4200 * If @to is NULL, this create first a new target state in the automata
4201 * and then adds an epsilon transition from the @from state to the target state
4202 * which will increment the counter provided
4203 *
4204 * Returns the target state or NULL in case of error
4205 */
4206xmlAutomataStatePtr
4207xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4208 xmlAutomataStatePtr to, int counter) {
4209 if ((am == NULL) || (from == NULL) || (counter < 0))
4210 return(NULL);
4211 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
4212 if (to == NULL)
4213 return(am->state);
4214 return(to);
4215}
4216
4217/**
4218 * xmlAutomataNewCounterTrans:
4219 * @am: an automata
4220 * @from: the starting point of the transition
4221 * @to: the target point of the transition or NULL
4222 * @counter: the counter associated to that transition
4223 *
4224 * If @to is NULL, this create first a new target state in the automata
4225 * and then adds an epsilon transition from the @from state to the target state
4226 * which will be allowed only if the counter is within the right range.
4227 *
4228 * Returns the target state or NULL in case of error
4229 */
4230xmlAutomataStatePtr
4231xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4232 xmlAutomataStatePtr to, int counter) {
4233 if ((am == NULL) || (from == NULL) || (counter < 0))
4234 return(NULL);
4235 xmlFAGenerateCountedTransition(am, from, to, counter);
4236 if (to == NULL)
4237 return(am->state);
4238 return(to);
4239}
Daniel Veillard4255d502002-04-16 15:50:10 +00004240
4241/**
4242 * xmlAutomataCompile:
4243 * @am: an automata
4244 *
4245 * Compile the automata into a Reg Exp ready for being executed.
4246 * The automata should be free after this point.
4247 *
4248 * Returns the compiled regexp or NULL in case of error
4249 */
4250xmlRegexpPtr
4251xmlAutomataCompile(xmlAutomataPtr am) {
4252 xmlRegexpPtr ret;
4253
4254 xmlFAEliminateEpsilonTransitions(am);
Daniel Veillard23e73572002-09-19 19:56:43 +00004255 /* xmlFAComputesDeterminism(am); */
Daniel Veillard4255d502002-04-16 15:50:10 +00004256 ret = xmlRegEpxFromParse(am);
4257
4258 return(ret);
4259}
Daniel Veillarde19fc232002-04-22 16:01:24 +00004260
4261/**
4262 * xmlAutomataIsDeterminist:
4263 * @am: an automata
4264 *
4265 * Checks if an automata is determinist.
4266 *
4267 * Returns 1 if true, 0 if not, and -1 in case of error
4268 */
4269int
4270xmlAutomataIsDeterminist(xmlAutomataPtr am) {
4271 int ret;
4272
4273 if (am == NULL)
4274 return(-1);
4275
4276 ret = xmlFAComputesDeterminism(am);
4277 return(ret);
4278}
Daniel Veillard4255d502002-04-16 15:50:10 +00004279#endif /* LIBXML_AUTOMATA_ENABLED */
4280#endif /* LIBXML_REGEXP_ENABLED */