blob: c49f2a61b2f65ce7b8047b64bbd8c5577ca6591e [file] [log] [blame]
Daniel Veillard4255d502002-04-16 15:50:10 +00001/*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/shemas mechanisms now available in
6 * XML related specifications thise includes:
7 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17#define IN_LIBXML
18#include "libxml.h"
19
20#ifdef LIBXML_REGEXP_ENABLED
21
22#include <stdio.h>
23#include <string.h>
24#include <libxml/tree.h>
25#include <libxml/parserInternals.h>
26#include <libxml/xmlregexp.h>
27#include <libxml/xmlautomata.h>
28#include <libxml/xmlunicode.h>
29
30/* #define DEBUG_REGEXP_GRAPH */
31/* #define DEBUG_REGEXP_EXEC */
32/* #define DEBUG_PUSH */
Daniel Veillard23e73572002-09-19 19:56:43 +000033/* #define DEBUG_COMPACTION */
Daniel Veillard4255d502002-04-16 15:50:10 +000034
35#define ERROR(str) ctxt->error = 1; \
36 xmlGenericError(xmlGenericErrorContext, "Regexp: %s: %s\n", str, ctxt->cur)
37#define NEXT ctxt->cur++
38#define CUR (*(ctxt->cur))
39#define NXT(index) (ctxt->cur[index])
40
41#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
42#define NEXTL(l) ctxt->cur += l;
43
Daniel Veillarde19fc232002-04-22 16:01:24 +000044/**
45 * TODO:
46 *
47 * macro to flag unimplemented blocks
48 */
49#define TODO \
50 xmlGenericError(xmlGenericErrorContext, \
51 "Unimplemented block at %s:%d\n", \
52 __FILE__, __LINE__);
53
Daniel Veillard4255d502002-04-16 15:50:10 +000054
55/************************************************************************
56 * *
57 * Datatypes and structures *
58 * *
59 ************************************************************************/
60
61typedef enum {
62 XML_REGEXP_EPSILON = 1,
63 XML_REGEXP_CHARVAL,
64 XML_REGEXP_RANGES,
65 XML_REGEXP_SUBREG,
66 XML_REGEXP_STRING,
67 XML_REGEXP_ANYCHAR, /* . */
68 XML_REGEXP_ANYSPACE, /* \s */
69 XML_REGEXP_NOTSPACE, /* \S */
70 XML_REGEXP_INITNAME, /* \l */
71 XML_REGEXP_NOTINITNAME, /* \l */
72 XML_REGEXP_NAMECHAR, /* \c */
73 XML_REGEXP_NOTNAMECHAR, /* \C */
74 XML_REGEXP_DECIMAL, /* \d */
75 XML_REGEXP_NOTDECIMAL, /* \d */
76 XML_REGEXP_REALCHAR, /* \w */
77 XML_REGEXP_NOTREALCHAR, /* \w */
78 XML_REGEXP_LETTER,
79 XML_REGEXP_LETTER_UPPERCASE,
80 XML_REGEXP_LETTER_LOWERCASE,
81 XML_REGEXP_LETTER_TITLECASE,
82 XML_REGEXP_LETTER_MODIFIER,
83 XML_REGEXP_LETTER_OTHERS,
84 XML_REGEXP_MARK,
85 XML_REGEXP_MARK_NONSPACING,
86 XML_REGEXP_MARK_SPACECOMBINING,
87 XML_REGEXP_MARK_ENCLOSING,
88 XML_REGEXP_NUMBER,
89 XML_REGEXP_NUMBER_DECIMAL,
90 XML_REGEXP_NUMBER_LETTER,
91 XML_REGEXP_NUMBER_OTHERS,
92 XML_REGEXP_PUNCT,
93 XML_REGEXP_PUNCT_CONNECTOR,
94 XML_REGEXP_PUNCT_DASH,
95 XML_REGEXP_PUNCT_OPEN,
96 XML_REGEXP_PUNCT_CLOSE,
97 XML_REGEXP_PUNCT_INITQUOTE,
98 XML_REGEXP_PUNCT_FINQUOTE,
99 XML_REGEXP_PUNCT_OTHERS,
100 XML_REGEXP_SEPAR,
101 XML_REGEXP_SEPAR_SPACE,
102 XML_REGEXP_SEPAR_LINE,
103 XML_REGEXP_SEPAR_PARA,
104 XML_REGEXP_SYMBOL,
105 XML_REGEXP_SYMBOL_MATH,
106 XML_REGEXP_SYMBOL_CURRENCY,
107 XML_REGEXP_SYMBOL_MODIFIER,
108 XML_REGEXP_SYMBOL_OTHERS,
109 XML_REGEXP_OTHER,
110 XML_REGEXP_OTHER_CONTROL,
111 XML_REGEXP_OTHER_FORMAT,
112 XML_REGEXP_OTHER_PRIVATE,
113 XML_REGEXP_OTHER_NA,
114 XML_REGEXP_BLOCK_NAME
115} xmlRegAtomType;
116
117typedef enum {
118 XML_REGEXP_QUANT_EPSILON = 1,
119 XML_REGEXP_QUANT_ONCE,
120 XML_REGEXP_QUANT_OPT,
121 XML_REGEXP_QUANT_MULT,
122 XML_REGEXP_QUANT_PLUS,
Daniel Veillard7646b182002-04-20 06:41:40 +0000123 XML_REGEXP_QUANT_ONCEONLY,
124 XML_REGEXP_QUANT_ALL,
Daniel Veillard4255d502002-04-16 15:50:10 +0000125 XML_REGEXP_QUANT_RANGE
126} xmlRegQuantType;
127
128typedef enum {
129 XML_REGEXP_START_STATE = 1,
130 XML_REGEXP_FINAL_STATE,
131 XML_REGEXP_TRANS_STATE
132} xmlRegStateType;
133
134typedef enum {
135 XML_REGEXP_MARK_NORMAL = 0,
136 XML_REGEXP_MARK_START,
137 XML_REGEXP_MARK_VISITED
138} xmlRegMarkedType;
139
140typedef struct _xmlRegRange xmlRegRange;
141typedef xmlRegRange *xmlRegRangePtr;
142
143struct _xmlRegRange {
144 int neg;
145 xmlRegAtomType type;
146 int start;
147 int end;
148 xmlChar *blockName;
149};
150
151typedef struct _xmlRegAtom xmlRegAtom;
152typedef xmlRegAtom *xmlRegAtomPtr;
153
154typedef struct _xmlAutomataState xmlRegState;
155typedef xmlRegState *xmlRegStatePtr;
156
157struct _xmlRegAtom {
158 int no;
159 xmlRegAtomType type;
160 xmlRegQuantType quant;
161 int min;
162 int max;
163
164 void *valuep;
Daniel Veillarda646cfd2002-09-17 21:50:03 +0000165 void *valuep2;
Daniel Veillard4255d502002-04-16 15:50:10 +0000166 int neg;
167 int codepoint;
168 xmlRegStatePtr start;
169 xmlRegStatePtr stop;
170 int maxRanges;
171 int nbRanges;
172 xmlRegRangePtr *ranges;
173 void *data;
174};
175
176typedef struct _xmlRegCounter xmlRegCounter;
177typedef xmlRegCounter *xmlRegCounterPtr;
178
179struct _xmlRegCounter {
180 int min;
181 int max;
182};
183
184typedef struct _xmlRegTrans xmlRegTrans;
185typedef xmlRegTrans *xmlRegTransPtr;
186
187struct _xmlRegTrans {
188 xmlRegAtomPtr atom;
189 int to;
190 int counter;
191 int count;
192};
193
194struct _xmlAutomataState {
195 xmlRegStateType type;
196 xmlRegMarkedType mark;
Daniel Veillard23e73572002-09-19 19:56:43 +0000197 xmlRegMarkedType reached;
Daniel Veillard4255d502002-04-16 15:50:10 +0000198 int no;
199
200 int maxTrans;
201 int nbTrans;
202 xmlRegTrans *trans;
203};
204
205typedef struct _xmlAutomata xmlRegParserCtxt;
206typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
207
208struct _xmlAutomata {
209 xmlChar *string;
210 xmlChar *cur;
211
212 int error;
213 int neg;
214
215 xmlRegStatePtr start;
216 xmlRegStatePtr end;
217 xmlRegStatePtr state;
218
219 xmlRegAtomPtr atom;
220
221 int maxAtoms;
222 int nbAtoms;
223 xmlRegAtomPtr *atoms;
224
225 int maxStates;
226 int nbStates;
227 xmlRegStatePtr *states;
228
229 int maxCounters;
230 int nbCounters;
231 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000232
233 int determinist;
Daniel Veillard4255d502002-04-16 15:50:10 +0000234};
235
236struct _xmlRegexp {
237 xmlChar *string;
238 int nbStates;
239 xmlRegStatePtr *states;
240 int nbAtoms;
241 xmlRegAtomPtr *atoms;
242 int nbCounters;
243 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000244 int determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000245 /*
246 * That's the compact form for determinists automatas
247 */
248 int nbstates;
249 int *compact;
Daniel Veillard118aed72002-09-24 14:13:13 +0000250 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000251 int nbstrings;
252 xmlChar **stringMap;
Daniel Veillard4255d502002-04-16 15:50:10 +0000253};
254
255typedef struct _xmlRegExecRollback xmlRegExecRollback;
256typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
257
258struct _xmlRegExecRollback {
259 xmlRegStatePtr state;/* the current state */
260 int index; /* the index in the input stack */
261 int nextbranch; /* the next transition to explore in that state */
262 int *counts; /* save the automate state if it has some */
263};
264
265typedef struct _xmlRegInputToken xmlRegInputToken;
266typedef xmlRegInputToken *xmlRegInputTokenPtr;
267
268struct _xmlRegInputToken {
269 xmlChar *value;
270 void *data;
271};
272
273struct _xmlRegExecCtxt {
274 int status; /* execution status != 0 indicate an error */
275 int determinist; /* did we found an inderterministic behaviour */
276 xmlRegexpPtr comp; /* the compiled regexp */
277 xmlRegExecCallbacks callback;
278 void *data;
279
280 xmlRegStatePtr state;/* the current state */
281 int transno; /* the current transition on that state */
282 int transcount; /* the number of char in char counted transitions */
283
284 /*
285 * A stack of rollback states
286 */
287 int maxRollbacks;
288 int nbRollbacks;
289 xmlRegExecRollback *rollbacks;
290
291 /*
292 * The state of the automata if any
293 */
294 int *counts;
295
296 /*
297 * The input stack
298 */
299 int inputStackMax;
300 int inputStackNr;
301 int index;
302 int *charStack;
303 const xmlChar *inputString; /* when operating on characters */
304 xmlRegInputTokenPtr inputStack;/* when operating on strings */
305
306};
307
Daniel Veillard441bc322002-04-20 17:38:48 +0000308#define REGEXP_ALL_COUNTER 0x123456
309#define REGEXP_ALL_LAX_COUNTER 0x123457
Daniel Veillard7646b182002-04-20 06:41:40 +0000310
Daniel Veillard4255d502002-04-16 15:50:10 +0000311static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
Daniel Veillard23e73572002-09-19 19:56:43 +0000312static void xmlRegFreeState(xmlRegStatePtr state);
313static void xmlRegFreeAtom(xmlRegAtomPtr atom);
Daniel Veillard4255d502002-04-16 15:50:10 +0000314
315/************************************************************************
316 * *
317 * Allocation/Deallocation *
318 * *
319 ************************************************************************/
320
Daniel Veillard23e73572002-09-19 19:56:43 +0000321static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
Daniel Veillard4255d502002-04-16 15:50:10 +0000322/**
323 * xmlRegEpxFromParse:
324 * @ctxt: the parser context used to build it
325 *
326 * Allocate a new regexp and fill it with the reult from the parser
327 *
328 * Returns the new regexp or NULL in case of error
329 */
330static xmlRegexpPtr
331xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
332 xmlRegexpPtr ret;
333
334 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
335 if (ret == NULL)
336 return(NULL);
337 memset(ret, 0, sizeof(xmlRegexp));
338 ret->string = ctxt->string;
339 ctxt->string = NULL;
340 ret->nbStates = ctxt->nbStates;
341 ctxt->nbStates = 0;
342 ret->states = ctxt->states;
343 ctxt->states = NULL;
344 ret->nbAtoms = ctxt->nbAtoms;
345 ctxt->nbAtoms = 0;
346 ret->atoms = ctxt->atoms;
347 ctxt->atoms = NULL;
348 ret->nbCounters = ctxt->nbCounters;
349 ctxt->nbCounters = 0;
350 ret->counters = ctxt->counters;
351 ctxt->counters = NULL;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000352 ret->determinist = ctxt->determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000353
354 if ((ret->determinist != 0) &&
355 (ret->nbCounters == 0) &&
Daniel Veillard118aed72002-09-24 14:13:13 +0000356 (ret->atoms != NULL) &&
Daniel Veillard23e73572002-09-19 19:56:43 +0000357 (ret->atoms[0] != NULL) &&
358 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
359 int i, j, nbstates = 0, nbatoms = 0;
360 int *stateRemap;
361 int *stringRemap;
362 int *transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000363 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000364 xmlChar **stringMap;
365 xmlChar *value;
366
367 /*
368 * Switch to a compact representation
369 * 1/ counting the effective number of states left
370 * 2/ conting the unique number of atoms, and check that
371 * they are all of the string type
372 * 3/ build a table state x atom for the transitions
373 */
374
375 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
376 for (i = 0;i < ret->nbStates;i++) {
377 if (ret->states[i] != NULL) {
378 stateRemap[i] = nbstates;
379 nbstates++;
380 } else {
381 stateRemap[i] = -1;
382 }
383 }
384#ifdef DEBUG_COMPACTION
385 printf("Final: %d states\n", nbstates);
386#endif
387 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
388 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
389 for (i = 0;i < ret->nbAtoms;i++) {
390 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
391 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
392 value = ret->atoms[i]->valuep;
393 for (j = 0;j < nbatoms;j++) {
394 if (xmlStrEqual(stringMap[j], value)) {
395 stringRemap[i] = j;
396 break;
397 }
398 }
399 if (j >= nbatoms) {
400 stringRemap[i] = nbatoms;
401 stringMap[nbatoms] = xmlStrdup(value);
402 nbatoms++;
403 }
404 } else {
405 xmlFree(stateRemap);
406 xmlFree(stringRemap);
407 for (i = 0;i < nbatoms;i++)
408 xmlFree(stringMap[i]);
409 xmlFree(stringMap);
410 goto fail_compact;
411 }
412 }
413#ifdef DEBUG_COMPACTION
414 printf("Final: %d atoms\n", nbatoms);
415#endif
416
417 /*
418 * Allocate the transition table. The first entry for each
419 * state correspond to the state type.
420 */
421 transitions = (int *) xmlMalloc(nbstates * (nbatoms + 1) * sizeof(int));
Daniel Veillard118aed72002-09-24 14:13:13 +0000422 transdata = NULL;
Daniel Veillard23e73572002-09-19 19:56:43 +0000423 memset(transitions, 0, nbstates * (nbatoms + 1) * sizeof(int));
424
425 for (i = 0;i < ret->nbStates;i++) {
426 int stateno, atomno, targetno, prev;
427 xmlRegStatePtr state;
428 xmlRegTransPtr trans;
429
430 stateno = stateRemap[i];
431 if (stateno == -1)
432 continue;
433 state = ret->states[i];
434
435 transitions[stateno * (nbatoms + 1)] = state->type;
436
437 for (j = 0;j < state->nbTrans;j++) {
438 trans = &(state->trans[j]);
439 if ((trans->to == -1) || (trans->atom == NULL))
440 continue;
441 atomno = stringRemap[trans->atom->no];
Daniel Veillard118aed72002-09-24 14:13:13 +0000442 if ((trans->atom->data != NULL) && (transdata == NULL)) {
443 transdata = (void **) xmlMalloc(nbstates * nbatoms *
444 sizeof(void *));
445 if (transdata != NULL)
446 memset(transdata, 0,
447 nbstates * nbatoms * sizeof(void *));
448 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000449 targetno = stateRemap[trans->to];
450 /*
451 * if the same atome can generate transition to 2 different
452 * states then it means the automata is not determinist and
453 * the compact form can't be used !
454 */
455 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
456 if (prev != 0) {
457 if (prev != targetno + 1) {
Daniel Veillard23e73572002-09-19 19:56:43 +0000458 ret->determinist = 0;
459#ifdef DEBUG_COMPACTION
460 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
461 i, j, trans->atom->no, trans->to, atomno, targetno);
462 printf(" previous to is %d\n", prev);
463#endif
464 ret->determinist = 0;
Daniel Veillard118aed72002-09-24 14:13:13 +0000465 if (transdata != NULL)
466 xmlFree(transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +0000467 xmlFree(transitions);
468 xmlFree(stateRemap);
469 xmlFree(stringRemap);
470 for (i = 0;i < nbatoms;i++)
471 xmlFree(stringMap[i]);
472 xmlFree(stringMap);
473 goto fail_compact;
474 }
475 } else {
476#if 0
477 printf("State %d trans %d: atom %d to %d : %d to %d\n",
478 i, j, trans->atom->no, trans->to, atomno, targetno);
479#endif
480 transitions[stateno * (nbatoms + 1) + atomno + 1] =
Daniel Veillard118aed72002-09-24 14:13:13 +0000481 targetno + 1; /* to avoid 0 */
482 if (transdata != NULL)
483 transdata[stateno * nbatoms + atomno] =
484 trans->atom->data;
Daniel Veillard23e73572002-09-19 19:56:43 +0000485 }
486 }
487 }
488 ret->determinist = 1;
489#ifdef DEBUG_COMPACTION
490 /*
491 * Debug
492 */
493 for (i = 0;i < nbstates;i++) {
494 for (j = 0;j < nbatoms + 1;j++) {
495 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
496 }
497 printf("\n");
498 }
499 printf("\n");
500#endif
501 /*
502 * Cleanup of the old data
503 */
504 if (ret->states != NULL) {
505 for (i = 0;i < ret->nbStates;i++)
506 xmlRegFreeState(ret->states[i]);
507 xmlFree(ret->states);
508 }
509 ret->states = NULL;
510 ret->nbStates = 0;
511 if (ret->atoms != NULL) {
512 for (i = 0;i < ret->nbAtoms;i++)
513 xmlRegFreeAtom(ret->atoms[i]);
514 xmlFree(ret->atoms);
515 }
516 ret->atoms = NULL;
517 ret->nbAtoms = 0;
518
519 ret->compact = transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000520 ret->transdata = transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000521 ret->stringMap = stringMap;
522 ret->nbstrings = nbatoms;
523 ret->nbstates = nbstates;
524 xmlFree(stateRemap);
525 xmlFree(stringRemap);
526 }
527fail_compact:
Daniel Veillard4255d502002-04-16 15:50:10 +0000528 return(ret);
529}
530
531/**
532 * xmlRegNewParserCtxt:
533 * @string: the string to parse
534 *
535 * Allocate a new regexp parser context
536 *
537 * Returns the new context or NULL in case of error
538 */
539static xmlRegParserCtxtPtr
540xmlRegNewParserCtxt(const xmlChar *string) {
541 xmlRegParserCtxtPtr ret;
542
543 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
544 if (ret == NULL)
545 return(NULL);
546 memset(ret, 0, sizeof(xmlRegParserCtxt));
547 if (string != NULL)
548 ret->string = xmlStrdup(string);
549 ret->cur = ret->string;
550 ret->neg = 0;
551 ret->error = 0;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000552 ret->determinist = -1;
Daniel Veillard4255d502002-04-16 15:50:10 +0000553 return(ret);
554}
555
556/**
557 * xmlRegNewRange:
558 * @ctxt: the regexp parser context
559 * @neg: is that negative
560 * @type: the type of range
561 * @start: the start codepoint
562 * @end: the end codepoint
563 *
564 * Allocate a new regexp range
565 *
566 * Returns the new range or NULL in case of error
567 */
568static xmlRegRangePtr
569xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
570 int neg, xmlRegAtomType type, int start, int end) {
571 xmlRegRangePtr ret;
572
573 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
574 if (ret == NULL) {
575 ERROR("failed to allocate regexp range");
576 return(NULL);
577 }
578 ret->neg = neg;
579 ret->type = type;
580 ret->start = start;
581 ret->end = end;
582 return(ret);
583}
584
585/**
586 * xmlRegFreeRange:
587 * @range: the regexp range
588 *
589 * Free a regexp range
590 */
591static void
592xmlRegFreeRange(xmlRegRangePtr range) {
593 if (range == NULL)
594 return;
595
596 if (range->blockName != NULL)
597 xmlFree(range->blockName);
598 xmlFree(range);
599}
600
601/**
602 * xmlRegNewAtom:
603 * @ctxt: the regexp parser context
604 * @type: the type of atom
605 *
606 * Allocate a new regexp range
607 *
608 * Returns the new atom or NULL in case of error
609 */
610static xmlRegAtomPtr
611xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
612 xmlRegAtomPtr ret;
613
614 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
615 if (ret == NULL) {
616 ERROR("failed to allocate regexp atom");
617 return(NULL);
618 }
619 memset(ret, 0, sizeof(xmlRegAtom));
620 ret->type = type;
621 ret->quant = XML_REGEXP_QUANT_ONCE;
622 ret->min = 0;
623 ret->max = 0;
624 return(ret);
625}
626
627/**
628 * xmlRegFreeAtom:
629 * @atom: the regexp atom
630 *
631 * Free a regexp atom
632 */
633static void
634xmlRegFreeAtom(xmlRegAtomPtr atom) {
635 int i;
636
637 if (atom == NULL)
638 return;
639
640 for (i = 0;i < atom->nbRanges;i++)
641 xmlRegFreeRange(atom->ranges[i]);
642 if (atom->ranges != NULL)
643 xmlFree(atom->ranges);
644 if (atom->type == XML_REGEXP_STRING)
645 xmlFree(atom->valuep);
646 xmlFree(atom);
647}
648
649static xmlRegStatePtr
650xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
651 xmlRegStatePtr ret;
652
653 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
654 if (ret == NULL) {
655 ERROR("failed to allocate regexp state");
656 return(NULL);
657 }
658 memset(ret, 0, sizeof(xmlRegState));
659 ret->type = XML_REGEXP_TRANS_STATE;
660 ret->mark = XML_REGEXP_MARK_NORMAL;
661 return(ret);
662}
663
664/**
665 * xmlRegFreeState:
666 * @state: the regexp state
667 *
668 * Free a regexp state
669 */
670static void
671xmlRegFreeState(xmlRegStatePtr state) {
672 if (state == NULL)
673 return;
674
675 if (state->trans != NULL)
676 xmlFree(state->trans);
677 xmlFree(state);
678}
679
680/**
681 * xmlRegFreeParserCtxt:
682 * @ctxt: the regexp parser context
683 *
684 * Free a regexp parser context
685 */
686static void
687xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
688 int i;
689 if (ctxt == NULL)
690 return;
691
692 if (ctxt->string != NULL)
693 xmlFree(ctxt->string);
694 if (ctxt->states != NULL) {
695 for (i = 0;i < ctxt->nbStates;i++)
696 xmlRegFreeState(ctxt->states[i]);
697 xmlFree(ctxt->states);
698 }
699 if (ctxt->atoms != NULL) {
700 for (i = 0;i < ctxt->nbAtoms;i++)
701 xmlRegFreeAtom(ctxt->atoms[i]);
702 xmlFree(ctxt->atoms);
703 }
704 if (ctxt->counters != NULL)
705 xmlFree(ctxt->counters);
706 xmlFree(ctxt);
707}
708
709/************************************************************************
710 * *
711 * Display of Data structures *
712 * *
713 ************************************************************************/
714
715static void
716xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
717 switch (type) {
718 case XML_REGEXP_EPSILON:
719 fprintf(output, "epsilon "); break;
720 case XML_REGEXP_CHARVAL:
721 fprintf(output, "charval "); break;
722 case XML_REGEXP_RANGES:
723 fprintf(output, "ranges "); break;
724 case XML_REGEXP_SUBREG:
725 fprintf(output, "subexpr "); break;
726 case XML_REGEXP_STRING:
727 fprintf(output, "string "); break;
728 case XML_REGEXP_ANYCHAR:
729 fprintf(output, "anychar "); break;
730 case XML_REGEXP_ANYSPACE:
731 fprintf(output, "anyspace "); break;
732 case XML_REGEXP_NOTSPACE:
733 fprintf(output, "notspace "); break;
734 case XML_REGEXP_INITNAME:
735 fprintf(output, "initname "); break;
736 case XML_REGEXP_NOTINITNAME:
737 fprintf(output, "notinitname "); break;
738 case XML_REGEXP_NAMECHAR:
739 fprintf(output, "namechar "); break;
740 case XML_REGEXP_NOTNAMECHAR:
741 fprintf(output, "notnamechar "); break;
742 case XML_REGEXP_DECIMAL:
743 fprintf(output, "decimal "); break;
744 case XML_REGEXP_NOTDECIMAL:
745 fprintf(output, "notdecimal "); break;
746 case XML_REGEXP_REALCHAR:
747 fprintf(output, "realchar "); break;
748 case XML_REGEXP_NOTREALCHAR:
749 fprintf(output, "notrealchar "); break;
750 case XML_REGEXP_LETTER:
751 fprintf(output, "LETTER "); break;
752 case XML_REGEXP_LETTER_UPPERCASE:
753 fprintf(output, "LETTER_UPPERCASE "); break;
754 case XML_REGEXP_LETTER_LOWERCASE:
755 fprintf(output, "LETTER_LOWERCASE "); break;
756 case XML_REGEXP_LETTER_TITLECASE:
757 fprintf(output, "LETTER_TITLECASE "); break;
758 case XML_REGEXP_LETTER_MODIFIER:
759 fprintf(output, "LETTER_MODIFIER "); break;
760 case XML_REGEXP_LETTER_OTHERS:
761 fprintf(output, "LETTER_OTHERS "); break;
762 case XML_REGEXP_MARK:
763 fprintf(output, "MARK "); break;
764 case XML_REGEXP_MARK_NONSPACING:
765 fprintf(output, "MARK_NONSPACING "); break;
766 case XML_REGEXP_MARK_SPACECOMBINING:
767 fprintf(output, "MARK_SPACECOMBINING "); break;
768 case XML_REGEXP_MARK_ENCLOSING:
769 fprintf(output, "MARK_ENCLOSING "); break;
770 case XML_REGEXP_NUMBER:
771 fprintf(output, "NUMBER "); break;
772 case XML_REGEXP_NUMBER_DECIMAL:
773 fprintf(output, "NUMBER_DECIMAL "); break;
774 case XML_REGEXP_NUMBER_LETTER:
775 fprintf(output, "NUMBER_LETTER "); break;
776 case XML_REGEXP_NUMBER_OTHERS:
777 fprintf(output, "NUMBER_OTHERS "); break;
778 case XML_REGEXP_PUNCT:
779 fprintf(output, "PUNCT "); break;
780 case XML_REGEXP_PUNCT_CONNECTOR:
781 fprintf(output, "PUNCT_CONNECTOR "); break;
782 case XML_REGEXP_PUNCT_DASH:
783 fprintf(output, "PUNCT_DASH "); break;
784 case XML_REGEXP_PUNCT_OPEN:
785 fprintf(output, "PUNCT_OPEN "); break;
786 case XML_REGEXP_PUNCT_CLOSE:
787 fprintf(output, "PUNCT_CLOSE "); break;
788 case XML_REGEXP_PUNCT_INITQUOTE:
789 fprintf(output, "PUNCT_INITQUOTE "); break;
790 case XML_REGEXP_PUNCT_FINQUOTE:
791 fprintf(output, "PUNCT_FINQUOTE "); break;
792 case XML_REGEXP_PUNCT_OTHERS:
793 fprintf(output, "PUNCT_OTHERS "); break;
794 case XML_REGEXP_SEPAR:
795 fprintf(output, "SEPAR "); break;
796 case XML_REGEXP_SEPAR_SPACE:
797 fprintf(output, "SEPAR_SPACE "); break;
798 case XML_REGEXP_SEPAR_LINE:
799 fprintf(output, "SEPAR_LINE "); break;
800 case XML_REGEXP_SEPAR_PARA:
801 fprintf(output, "SEPAR_PARA "); break;
802 case XML_REGEXP_SYMBOL:
803 fprintf(output, "SYMBOL "); break;
804 case XML_REGEXP_SYMBOL_MATH:
805 fprintf(output, "SYMBOL_MATH "); break;
806 case XML_REGEXP_SYMBOL_CURRENCY:
807 fprintf(output, "SYMBOL_CURRENCY "); break;
808 case XML_REGEXP_SYMBOL_MODIFIER:
809 fprintf(output, "SYMBOL_MODIFIER "); break;
810 case XML_REGEXP_SYMBOL_OTHERS:
811 fprintf(output, "SYMBOL_OTHERS "); break;
812 case XML_REGEXP_OTHER:
813 fprintf(output, "OTHER "); break;
814 case XML_REGEXP_OTHER_CONTROL:
815 fprintf(output, "OTHER_CONTROL "); break;
816 case XML_REGEXP_OTHER_FORMAT:
817 fprintf(output, "OTHER_FORMAT "); break;
818 case XML_REGEXP_OTHER_PRIVATE:
819 fprintf(output, "OTHER_PRIVATE "); break;
820 case XML_REGEXP_OTHER_NA:
821 fprintf(output, "OTHER_NA "); break;
822 case XML_REGEXP_BLOCK_NAME:
823 fprintf(output, "BLOCK "); break;
824 }
825}
826
827static void
828xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
829 switch (type) {
830 case XML_REGEXP_QUANT_EPSILON:
831 fprintf(output, "epsilon "); break;
832 case XML_REGEXP_QUANT_ONCE:
833 fprintf(output, "once "); break;
834 case XML_REGEXP_QUANT_OPT:
835 fprintf(output, "? "); break;
836 case XML_REGEXP_QUANT_MULT:
837 fprintf(output, "* "); break;
838 case XML_REGEXP_QUANT_PLUS:
839 fprintf(output, "+ "); break;
840 case XML_REGEXP_QUANT_RANGE:
841 fprintf(output, "range "); break;
Daniel Veillard7646b182002-04-20 06:41:40 +0000842 case XML_REGEXP_QUANT_ONCEONLY:
843 fprintf(output, "onceonly "); break;
844 case XML_REGEXP_QUANT_ALL:
845 fprintf(output, "all "); break;
Daniel Veillard4255d502002-04-16 15:50:10 +0000846 }
847}
848static void
849xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
850 fprintf(output, " range: ");
851 if (range->neg)
852 fprintf(output, "negative ");
853 xmlRegPrintAtomType(output, range->type);
854 fprintf(output, "%c - %c\n", range->start, range->end);
855}
856
857static void
858xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
859 fprintf(output, " atom: ");
860 if (atom == NULL) {
861 fprintf(output, "NULL\n");
862 return;
863 }
864 xmlRegPrintAtomType(output, atom->type);
865 xmlRegPrintQuantType(output, atom->quant);
866 if (atom->quant == XML_REGEXP_QUANT_RANGE)
867 fprintf(output, "%d-%d ", atom->min, atom->max);
868 if (atom->type == XML_REGEXP_STRING)
869 fprintf(output, "'%s' ", (char *) atom->valuep);
870 if (atom->type == XML_REGEXP_CHARVAL)
871 fprintf(output, "char %c\n", atom->codepoint);
872 else if (atom->type == XML_REGEXP_RANGES) {
873 int i;
874 fprintf(output, "%d entries\n", atom->nbRanges);
875 for (i = 0; i < atom->nbRanges;i++)
876 xmlRegPrintRange(output, atom->ranges[i]);
877 } else if (atom->type == XML_REGEXP_SUBREG) {
878 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
879 } else {
880 fprintf(output, "\n");
881 }
882}
883
884static void
885xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
886 fprintf(output, " trans: ");
887 if (trans == NULL) {
888 fprintf(output, "NULL\n");
889 return;
890 }
891 if (trans->to < 0) {
892 fprintf(output, "removed\n");
893 return;
894 }
895 if (trans->counter >= 0) {
896 fprintf(output, "counted %d, ", trans->counter);
897 }
Daniel Veillard8a001f62002-04-20 07:24:11 +0000898 if (trans->count == REGEXP_ALL_COUNTER) {
899 fprintf(output, "all transition, ");
900 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +0000901 fprintf(output, "count based %d, ", trans->count);
902 }
903 if (trans->atom == NULL) {
904 fprintf(output, "epsilon to %d\n", trans->to);
905 return;
906 }
907 if (trans->atom->type == XML_REGEXP_CHARVAL)
908 fprintf(output, "char %c ", trans->atom->codepoint);
909 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
910}
911
912static void
913xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
914 int i;
915
916 fprintf(output, " state: ");
917 if (state == NULL) {
918 fprintf(output, "NULL\n");
919 return;
920 }
921 if (state->type == XML_REGEXP_START_STATE)
922 fprintf(output, "START ");
923 if (state->type == XML_REGEXP_FINAL_STATE)
924 fprintf(output, "FINAL ");
925
926 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
927 for (i = 0;i < state->nbTrans; i++) {
928 xmlRegPrintTrans(output, &(state->trans[i]));
929 }
930}
931
Daniel Veillard23e73572002-09-19 19:56:43 +0000932#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard4255d502002-04-16 15:50:10 +0000933static void
934xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
935 int i;
936
937 fprintf(output, " ctxt: ");
938 if (ctxt == NULL) {
939 fprintf(output, "NULL\n");
940 return;
941 }
942 fprintf(output, "'%s' ", ctxt->string);
943 if (ctxt->error)
944 fprintf(output, "error ");
945 if (ctxt->neg)
946 fprintf(output, "neg ");
947 fprintf(output, "\n");
948 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
949 for (i = 0;i < ctxt->nbAtoms; i++) {
950 fprintf(output, " %02d ", i);
951 xmlRegPrintAtom(output, ctxt->atoms[i]);
952 }
953 if (ctxt->atom != NULL) {
954 fprintf(output, "current atom:\n");
955 xmlRegPrintAtom(output, ctxt->atom);
956 }
957 fprintf(output, "%d states:", ctxt->nbStates);
958 if (ctxt->start != NULL)
959 fprintf(output, " start: %d", ctxt->start->no);
960 if (ctxt->end != NULL)
961 fprintf(output, " end: %d", ctxt->end->no);
962 fprintf(output, "\n");
963 for (i = 0;i < ctxt->nbStates; i++) {
964 xmlRegPrintState(output, ctxt->states[i]);
965 }
966 fprintf(output, "%d counters:\n", ctxt->nbCounters);
967 for (i = 0;i < ctxt->nbCounters; i++) {
968 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
969 ctxt->counters[i].max);
970 }
971}
Daniel Veillard23e73572002-09-19 19:56:43 +0000972#endif
Daniel Veillard4255d502002-04-16 15:50:10 +0000973
974/************************************************************************
975 * *
976 * Finite Automata structures manipulations *
977 * *
978 ************************************************************************/
979
980static void
981xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
982 int neg, xmlRegAtomType type, int start, int end,
983 xmlChar *blockName) {
984 xmlRegRangePtr range;
985
986 if (atom == NULL) {
987 ERROR("add range: atom is NULL");
988 return;
989 }
990 if (atom->type != XML_REGEXP_RANGES) {
991 ERROR("add range: atom is not ranges");
992 return;
993 }
994 if (atom->maxRanges == 0) {
995 atom->maxRanges = 4;
996 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
997 sizeof(xmlRegRangePtr));
998 if (atom->ranges == NULL) {
999 ERROR("add range: allocation failed");
1000 atom->maxRanges = 0;
1001 return;
1002 }
1003 } else if (atom->nbRanges >= atom->maxRanges) {
1004 xmlRegRangePtr *tmp;
1005 atom->maxRanges *= 2;
1006 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1007 sizeof(xmlRegRangePtr));
1008 if (tmp == NULL) {
1009 ERROR("add range: allocation failed");
1010 atom->maxRanges /= 2;
1011 return;
1012 }
1013 atom->ranges = tmp;
1014 }
1015 range = xmlRegNewRange(ctxt, neg, type, start, end);
1016 if (range == NULL)
1017 return;
1018 range->blockName = blockName;
1019 atom->ranges[atom->nbRanges++] = range;
1020
1021}
1022
1023static int
1024xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1025 if (ctxt->maxCounters == 0) {
1026 ctxt->maxCounters = 4;
1027 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1028 sizeof(xmlRegCounter));
1029 if (ctxt->counters == NULL) {
1030 ERROR("reg counter: allocation failed");
1031 ctxt->maxCounters = 0;
1032 return(-1);
1033 }
1034 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1035 xmlRegCounter *tmp;
1036 ctxt->maxCounters *= 2;
1037 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1038 sizeof(xmlRegCounter));
1039 if (tmp == NULL) {
1040 ERROR("reg counter: allocation failed");
1041 ctxt->maxCounters /= 2;
1042 return(-1);
1043 }
1044 ctxt->counters = tmp;
1045 }
1046 ctxt->counters[ctxt->nbCounters].min = -1;
1047 ctxt->counters[ctxt->nbCounters].max = -1;
1048 return(ctxt->nbCounters++);
1049}
1050
1051static void
1052xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1053 if (atom == NULL) {
1054 ERROR("atom push: atom is NULL");
1055 return;
1056 }
1057 if (ctxt->maxAtoms == 0) {
1058 ctxt->maxAtoms = 4;
1059 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1060 sizeof(xmlRegAtomPtr));
1061 if (ctxt->atoms == NULL) {
1062 ERROR("atom push: allocation failed");
1063 ctxt->maxAtoms = 0;
1064 return;
1065 }
1066 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1067 xmlRegAtomPtr *tmp;
1068 ctxt->maxAtoms *= 2;
1069 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1070 sizeof(xmlRegAtomPtr));
1071 if (tmp == NULL) {
1072 ERROR("atom push: allocation failed");
1073 ctxt->maxAtoms /= 2;
1074 return;
1075 }
1076 ctxt->atoms = tmp;
1077 }
1078 atom->no = ctxt->nbAtoms;
1079 ctxt->atoms[ctxt->nbAtoms++] = atom;
1080}
1081
1082static void
1083xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1084 xmlRegAtomPtr atom, xmlRegStatePtr target,
1085 int counter, int count) {
1086 if (state == NULL) {
1087 ERROR("add state: state is NULL");
1088 return;
1089 }
1090 if (target == NULL) {
1091 ERROR("add state: target is NULL");
1092 return;
1093 }
1094 if (state->maxTrans == 0) {
1095 state->maxTrans = 4;
1096 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1097 sizeof(xmlRegTrans));
1098 if (state->trans == NULL) {
1099 ERROR("add range: allocation failed");
1100 state->maxTrans = 0;
1101 return;
1102 }
1103 } else if (state->nbTrans >= state->maxTrans) {
1104 xmlRegTrans *tmp;
1105 state->maxTrans *= 2;
1106 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1107 sizeof(xmlRegTrans));
1108 if (tmp == NULL) {
1109 ERROR("add range: allocation failed");
1110 state->maxTrans /= 2;
1111 return;
1112 }
1113 state->trans = tmp;
1114 }
1115#ifdef DEBUG_REGEXP_GRAPH
1116 printf("Add trans from %d to %d ", state->no, target->no);
Daniel Veillard8a001f62002-04-20 07:24:11 +00001117 if (count == REGEXP_ALL_COUNTER)
1118 printf("all transition");
Daniel Veillard4402ab42002-09-12 16:02:56 +00001119 else if (count >= 0)
Daniel Veillard4255d502002-04-16 15:50:10 +00001120 printf("count based %d", count);
1121 else if (counter >= 0)
1122 printf("counted %d", counter);
1123 else if (atom == NULL)
1124 printf("epsilon transition");
1125 printf("\n");
1126#endif
1127
1128 state->trans[state->nbTrans].atom = atom;
1129 state->trans[state->nbTrans].to = target->no;
1130 state->trans[state->nbTrans].counter = counter;
1131 state->trans[state->nbTrans].count = count;
1132 state->nbTrans++;
1133}
1134
1135static void
1136xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
1137 if (ctxt->maxStates == 0) {
1138 ctxt->maxStates = 4;
1139 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1140 sizeof(xmlRegStatePtr));
1141 if (ctxt->states == NULL) {
1142 ERROR("add range: allocation failed");
1143 ctxt->maxStates = 0;
1144 return;
1145 }
1146 } else if (ctxt->nbStates >= ctxt->maxStates) {
1147 xmlRegStatePtr *tmp;
1148 ctxt->maxStates *= 2;
1149 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1150 sizeof(xmlRegStatePtr));
1151 if (tmp == NULL) {
1152 ERROR("add range: allocation failed");
1153 ctxt->maxStates /= 2;
1154 return;
1155 }
1156 ctxt->states = tmp;
1157 }
1158 state->no = ctxt->nbStates;
1159 ctxt->states[ctxt->nbStates++] = state;
1160}
1161
1162/**
Daniel Veillard7646b182002-04-20 06:41:40 +00001163 * xmlFAGenerateAllTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001164 * @ctxt: a regexp parser context
1165 * @from: the from state
1166 * @to: the target state or NULL for building a new one
1167 * @lax:
Daniel Veillard7646b182002-04-20 06:41:40 +00001168 *
1169 */
1170static void
1171xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
Daniel Veillard441bc322002-04-20 17:38:48 +00001172 xmlRegStatePtr from, xmlRegStatePtr to,
1173 int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00001174 if (to == NULL) {
1175 to = xmlRegNewState(ctxt);
1176 xmlRegStatePush(ctxt, to);
1177 ctxt->state = to;
1178 }
Daniel Veillard441bc322002-04-20 17:38:48 +00001179 if (lax)
1180 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1181 else
1182 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
Daniel Veillard7646b182002-04-20 06:41:40 +00001183}
1184
1185/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001186 * xmlFAGenerateEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001187 * @ctxt: a regexp parser context
1188 * @from: the from state
1189 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001190 *
1191 */
1192static void
1193xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1194 xmlRegStatePtr from, xmlRegStatePtr to) {
1195 if (to == NULL) {
1196 to = xmlRegNewState(ctxt);
1197 xmlRegStatePush(ctxt, to);
1198 ctxt->state = to;
1199 }
1200 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1201}
1202
1203/**
1204 * xmlFAGenerateCountedEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001205 * @ctxt: a regexp parser context
1206 * @from: the from state
1207 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001208 * counter: the counter for that transition
1209 *
1210 */
1211static void
1212xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1213 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1214 if (to == NULL) {
1215 to = xmlRegNewState(ctxt);
1216 xmlRegStatePush(ctxt, to);
1217 ctxt->state = to;
1218 }
1219 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1220}
1221
1222/**
1223 * xmlFAGenerateCountedTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001224 * @ctxt: a regexp parser context
1225 * @from: the from state
1226 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001227 * counter: the counter for that transition
1228 *
1229 */
1230static void
1231xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1232 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1233 if (to == NULL) {
1234 to = xmlRegNewState(ctxt);
1235 xmlRegStatePush(ctxt, to);
1236 ctxt->state = to;
1237 }
1238 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1239}
1240
1241/**
1242 * xmlFAGenerateTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001243 * @ctxt: a regexp parser context
1244 * @from: the from state
1245 * @to: the target state or NULL for building a new one
1246 * @atom: the atom generating the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00001247 *
1248 */
1249static void
1250xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1251 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1252 if (atom == NULL) {
1253 ERROR("genrate transition: atom == NULL");
1254 return;
1255 }
1256 if (atom->type == XML_REGEXP_SUBREG) {
1257 /*
1258 * this is a subexpression handling one should not need to
1259 * create a new node excep for XML_REGEXP_QUANT_RANGE.
1260 */
1261 xmlRegAtomPush(ctxt, atom);
1262 if ((to != NULL) && (atom->stop != to) &&
1263 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1264 /*
1265 * Generate an epsilon transition to link to the target
1266 */
1267 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1268 }
1269 switch (atom->quant) {
1270 case XML_REGEXP_QUANT_OPT:
1271 atom->quant = XML_REGEXP_QUANT_ONCE;
1272 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1273 break;
1274 case XML_REGEXP_QUANT_MULT:
1275 atom->quant = XML_REGEXP_QUANT_ONCE;
1276 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1277 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1278 break;
1279 case XML_REGEXP_QUANT_PLUS:
1280 atom->quant = XML_REGEXP_QUANT_ONCE;
1281 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1282 break;
1283 case XML_REGEXP_QUANT_RANGE: {
1284 int counter;
1285 xmlRegStatePtr newstate;
1286
1287 /*
1288 * This one is nasty:
1289 * 1/ register a new counter
1290 * 2/ register an epsilon transition associated to
1291 * this counter going from atom->stop to atom->start
1292 * 3/ create a new state
1293 * 4/ generate a counted transition from atom->stop to
1294 * that state
1295 */
1296 counter = xmlRegGetCounter(ctxt);
1297 ctxt->counters[counter].min = atom->min - 1;
1298 ctxt->counters[counter].max = atom->max - 1;
1299 atom->min = 0;
1300 atom->max = 0;
1301 atom->quant = XML_REGEXP_QUANT_ONCE;
1302 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1303 atom->start, counter);
1304 if (to != NULL) {
1305 newstate = to;
1306 } else {
1307 newstate = xmlRegNewState(ctxt);
1308 xmlRegStatePush(ctxt, newstate);
1309 ctxt->state = newstate;
1310 }
1311 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1312 newstate, counter);
1313 }
1314 default:
1315 break;
1316 }
1317 return;
1318 } else {
1319 if (to == NULL) {
1320 to = xmlRegNewState(ctxt);
1321 xmlRegStatePush(ctxt, to);
1322 }
1323 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1324 xmlRegAtomPush(ctxt, atom);
1325 ctxt->state = to;
1326 }
1327 switch (atom->quant) {
1328 case XML_REGEXP_QUANT_OPT:
1329 atom->quant = XML_REGEXP_QUANT_ONCE;
1330 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1331 break;
1332 case XML_REGEXP_QUANT_MULT:
1333 atom->quant = XML_REGEXP_QUANT_ONCE;
1334 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1335 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1336 break;
1337 case XML_REGEXP_QUANT_PLUS:
1338 atom->quant = XML_REGEXP_QUANT_ONCE;
1339 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1340 break;
1341 default:
1342 break;
1343 }
1344}
1345
1346/**
1347 * xmlFAReduceEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001348 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001349 * @fromnr: the from state
1350 * @tonr: the to state
1351 * @cpunter: should that transition be associted to a counted
1352 *
1353 */
1354static void
1355xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1356 int tonr, int counter) {
1357 int transnr;
1358 xmlRegStatePtr from;
1359 xmlRegStatePtr to;
1360
1361#ifdef DEBUG_REGEXP_GRAPH
1362 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1363#endif
1364 from = ctxt->states[fromnr];
1365 if (from == NULL)
1366 return;
1367 to = ctxt->states[tonr];
1368 if (to == NULL)
1369 return;
1370 if ((to->mark == XML_REGEXP_MARK_START) ||
1371 (to->mark == XML_REGEXP_MARK_VISITED))
1372 return;
1373
1374 to->mark = XML_REGEXP_MARK_VISITED;
1375 if (to->type == XML_REGEXP_FINAL_STATE) {
1376#ifdef DEBUG_REGEXP_GRAPH
1377 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1378#endif
1379 from->type = XML_REGEXP_FINAL_STATE;
1380 }
1381 for (transnr = 0;transnr < to->nbTrans;transnr++) {
1382 if (to->trans[transnr].atom == NULL) {
1383 /*
1384 * Don't remove counted transitions
1385 * Don't loop either
1386 */
Daniel Veillardb509f152002-04-17 16:28:10 +00001387 if (to->trans[transnr].to != fromnr) {
1388 if (to->trans[transnr].count >= 0) {
1389 int newto = to->trans[transnr].to;
1390
1391 xmlRegStateAddTrans(ctxt, from, NULL,
1392 ctxt->states[newto],
1393 -1, to->trans[transnr].count);
1394 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00001395#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillardb509f152002-04-17 16:28:10 +00001396 printf("Found epsilon trans %d from %d to %d\n",
1397 transnr, tonr, to->trans[transnr].to);
Daniel Veillard4255d502002-04-16 15:50:10 +00001398#endif
Daniel Veillardb509f152002-04-17 16:28:10 +00001399 if (to->trans[transnr].counter >= 0) {
1400 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1401 to->trans[transnr].to,
1402 to->trans[transnr].counter);
1403 } else {
1404 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1405 to->trans[transnr].to,
1406 counter);
1407 }
1408 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001409 }
1410 } else {
1411 int newto = to->trans[transnr].to;
1412
Daniel Veillardb509f152002-04-17 16:28:10 +00001413 if (to->trans[transnr].counter >= 0) {
1414 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1415 ctxt->states[newto],
1416 to->trans[transnr].counter, -1);
1417 } else {
1418 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1419 ctxt->states[newto], counter, -1);
1420 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001421 }
1422 }
1423 to->mark = XML_REGEXP_MARK_NORMAL;
1424}
1425
1426/**
1427 * xmlFAEliminateEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001428 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001429 *
1430 */
1431static void
1432xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1433 int statenr, transnr;
1434 xmlRegStatePtr state;
1435
1436 /*
1437 * build the completed transitions bypassing the epsilons
1438 * Use a marking algorithm to avoid loops
1439 */
1440 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1441 state = ctxt->states[statenr];
1442 if (state == NULL)
1443 continue;
1444 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1445 if ((state->trans[transnr].atom == NULL) &&
1446 (state->trans[transnr].to >= 0)) {
1447 if (state->trans[transnr].to == statenr) {
1448 state->trans[transnr].to = -1;
1449#ifdef DEBUG_REGEXP_GRAPH
1450 printf("Removed loopback epsilon trans %d on %d\n",
1451 transnr, statenr);
1452#endif
1453 } else if (state->trans[transnr].count < 0) {
1454 int newto = state->trans[transnr].to;
1455
1456#ifdef DEBUG_REGEXP_GRAPH
1457 printf("Found epsilon trans %d from %d to %d\n",
1458 transnr, statenr, newto);
1459#endif
1460 state->mark = XML_REGEXP_MARK_START;
1461 xmlFAReduceEpsilonTransitions(ctxt, statenr,
1462 newto, state->trans[transnr].counter);
1463 state->mark = XML_REGEXP_MARK_NORMAL;
1464#ifdef DEBUG_REGEXP_GRAPH
1465 } else {
1466 printf("Found counted transition %d on %d\n",
1467 transnr, statenr);
1468#endif
1469 }
1470 }
1471 }
1472 }
1473 /*
1474 * Eliminate the epsilon transitions
1475 */
1476 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1477 state = ctxt->states[statenr];
1478 if (state == NULL)
1479 continue;
1480 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1481 if ((state->trans[transnr].atom == NULL) &&
1482 (state->trans[transnr].count < 0) &&
1483 (state->trans[transnr].to >= 0)) {
1484 state->trans[transnr].to = -1;
1485 }
1486 }
1487 }
Daniel Veillard23e73572002-09-19 19:56:43 +00001488
1489 /*
1490 * Use this pass to detect unreachable states too
1491 */
1492 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1493 state = ctxt->states[statenr];
1494 if (state != NULL)
1495 state->reached = 0;
1496 }
1497 state = ctxt->states[0];
1498 if (state != NULL)
1499 state->reached = 1;
1500 while (state != NULL) {
1501 xmlRegStatePtr target = NULL;
1502 state->reached = 2;
1503 /*
1504 * Mark all state reachable from the current reachable state
1505 */
1506 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1507 if ((state->trans[transnr].to >= 0) &&
1508 ((state->trans[transnr].atom != NULL) ||
1509 (state->trans[transnr].count >= 0))) {
1510 int newto = state->trans[transnr].to;
1511
1512 if (ctxt->states[newto] == NULL)
1513 continue;
1514 if (ctxt->states[newto]->reached == 0) {
1515 ctxt->states[newto]->reached = 1;
1516 target = ctxt->states[newto];
1517 }
1518 }
1519 }
1520 /*
1521 * find the next accessible state not explored
1522 */
1523 if (target == NULL) {
1524 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
1525 state = ctxt->states[statenr];
1526 if ((state != NULL) && (state->reached == 1)) {
1527 target = state;
1528 break;
1529 }
1530 }
1531 }
1532 state = target;
1533 }
1534 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1535 state = ctxt->states[statenr];
1536 if ((state != NULL) && (state->reached == 0)) {
1537#ifdef DEBUG_REGEXP_GRAPH
1538 printf("Removed unreachable state %d\n", statenr);
1539#endif
1540 xmlRegFreeState(state);
1541 ctxt->states[statenr] = NULL;
1542 }
1543 }
1544
Daniel Veillard4255d502002-04-16 15:50:10 +00001545}
1546
Daniel Veillarde19fc232002-04-22 16:01:24 +00001547/**
1548 * xmlFACompareAtoms:
1549 * @atom1: an atom
1550 * @atom2: an atom
1551 *
1552 * Compares two atoms to check whether they are equivatents
1553 *
1554 * Returns 1 if yes and 0 otherwise
1555 */
1556static int
1557xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
1558 if (atom1 == atom2)
1559 return(1);
1560 if ((atom1 == NULL) || (atom2 == NULL))
1561 return(0);
1562
1563 if (atom1->type != atom2->type)
1564 return(0);
1565 switch (atom1->type) {
1566 case XML_REGEXP_STRING:
1567 return(xmlStrEqual((xmlChar *)atom1->valuep,
1568 (xmlChar *)atom2->valuep));
1569 case XML_REGEXP_EPSILON:
1570 return(1);
1571 case XML_REGEXP_CHARVAL:
1572 return(atom1->codepoint == atom2->codepoint);
1573 case XML_REGEXP_RANGES:
1574 TODO;
1575 return(0);
1576 default:
1577 break;
1578 }
1579 return(1);
1580}
1581
1582/**
1583 * xmlFARecurseDeterminism:
1584 * @ctxt: a regexp parser context
1585 *
1586 * Check whether the associated regexp is determinist,
1587 * should be called after xmlFAEliminateEpsilonTransitions()
1588 *
1589 */
1590static int
1591xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1592 int to, xmlRegAtomPtr atom) {
1593 int ret = 1;
1594 int transnr;
1595 xmlRegTransPtr t1;
1596
1597 if (state == NULL)
1598 return(ret);
1599 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1600 t1 = &(state->trans[transnr]);
1601 /*
1602 * check transitions conflicting with the one looked at
1603 */
1604 if (t1->atom == NULL) {
1605 if (t1->to == -1)
1606 continue;
1607 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1608 to, atom);
1609 if (ret == 0)
1610 return(0);
1611 continue;
1612 }
1613 if (t1->to != to)
1614 continue;
1615 if (xmlFACompareAtoms(t1->atom, atom))
1616 return(0);
1617 }
1618 return(ret);
1619}
1620
1621/**
1622 * xmlFAComputesDeterminism:
1623 * @ctxt: a regexp parser context
1624 *
1625 * Check whether the associated regexp is determinist,
1626 * should be called after xmlFAEliminateEpsilonTransitions()
1627 *
1628 */
1629static int
1630xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
1631 int statenr, transnr;
1632 xmlRegStatePtr state;
1633 xmlRegTransPtr t1, t2;
1634 int i;
1635 int ret = 1;
1636
Daniel Veillard4402ab42002-09-12 16:02:56 +00001637#ifdef DEBUG_REGEXP_GRAPH
1638 printf("xmlFAComputesDeterminism\n");
1639 xmlRegPrintCtxt(stdout, ctxt);
1640#endif
Daniel Veillarde19fc232002-04-22 16:01:24 +00001641 if (ctxt->determinist != -1)
1642 return(ctxt->determinist);
1643
1644 /*
1645 * Check for all states that there isn't 2 transitions
1646 * with the same atom and a different target.
1647 */
1648 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1649 state = ctxt->states[statenr];
1650 if (state == NULL)
1651 continue;
1652 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1653 t1 = &(state->trans[transnr]);
1654 /*
1655 * Determinism checks in case of counted or all transitions
1656 * will have to be handled separately
1657 */
1658 if (t1->atom == NULL)
1659 continue;
1660 if (t1->to == -1) /* eliminated */
1661 continue;
1662 for (i = 0;i < transnr;i++) {
1663 t2 = &(state->trans[i]);
1664 if (t2->to == -1) /* eliminated */
1665 continue;
1666 if (t2->atom != NULL) {
1667 if (t1->to == t2->to) {
1668 if (xmlFACompareAtoms(t1->atom, t2->atom))
1669 t2->to = -1; /* eliminate */
1670 } else {
1671 /* not determinist ! */
1672 if (xmlFACompareAtoms(t1->atom, t2->atom))
1673 ret = 0;
1674 }
1675 } else if (t1->to != -1) {
1676 /*
1677 * do the closure in case of remaining specific
1678 * epsilon transitions like choices or all
1679 */
1680 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1681 t2->to, t2->atom);
1682 if (ret == 0)
1683 return(0);
1684 }
1685 }
1686 if (ret == 0)
1687 break;
1688 }
1689 if (ret == 0)
1690 break;
1691 }
1692 ctxt->determinist = ret;
1693 return(ret);
1694}
1695
Daniel Veillard4255d502002-04-16 15:50:10 +00001696/************************************************************************
1697 * *
1698 * Routines to check input against transition atoms *
1699 * *
1700 ************************************************************************/
1701
1702static int
1703xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
1704 int start, int end, const xmlChar *blockName) {
1705 int ret = 0;
1706
1707 switch (type) {
1708 case XML_REGEXP_STRING:
1709 case XML_REGEXP_SUBREG:
1710 case XML_REGEXP_RANGES:
1711 case XML_REGEXP_EPSILON:
1712 return(-1);
1713 case XML_REGEXP_ANYCHAR:
1714 ret = ((codepoint != '\n') && (codepoint != '\r'));
1715 break;
1716 case XML_REGEXP_CHARVAL:
1717 ret = ((codepoint >= start) && (codepoint <= end));
1718 break;
1719 case XML_REGEXP_NOTSPACE:
1720 neg = !neg;
1721 case XML_REGEXP_ANYSPACE:
1722 ret = ((codepoint == '\n') || (codepoint == '\r') ||
1723 (codepoint == '\t') || (codepoint == ' '));
1724 break;
1725 case XML_REGEXP_NOTINITNAME:
1726 neg = !neg;
1727 case XML_REGEXP_INITNAME:
1728 ret = (xmlIsLetter(codepoint) ||
1729 (codepoint == '_') || (codepoint == ':'));
1730 break;
1731 case XML_REGEXP_NOTNAMECHAR:
1732 neg = !neg;
1733 case XML_REGEXP_NAMECHAR:
1734 ret = (xmlIsLetter(codepoint) || xmlIsDigit(codepoint) ||
1735 (codepoint == '.') || (codepoint == '-') ||
1736 (codepoint == '_') || (codepoint == ':') ||
1737 xmlIsCombining(codepoint) || xmlIsExtender(codepoint));
1738 break;
1739 case XML_REGEXP_NOTDECIMAL:
1740 neg = !neg;
1741 case XML_REGEXP_DECIMAL:
1742 ret = xmlUCSIsCatNd(codepoint);
1743 break;
1744 case XML_REGEXP_REALCHAR:
1745 neg = !neg;
1746 case XML_REGEXP_NOTREALCHAR:
1747 ret = xmlUCSIsCatP(codepoint);
1748 if (ret == 0)
1749 ret = xmlUCSIsCatZ(codepoint);
1750 if (ret == 0)
1751 ret = xmlUCSIsCatC(codepoint);
1752 break;
1753 case XML_REGEXP_LETTER:
1754 ret = xmlUCSIsCatL(codepoint);
1755 break;
1756 case XML_REGEXP_LETTER_UPPERCASE:
1757 ret = xmlUCSIsCatLu(codepoint);
1758 break;
1759 case XML_REGEXP_LETTER_LOWERCASE:
1760 ret = xmlUCSIsCatLl(codepoint);
1761 break;
1762 case XML_REGEXP_LETTER_TITLECASE:
1763 ret = xmlUCSIsCatLt(codepoint);
1764 break;
1765 case XML_REGEXP_LETTER_MODIFIER:
1766 ret = xmlUCSIsCatLm(codepoint);
1767 break;
1768 case XML_REGEXP_LETTER_OTHERS:
1769 ret = xmlUCSIsCatLo(codepoint);
1770 break;
1771 case XML_REGEXP_MARK:
1772 ret = xmlUCSIsCatM(codepoint);
1773 break;
1774 case XML_REGEXP_MARK_NONSPACING:
1775 ret = xmlUCSIsCatMn(codepoint);
1776 break;
1777 case XML_REGEXP_MARK_SPACECOMBINING:
1778 ret = xmlUCSIsCatMc(codepoint);
1779 break;
1780 case XML_REGEXP_MARK_ENCLOSING:
1781 ret = xmlUCSIsCatMe(codepoint);
1782 break;
1783 case XML_REGEXP_NUMBER:
1784 ret = xmlUCSIsCatN(codepoint);
1785 break;
1786 case XML_REGEXP_NUMBER_DECIMAL:
1787 ret = xmlUCSIsCatNd(codepoint);
1788 break;
1789 case XML_REGEXP_NUMBER_LETTER:
1790 ret = xmlUCSIsCatNl(codepoint);
1791 break;
1792 case XML_REGEXP_NUMBER_OTHERS:
1793 ret = xmlUCSIsCatNo(codepoint);
1794 break;
1795 case XML_REGEXP_PUNCT:
1796 ret = xmlUCSIsCatP(codepoint);
1797 break;
1798 case XML_REGEXP_PUNCT_CONNECTOR:
1799 ret = xmlUCSIsCatPc(codepoint);
1800 break;
1801 case XML_REGEXP_PUNCT_DASH:
1802 ret = xmlUCSIsCatPd(codepoint);
1803 break;
1804 case XML_REGEXP_PUNCT_OPEN:
1805 ret = xmlUCSIsCatPs(codepoint);
1806 break;
1807 case XML_REGEXP_PUNCT_CLOSE:
1808 ret = xmlUCSIsCatPe(codepoint);
1809 break;
1810 case XML_REGEXP_PUNCT_INITQUOTE:
1811 ret = xmlUCSIsCatPi(codepoint);
1812 break;
1813 case XML_REGEXP_PUNCT_FINQUOTE:
1814 ret = xmlUCSIsCatPf(codepoint);
1815 break;
1816 case XML_REGEXP_PUNCT_OTHERS:
1817 ret = xmlUCSIsCatPo(codepoint);
1818 break;
1819 case XML_REGEXP_SEPAR:
1820 ret = xmlUCSIsCatZ(codepoint);
1821 break;
1822 case XML_REGEXP_SEPAR_SPACE:
1823 ret = xmlUCSIsCatZs(codepoint);
1824 break;
1825 case XML_REGEXP_SEPAR_LINE:
1826 ret = xmlUCSIsCatZl(codepoint);
1827 break;
1828 case XML_REGEXP_SEPAR_PARA:
1829 ret = xmlUCSIsCatZp(codepoint);
1830 break;
1831 case XML_REGEXP_SYMBOL:
1832 ret = xmlUCSIsCatS(codepoint);
1833 break;
1834 case XML_REGEXP_SYMBOL_MATH:
1835 ret = xmlUCSIsCatSm(codepoint);
1836 break;
1837 case XML_REGEXP_SYMBOL_CURRENCY:
1838 ret = xmlUCSIsCatSc(codepoint);
1839 break;
1840 case XML_REGEXP_SYMBOL_MODIFIER:
1841 ret = xmlUCSIsCatSk(codepoint);
1842 break;
1843 case XML_REGEXP_SYMBOL_OTHERS:
1844 ret = xmlUCSIsCatSo(codepoint);
1845 break;
1846 case XML_REGEXP_OTHER:
1847 ret = xmlUCSIsCatC(codepoint);
1848 break;
1849 case XML_REGEXP_OTHER_CONTROL:
1850 ret = xmlUCSIsCatCc(codepoint);
1851 break;
1852 case XML_REGEXP_OTHER_FORMAT:
1853 ret = xmlUCSIsCatCf(codepoint);
1854 break;
1855 case XML_REGEXP_OTHER_PRIVATE:
1856 ret = xmlUCSIsCatCo(codepoint);
1857 break;
1858 case XML_REGEXP_OTHER_NA:
1859 /* ret = xmlUCSIsCatCn(codepoint); */
1860 /* Seems it doesn't exist anymore in recent Unicode releases */
1861 ret = 0;
1862 break;
1863 case XML_REGEXP_BLOCK_NAME:
1864 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
1865 break;
1866 }
1867 if (neg)
1868 return(!ret);
1869 return(ret);
1870}
1871
1872static int
1873xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
1874 int i, ret = 0;
1875 xmlRegRangePtr range;
1876
1877 if ((atom == NULL) || (!xmlIsChar(codepoint)))
1878 return(-1);
1879
1880 switch (atom->type) {
1881 case XML_REGEXP_SUBREG:
1882 case XML_REGEXP_EPSILON:
1883 return(-1);
1884 case XML_REGEXP_CHARVAL:
1885 return(codepoint == atom->codepoint);
1886 case XML_REGEXP_RANGES: {
1887 int accept = 0;
1888 for (i = 0;i < atom->nbRanges;i++) {
1889 range = atom->ranges[i];
1890 if (range->neg) {
1891 ret = xmlRegCheckCharacterRange(range->type, codepoint,
1892 0, range->start, range->end,
1893 range->blockName);
1894 if (ret != 0)
1895 return(0); /* excluded char */
1896 } else {
1897 ret = xmlRegCheckCharacterRange(range->type, codepoint,
1898 0, range->start, range->end,
1899 range->blockName);
1900 if (ret != 0)
1901 accept = 1; /* might still be excluded */
1902 }
1903 }
1904 return(accept);
1905 }
1906 case XML_REGEXP_STRING:
1907 printf("TODO: XML_REGEXP_STRING\n");
1908 return(-1);
1909 case XML_REGEXP_ANYCHAR:
1910 case XML_REGEXP_ANYSPACE:
1911 case XML_REGEXP_NOTSPACE:
1912 case XML_REGEXP_INITNAME:
1913 case XML_REGEXP_NOTINITNAME:
1914 case XML_REGEXP_NAMECHAR:
1915 case XML_REGEXP_NOTNAMECHAR:
1916 case XML_REGEXP_DECIMAL:
1917 case XML_REGEXP_NOTDECIMAL:
1918 case XML_REGEXP_REALCHAR:
1919 case XML_REGEXP_NOTREALCHAR:
1920 case XML_REGEXP_LETTER:
1921 case XML_REGEXP_LETTER_UPPERCASE:
1922 case XML_REGEXP_LETTER_LOWERCASE:
1923 case XML_REGEXP_LETTER_TITLECASE:
1924 case XML_REGEXP_LETTER_MODIFIER:
1925 case XML_REGEXP_LETTER_OTHERS:
1926 case XML_REGEXP_MARK:
1927 case XML_REGEXP_MARK_NONSPACING:
1928 case XML_REGEXP_MARK_SPACECOMBINING:
1929 case XML_REGEXP_MARK_ENCLOSING:
1930 case XML_REGEXP_NUMBER:
1931 case XML_REGEXP_NUMBER_DECIMAL:
1932 case XML_REGEXP_NUMBER_LETTER:
1933 case XML_REGEXP_NUMBER_OTHERS:
1934 case XML_REGEXP_PUNCT:
1935 case XML_REGEXP_PUNCT_CONNECTOR:
1936 case XML_REGEXP_PUNCT_DASH:
1937 case XML_REGEXP_PUNCT_OPEN:
1938 case XML_REGEXP_PUNCT_CLOSE:
1939 case XML_REGEXP_PUNCT_INITQUOTE:
1940 case XML_REGEXP_PUNCT_FINQUOTE:
1941 case XML_REGEXP_PUNCT_OTHERS:
1942 case XML_REGEXP_SEPAR:
1943 case XML_REGEXP_SEPAR_SPACE:
1944 case XML_REGEXP_SEPAR_LINE:
1945 case XML_REGEXP_SEPAR_PARA:
1946 case XML_REGEXP_SYMBOL:
1947 case XML_REGEXP_SYMBOL_MATH:
1948 case XML_REGEXP_SYMBOL_CURRENCY:
1949 case XML_REGEXP_SYMBOL_MODIFIER:
1950 case XML_REGEXP_SYMBOL_OTHERS:
1951 case XML_REGEXP_OTHER:
1952 case XML_REGEXP_OTHER_CONTROL:
1953 case XML_REGEXP_OTHER_FORMAT:
1954 case XML_REGEXP_OTHER_PRIVATE:
1955 case XML_REGEXP_OTHER_NA:
1956 case XML_REGEXP_BLOCK_NAME:
1957 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
1958 (const xmlChar *)atom->valuep);
1959 if (atom->neg)
1960 ret = !ret;
1961 break;
1962 }
1963 return(ret);
1964}
1965
1966/************************************************************************
1967 * *
1968 * Saving an restoring state of an execution context *
1969 * *
1970 ************************************************************************/
1971
1972#ifdef DEBUG_REGEXP_EXEC
1973static void
1974xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
1975 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
1976 if (exec->inputStack != NULL) {
1977 int i;
1978 printf(": ");
1979 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
1980 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]);
1981 } else {
1982 printf(": %s", &(exec->inputString[exec->index]));
1983 }
1984 printf("\n");
1985}
1986#endif
1987
1988static void
1989xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
1990#ifdef DEBUG_REGEXP_EXEC
1991 printf("saving ");
1992 exec->transno++;
1993 xmlFARegDebugExec(exec);
1994 exec->transno--;
1995#endif
1996
1997 if (exec->maxRollbacks == 0) {
1998 exec->maxRollbacks = 4;
1999 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
2000 sizeof(xmlRegExecRollback));
2001 if (exec->rollbacks == NULL) {
2002 fprintf(stderr, "exec save: allocation failed");
2003 exec->maxRollbacks = 0;
2004 return;
2005 }
2006 memset(exec->rollbacks, 0,
2007 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2008 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
2009 xmlRegExecRollback *tmp;
2010 int len = exec->maxRollbacks;
2011
2012 exec->maxRollbacks *= 2;
2013 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
2014 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2015 if (tmp == NULL) {
2016 fprintf(stderr, "exec save: allocation failed");
2017 exec->maxRollbacks /= 2;
2018 return;
2019 }
2020 exec->rollbacks = tmp;
2021 tmp = &exec->rollbacks[len];
2022 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
2023 }
2024 exec->rollbacks[exec->nbRollbacks].state = exec->state;
2025 exec->rollbacks[exec->nbRollbacks].index = exec->index;
2026 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
2027 if (exec->comp->nbCounters > 0) {
2028 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2029 exec->rollbacks[exec->nbRollbacks].counts = (int *)
2030 xmlMalloc(exec->comp->nbCounters * sizeof(int));
2031 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2032 fprintf(stderr, "exec save: allocation failed");
2033 exec->status = -5;
2034 return;
2035 }
2036 }
2037 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
2038 exec->comp->nbCounters * sizeof(int));
2039 }
2040 exec->nbRollbacks++;
2041}
2042
2043static void
2044xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
2045 if (exec->nbRollbacks <= 0) {
2046 exec->status = -1;
2047#ifdef DEBUG_REGEXP_EXEC
2048 printf("rollback failed on empty stack\n");
2049#endif
2050 return;
2051 }
2052 exec->nbRollbacks--;
2053 exec->state = exec->rollbacks[exec->nbRollbacks].state;
2054 exec->index = exec->rollbacks[exec->nbRollbacks].index;
2055 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
2056 if (exec->comp->nbCounters > 0) {
2057 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2058 fprintf(stderr, "exec save: allocation failed");
2059 exec->status = -6;
2060 return;
2061 }
2062 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
2063 exec->comp->nbCounters * sizeof(int));
2064 }
2065
2066#ifdef DEBUG_REGEXP_EXEC
2067 printf("restored ");
2068 xmlFARegDebugExec(exec);
2069#endif
2070}
2071
2072/************************************************************************
2073 * *
2074 * Verifyer, running an input against a compiled regexp *
2075 * *
2076 ************************************************************************/
2077
2078static int
2079xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
2080 xmlRegExecCtxt execval;
2081 xmlRegExecCtxtPtr exec = &execval;
2082 int ret, codepoint, len;
2083
2084 exec->inputString = content;
2085 exec->index = 0;
2086 exec->determinist = 1;
2087 exec->maxRollbacks = 0;
2088 exec->nbRollbacks = 0;
2089 exec->rollbacks = NULL;
2090 exec->status = 0;
2091 exec->comp = comp;
2092 exec->state = comp->states[0];
2093 exec->transno = 0;
2094 exec->transcount = 0;
2095 if (comp->nbCounters > 0) {
2096 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
2097 if (exec->counts == NULL)
2098 return(-1);
2099 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
2100 } else
2101 exec->counts = NULL;
2102 while ((exec->status == 0) &&
2103 ((exec->inputString[exec->index] != 0) ||
2104 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
2105 xmlRegTransPtr trans;
2106 xmlRegAtomPtr atom;
2107
2108 /*
2109 * End of input on non-terminal state, rollback, however we may
2110 * still have epsilon like transition for counted transitions
2111 * on counters, in that case don't break too early.
2112 */
2113 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
2114 goto rollback;
2115
2116 exec->transcount = 0;
2117 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2118 trans = &exec->state->trans[exec->transno];
2119 if (trans->to < 0)
2120 continue;
2121 atom = trans->atom;
2122 ret = 0;
2123 if (trans->count >= 0) {
2124 int count;
2125 xmlRegCounterPtr counter;
2126
2127 /*
2128 * A counted transition.
2129 */
2130
2131 count = exec->counts[trans->count];
2132 counter = &exec->comp->counters[trans->count];
2133#ifdef DEBUG_REGEXP_EXEC
2134 printf("testing count %d: val %d, min %d, max %d\n",
2135 trans->count, count, counter->min, counter->max);
2136#endif
2137 ret = ((count >= counter->min) && (count <= counter->max));
2138 } else if (atom == NULL) {
2139 fprintf(stderr, "epsilon transition left at runtime\n");
2140 exec->status = -2;
2141 break;
2142 } else if (exec->inputString[exec->index] != 0) {
2143 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
2144 ret = xmlRegCheckCharacter(atom, codepoint);
2145 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2146 xmlRegStatePtr to = comp->states[trans->to];
2147
2148 /*
2149 * this is a multiple input sequence
2150 */
2151 if (exec->state->nbTrans > exec->transno + 1) {
2152 xmlFARegExecSave(exec);
2153 }
2154 exec->transcount = 1;
2155 do {
2156 /*
2157 * Try to progress as much as possible on the input
2158 */
2159 if (exec->transcount == atom->max) {
2160 break;
2161 }
2162 exec->index += len;
2163 /*
2164 * End of input: stop here
2165 */
2166 if (exec->inputString[exec->index] == 0) {
2167 exec->index -= len;
2168 break;
2169 }
2170 if (exec->transcount >= atom->min) {
2171 int transno = exec->transno;
2172 xmlRegStatePtr state = exec->state;
2173
2174 /*
2175 * The transition is acceptable save it
2176 */
2177 exec->transno = -1; /* trick */
2178 exec->state = to;
2179 xmlFARegExecSave(exec);
2180 exec->transno = transno;
2181 exec->state = state;
2182 }
2183 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
2184 len);
2185 ret = xmlRegCheckCharacter(atom, codepoint);
2186 exec->transcount++;
2187 } while (ret == 1);
2188 if (exec->transcount < atom->min)
2189 ret = 0;
2190
2191 /*
2192 * If the last check failed but one transition was found
2193 * possible, rollback
2194 */
2195 if (ret < 0)
2196 ret = 0;
2197 if (ret == 0) {
2198 goto rollback;
2199 }
2200 }
2201 }
2202 if (ret == 1) {
2203 if (exec->state->nbTrans > exec->transno + 1) {
2204 xmlFARegExecSave(exec);
2205 }
2206 if (trans->counter >= 0) {
2207#ifdef DEBUG_REGEXP_EXEC
2208 printf("Increasing count %d\n", trans->counter);
2209#endif
2210 exec->counts[trans->counter]++;
2211 }
2212#ifdef DEBUG_REGEXP_EXEC
2213 printf("entering state %d\n", trans->to);
2214#endif
2215 exec->state = comp->states[trans->to];
2216 exec->transno = 0;
2217 if (trans->atom != NULL) {
2218 exec->index += len;
2219 }
2220 goto progress;
2221 } else if (ret < 0) {
2222 exec->status = -4;
2223 break;
2224 }
2225 }
2226 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2227rollback:
2228 /*
2229 * Failed to find a way out
2230 */
2231 exec->determinist = 0;
2232 xmlFARegExecRollBack(exec);
2233 }
2234progress:
2235 continue;
2236 }
2237 if (exec->rollbacks != NULL) {
2238 if (exec->counts != NULL) {
2239 int i;
2240
2241 for (i = 0;i < exec->maxRollbacks;i++)
2242 if (exec->rollbacks[i].counts != NULL)
2243 xmlFree(exec->rollbacks[i].counts);
2244 }
2245 xmlFree(exec->rollbacks);
2246 }
2247 if (exec->counts != NULL)
2248 xmlFree(exec->counts);
2249 if (exec->status == 0)
2250 return(1);
2251 if (exec->status == -1)
2252 return(0);
2253 return(exec->status);
2254}
2255
2256/************************************************************************
2257 * *
2258 * Progressive interface to the verifyer one atom at a time *
2259 * *
2260 ************************************************************************/
2261
2262/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002263 * xmlRegNewExecCtxt:
Daniel Veillard4255d502002-04-16 15:50:10 +00002264 * @comp: a precompiled regular expression
2265 * @callback: a callback function used for handling progresses in the
2266 * automata matching phase
2267 * @data: the context data associated to the callback in this context
2268 *
2269 * Build a context used for progressive evaluation of a regexp.
Daniel Veillard01c13b52002-12-10 15:19:08 +00002270 *
2271 * Returns the new context
Daniel Veillard4255d502002-04-16 15:50:10 +00002272 */
2273xmlRegExecCtxtPtr
2274xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
2275 xmlRegExecCtxtPtr exec;
2276
2277 if (comp == NULL)
2278 return(NULL);
2279 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
2280 if (exec == NULL) {
2281 return(NULL);
2282 }
2283 memset(exec, 0, sizeof(xmlRegExecCtxt));
2284 exec->inputString = NULL;
2285 exec->index = 0;
2286 exec->determinist = 1;
2287 exec->maxRollbacks = 0;
2288 exec->nbRollbacks = 0;
2289 exec->rollbacks = NULL;
2290 exec->status = 0;
2291 exec->comp = comp;
Daniel Veillard23e73572002-09-19 19:56:43 +00002292 if (comp->compact == NULL)
2293 exec->state = comp->states[0];
Daniel Veillard4255d502002-04-16 15:50:10 +00002294 exec->transno = 0;
2295 exec->transcount = 0;
2296 exec->callback = callback;
2297 exec->data = data;
2298 if (comp->nbCounters > 0) {
2299 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
2300 if (exec->counts == NULL) {
2301 xmlFree(exec);
2302 return(NULL);
2303 }
2304 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
2305 } else
2306 exec->counts = NULL;
2307 exec->inputStackMax = 0;
2308 exec->inputStackNr = 0;
2309 exec->inputStack = NULL;
2310 return(exec);
2311}
2312
2313/**
2314 * xmlRegFreeExecCtxt:
2315 * @exec: a regular expression evaulation context
2316 *
2317 * Free the structures associated to a regular expression evaulation context.
2318 */
2319void
2320xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
2321 if (exec == NULL)
2322 return;
2323
2324 if (exec->rollbacks != NULL) {
2325 if (exec->counts != NULL) {
2326 int i;
2327
2328 for (i = 0;i < exec->maxRollbacks;i++)
2329 if (exec->rollbacks[i].counts != NULL)
2330 xmlFree(exec->rollbacks[i].counts);
2331 }
2332 xmlFree(exec->rollbacks);
2333 }
2334 if (exec->counts != NULL)
2335 xmlFree(exec->counts);
2336 if (exec->inputStack != NULL) {
2337 int i;
2338
Daniel Veillard32370232002-10-16 14:08:14 +00002339 for (i = 0;i < exec->inputStackNr;i++) {
2340 if (exec->inputStack[i].value != NULL)
2341 xmlFree(exec->inputStack[i].value);
2342 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002343 xmlFree(exec->inputStack);
2344 }
2345 xmlFree(exec);
2346}
2347
2348static void
2349xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2350 void *data) {
2351#ifdef DEBUG_PUSH
2352 printf("saving value: %d:%s\n", exec->inputStackNr, value);
2353#endif
2354 if (exec->inputStackMax == 0) {
2355 exec->inputStackMax = 4;
2356 exec->inputStack = (xmlRegInputTokenPtr)
2357 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
2358 if (exec->inputStack == NULL) {
2359 fprintf(stderr, "push input: allocation failed");
2360 exec->inputStackMax = 0;
2361 return;
2362 }
2363 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
2364 xmlRegInputTokenPtr tmp;
2365
2366 exec->inputStackMax *= 2;
2367 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
2368 exec->inputStackMax * sizeof(xmlRegInputToken));
2369 if (tmp == NULL) {
2370 fprintf(stderr, "push input: allocation failed");
2371 exec->inputStackMax /= 2;
2372 return;
2373 }
2374 exec->inputStack = tmp;
2375 }
2376 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
2377 exec->inputStack[exec->inputStackNr].data = data;
2378 exec->inputStackNr++;
2379 exec->inputStack[exec->inputStackNr].value = NULL;
2380 exec->inputStack[exec->inputStackNr].data = NULL;
2381}
2382
2383
2384/**
Daniel Veillard23e73572002-09-19 19:56:43 +00002385 * xmlRegCompactPushString:
2386 * @exec: a regexp execution context
2387 * @comp: the precompiled exec with a compact table
2388 * @value: a string token input
2389 * @data: data associated to the token to reuse in callbacks
2390 *
2391 * Push one input token in the execution context
2392 *
2393 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2394 * a negative value in case of error.
2395 */
2396static int
2397xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
2398 xmlRegexpPtr comp,
2399 const xmlChar *value,
2400 void *data) {
2401 int state = exec->index;
2402 int i, target;
2403
2404 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
2405 return(-1);
2406
2407 if (value == NULL) {
2408 /*
2409 * are we at a final state ?
2410 */
2411 if (comp->compact[state * (comp->nbstrings + 1)] ==
2412 XML_REGEXP_FINAL_STATE)
2413 return(1);
2414 return(0);
2415 }
2416
2417#ifdef DEBUG_PUSH
2418 printf("value pushed: %s\n", value);
2419#endif
2420
2421 /*
2422 * Examine all outside transition from current state
2423 */
2424 for (i = 0;i < comp->nbstrings;i++) {
2425 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
2426 if ((target > 0) && (target <= comp->nbstates)) {
2427 target--; /* to avoid 0 */
2428 if (xmlStrEqual(comp->stringMap[i], value)) {
2429 exec->index = target;
Daniel Veillard118aed72002-09-24 14:13:13 +00002430 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
2431 exec->callback(exec->data, value,
2432 comp->transdata[state * comp->nbstrings + i], data);
2433 }
Daniel Veillard23e73572002-09-19 19:56:43 +00002434#ifdef DEBUG_PUSH
2435 printf("entering state %d\n", target);
2436#endif
2437 if (comp->compact[target * (comp->nbstrings + 1)] ==
2438 XML_REGEXP_FINAL_STATE)
2439 return(1);
2440 return(0);
2441 }
2442 }
2443 }
2444 /*
2445 * Failed to find an exit transition out from current state for the
2446 * current token
2447 */
2448#ifdef DEBUG_PUSH
2449 printf("failed to find a transition for %s on state %d\n", value, state);
2450#endif
2451 exec->status = -1;
2452 return(-1);
2453}
2454
2455/**
Daniel Veillard4255d502002-04-16 15:50:10 +00002456 * xmlRegExecPushString:
Daniel Veillardea7751d2002-12-20 00:16:24 +00002457 * @exec: a regexp execution context or NULL to indicate the end
Daniel Veillard4255d502002-04-16 15:50:10 +00002458 * @value: a string token input
2459 * @data: data associated to the token to reuse in callbacks
2460 *
2461 * Push one input token in the execution context
2462 *
2463 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2464 * a negative value in case of error.
2465 */
2466int
2467xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2468 void *data) {
2469 xmlRegTransPtr trans;
2470 xmlRegAtomPtr atom;
2471 int ret;
2472 int final = 0;
2473
2474 if (exec == NULL)
2475 return(-1);
Daniel Veillard23e73572002-09-19 19:56:43 +00002476 if (exec->comp == NULL)
2477 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00002478 if (exec->status != 0)
2479 return(exec->status);
2480
Daniel Veillard23e73572002-09-19 19:56:43 +00002481 if (exec->comp->compact != NULL)
2482 return(xmlRegCompactPushString(exec, exec->comp, value, data));
2483
Daniel Veillard4255d502002-04-16 15:50:10 +00002484 if (value == NULL) {
2485 if (exec->state->type == XML_REGEXP_FINAL_STATE)
2486 return(1);
2487 final = 1;
2488 }
2489
2490#ifdef DEBUG_PUSH
2491 printf("value pushed: %s\n", value);
2492#endif
2493 /*
2494 * If we have an active rollback stack push the new value there
2495 * and get back to where we were left
2496 */
2497 if ((value != NULL) && (exec->inputStackNr > 0)) {
2498 xmlFARegExecSaveInputString(exec, value, data);
2499 value = exec->inputStack[exec->index].value;
2500 data = exec->inputStack[exec->index].data;
2501#ifdef DEBUG_PUSH
2502 printf("value loaded: %s\n", value);
2503#endif
2504 }
2505
2506 while ((exec->status == 0) &&
2507 ((value != NULL) ||
2508 ((final == 1) &&
2509 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
2510
2511 /*
2512 * End of input on non-terminal state, rollback, however we may
2513 * still have epsilon like transition for counted transitions
2514 * on counters, in that case don't break too early.
2515 */
Daniel Veillardb509f152002-04-17 16:28:10 +00002516 if ((value == NULL) && (exec->counts == NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +00002517 goto rollback;
2518
2519 exec->transcount = 0;
2520 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2521 trans = &exec->state->trans[exec->transno];
2522 if (trans->to < 0)
2523 continue;
2524 atom = trans->atom;
2525 ret = 0;
Daniel Veillard441bc322002-04-20 17:38:48 +00002526 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
2527 int i;
2528 int count;
2529 xmlRegTransPtr t;
2530 xmlRegCounterPtr counter;
2531
2532 ret = 0;
2533
2534#ifdef DEBUG_PUSH
2535 printf("testing all lax %d\n", trans->count);
2536#endif
2537 /*
2538 * Check all counted transitions from the current state
2539 */
2540 if ((value == NULL) && (final)) {
2541 ret = 1;
2542 } else if (value != NULL) {
2543 for (i = 0;i < exec->state->nbTrans;i++) {
2544 t = &exec->state->trans[i];
2545 if ((t->counter < 0) || (t == trans))
2546 continue;
2547 counter = &exec->comp->counters[t->counter];
2548 count = exec->counts[t->counter];
2549 if ((count < counter->max) &&
2550 (t->atom != NULL) &&
2551 (xmlStrEqual(value, t->atom->valuep))) {
2552 ret = 0;
2553 break;
2554 }
2555 if ((count >= counter->min) &&
2556 (count < counter->max) &&
2557 (xmlStrEqual(value, t->atom->valuep))) {
2558 ret = 1;
2559 break;
2560 }
2561 }
2562 }
2563 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillard8a001f62002-04-20 07:24:11 +00002564 int i;
2565 int count;
2566 xmlRegTransPtr t;
2567 xmlRegCounterPtr counter;
2568
2569 ret = 1;
2570
2571#ifdef DEBUG_PUSH
2572 printf("testing all %d\n", trans->count);
2573#endif
2574 /*
2575 * Check all counted transitions from the current state
2576 */
2577 for (i = 0;i < exec->state->nbTrans;i++) {
2578 t = &exec->state->trans[i];
2579 if ((t->counter < 0) || (t == trans))
2580 continue;
2581 counter = &exec->comp->counters[t->counter];
2582 count = exec->counts[t->counter];
2583 if ((count < counter->min) || (count > counter->max)) {
2584 ret = 0;
2585 break;
2586 }
2587 }
2588 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002589 int count;
2590 xmlRegCounterPtr counter;
2591
2592 /*
2593 * A counted transition.
2594 */
2595
2596 count = exec->counts[trans->count];
2597 counter = &exec->comp->counters[trans->count];
2598#ifdef DEBUG_PUSH
2599 printf("testing count %d: val %d, min %d, max %d\n",
2600 trans->count, count, counter->min, counter->max);
2601#endif
2602 ret = ((count >= counter->min) && (count <= counter->max));
2603 } else if (atom == NULL) {
2604 fprintf(stderr, "epsilon transition left at runtime\n");
2605 exec->status = -2;
2606 break;
2607 } else if (value != NULL) {
2608 ret = xmlStrEqual(value, atom->valuep);
Daniel Veillard441bc322002-04-20 17:38:48 +00002609 if ((ret == 1) && (trans->counter >= 0)) {
2610 xmlRegCounterPtr counter;
2611 int count;
2612
2613 count = exec->counts[trans->counter];
2614 counter = &exec->comp->counters[trans->counter];
2615 if (count >= counter->max)
2616 ret = 0;
2617 }
2618
Daniel Veillard4255d502002-04-16 15:50:10 +00002619 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2620 xmlRegStatePtr to = exec->comp->states[trans->to];
2621
2622 /*
2623 * this is a multiple input sequence
2624 */
2625 if (exec->state->nbTrans > exec->transno + 1) {
2626 if (exec->inputStackNr <= 0) {
2627 xmlFARegExecSaveInputString(exec, value, data);
2628 }
2629 xmlFARegExecSave(exec);
2630 }
2631 exec->transcount = 1;
2632 do {
2633 /*
2634 * Try to progress as much as possible on the input
2635 */
2636 if (exec->transcount == atom->max) {
2637 break;
2638 }
2639 exec->index++;
2640 value = exec->inputStack[exec->index].value;
2641 data = exec->inputStack[exec->index].data;
2642#ifdef DEBUG_PUSH
2643 printf("value loaded: %s\n", value);
2644#endif
2645
2646 /*
2647 * End of input: stop here
2648 */
2649 if (value == NULL) {
2650 exec->index --;
2651 break;
2652 }
2653 if (exec->transcount >= atom->min) {
2654 int transno = exec->transno;
2655 xmlRegStatePtr state = exec->state;
2656
2657 /*
2658 * The transition is acceptable save it
2659 */
2660 exec->transno = -1; /* trick */
2661 exec->state = to;
2662 if (exec->inputStackNr <= 0) {
2663 xmlFARegExecSaveInputString(exec, value, data);
2664 }
2665 xmlFARegExecSave(exec);
2666 exec->transno = transno;
2667 exec->state = state;
2668 }
2669 ret = xmlStrEqual(value, atom->valuep);
2670 exec->transcount++;
2671 } while (ret == 1);
2672 if (exec->transcount < atom->min)
2673 ret = 0;
2674
2675 /*
2676 * If the last check failed but one transition was found
2677 * possible, rollback
2678 */
2679 if (ret < 0)
2680 ret = 0;
2681 if (ret == 0) {
2682 goto rollback;
2683 }
2684 }
2685 }
2686 if (ret == 1) {
2687 if ((exec->callback != NULL) && (atom != NULL)) {
2688 exec->callback(exec->data, atom->valuep,
2689 atom->data, data);
2690 }
2691 if (exec->state->nbTrans > exec->transno + 1) {
2692 if (exec->inputStackNr <= 0) {
2693 xmlFARegExecSaveInputString(exec, value, data);
2694 }
2695 xmlFARegExecSave(exec);
2696 }
2697 if (trans->counter >= 0) {
2698#ifdef DEBUG_PUSH
2699 printf("Increasing count %d\n", trans->counter);
2700#endif
2701 exec->counts[trans->counter]++;
2702 }
2703#ifdef DEBUG_PUSH
2704 printf("entering state %d\n", trans->to);
2705#endif
2706 exec->state = exec->comp->states[trans->to];
2707 exec->transno = 0;
2708 if (trans->atom != NULL) {
2709 if (exec->inputStack != NULL) {
2710 exec->index++;
2711 if (exec->index < exec->inputStackNr) {
2712 value = exec->inputStack[exec->index].value;
2713 data = exec->inputStack[exec->index].data;
2714#ifdef DEBUG_PUSH
2715 printf("value loaded: %s\n", value);
2716#endif
2717 } else {
2718 value = NULL;
2719 data = NULL;
2720#ifdef DEBUG_PUSH
2721 printf("end of input\n");
2722#endif
2723 }
2724 } else {
2725 value = NULL;
2726 data = NULL;
2727#ifdef DEBUG_PUSH
2728 printf("end of input\n");
2729#endif
2730 }
2731 }
2732 goto progress;
2733 } else if (ret < 0) {
2734 exec->status = -4;
2735 break;
2736 }
2737 }
2738 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2739rollback:
2740 /*
2741 * Failed to find a way out
2742 */
2743 exec->determinist = 0;
2744 xmlFARegExecRollBack(exec);
2745 if (exec->status == 0) {
2746 value = exec->inputStack[exec->index].value;
2747 data = exec->inputStack[exec->index].data;
2748#ifdef DEBUG_PUSH
2749 printf("value loaded: %s\n", value);
2750#endif
2751 }
2752 }
2753progress:
2754 continue;
2755 }
2756 if (exec->status == 0) {
2757 return(exec->state->type == XML_REGEXP_FINAL_STATE);
2758 }
2759 return(exec->status);
2760}
2761
2762#if 0
2763static int
2764xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
2765 xmlRegTransPtr trans;
2766 xmlRegAtomPtr atom;
2767 int ret;
2768 int codepoint, len;
2769
2770 if (exec == NULL)
2771 return(-1);
2772 if (exec->status != 0)
2773 return(exec->status);
2774
2775 while ((exec->status == 0) &&
2776 ((exec->inputString[exec->index] != 0) ||
2777 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
2778
2779 /*
2780 * End of input on non-terminal state, rollback, however we may
2781 * still have epsilon like transition for counted transitions
2782 * on counters, in that case don't break too early.
2783 */
2784 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
2785 goto rollback;
2786
2787 exec->transcount = 0;
2788 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2789 trans = &exec->state->trans[exec->transno];
2790 if (trans->to < 0)
2791 continue;
2792 atom = trans->atom;
2793 ret = 0;
2794 if (trans->count >= 0) {
2795 int count;
2796 xmlRegCounterPtr counter;
2797
2798 /*
2799 * A counted transition.
2800 */
2801
2802 count = exec->counts[trans->count];
2803 counter = &exec->comp->counters[trans->count];
2804#ifdef DEBUG_REGEXP_EXEC
2805 printf("testing count %d: val %d, min %d, max %d\n",
2806 trans->count, count, counter->min, counter->max);
2807#endif
2808 ret = ((count >= counter->min) && (count <= counter->max));
2809 } else if (atom == NULL) {
2810 fprintf(stderr, "epsilon transition left at runtime\n");
2811 exec->status = -2;
2812 break;
2813 } else if (exec->inputString[exec->index] != 0) {
2814 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
2815 ret = xmlRegCheckCharacter(atom, codepoint);
2816 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2817 xmlRegStatePtr to = exec->comp->states[trans->to];
2818
2819 /*
2820 * this is a multiple input sequence
2821 */
2822 if (exec->state->nbTrans > exec->transno + 1) {
2823 xmlFARegExecSave(exec);
2824 }
2825 exec->transcount = 1;
2826 do {
2827 /*
2828 * Try to progress as much as possible on the input
2829 */
2830 if (exec->transcount == atom->max) {
2831 break;
2832 }
2833 exec->index += len;
2834 /*
2835 * End of input: stop here
2836 */
2837 if (exec->inputString[exec->index] == 0) {
2838 exec->index -= len;
2839 break;
2840 }
2841 if (exec->transcount >= atom->min) {
2842 int transno = exec->transno;
2843 xmlRegStatePtr state = exec->state;
2844
2845 /*
2846 * The transition is acceptable save it
2847 */
2848 exec->transno = -1; /* trick */
2849 exec->state = to;
2850 xmlFARegExecSave(exec);
2851 exec->transno = transno;
2852 exec->state = state;
2853 }
2854 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
2855 len);
2856 ret = xmlRegCheckCharacter(atom, codepoint);
2857 exec->transcount++;
2858 } while (ret == 1);
2859 if (exec->transcount < atom->min)
2860 ret = 0;
2861
2862 /*
2863 * If the last check failed but one transition was found
2864 * possible, rollback
2865 */
2866 if (ret < 0)
2867 ret = 0;
2868 if (ret == 0) {
2869 goto rollback;
2870 }
2871 }
2872 }
2873 if (ret == 1) {
2874 if (exec->state->nbTrans > exec->transno + 1) {
2875 xmlFARegExecSave(exec);
2876 }
2877 if (trans->counter >= 0) {
2878#ifdef DEBUG_REGEXP_EXEC
2879 printf("Increasing count %d\n", trans->counter);
2880#endif
2881 exec->counts[trans->counter]++;
2882 }
2883#ifdef DEBUG_REGEXP_EXEC
2884 printf("entering state %d\n", trans->to);
2885#endif
2886 exec->state = exec->comp->states[trans->to];
2887 exec->transno = 0;
2888 if (trans->atom != NULL) {
2889 exec->index += len;
2890 }
2891 goto progress;
2892 } else if (ret < 0) {
2893 exec->status = -4;
2894 break;
2895 }
2896 }
2897 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2898rollback:
2899 /*
2900 * Failed to find a way out
2901 */
2902 exec->determinist = 0;
2903 xmlFARegExecRollBack(exec);
2904 }
2905progress:
2906 continue;
2907 }
2908}
2909#endif
2910/************************************************************************
2911 * *
2912 * Parser for the Shemas Datatype Regular Expressions *
2913 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
2914 * *
2915 ************************************************************************/
2916
2917/**
2918 * xmlFAIsChar:
Daniel Veillard441bc322002-04-20 17:38:48 +00002919 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00002920 *
2921 * [10] Char ::= [^.\?*+()|#x5B#x5D]
2922 */
2923static int
2924xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
2925 int cur;
2926 int len;
2927
2928 cur = CUR_SCHAR(ctxt->cur, len);
2929 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
2930 (cur == '*') || (cur == '+') || (cur == '(') ||
2931 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
2932 (cur == 0x5D) || (cur == 0))
2933 return(-1);
2934 return(cur);
2935}
2936
2937/**
2938 * xmlFAParseCharProp:
Daniel Veillard441bc322002-04-20 17:38:48 +00002939 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00002940 *
2941 * [27] charProp ::= IsCategory | IsBlock
2942 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
2943 * Separators | Symbols | Others
2944 * [29] Letters ::= 'L' [ultmo]?
2945 * [30] Marks ::= 'M' [nce]?
2946 * [31] Numbers ::= 'N' [dlo]?
2947 * [32] Punctuation ::= 'P' [cdseifo]?
2948 * [33] Separators ::= 'Z' [slp]?
2949 * [34] Symbols ::= 'S' [mcko]?
2950 * [35] Others ::= 'C' [cfon]?
2951 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
2952 */
2953static void
2954xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
2955 int cur;
2956 xmlRegAtomType type = 0;
2957 xmlChar *blockName = NULL;
2958
2959 cur = CUR;
2960 if (cur == 'L') {
2961 NEXT;
2962 cur = CUR;
2963 if (cur == 'u') {
2964 NEXT;
2965 type = XML_REGEXP_LETTER_UPPERCASE;
2966 } else if (cur == 'l') {
2967 NEXT;
2968 type = XML_REGEXP_LETTER_LOWERCASE;
2969 } else if (cur == 't') {
2970 NEXT;
2971 type = XML_REGEXP_LETTER_TITLECASE;
2972 } else if (cur == 'm') {
2973 NEXT;
2974 type = XML_REGEXP_LETTER_MODIFIER;
2975 } else if (cur == 'o') {
2976 NEXT;
2977 type = XML_REGEXP_LETTER_OTHERS;
2978 } else {
2979 type = XML_REGEXP_LETTER;
2980 }
2981 } else if (cur == 'M') {
2982 NEXT;
2983 cur = CUR;
2984 if (cur == 'n') {
2985 NEXT;
2986 /* nonspacing */
2987 type = XML_REGEXP_MARK_NONSPACING;
2988 } else if (cur == 'c') {
2989 NEXT;
2990 /* spacing combining */
2991 type = XML_REGEXP_MARK_SPACECOMBINING;
2992 } else if (cur == 'e') {
2993 NEXT;
2994 /* enclosing */
2995 type = XML_REGEXP_MARK_ENCLOSING;
2996 } else {
2997 /* all marks */
2998 type = XML_REGEXP_MARK;
2999 }
3000 } else if (cur == 'N') {
3001 NEXT;
3002 cur = CUR;
3003 if (cur == 'd') {
3004 NEXT;
3005 /* digital */
3006 type = XML_REGEXP_NUMBER_DECIMAL;
3007 } else if (cur == 'l') {
3008 NEXT;
3009 /* letter */
3010 type = XML_REGEXP_NUMBER_LETTER;
3011 } else if (cur == 'o') {
3012 NEXT;
3013 /* other */
3014 type = XML_REGEXP_NUMBER_OTHERS;
3015 } else {
3016 /* all numbers */
3017 type = XML_REGEXP_NUMBER;
3018 }
3019 } else if (cur == 'P') {
3020 NEXT;
3021 cur = CUR;
3022 if (cur == 'c') {
3023 NEXT;
3024 /* connector */
3025 type = XML_REGEXP_PUNCT_CONNECTOR;
3026 } else if (cur == 'd') {
3027 NEXT;
3028 /* dash */
3029 type = XML_REGEXP_PUNCT_DASH;
3030 } else if (cur == 's') {
3031 NEXT;
3032 /* open */
3033 type = XML_REGEXP_PUNCT_OPEN;
3034 } else if (cur == 'e') {
3035 NEXT;
3036 /* close */
3037 type = XML_REGEXP_PUNCT_CLOSE;
3038 } else if (cur == 'i') {
3039 NEXT;
3040 /* initial quote */
3041 type = XML_REGEXP_PUNCT_INITQUOTE;
3042 } else if (cur == 'f') {
3043 NEXT;
3044 /* final quote */
3045 type = XML_REGEXP_PUNCT_FINQUOTE;
3046 } else if (cur == 'o') {
3047 NEXT;
3048 /* other */
3049 type = XML_REGEXP_PUNCT_OTHERS;
3050 } else {
3051 /* all punctuation */
3052 type = XML_REGEXP_PUNCT;
3053 }
3054 } else if (cur == 'Z') {
3055 NEXT;
3056 cur = CUR;
3057 if (cur == 's') {
3058 NEXT;
3059 /* space */
3060 type = XML_REGEXP_SEPAR_SPACE;
3061 } else if (cur == 'l') {
3062 NEXT;
3063 /* line */
3064 type = XML_REGEXP_SEPAR_LINE;
3065 } else if (cur == 'p') {
3066 NEXT;
3067 /* paragraph */
3068 type = XML_REGEXP_SEPAR_PARA;
3069 } else {
3070 /* all separators */
3071 type = XML_REGEXP_SEPAR;
3072 }
3073 } else if (cur == 'S') {
3074 NEXT;
3075 cur = CUR;
3076 if (cur == 'm') {
3077 NEXT;
3078 type = XML_REGEXP_SYMBOL_MATH;
3079 /* math */
3080 } else if (cur == 'c') {
3081 NEXT;
3082 type = XML_REGEXP_SYMBOL_CURRENCY;
3083 /* currency */
3084 } else if (cur == 'k') {
3085 NEXT;
3086 type = XML_REGEXP_SYMBOL_MODIFIER;
3087 /* modifiers */
3088 } else if (cur == 'o') {
3089 NEXT;
3090 type = XML_REGEXP_SYMBOL_OTHERS;
3091 /* other */
3092 } else {
3093 /* all symbols */
3094 type = XML_REGEXP_SYMBOL;
3095 }
3096 } else if (cur == 'C') {
3097 NEXT;
3098 cur = CUR;
3099 if (cur == 'c') {
3100 NEXT;
3101 /* control */
3102 type = XML_REGEXP_OTHER_CONTROL;
3103 } else if (cur == 'f') {
3104 NEXT;
3105 /* format */
3106 type = XML_REGEXP_OTHER_FORMAT;
3107 } else if (cur == 'o') {
3108 NEXT;
3109 /* private use */
3110 type = XML_REGEXP_OTHER_PRIVATE;
3111 } else if (cur == 'n') {
3112 NEXT;
3113 /* not assigned */
3114 type = XML_REGEXP_OTHER_NA;
3115 } else {
3116 /* all others */
3117 type = XML_REGEXP_OTHER;
3118 }
3119 } else if (cur == 'I') {
3120 const xmlChar *start;
3121 NEXT;
3122 cur = CUR;
3123 if (cur != 's') {
3124 ERROR("IsXXXX expected");
3125 return;
3126 }
3127 NEXT;
3128 start = ctxt->cur;
3129 cur = CUR;
3130 if (((cur >= 'a') && (cur <= 'z')) ||
3131 ((cur >= 'A') && (cur <= 'Z')) ||
3132 ((cur >= '0') && (cur <= '9')) ||
3133 (cur == 0x2D)) {
3134 NEXT;
3135 cur = CUR;
3136 while (((cur >= 'a') && (cur <= 'z')) ||
3137 ((cur >= 'A') && (cur <= 'Z')) ||
3138 ((cur >= '0') && (cur <= '9')) ||
3139 (cur == 0x2D)) {
3140 NEXT;
3141 cur = CUR;
3142 }
3143 }
3144 type = XML_REGEXP_BLOCK_NAME;
3145 blockName = xmlStrndup(start, ctxt->cur - start);
3146 } else {
3147 ERROR("Unknown char property");
3148 return;
3149 }
3150 if (ctxt->atom == NULL) {
3151 ctxt->atom = xmlRegNewAtom(ctxt, type);
3152 if (ctxt->atom != NULL)
3153 ctxt->atom->valuep = blockName;
3154 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3155 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3156 type, 0, 0, blockName);
3157 }
3158}
3159
3160/**
3161 * xmlFAParseCharClassEsc:
Daniel Veillard441bc322002-04-20 17:38:48 +00003162 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003163 *
3164 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
3165 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
3166 * [25] catEsc ::= '\p{' charProp '}'
3167 * [26] complEsc ::= '\P{' charProp '}'
3168 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
3169 */
3170static void
3171xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
3172 int cur;
3173
3174 if (CUR == '.') {
3175 if (ctxt->atom == NULL) {
3176 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
3177 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3178 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3179 XML_REGEXP_ANYCHAR, 0, 0, NULL);
3180 }
3181 NEXT;
3182 return;
3183 }
3184 if (CUR != '\\') {
3185 ERROR("Escaped sequence: expecting \\");
3186 return;
3187 }
3188 NEXT;
3189 cur = CUR;
3190 if (cur == 'p') {
3191 NEXT;
3192 if (CUR != '{') {
3193 ERROR("Expecting '{'");
3194 return;
3195 }
3196 NEXT;
3197 xmlFAParseCharProp(ctxt);
3198 if (CUR != '}') {
3199 ERROR("Expecting '}'");
3200 return;
3201 }
3202 NEXT;
3203 } else if (cur == 'P') {
3204 NEXT;
3205 if (CUR != '{') {
3206 ERROR("Expecting '{'");
3207 return;
3208 }
3209 NEXT;
3210 xmlFAParseCharProp(ctxt);
3211 ctxt->atom->neg = 1;
3212 if (CUR != '}') {
3213 ERROR("Expecting '}'");
3214 return;
3215 }
3216 NEXT;
3217 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
3218 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
3219 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
3220 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
3221 (cur == 0x5E)) {
3222 if (ctxt->atom == NULL) {
3223 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
3224 if (ctxt->atom != NULL)
3225 ctxt->atom->codepoint = cur;
3226 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3227 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3228 XML_REGEXP_CHARVAL, cur, cur, NULL);
3229 }
3230 NEXT;
3231 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
3232 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
3233 (cur == 'w') || (cur == 'W')) {
Daniel Veillardb509f152002-04-17 16:28:10 +00003234 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
Daniel Veillard4255d502002-04-16 15:50:10 +00003235
3236 switch (cur) {
3237 case 's':
3238 type = XML_REGEXP_ANYSPACE;
3239 break;
3240 case 'S':
3241 type = XML_REGEXP_NOTSPACE;
3242 break;
3243 case 'i':
3244 type = XML_REGEXP_INITNAME;
3245 break;
3246 case 'I':
3247 type = XML_REGEXP_NOTINITNAME;
3248 break;
3249 case 'c':
3250 type = XML_REGEXP_NAMECHAR;
3251 break;
3252 case 'C':
3253 type = XML_REGEXP_NOTNAMECHAR;
3254 break;
3255 case 'd':
3256 type = XML_REGEXP_DECIMAL;
3257 break;
3258 case 'D':
3259 type = XML_REGEXP_NOTDECIMAL;
3260 break;
3261 case 'w':
3262 type = XML_REGEXP_REALCHAR;
3263 break;
3264 case 'W':
3265 type = XML_REGEXP_NOTREALCHAR;
3266 break;
3267 }
3268 NEXT;
3269 if (ctxt->atom == NULL) {
3270 ctxt->atom = xmlRegNewAtom(ctxt, type);
3271 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3272 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3273 type, 0, 0, NULL);
3274 }
3275 }
3276}
3277
3278/**
3279 * xmlFAParseCharRef:
Daniel Veillard441bc322002-04-20 17:38:48 +00003280 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003281 *
3282 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' )
3283 */
3284static int
3285xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) {
3286 int ret = 0, cur;
3287
3288 if ((CUR != '&') || (NXT(1) != '#'))
3289 return(-1);
3290 NEXT;
3291 NEXT;
3292 cur = CUR;
3293 if (cur == 'x') {
3294 NEXT;
3295 cur = CUR;
3296 if (((cur >= '0') && (cur <= '9')) ||
3297 ((cur >= 'a') && (cur <= 'f')) ||
3298 ((cur >= 'A') && (cur <= 'F'))) {
3299 while (((cur >= '0') && (cur <= '9')) ||
3300 ((cur >= 'A') && (cur <= 'F'))) {
3301 if ((cur >= '0') && (cur <= '9'))
3302 ret = ret * 16 + cur - '0';
3303 else if ((cur >= 'a') && (cur <= 'f'))
3304 ret = ret * 16 + 10 + (cur - 'a');
3305 else
3306 ret = ret * 16 + 10 + (cur - 'A');
3307 NEXT;
3308 cur = CUR;
3309 }
3310 } else {
3311 ERROR("Char ref: expecting [0-9A-F]");
3312 return(-1);
3313 }
3314 } else {
3315 if ((cur >= '0') && (cur <= '9')) {
3316 while ((cur >= '0') && (cur <= '9')) {
3317 ret = ret * 10 + cur - '0';
3318 NEXT;
3319 cur = CUR;
3320 }
3321 } else {
3322 ERROR("Char ref: expecting [0-9]");
3323 return(-1);
3324 }
3325 }
3326 if (cur != ';') {
3327 ERROR("Char ref: expecting ';'");
3328 return(-1);
3329 } else {
3330 NEXT;
3331 }
3332 return(ret);
3333}
3334
3335/**
3336 * xmlFAParseCharRange:
Daniel Veillard441bc322002-04-20 17:38:48 +00003337 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003338 *
3339 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
3340 * [18] seRange ::= charOrEsc '-' charOrEsc
3341 * [20] charOrEsc ::= XmlChar | SingleCharEsc
3342 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
3343 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
3344 */
3345static void
3346xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
3347 int cur;
3348 int start = -1;
3349 int end = -1;
3350
3351 if ((CUR == '&') && (NXT(1) == '#')) {
3352 end = start = xmlFAParseCharRef(ctxt);
3353 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3354 XML_REGEXP_CHARVAL, start, end, NULL);
3355 return;
3356 }
3357 cur = CUR;
3358 if (cur == '\\') {
3359 NEXT;
3360 cur = CUR;
3361 switch (cur) {
3362 case 'n': start = 0xA; break;
3363 case 'r': start = 0xD; break;
3364 case 't': start = 0x9; break;
3365 case '\\': case '|': case '.': case '-': case '^': case '?':
3366 case '*': case '+': case '{': case '}': case '(': case ')':
3367 case '[': case ']':
3368 start = cur; break;
3369 default:
3370 ERROR("Invalid escape value");
3371 return;
3372 }
3373 end = start;
3374 } else if ((cur != 0x5B) && (cur != 0x5D)) {
3375 end = start = cur;
3376 } else {
3377 ERROR("Expecting a char range");
3378 return;
3379 }
3380 NEXT;
3381 if (start == '-') {
3382 return;
3383 }
3384 cur = CUR;
3385 if (cur != '-') {
3386 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3387 XML_REGEXP_CHARVAL, start, end, NULL);
3388 return;
3389 }
3390 NEXT;
3391 cur = CUR;
3392 if (cur == '\\') {
3393 NEXT;
3394 cur = CUR;
3395 switch (cur) {
3396 case 'n': end = 0xA; break;
3397 case 'r': end = 0xD; break;
3398 case 't': end = 0x9; break;
3399 case '\\': case '|': case '.': case '-': case '^': case '?':
3400 case '*': case '+': case '{': case '}': case '(': case ')':
3401 case '[': case ']':
3402 end = cur; break;
3403 default:
3404 ERROR("Invalid escape value");
3405 return;
3406 }
3407 } else if ((cur != 0x5B) && (cur != 0x5D)) {
3408 end = cur;
3409 } else {
3410 ERROR("Expecting the end of a char range");
3411 return;
3412 }
3413 NEXT;
3414 /* TODO check that the values are acceptable character ranges for XML */
3415 if (end < start) {
3416 ERROR("End of range is before start of range");
3417 } else {
3418 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3419 XML_REGEXP_CHARVAL, start, end, NULL);
3420 }
3421 return;
3422}
3423
3424/**
3425 * xmlFAParsePosCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00003426 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003427 *
3428 * [14] posCharGroup ::= ( charRange | charClassEsc )+
3429 */
3430static void
3431xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
3432 do {
3433 if ((CUR == '\\') || (CUR == '.')) {
3434 xmlFAParseCharClassEsc(ctxt);
3435 } else {
3436 xmlFAParseCharRange(ctxt);
3437 }
3438 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
3439 (ctxt->error == 0));
3440}
3441
3442/**
3443 * xmlFAParseCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00003444 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003445 *
3446 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
3447 * [15] negCharGroup ::= '^' posCharGroup
3448 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
3449 * [12] charClassExpr ::= '[' charGroup ']'
3450 */
3451static void
3452xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
3453 int n = ctxt->neg;
3454 while ((CUR != ']') && (ctxt->error == 0)) {
3455 if (CUR == '^') {
3456 int neg = ctxt->neg;
3457
3458 NEXT;
3459 ctxt->neg = !ctxt->neg;
3460 xmlFAParsePosCharGroup(ctxt);
3461 ctxt->neg = neg;
3462 } else if (CUR == '-') {
3463 NEXT;
3464 ctxt->neg = !ctxt->neg;
3465 if (CUR != '[') {
3466 ERROR("charClassExpr: '[' expected");
3467 break;
3468 }
3469 NEXT;
3470 xmlFAParseCharGroup(ctxt);
3471 if (CUR == ']') {
3472 NEXT;
3473 } else {
3474 ERROR("charClassExpr: ']' expected");
3475 break;
3476 }
3477 break;
3478 } else if (CUR != ']') {
3479 xmlFAParsePosCharGroup(ctxt);
3480 }
3481 }
3482 ctxt->neg = n;
3483}
3484
3485/**
3486 * xmlFAParseCharClass:
Daniel Veillard441bc322002-04-20 17:38:48 +00003487 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003488 *
3489 * [11] charClass ::= charClassEsc | charClassExpr
3490 * [12] charClassExpr ::= '[' charGroup ']'
3491 */
3492static void
3493xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
3494 if (CUR == '[') {
3495 NEXT;
3496 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
3497 if (ctxt->atom == NULL)
3498 return;
3499 xmlFAParseCharGroup(ctxt);
3500 if (CUR == ']') {
3501 NEXT;
3502 } else {
3503 ERROR("xmlFAParseCharClass: ']' expected");
3504 }
3505 } else {
3506 xmlFAParseCharClassEsc(ctxt);
3507 }
3508}
3509
3510/**
3511 * xmlFAParseQuantExact:
Daniel Veillard441bc322002-04-20 17:38:48 +00003512 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003513 *
3514 * [8] QuantExact ::= [0-9]+
3515 */
3516static int
3517xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
3518 int ret = 0;
3519 int ok = 0;
3520
3521 while ((CUR >= '0') && (CUR <= '9')) {
3522 ret = ret * 10 + (CUR - '0');
3523 ok = 1;
3524 NEXT;
3525 }
3526 if (ok != 1) {
3527 return(-1);
3528 }
3529 return(ret);
3530}
3531
3532/**
3533 * xmlFAParseQuantifier:
Daniel Veillard441bc322002-04-20 17:38:48 +00003534 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003535 *
3536 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
3537 * [5] quantity ::= quantRange | quantMin | QuantExact
3538 * [6] quantRange ::= QuantExact ',' QuantExact
3539 * [7] quantMin ::= QuantExact ','
3540 * [8] QuantExact ::= [0-9]+
3541 */
3542static int
3543xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
3544 int cur;
3545
3546 cur = CUR;
3547 if ((cur == '?') || (cur == '*') || (cur == '+')) {
3548 if (ctxt->atom != NULL) {
3549 if (cur == '?')
3550 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
3551 else if (cur == '*')
3552 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
3553 else if (cur == '+')
3554 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
3555 }
3556 NEXT;
3557 return(1);
3558 }
3559 if (cur == '{') {
3560 int min = 0, max = 0;
3561
3562 NEXT;
3563 cur = xmlFAParseQuantExact(ctxt);
3564 if (cur >= 0)
3565 min = cur;
3566 if (CUR == ',') {
3567 NEXT;
3568 cur = xmlFAParseQuantExact(ctxt);
3569 if (cur >= 0)
3570 max = cur;
3571 }
3572 if (CUR == '}') {
3573 NEXT;
3574 } else {
3575 ERROR("Unterminated quantifier");
3576 }
3577 if (max == 0)
3578 max = min;
3579 if (ctxt->atom != NULL) {
3580 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
3581 ctxt->atom->min = min;
3582 ctxt->atom->max = max;
3583 }
3584 return(1);
3585 }
3586 return(0);
3587}
3588
3589/**
3590 * xmlFAParseAtom:
Daniel Veillard441bc322002-04-20 17:38:48 +00003591 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003592 *
3593 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
3594 */
3595static int
3596xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
3597 int codepoint, len;
3598
3599 codepoint = xmlFAIsChar(ctxt);
3600 if (codepoint > 0) {
3601 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
3602 if (ctxt->atom == NULL)
3603 return(-1);
3604 codepoint = CUR_SCHAR(ctxt->cur, len);
3605 ctxt->atom->codepoint = codepoint;
3606 NEXTL(len);
3607 return(1);
3608 } else if (CUR == '|') {
3609 return(0);
3610 } else if (CUR == 0) {
3611 return(0);
3612 } else if (CUR == ')') {
3613 return(0);
3614 } else if (CUR == '(') {
3615 xmlRegStatePtr start, oldend;
3616
3617 NEXT;
3618 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
3619 start = ctxt->state;
3620 oldend = ctxt->end;
3621 ctxt->end = NULL;
3622 ctxt->atom = NULL;
3623 xmlFAParseRegExp(ctxt, 0);
3624 if (CUR == ')') {
3625 NEXT;
3626 } else {
3627 ERROR("xmlFAParseAtom: expecting ')'");
3628 }
3629 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
3630 if (ctxt->atom == NULL)
3631 return(-1);
3632 ctxt->atom->start = start;
3633 ctxt->atom->stop = ctxt->state;
3634 ctxt->end = oldend;
3635 return(1);
3636 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
3637 xmlFAParseCharClass(ctxt);
3638 return(1);
3639 }
3640 return(0);
3641}
3642
3643/**
3644 * xmlFAParsePiece:
Daniel Veillard441bc322002-04-20 17:38:48 +00003645 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003646 *
3647 * [3] piece ::= atom quantifier?
3648 */
3649static int
3650xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
3651 int ret;
3652
3653 ctxt->atom = NULL;
3654 ret = xmlFAParseAtom(ctxt);
3655 if (ret == 0)
3656 return(0);
3657 if (ctxt->atom == NULL) {
3658 ERROR("internal: no atom generated");
3659 }
3660 xmlFAParseQuantifier(ctxt);
3661 return(1);
3662}
3663
3664/**
3665 * xmlFAParseBranch:
Daniel Veillard441bc322002-04-20 17:38:48 +00003666 * @ctxt: a regexp parser context
3667 * @first: is taht the first
Daniel Veillard4255d502002-04-16 15:50:10 +00003668 *
3669 * [2] branch ::= piece*
3670 */
3671static void
3672xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, int first) {
3673 xmlRegStatePtr previous;
3674 xmlRegAtomPtr prevatom = NULL;
3675 int ret;
3676
3677 previous = ctxt->state;
3678 ret = xmlFAParsePiece(ctxt);
3679 if (ret != 0) {
3680 if (first) {
3681 xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom);
3682 previous = ctxt->state;
3683 } else {
3684 prevatom = ctxt->atom;
3685 }
3686 ctxt->atom = NULL;
3687 }
3688 while ((ret != 0) && (ctxt->error == 0)) {
3689 ret = xmlFAParsePiece(ctxt);
3690 if (ret != 0) {
3691 if (first) {
3692 xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom);
3693 } else {
3694 xmlFAGenerateTransitions(ctxt, previous, NULL, prevatom);
3695 prevatom = ctxt->atom;
3696 }
3697 previous = ctxt->state;
3698 ctxt->atom = NULL;
3699 }
3700 }
3701 if (!first) {
3702 xmlFAGenerateTransitions(ctxt, previous, ctxt->end, prevatom);
3703 }
3704}
3705
3706/**
3707 * xmlFAParseRegExp:
Daniel Veillard441bc322002-04-20 17:38:48 +00003708 * @ctxt: a regexp parser context
3709 * @top: is that the top-level expressions ?
Daniel Veillard4255d502002-04-16 15:50:10 +00003710 *
3711 * [1] regExp ::= branch ( '|' branch )*
3712 */
3713static void
3714xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
3715 xmlRegStatePtr start, end, oldend;
3716
3717 oldend = ctxt->end;
3718
3719 start = ctxt->state;
3720 xmlFAParseBranch(ctxt, (ctxt->end == NULL));
3721 if (CUR != '|') {
3722 ctxt->end = ctxt->state;
3723 return;
3724 }
3725 end = ctxt->state;
3726 while ((CUR == '|') && (ctxt->error == 0)) {
3727 NEXT;
3728 ctxt->state = start;
3729 ctxt->end = end;
3730 xmlFAParseBranch(ctxt, 0);
3731 }
3732 if (!top)
3733 ctxt->end = oldend;
3734}
3735
3736/************************************************************************
3737 * *
3738 * The basic API *
3739 * *
3740 ************************************************************************/
3741
3742/**
3743 * xmlRegexpPrint:
3744 * @output: the file for the output debug
3745 * @regexp: the compiled regexp
3746 *
3747 * Print the content of the compiled regular expression
3748 */
3749void
3750xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
3751 int i;
3752
3753 fprintf(output, " regexp: ");
3754 if (regexp == NULL) {
3755 fprintf(output, "NULL\n");
3756 return;
3757 }
3758 fprintf(output, "'%s' ", regexp->string);
3759 fprintf(output, "\n");
3760 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
3761 for (i = 0;i < regexp->nbAtoms; i++) {
3762 fprintf(output, " %02d ", i);
3763 xmlRegPrintAtom(output, regexp->atoms[i]);
3764 }
3765 fprintf(output, "%d states:", regexp->nbStates);
3766 fprintf(output, "\n");
3767 for (i = 0;i < regexp->nbStates; i++) {
3768 xmlRegPrintState(output, regexp->states[i]);
3769 }
3770 fprintf(output, "%d counters:\n", regexp->nbCounters);
3771 for (i = 0;i < regexp->nbCounters; i++) {
3772 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
3773 regexp->counters[i].max);
3774 }
3775}
3776
3777/**
3778 * xmlRegexpCompile:
3779 * @regexp: a regular expression string
3780 *
3781 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
3782 * Appendix F and build an automata suitable for testing strings against
3783 * that regular expression
3784 *
3785 * Returns the compiled expression or NULL in case of error
3786 */
3787xmlRegexpPtr
3788xmlRegexpCompile(const xmlChar *regexp) {
3789 xmlRegexpPtr ret;
3790 xmlRegParserCtxtPtr ctxt;
3791
3792 ctxt = xmlRegNewParserCtxt(regexp);
3793 if (ctxt == NULL)
3794 return(NULL);
3795
3796 /* initialize the parser */
3797 ctxt->end = NULL;
3798 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
3799 xmlRegStatePush(ctxt, ctxt->start);
3800
3801 /* parse the expression building an automata */
3802 xmlFAParseRegExp(ctxt, 1);
3803 if (CUR != 0) {
3804 ERROR("xmlFAParseRegExp: extra characters");
3805 }
3806 ctxt->end = ctxt->state;
3807 ctxt->start->type = XML_REGEXP_START_STATE;
3808 ctxt->end->type = XML_REGEXP_FINAL_STATE;
3809
3810 /* remove the Epsilon except for counted transitions */
3811 xmlFAEliminateEpsilonTransitions(ctxt);
3812
3813
3814 if (ctxt->error != 0) {
3815 xmlRegFreeParserCtxt(ctxt);
3816 return(NULL);
3817 }
3818 ret = xmlRegEpxFromParse(ctxt);
3819 xmlRegFreeParserCtxt(ctxt);
3820 return(ret);
3821}
3822
3823/**
3824 * xmlRegexpExec:
3825 * @comp: the compiled regular expression
3826 * @content: the value to check against the regular expression
3827 *
3828 * Check if the regular expression generate the value
3829 *
3830 * Returns 1 if it matches, 0 if not and a negativa value in case of error
3831 */
3832int
3833xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
3834 if ((comp == NULL) || (content == NULL))
3835 return(-1);
3836 return(xmlFARegExec(comp, content));
3837}
3838
3839/**
Daniel Veillard23e73572002-09-19 19:56:43 +00003840 * xmlRegexpIsDeterminist:
3841 * @comp: the compiled regular expression
3842 *
3843 * Check if the regular expression is determinist
3844 *
3845 * Returns 1 if it yes, 0 if not and a negativa value in case of error
3846 */
3847int
3848xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
3849 xmlAutomataPtr am;
3850 int ret;
3851
3852 if (comp == NULL)
3853 return(-1);
3854 if (comp->determinist != -1)
3855 return(comp->determinist);
3856
3857 am = xmlNewAutomata();
Daniel Veillardbd9afb52002-09-25 22:25:35 +00003858 if (am->states != NULL) {
3859 int i;
3860
3861 for (i = 0;i < am->nbStates;i++)
3862 xmlRegFreeState(am->states[i]);
3863 xmlFree(am->states);
3864 }
Daniel Veillard23e73572002-09-19 19:56:43 +00003865 am->nbAtoms = comp->nbAtoms;
3866 am->atoms = comp->atoms;
3867 am->nbStates = comp->nbStates;
3868 am->states = comp->states;
3869 am->determinist = -1;
3870 ret = xmlFAComputesDeterminism(am);
3871 am->atoms = NULL;
3872 am->states = NULL;
3873 xmlFreeAutomata(am);
3874 return(ret);
3875}
3876
3877/**
Daniel Veillard4255d502002-04-16 15:50:10 +00003878 * xmlRegFreeRegexp:
3879 * @regexp: the regexp
3880 *
3881 * Free a regexp
3882 */
3883void
3884xmlRegFreeRegexp(xmlRegexpPtr regexp) {
3885 int i;
3886 if (regexp == NULL)
3887 return;
3888
3889 if (regexp->string != NULL)
3890 xmlFree(regexp->string);
3891 if (regexp->states != NULL) {
3892 for (i = 0;i < regexp->nbStates;i++)
3893 xmlRegFreeState(regexp->states[i]);
3894 xmlFree(regexp->states);
3895 }
3896 if (regexp->atoms != NULL) {
3897 for (i = 0;i < regexp->nbAtoms;i++)
3898 xmlRegFreeAtom(regexp->atoms[i]);
3899 xmlFree(regexp->atoms);
3900 }
3901 if (regexp->counters != NULL)
3902 xmlFree(regexp->counters);
Daniel Veillard23e73572002-09-19 19:56:43 +00003903 if (regexp->compact != NULL)
3904 xmlFree(regexp->compact);
Daniel Veillard118aed72002-09-24 14:13:13 +00003905 if (regexp->transdata != NULL)
3906 xmlFree(regexp->transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +00003907 if (regexp->stringMap != NULL) {
3908 for (i = 0; i < regexp->nbstrings;i++)
3909 xmlFree(regexp->stringMap[i]);
3910 xmlFree(regexp->stringMap);
3911 }
3912
Daniel Veillard4255d502002-04-16 15:50:10 +00003913 xmlFree(regexp);
3914}
3915
3916#ifdef LIBXML_AUTOMATA_ENABLED
3917/************************************************************************
3918 * *
3919 * The Automata interface *
3920 * *
3921 ************************************************************************/
3922
3923/**
3924 * xmlNewAutomata:
3925 *
3926 * Create a new automata
3927 *
3928 * Returns the new object or NULL in case of failure
3929 */
3930xmlAutomataPtr
3931xmlNewAutomata(void) {
3932 xmlAutomataPtr ctxt;
3933
3934 ctxt = xmlRegNewParserCtxt(NULL);
3935 if (ctxt == NULL)
3936 return(NULL);
3937
3938 /* initialize the parser */
3939 ctxt->end = NULL;
3940 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
3941 xmlRegStatePush(ctxt, ctxt->start);
3942
3943 return(ctxt);
3944}
3945
3946/**
3947 * xmlFreeAutomata:
3948 * @am: an automata
3949 *
3950 * Free an automata
3951 */
3952void
3953xmlFreeAutomata(xmlAutomataPtr am) {
3954 if (am == NULL)
3955 return;
3956 xmlRegFreeParserCtxt(am);
3957}
3958
3959/**
3960 * xmlAutomataGetInitState:
3961 * @am: an automata
3962 *
Daniel Veillarda9b66d02002-12-11 14:23:49 +00003963 * Initial state lookup
3964 *
Daniel Veillard4255d502002-04-16 15:50:10 +00003965 * Returns the initial state of the automata
3966 */
3967xmlAutomataStatePtr
3968xmlAutomataGetInitState(xmlAutomataPtr am) {
3969 if (am == NULL)
3970 return(NULL);
3971 return(am->start);
3972}
3973
3974/**
3975 * xmlAutomataSetFinalState:
3976 * @am: an automata
3977 * @state: a state in this automata
3978 *
3979 * Makes that state a final state
3980 *
3981 * Returns 0 or -1 in case of error
3982 */
3983int
3984xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
3985 if ((am == NULL) || (state == NULL))
3986 return(-1);
3987 state->type = XML_REGEXP_FINAL_STATE;
3988 return(0);
3989}
3990
3991/**
3992 * xmlAutomataNewTransition:
3993 * @am: an automata
3994 * @from: the starting point of the transition
3995 * @to: the target point of the transition or NULL
3996 * @token: the input string associated to that transition
3997 * @data: data passed to the callback function if the transition is activated
3998 *
3999 * If @to is NULL, this create first a new target state in the automata
4000 * and then adds a transition from the @from state to the target state
4001 * activated by the value of @token
4002 *
4003 * Returns the target state or NULL in case of error
4004 */
4005xmlAutomataStatePtr
4006xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
4007 xmlAutomataStatePtr to, const xmlChar *token,
4008 void *data) {
4009 xmlRegAtomPtr atom;
4010
4011 if ((am == NULL) || (from == NULL) || (token == NULL))
4012 return(NULL);
4013 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4014 atom->data = data;
4015 if (atom == NULL)
4016 return(NULL);
4017 atom->valuep = xmlStrdup(token);
4018
4019 xmlFAGenerateTransitions(am, from, to, atom);
4020 if (to == NULL)
4021 return(am->state);
4022 return(to);
4023}
4024
4025/**
4026 * xmlAutomataNewCountTrans:
4027 * @am: an automata
4028 * @from: the starting point of the transition
4029 * @to: the target point of the transition or NULL
4030 * @token: the input string associated to that transition
4031 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004032 * @max: the maximum successive occurences of token
4033 * @data: data associated to the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00004034 *
4035 * If @to is NULL, this create first a new target state in the automata
4036 * and then adds a transition from the @from state to the target state
4037 * activated by a succession of input of value @token and whose number
4038 * is between @min and @max
4039 *
4040 * Returns the target state or NULL in case of error
4041 */
4042xmlAutomataStatePtr
4043xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4044 xmlAutomataStatePtr to, const xmlChar *token,
4045 int min, int max, void *data) {
4046 xmlRegAtomPtr atom;
4047
4048 if ((am == NULL) || (from == NULL) || (token == NULL))
4049 return(NULL);
4050 if (min < 0)
4051 return(NULL);
4052 if ((max < min) || (max < 1))
4053 return(NULL);
4054 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4055 if (atom == NULL)
4056 return(NULL);
4057 atom->valuep = xmlStrdup(token);
4058 atom->data = data;
4059 if (min == 0)
4060 atom->min = 1;
4061 else
4062 atom->min = min;
4063 atom->max = max;
4064
4065 xmlFAGenerateTransitions(am, from, to, atom);
4066 if (to == NULL)
4067 to = am->state;
4068 if (to == NULL)
4069 return(NULL);
4070 if (min == 0)
4071 xmlFAGenerateEpsilonTransition(am, from, to);
4072 return(to);
4073}
4074
4075/**
Daniel Veillard7646b182002-04-20 06:41:40 +00004076 * xmlAutomataNewOnceTrans:
4077 * @am: an automata
4078 * @from: the starting point of the transition
4079 * @to: the target point of the transition or NULL
4080 * @token: the input string associated to that transition
4081 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004082 * @max: the maximum successive occurences of token
4083 * @data: data associated to the transition
Daniel Veillard7646b182002-04-20 06:41:40 +00004084 *
4085 * If @to is NULL, this create first a new target state in the automata
4086 * and then adds a transition from the @from state to the target state
4087 * activated by a succession of input of value @token and whose number
4088 * is between @min and @max, moreover that transistion can only be crossed
4089 * once.
4090 *
4091 * Returns the target state or NULL in case of error
4092 */
4093xmlAutomataStatePtr
4094xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4095 xmlAutomataStatePtr to, const xmlChar *token,
4096 int min, int max, void *data) {
4097 xmlRegAtomPtr atom;
4098 int counter;
4099
4100 if ((am == NULL) || (from == NULL) || (token == NULL))
4101 return(NULL);
4102 if (min < 1)
4103 return(NULL);
4104 if ((max < min) || (max < 1))
4105 return(NULL);
4106 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4107 if (atom == NULL)
4108 return(NULL);
4109 atom->valuep = xmlStrdup(token);
4110 atom->data = data;
4111 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
4112 if (min == 0)
4113 atom->min = 1;
4114 else
4115 atom->min = min;
4116 atom->max = max;
4117 /*
4118 * associate a counter to the transition.
4119 */
4120 counter = xmlRegGetCounter(am);
4121 am->counters[counter].min = 1;
4122 am->counters[counter].max = 1;
4123
4124 /* xmlFAGenerateTransitions(am, from, to, atom); */
4125 if (to == NULL) {
4126 to = xmlRegNewState(am);
4127 xmlRegStatePush(am, to);
4128 }
4129 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4130 xmlRegAtomPush(am, atom);
4131 am->state = to;
4132 if (to == NULL)
4133 to = am->state;
4134 if (to == NULL)
4135 return(NULL);
4136 return(to);
4137}
4138
4139/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004140 * xmlAutomataNewState:
4141 * @am: an automata
4142 *
4143 * Create a new disconnected state in the automata
4144 *
4145 * Returns the new state or NULL in case of error
4146 */
4147xmlAutomataStatePtr
4148xmlAutomataNewState(xmlAutomataPtr am) {
4149 xmlAutomataStatePtr to;
4150
4151 if (am == NULL)
4152 return(NULL);
4153 to = xmlRegNewState(am);
4154 xmlRegStatePush(am, to);
4155 return(to);
4156}
4157
4158/**
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004159 * xmlAutomataNewEpsilon:
Daniel Veillard4255d502002-04-16 15:50:10 +00004160 * @am: an automata
4161 * @from: the starting point of the transition
4162 * @to: the target point of the transition or NULL
4163 *
4164 * If @to is NULL, this create first a new target state in the automata
4165 * and then adds a an epsilon transition from the @from state to the
4166 * target state
4167 *
4168 * Returns the target state or NULL in case of error
4169 */
4170xmlAutomataStatePtr
4171xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
4172 xmlAutomataStatePtr to) {
4173 if ((am == NULL) || (from == NULL))
4174 return(NULL);
4175 xmlFAGenerateEpsilonTransition(am, from, to);
4176 if (to == NULL)
4177 return(am->state);
4178 return(to);
4179}
4180
Daniel Veillardb509f152002-04-17 16:28:10 +00004181/**
Daniel Veillard7646b182002-04-20 06:41:40 +00004182 * xmlAutomataNewAllTrans:
4183 * @am: an automata
4184 * @from: the starting point of the transition
4185 * @to: the target point of the transition or NULL
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004186 * @lax: allow to transition if not all all transitions have been activated
Daniel Veillard7646b182002-04-20 06:41:40 +00004187 *
4188 * If @to is NULL, this create first a new target state in the automata
4189 * and then adds a an ALL transition from the @from state to the
4190 * target state. That transition is an epsilon transition allowed only when
4191 * all transitions from the @from node have been activated.
4192 *
4193 * Returns the target state or NULL in case of error
4194 */
4195xmlAutomataStatePtr
4196xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
Daniel Veillard441bc322002-04-20 17:38:48 +00004197 xmlAutomataStatePtr to, int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00004198 if ((am == NULL) || (from == NULL))
4199 return(NULL);
Daniel Veillard441bc322002-04-20 17:38:48 +00004200 xmlFAGenerateAllTransition(am, from, to, lax);
Daniel Veillard7646b182002-04-20 06:41:40 +00004201 if (to == NULL)
4202 return(am->state);
4203 return(to);
4204}
4205
4206/**
Daniel Veillardb509f152002-04-17 16:28:10 +00004207 * xmlAutomataNewCounter:
4208 * @am: an automata
4209 * @min: the minimal value on the counter
4210 * @max: the maximal value on the counter
4211 *
4212 * Create a new counter
4213 *
4214 * Returns the counter number or -1 in case of error
4215 */
4216int
4217xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
4218 int ret;
4219
4220 if (am == NULL)
4221 return(-1);
4222
4223 ret = xmlRegGetCounter(am);
4224 if (ret < 0)
4225 return(-1);
4226 am->counters[ret].min = min;
4227 am->counters[ret].max = max;
4228 return(ret);
4229}
4230
4231/**
4232 * xmlAutomataNewCountedTrans:
4233 * @am: an automata
4234 * @from: the starting point of the transition
4235 * @to: the target point of the transition or NULL
4236 * @counter: the counter associated to that transition
4237 *
4238 * If @to is NULL, this create first a new target state in the automata
4239 * and then adds an epsilon transition from the @from state to the target state
4240 * which will increment the counter provided
4241 *
4242 * Returns the target state or NULL in case of error
4243 */
4244xmlAutomataStatePtr
4245xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4246 xmlAutomataStatePtr to, int counter) {
4247 if ((am == NULL) || (from == NULL) || (counter < 0))
4248 return(NULL);
4249 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
4250 if (to == NULL)
4251 return(am->state);
4252 return(to);
4253}
4254
4255/**
4256 * xmlAutomataNewCounterTrans:
4257 * @am: an automata
4258 * @from: the starting point of the transition
4259 * @to: the target point of the transition or NULL
4260 * @counter: the counter associated to that transition
4261 *
4262 * If @to is NULL, this create first a new target state in the automata
4263 * and then adds an epsilon transition from the @from state to the target state
4264 * which will be allowed only if the counter is within the right range.
4265 *
4266 * Returns the target state or NULL in case of error
4267 */
4268xmlAutomataStatePtr
4269xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4270 xmlAutomataStatePtr to, int counter) {
4271 if ((am == NULL) || (from == NULL) || (counter < 0))
4272 return(NULL);
4273 xmlFAGenerateCountedTransition(am, from, to, counter);
4274 if (to == NULL)
4275 return(am->state);
4276 return(to);
4277}
Daniel Veillard4255d502002-04-16 15:50:10 +00004278
4279/**
4280 * xmlAutomataCompile:
4281 * @am: an automata
4282 *
4283 * Compile the automata into a Reg Exp ready for being executed.
4284 * The automata should be free after this point.
4285 *
4286 * Returns the compiled regexp or NULL in case of error
4287 */
4288xmlRegexpPtr
4289xmlAutomataCompile(xmlAutomataPtr am) {
4290 xmlRegexpPtr ret;
4291
4292 xmlFAEliminateEpsilonTransitions(am);
Daniel Veillard23e73572002-09-19 19:56:43 +00004293 /* xmlFAComputesDeterminism(am); */
Daniel Veillard4255d502002-04-16 15:50:10 +00004294 ret = xmlRegEpxFromParse(am);
4295
4296 return(ret);
4297}
Daniel Veillarde19fc232002-04-22 16:01:24 +00004298
4299/**
4300 * xmlAutomataIsDeterminist:
4301 * @am: an automata
4302 *
4303 * Checks if an automata is determinist.
4304 *
4305 * Returns 1 if true, 0 if not, and -1 in case of error
4306 */
4307int
4308xmlAutomataIsDeterminist(xmlAutomataPtr am) {
4309 int ret;
4310
4311 if (am == NULL)
4312 return(-1);
4313
4314 ret = xmlFAComputesDeterminism(am);
4315 return(ret);
4316}
Daniel Veillard4255d502002-04-16 15:50:10 +00004317#endif /* LIBXML_AUTOMATA_ENABLED */
4318#endif /* LIBXML_REGEXP_ENABLED */