blob: 8f1e9c1dc84068e8c3502a09e402fe0fe481c968 [file] [log] [blame]
Daniel Veillard4255d502002-04-16 15:50:10 +00001/*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/shemas mechanisms now available in
6 * XML related specifications thise includes:
7 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17#define IN_LIBXML
18#include "libxml.h"
19
20#ifdef LIBXML_REGEXP_ENABLED
21
22#include <stdio.h>
23#include <string.h>
24#include <libxml/tree.h>
25#include <libxml/parserInternals.h>
26#include <libxml/xmlregexp.h>
27#include <libxml/xmlautomata.h>
28#include <libxml/xmlunicode.h>
29
30/* #define DEBUG_REGEXP_GRAPH */
31/* #define DEBUG_REGEXP_EXEC */
32/* #define DEBUG_PUSH */
Daniel Veillard23e73572002-09-19 19:56:43 +000033/* #define DEBUG_COMPACTION */
Daniel Veillard4255d502002-04-16 15:50:10 +000034
35#define ERROR(str) ctxt->error = 1; \
36 xmlGenericError(xmlGenericErrorContext, "Regexp: %s: %s\n", str, ctxt->cur)
37#define NEXT ctxt->cur++
38#define CUR (*(ctxt->cur))
39#define NXT(index) (ctxt->cur[index])
40
41#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
42#define NEXTL(l) ctxt->cur += l;
43
Daniel Veillarde19fc232002-04-22 16:01:24 +000044/**
45 * TODO:
46 *
47 * macro to flag unimplemented blocks
48 */
49#define TODO \
50 xmlGenericError(xmlGenericErrorContext, \
51 "Unimplemented block at %s:%d\n", \
52 __FILE__, __LINE__);
53
Daniel Veillard4255d502002-04-16 15:50:10 +000054
55/************************************************************************
56 * *
57 * Datatypes and structures *
58 * *
59 ************************************************************************/
60
61typedef enum {
62 XML_REGEXP_EPSILON = 1,
63 XML_REGEXP_CHARVAL,
64 XML_REGEXP_RANGES,
65 XML_REGEXP_SUBREG,
66 XML_REGEXP_STRING,
67 XML_REGEXP_ANYCHAR, /* . */
68 XML_REGEXP_ANYSPACE, /* \s */
69 XML_REGEXP_NOTSPACE, /* \S */
70 XML_REGEXP_INITNAME, /* \l */
71 XML_REGEXP_NOTINITNAME, /* \l */
72 XML_REGEXP_NAMECHAR, /* \c */
73 XML_REGEXP_NOTNAMECHAR, /* \C */
74 XML_REGEXP_DECIMAL, /* \d */
75 XML_REGEXP_NOTDECIMAL, /* \d */
76 XML_REGEXP_REALCHAR, /* \w */
77 XML_REGEXP_NOTREALCHAR, /* \w */
78 XML_REGEXP_LETTER,
79 XML_REGEXP_LETTER_UPPERCASE,
80 XML_REGEXP_LETTER_LOWERCASE,
81 XML_REGEXP_LETTER_TITLECASE,
82 XML_REGEXP_LETTER_MODIFIER,
83 XML_REGEXP_LETTER_OTHERS,
84 XML_REGEXP_MARK,
85 XML_REGEXP_MARK_NONSPACING,
86 XML_REGEXP_MARK_SPACECOMBINING,
87 XML_REGEXP_MARK_ENCLOSING,
88 XML_REGEXP_NUMBER,
89 XML_REGEXP_NUMBER_DECIMAL,
90 XML_REGEXP_NUMBER_LETTER,
91 XML_REGEXP_NUMBER_OTHERS,
92 XML_REGEXP_PUNCT,
93 XML_REGEXP_PUNCT_CONNECTOR,
94 XML_REGEXP_PUNCT_DASH,
95 XML_REGEXP_PUNCT_OPEN,
96 XML_REGEXP_PUNCT_CLOSE,
97 XML_REGEXP_PUNCT_INITQUOTE,
98 XML_REGEXP_PUNCT_FINQUOTE,
99 XML_REGEXP_PUNCT_OTHERS,
100 XML_REGEXP_SEPAR,
101 XML_REGEXP_SEPAR_SPACE,
102 XML_REGEXP_SEPAR_LINE,
103 XML_REGEXP_SEPAR_PARA,
104 XML_REGEXP_SYMBOL,
105 XML_REGEXP_SYMBOL_MATH,
106 XML_REGEXP_SYMBOL_CURRENCY,
107 XML_REGEXP_SYMBOL_MODIFIER,
108 XML_REGEXP_SYMBOL_OTHERS,
109 XML_REGEXP_OTHER,
110 XML_REGEXP_OTHER_CONTROL,
111 XML_REGEXP_OTHER_FORMAT,
112 XML_REGEXP_OTHER_PRIVATE,
113 XML_REGEXP_OTHER_NA,
114 XML_REGEXP_BLOCK_NAME
115} xmlRegAtomType;
116
117typedef enum {
118 XML_REGEXP_QUANT_EPSILON = 1,
119 XML_REGEXP_QUANT_ONCE,
120 XML_REGEXP_QUANT_OPT,
121 XML_REGEXP_QUANT_MULT,
122 XML_REGEXP_QUANT_PLUS,
Daniel Veillard7646b182002-04-20 06:41:40 +0000123 XML_REGEXP_QUANT_ONCEONLY,
124 XML_REGEXP_QUANT_ALL,
Daniel Veillard4255d502002-04-16 15:50:10 +0000125 XML_REGEXP_QUANT_RANGE
126} xmlRegQuantType;
127
128typedef enum {
129 XML_REGEXP_START_STATE = 1,
130 XML_REGEXP_FINAL_STATE,
131 XML_REGEXP_TRANS_STATE
132} xmlRegStateType;
133
134typedef enum {
135 XML_REGEXP_MARK_NORMAL = 0,
136 XML_REGEXP_MARK_START,
137 XML_REGEXP_MARK_VISITED
138} xmlRegMarkedType;
139
140typedef struct _xmlRegRange xmlRegRange;
141typedef xmlRegRange *xmlRegRangePtr;
142
143struct _xmlRegRange {
144 int neg;
145 xmlRegAtomType type;
146 int start;
147 int end;
148 xmlChar *blockName;
149};
150
151typedef struct _xmlRegAtom xmlRegAtom;
152typedef xmlRegAtom *xmlRegAtomPtr;
153
154typedef struct _xmlAutomataState xmlRegState;
155typedef xmlRegState *xmlRegStatePtr;
156
157struct _xmlRegAtom {
158 int no;
159 xmlRegAtomType type;
160 xmlRegQuantType quant;
161 int min;
162 int max;
163
164 void *valuep;
Daniel Veillarda646cfd2002-09-17 21:50:03 +0000165 void *valuep2;
Daniel Veillard4255d502002-04-16 15:50:10 +0000166 int neg;
167 int codepoint;
168 xmlRegStatePtr start;
169 xmlRegStatePtr stop;
170 int maxRanges;
171 int nbRanges;
172 xmlRegRangePtr *ranges;
173 void *data;
174};
175
176typedef struct _xmlRegCounter xmlRegCounter;
177typedef xmlRegCounter *xmlRegCounterPtr;
178
179struct _xmlRegCounter {
180 int min;
181 int max;
182};
183
184typedef struct _xmlRegTrans xmlRegTrans;
185typedef xmlRegTrans *xmlRegTransPtr;
186
187struct _xmlRegTrans {
188 xmlRegAtomPtr atom;
189 int to;
190 int counter;
191 int count;
192};
193
194struct _xmlAutomataState {
195 xmlRegStateType type;
196 xmlRegMarkedType mark;
Daniel Veillard23e73572002-09-19 19:56:43 +0000197 xmlRegMarkedType reached;
Daniel Veillard4255d502002-04-16 15:50:10 +0000198 int no;
199
200 int maxTrans;
201 int nbTrans;
202 xmlRegTrans *trans;
203};
204
205typedef struct _xmlAutomata xmlRegParserCtxt;
206typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
207
208struct _xmlAutomata {
209 xmlChar *string;
210 xmlChar *cur;
211
212 int error;
213 int neg;
214
215 xmlRegStatePtr start;
216 xmlRegStatePtr end;
217 xmlRegStatePtr state;
218
219 xmlRegAtomPtr atom;
220
221 int maxAtoms;
222 int nbAtoms;
223 xmlRegAtomPtr *atoms;
224
225 int maxStates;
226 int nbStates;
227 xmlRegStatePtr *states;
228
229 int maxCounters;
230 int nbCounters;
231 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000232
233 int determinist;
Daniel Veillard4255d502002-04-16 15:50:10 +0000234};
235
236struct _xmlRegexp {
237 xmlChar *string;
238 int nbStates;
239 xmlRegStatePtr *states;
240 int nbAtoms;
241 xmlRegAtomPtr *atoms;
242 int nbCounters;
243 xmlRegCounter *counters;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000244 int determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000245 /*
246 * That's the compact form for determinists automatas
247 */
248 int nbstates;
249 int *compact;
Daniel Veillard118aed72002-09-24 14:13:13 +0000250 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000251 int nbstrings;
252 xmlChar **stringMap;
Daniel Veillard4255d502002-04-16 15:50:10 +0000253};
254
255typedef struct _xmlRegExecRollback xmlRegExecRollback;
256typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
257
258struct _xmlRegExecRollback {
259 xmlRegStatePtr state;/* the current state */
260 int index; /* the index in the input stack */
261 int nextbranch; /* the next transition to explore in that state */
262 int *counts; /* save the automate state if it has some */
263};
264
265typedef struct _xmlRegInputToken xmlRegInputToken;
266typedef xmlRegInputToken *xmlRegInputTokenPtr;
267
268struct _xmlRegInputToken {
269 xmlChar *value;
270 void *data;
271};
272
273struct _xmlRegExecCtxt {
274 int status; /* execution status != 0 indicate an error */
275 int determinist; /* did we found an inderterministic behaviour */
276 xmlRegexpPtr comp; /* the compiled regexp */
277 xmlRegExecCallbacks callback;
278 void *data;
279
280 xmlRegStatePtr state;/* the current state */
281 int transno; /* the current transition on that state */
282 int transcount; /* the number of char in char counted transitions */
283
284 /*
285 * A stack of rollback states
286 */
287 int maxRollbacks;
288 int nbRollbacks;
289 xmlRegExecRollback *rollbacks;
290
291 /*
292 * The state of the automata if any
293 */
294 int *counts;
295
296 /*
297 * The input stack
298 */
299 int inputStackMax;
300 int inputStackNr;
301 int index;
302 int *charStack;
303 const xmlChar *inputString; /* when operating on characters */
304 xmlRegInputTokenPtr inputStack;/* when operating on strings */
305
306};
307
Daniel Veillard441bc322002-04-20 17:38:48 +0000308#define REGEXP_ALL_COUNTER 0x123456
309#define REGEXP_ALL_LAX_COUNTER 0x123457
Daniel Veillard7646b182002-04-20 06:41:40 +0000310
Daniel Veillard4255d502002-04-16 15:50:10 +0000311static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
Daniel Veillard23e73572002-09-19 19:56:43 +0000312static void xmlRegFreeState(xmlRegStatePtr state);
313static void xmlRegFreeAtom(xmlRegAtomPtr atom);
Daniel Veillard4255d502002-04-16 15:50:10 +0000314
315/************************************************************************
316 * *
317 * Allocation/Deallocation *
318 * *
319 ************************************************************************/
320
Daniel Veillard23e73572002-09-19 19:56:43 +0000321static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
Daniel Veillard4255d502002-04-16 15:50:10 +0000322/**
323 * xmlRegEpxFromParse:
324 * @ctxt: the parser context used to build it
325 *
326 * Allocate a new regexp and fill it with the reult from the parser
327 *
328 * Returns the new regexp or NULL in case of error
329 */
330static xmlRegexpPtr
331xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
332 xmlRegexpPtr ret;
333
334 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
335 if (ret == NULL)
336 return(NULL);
337 memset(ret, 0, sizeof(xmlRegexp));
338 ret->string = ctxt->string;
339 ctxt->string = NULL;
340 ret->nbStates = ctxt->nbStates;
341 ctxt->nbStates = 0;
342 ret->states = ctxt->states;
343 ctxt->states = NULL;
344 ret->nbAtoms = ctxt->nbAtoms;
345 ctxt->nbAtoms = 0;
346 ret->atoms = ctxt->atoms;
347 ctxt->atoms = NULL;
348 ret->nbCounters = ctxt->nbCounters;
349 ctxt->nbCounters = 0;
350 ret->counters = ctxt->counters;
351 ctxt->counters = NULL;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000352 ret->determinist = ctxt->determinist;
Daniel Veillard23e73572002-09-19 19:56:43 +0000353
354 if ((ret->determinist != 0) &&
355 (ret->nbCounters == 0) &&
Daniel Veillard118aed72002-09-24 14:13:13 +0000356 (ret->atoms != NULL) &&
Daniel Veillard23e73572002-09-19 19:56:43 +0000357 (ret->atoms[0] != NULL) &&
358 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
359 int i, j, nbstates = 0, nbatoms = 0;
360 int *stateRemap;
361 int *stringRemap;
362 int *transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000363 void **transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000364 xmlChar **stringMap;
365 xmlChar *value;
366
367 /*
368 * Switch to a compact representation
369 * 1/ counting the effective number of states left
370 * 2/ conting the unique number of atoms, and check that
371 * they are all of the string type
372 * 3/ build a table state x atom for the transitions
373 */
374
375 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
376 for (i = 0;i < ret->nbStates;i++) {
377 if (ret->states[i] != NULL) {
378 stateRemap[i] = nbstates;
379 nbstates++;
380 } else {
381 stateRemap[i] = -1;
382 }
383 }
384#ifdef DEBUG_COMPACTION
385 printf("Final: %d states\n", nbstates);
386#endif
387 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
388 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
389 for (i = 0;i < ret->nbAtoms;i++) {
390 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
391 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
392 value = ret->atoms[i]->valuep;
393 for (j = 0;j < nbatoms;j++) {
394 if (xmlStrEqual(stringMap[j], value)) {
395 stringRemap[i] = j;
396 break;
397 }
398 }
399 if (j >= nbatoms) {
400 stringRemap[i] = nbatoms;
401 stringMap[nbatoms] = xmlStrdup(value);
402 nbatoms++;
403 }
404 } else {
405 xmlFree(stateRemap);
406 xmlFree(stringRemap);
407 for (i = 0;i < nbatoms;i++)
408 xmlFree(stringMap[i]);
409 xmlFree(stringMap);
410 goto fail_compact;
411 }
412 }
413#ifdef DEBUG_COMPACTION
414 printf("Final: %d atoms\n", nbatoms);
415#endif
416
417 /*
418 * Allocate the transition table. The first entry for each
419 * state correspond to the state type.
420 */
421 transitions = (int *) xmlMalloc(nbstates * (nbatoms + 1) * sizeof(int));
Daniel Veillard118aed72002-09-24 14:13:13 +0000422 transdata = NULL;
Daniel Veillard23e73572002-09-19 19:56:43 +0000423 memset(transitions, 0, nbstates * (nbatoms + 1) * sizeof(int));
424
425 for (i = 0;i < ret->nbStates;i++) {
426 int stateno, atomno, targetno, prev;
427 xmlRegStatePtr state;
428 xmlRegTransPtr trans;
429
430 stateno = stateRemap[i];
431 if (stateno == -1)
432 continue;
433 state = ret->states[i];
434
435 transitions[stateno * (nbatoms + 1)] = state->type;
436
437 for (j = 0;j < state->nbTrans;j++) {
438 trans = &(state->trans[j]);
439 if ((trans->to == -1) || (trans->atom == NULL))
440 continue;
441 atomno = stringRemap[trans->atom->no];
Daniel Veillard118aed72002-09-24 14:13:13 +0000442 if ((trans->atom->data != NULL) && (transdata == NULL)) {
443 transdata = (void **) xmlMalloc(nbstates * nbatoms *
444 sizeof(void *));
445 if (transdata != NULL)
446 memset(transdata, 0,
447 nbstates * nbatoms * sizeof(void *));
448 }
Daniel Veillard23e73572002-09-19 19:56:43 +0000449 targetno = stateRemap[trans->to];
450 /*
451 * if the same atome can generate transition to 2 different
452 * states then it means the automata is not determinist and
453 * the compact form can't be used !
454 */
455 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
456 if (prev != 0) {
457 if (prev != targetno + 1) {
Daniel Veillard23e73572002-09-19 19:56:43 +0000458 ret->determinist = 0;
459#ifdef DEBUG_COMPACTION
460 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
461 i, j, trans->atom->no, trans->to, atomno, targetno);
462 printf(" previous to is %d\n", prev);
463#endif
464 ret->determinist = 0;
Daniel Veillard118aed72002-09-24 14:13:13 +0000465 if (transdata != NULL)
466 xmlFree(transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +0000467 xmlFree(transitions);
468 xmlFree(stateRemap);
469 xmlFree(stringRemap);
470 for (i = 0;i < nbatoms;i++)
471 xmlFree(stringMap[i]);
472 xmlFree(stringMap);
473 goto fail_compact;
474 }
475 } else {
476#if 0
477 printf("State %d trans %d: atom %d to %d : %d to %d\n",
478 i, j, trans->atom->no, trans->to, atomno, targetno);
479#endif
480 transitions[stateno * (nbatoms + 1) + atomno + 1] =
Daniel Veillard118aed72002-09-24 14:13:13 +0000481 targetno + 1; /* to avoid 0 */
482 if (transdata != NULL)
483 transdata[stateno * nbatoms + atomno] =
484 trans->atom->data;
Daniel Veillard23e73572002-09-19 19:56:43 +0000485 }
486 }
487 }
488 ret->determinist = 1;
489#ifdef DEBUG_COMPACTION
490 /*
491 * Debug
492 */
493 for (i = 0;i < nbstates;i++) {
494 for (j = 0;j < nbatoms + 1;j++) {
495 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
496 }
497 printf("\n");
498 }
499 printf("\n");
500#endif
501 /*
502 * Cleanup of the old data
503 */
504 if (ret->states != NULL) {
505 for (i = 0;i < ret->nbStates;i++)
506 xmlRegFreeState(ret->states[i]);
507 xmlFree(ret->states);
508 }
509 ret->states = NULL;
510 ret->nbStates = 0;
511 if (ret->atoms != NULL) {
512 for (i = 0;i < ret->nbAtoms;i++)
513 xmlRegFreeAtom(ret->atoms[i]);
514 xmlFree(ret->atoms);
515 }
516 ret->atoms = NULL;
517 ret->nbAtoms = 0;
518
519 ret->compact = transitions;
Daniel Veillard118aed72002-09-24 14:13:13 +0000520 ret->transdata = transdata;
Daniel Veillard23e73572002-09-19 19:56:43 +0000521 ret->stringMap = stringMap;
522 ret->nbstrings = nbatoms;
523 ret->nbstates = nbstates;
524 xmlFree(stateRemap);
525 xmlFree(stringRemap);
526 }
527fail_compact:
Daniel Veillard4255d502002-04-16 15:50:10 +0000528 return(ret);
529}
530
531/**
532 * xmlRegNewParserCtxt:
533 * @string: the string to parse
534 *
535 * Allocate a new regexp parser context
536 *
537 * Returns the new context or NULL in case of error
538 */
539static xmlRegParserCtxtPtr
540xmlRegNewParserCtxt(const xmlChar *string) {
541 xmlRegParserCtxtPtr ret;
542
543 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
544 if (ret == NULL)
545 return(NULL);
546 memset(ret, 0, sizeof(xmlRegParserCtxt));
547 if (string != NULL)
548 ret->string = xmlStrdup(string);
549 ret->cur = ret->string;
550 ret->neg = 0;
551 ret->error = 0;
Daniel Veillarde19fc232002-04-22 16:01:24 +0000552 ret->determinist = -1;
Daniel Veillard4255d502002-04-16 15:50:10 +0000553 return(ret);
554}
555
556/**
557 * xmlRegNewRange:
558 * @ctxt: the regexp parser context
559 * @neg: is that negative
560 * @type: the type of range
561 * @start: the start codepoint
562 * @end: the end codepoint
563 *
564 * Allocate a new regexp range
565 *
566 * Returns the new range or NULL in case of error
567 */
568static xmlRegRangePtr
569xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
570 int neg, xmlRegAtomType type, int start, int end) {
571 xmlRegRangePtr ret;
572
573 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
574 if (ret == NULL) {
575 ERROR("failed to allocate regexp range");
576 return(NULL);
577 }
578 ret->neg = neg;
579 ret->type = type;
580 ret->start = start;
581 ret->end = end;
582 return(ret);
583}
584
585/**
586 * xmlRegFreeRange:
587 * @range: the regexp range
588 *
589 * Free a regexp range
590 */
591static void
592xmlRegFreeRange(xmlRegRangePtr range) {
593 if (range == NULL)
594 return;
595
596 if (range->blockName != NULL)
597 xmlFree(range->blockName);
598 xmlFree(range);
599}
600
601/**
602 * xmlRegNewAtom:
603 * @ctxt: the regexp parser context
604 * @type: the type of atom
605 *
606 * Allocate a new regexp range
607 *
608 * Returns the new atom or NULL in case of error
609 */
610static xmlRegAtomPtr
611xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
612 xmlRegAtomPtr ret;
613
614 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
615 if (ret == NULL) {
616 ERROR("failed to allocate regexp atom");
617 return(NULL);
618 }
619 memset(ret, 0, sizeof(xmlRegAtom));
620 ret->type = type;
621 ret->quant = XML_REGEXP_QUANT_ONCE;
622 ret->min = 0;
623 ret->max = 0;
624 return(ret);
625}
626
627/**
628 * xmlRegFreeAtom:
629 * @atom: the regexp atom
630 *
631 * Free a regexp atom
632 */
633static void
634xmlRegFreeAtom(xmlRegAtomPtr atom) {
635 int i;
636
637 if (atom == NULL)
638 return;
639
640 for (i = 0;i < atom->nbRanges;i++)
641 xmlRegFreeRange(atom->ranges[i]);
642 if (atom->ranges != NULL)
643 xmlFree(atom->ranges);
644 if (atom->type == XML_REGEXP_STRING)
645 xmlFree(atom->valuep);
646 xmlFree(atom);
647}
648
649static xmlRegStatePtr
650xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
651 xmlRegStatePtr ret;
652
653 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
654 if (ret == NULL) {
655 ERROR("failed to allocate regexp state");
656 return(NULL);
657 }
658 memset(ret, 0, sizeof(xmlRegState));
659 ret->type = XML_REGEXP_TRANS_STATE;
660 ret->mark = XML_REGEXP_MARK_NORMAL;
661 return(ret);
662}
663
664/**
665 * xmlRegFreeState:
666 * @state: the regexp state
667 *
668 * Free a regexp state
669 */
670static void
671xmlRegFreeState(xmlRegStatePtr state) {
672 if (state == NULL)
673 return;
674
675 if (state->trans != NULL)
676 xmlFree(state->trans);
677 xmlFree(state);
678}
679
680/**
681 * xmlRegFreeParserCtxt:
682 * @ctxt: the regexp parser context
683 *
684 * Free a regexp parser context
685 */
686static void
687xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
688 int i;
689 if (ctxt == NULL)
690 return;
691
692 if (ctxt->string != NULL)
693 xmlFree(ctxt->string);
694 if (ctxt->states != NULL) {
695 for (i = 0;i < ctxt->nbStates;i++)
696 xmlRegFreeState(ctxt->states[i]);
697 xmlFree(ctxt->states);
698 }
699 if (ctxt->atoms != NULL) {
700 for (i = 0;i < ctxt->nbAtoms;i++)
701 xmlRegFreeAtom(ctxt->atoms[i]);
702 xmlFree(ctxt->atoms);
703 }
704 if (ctxt->counters != NULL)
705 xmlFree(ctxt->counters);
706 xmlFree(ctxt);
707}
708
709/************************************************************************
710 * *
711 * Display of Data structures *
712 * *
713 ************************************************************************/
714
715static void
716xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
717 switch (type) {
718 case XML_REGEXP_EPSILON:
719 fprintf(output, "epsilon "); break;
720 case XML_REGEXP_CHARVAL:
721 fprintf(output, "charval "); break;
722 case XML_REGEXP_RANGES:
723 fprintf(output, "ranges "); break;
724 case XML_REGEXP_SUBREG:
725 fprintf(output, "subexpr "); break;
726 case XML_REGEXP_STRING:
727 fprintf(output, "string "); break;
728 case XML_REGEXP_ANYCHAR:
729 fprintf(output, "anychar "); break;
730 case XML_REGEXP_ANYSPACE:
731 fprintf(output, "anyspace "); break;
732 case XML_REGEXP_NOTSPACE:
733 fprintf(output, "notspace "); break;
734 case XML_REGEXP_INITNAME:
735 fprintf(output, "initname "); break;
736 case XML_REGEXP_NOTINITNAME:
737 fprintf(output, "notinitname "); break;
738 case XML_REGEXP_NAMECHAR:
739 fprintf(output, "namechar "); break;
740 case XML_REGEXP_NOTNAMECHAR:
741 fprintf(output, "notnamechar "); break;
742 case XML_REGEXP_DECIMAL:
743 fprintf(output, "decimal "); break;
744 case XML_REGEXP_NOTDECIMAL:
745 fprintf(output, "notdecimal "); break;
746 case XML_REGEXP_REALCHAR:
747 fprintf(output, "realchar "); break;
748 case XML_REGEXP_NOTREALCHAR:
749 fprintf(output, "notrealchar "); break;
750 case XML_REGEXP_LETTER:
751 fprintf(output, "LETTER "); break;
752 case XML_REGEXP_LETTER_UPPERCASE:
753 fprintf(output, "LETTER_UPPERCASE "); break;
754 case XML_REGEXP_LETTER_LOWERCASE:
755 fprintf(output, "LETTER_LOWERCASE "); break;
756 case XML_REGEXP_LETTER_TITLECASE:
757 fprintf(output, "LETTER_TITLECASE "); break;
758 case XML_REGEXP_LETTER_MODIFIER:
759 fprintf(output, "LETTER_MODIFIER "); break;
760 case XML_REGEXP_LETTER_OTHERS:
761 fprintf(output, "LETTER_OTHERS "); break;
762 case XML_REGEXP_MARK:
763 fprintf(output, "MARK "); break;
764 case XML_REGEXP_MARK_NONSPACING:
765 fprintf(output, "MARK_NONSPACING "); break;
766 case XML_REGEXP_MARK_SPACECOMBINING:
767 fprintf(output, "MARK_SPACECOMBINING "); break;
768 case XML_REGEXP_MARK_ENCLOSING:
769 fprintf(output, "MARK_ENCLOSING "); break;
770 case XML_REGEXP_NUMBER:
771 fprintf(output, "NUMBER "); break;
772 case XML_REGEXP_NUMBER_DECIMAL:
773 fprintf(output, "NUMBER_DECIMAL "); break;
774 case XML_REGEXP_NUMBER_LETTER:
775 fprintf(output, "NUMBER_LETTER "); break;
776 case XML_REGEXP_NUMBER_OTHERS:
777 fprintf(output, "NUMBER_OTHERS "); break;
778 case XML_REGEXP_PUNCT:
779 fprintf(output, "PUNCT "); break;
780 case XML_REGEXP_PUNCT_CONNECTOR:
781 fprintf(output, "PUNCT_CONNECTOR "); break;
782 case XML_REGEXP_PUNCT_DASH:
783 fprintf(output, "PUNCT_DASH "); break;
784 case XML_REGEXP_PUNCT_OPEN:
785 fprintf(output, "PUNCT_OPEN "); break;
786 case XML_REGEXP_PUNCT_CLOSE:
787 fprintf(output, "PUNCT_CLOSE "); break;
788 case XML_REGEXP_PUNCT_INITQUOTE:
789 fprintf(output, "PUNCT_INITQUOTE "); break;
790 case XML_REGEXP_PUNCT_FINQUOTE:
791 fprintf(output, "PUNCT_FINQUOTE "); break;
792 case XML_REGEXP_PUNCT_OTHERS:
793 fprintf(output, "PUNCT_OTHERS "); break;
794 case XML_REGEXP_SEPAR:
795 fprintf(output, "SEPAR "); break;
796 case XML_REGEXP_SEPAR_SPACE:
797 fprintf(output, "SEPAR_SPACE "); break;
798 case XML_REGEXP_SEPAR_LINE:
799 fprintf(output, "SEPAR_LINE "); break;
800 case XML_REGEXP_SEPAR_PARA:
801 fprintf(output, "SEPAR_PARA "); break;
802 case XML_REGEXP_SYMBOL:
803 fprintf(output, "SYMBOL "); break;
804 case XML_REGEXP_SYMBOL_MATH:
805 fprintf(output, "SYMBOL_MATH "); break;
806 case XML_REGEXP_SYMBOL_CURRENCY:
807 fprintf(output, "SYMBOL_CURRENCY "); break;
808 case XML_REGEXP_SYMBOL_MODIFIER:
809 fprintf(output, "SYMBOL_MODIFIER "); break;
810 case XML_REGEXP_SYMBOL_OTHERS:
811 fprintf(output, "SYMBOL_OTHERS "); break;
812 case XML_REGEXP_OTHER:
813 fprintf(output, "OTHER "); break;
814 case XML_REGEXP_OTHER_CONTROL:
815 fprintf(output, "OTHER_CONTROL "); break;
816 case XML_REGEXP_OTHER_FORMAT:
817 fprintf(output, "OTHER_FORMAT "); break;
818 case XML_REGEXP_OTHER_PRIVATE:
819 fprintf(output, "OTHER_PRIVATE "); break;
820 case XML_REGEXP_OTHER_NA:
821 fprintf(output, "OTHER_NA "); break;
822 case XML_REGEXP_BLOCK_NAME:
823 fprintf(output, "BLOCK "); break;
824 }
825}
826
827static void
828xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
829 switch (type) {
830 case XML_REGEXP_QUANT_EPSILON:
831 fprintf(output, "epsilon "); break;
832 case XML_REGEXP_QUANT_ONCE:
833 fprintf(output, "once "); break;
834 case XML_REGEXP_QUANT_OPT:
835 fprintf(output, "? "); break;
836 case XML_REGEXP_QUANT_MULT:
837 fprintf(output, "* "); break;
838 case XML_REGEXP_QUANT_PLUS:
839 fprintf(output, "+ "); break;
840 case XML_REGEXP_QUANT_RANGE:
841 fprintf(output, "range "); break;
Daniel Veillard7646b182002-04-20 06:41:40 +0000842 case XML_REGEXP_QUANT_ONCEONLY:
843 fprintf(output, "onceonly "); break;
844 case XML_REGEXP_QUANT_ALL:
845 fprintf(output, "all "); break;
Daniel Veillard4255d502002-04-16 15:50:10 +0000846 }
847}
848static void
849xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
850 fprintf(output, " range: ");
851 if (range->neg)
852 fprintf(output, "negative ");
853 xmlRegPrintAtomType(output, range->type);
854 fprintf(output, "%c - %c\n", range->start, range->end);
855}
856
857static void
858xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
859 fprintf(output, " atom: ");
860 if (atom == NULL) {
861 fprintf(output, "NULL\n");
862 return;
863 }
864 xmlRegPrintAtomType(output, atom->type);
865 xmlRegPrintQuantType(output, atom->quant);
866 if (atom->quant == XML_REGEXP_QUANT_RANGE)
867 fprintf(output, "%d-%d ", atom->min, atom->max);
868 if (atom->type == XML_REGEXP_STRING)
869 fprintf(output, "'%s' ", (char *) atom->valuep);
870 if (atom->type == XML_REGEXP_CHARVAL)
871 fprintf(output, "char %c\n", atom->codepoint);
872 else if (atom->type == XML_REGEXP_RANGES) {
873 int i;
874 fprintf(output, "%d entries\n", atom->nbRanges);
875 for (i = 0; i < atom->nbRanges;i++)
876 xmlRegPrintRange(output, atom->ranges[i]);
877 } else if (atom->type == XML_REGEXP_SUBREG) {
878 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
879 } else {
880 fprintf(output, "\n");
881 }
882}
883
884static void
885xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
886 fprintf(output, " trans: ");
887 if (trans == NULL) {
888 fprintf(output, "NULL\n");
889 return;
890 }
891 if (trans->to < 0) {
892 fprintf(output, "removed\n");
893 return;
894 }
895 if (trans->counter >= 0) {
896 fprintf(output, "counted %d, ", trans->counter);
897 }
Daniel Veillard8a001f62002-04-20 07:24:11 +0000898 if (trans->count == REGEXP_ALL_COUNTER) {
899 fprintf(output, "all transition, ");
900 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +0000901 fprintf(output, "count based %d, ", trans->count);
902 }
903 if (trans->atom == NULL) {
904 fprintf(output, "epsilon to %d\n", trans->to);
905 return;
906 }
907 if (trans->atom->type == XML_REGEXP_CHARVAL)
908 fprintf(output, "char %c ", trans->atom->codepoint);
909 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
910}
911
912static void
913xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
914 int i;
915
916 fprintf(output, " state: ");
917 if (state == NULL) {
918 fprintf(output, "NULL\n");
919 return;
920 }
921 if (state->type == XML_REGEXP_START_STATE)
922 fprintf(output, "START ");
923 if (state->type == XML_REGEXP_FINAL_STATE)
924 fprintf(output, "FINAL ");
925
926 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
927 for (i = 0;i < state->nbTrans; i++) {
928 xmlRegPrintTrans(output, &(state->trans[i]));
929 }
930}
931
Daniel Veillard23e73572002-09-19 19:56:43 +0000932#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillard4255d502002-04-16 15:50:10 +0000933static void
934xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
935 int i;
936
937 fprintf(output, " ctxt: ");
938 if (ctxt == NULL) {
939 fprintf(output, "NULL\n");
940 return;
941 }
942 fprintf(output, "'%s' ", ctxt->string);
943 if (ctxt->error)
944 fprintf(output, "error ");
945 if (ctxt->neg)
946 fprintf(output, "neg ");
947 fprintf(output, "\n");
948 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
949 for (i = 0;i < ctxt->nbAtoms; i++) {
950 fprintf(output, " %02d ", i);
951 xmlRegPrintAtom(output, ctxt->atoms[i]);
952 }
953 if (ctxt->atom != NULL) {
954 fprintf(output, "current atom:\n");
955 xmlRegPrintAtom(output, ctxt->atom);
956 }
957 fprintf(output, "%d states:", ctxt->nbStates);
958 if (ctxt->start != NULL)
959 fprintf(output, " start: %d", ctxt->start->no);
960 if (ctxt->end != NULL)
961 fprintf(output, " end: %d", ctxt->end->no);
962 fprintf(output, "\n");
963 for (i = 0;i < ctxt->nbStates; i++) {
964 xmlRegPrintState(output, ctxt->states[i]);
965 }
966 fprintf(output, "%d counters:\n", ctxt->nbCounters);
967 for (i = 0;i < ctxt->nbCounters; i++) {
968 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
969 ctxt->counters[i].max);
970 }
971}
Daniel Veillard23e73572002-09-19 19:56:43 +0000972#endif
Daniel Veillard4255d502002-04-16 15:50:10 +0000973
974/************************************************************************
975 * *
976 * Finite Automata structures manipulations *
977 * *
978 ************************************************************************/
979
980static void
981xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
982 int neg, xmlRegAtomType type, int start, int end,
983 xmlChar *blockName) {
984 xmlRegRangePtr range;
985
986 if (atom == NULL) {
987 ERROR("add range: atom is NULL");
988 return;
989 }
990 if (atom->type != XML_REGEXP_RANGES) {
991 ERROR("add range: atom is not ranges");
992 return;
993 }
994 if (atom->maxRanges == 0) {
995 atom->maxRanges = 4;
996 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
997 sizeof(xmlRegRangePtr));
998 if (atom->ranges == NULL) {
999 ERROR("add range: allocation failed");
1000 atom->maxRanges = 0;
1001 return;
1002 }
1003 } else if (atom->nbRanges >= atom->maxRanges) {
1004 xmlRegRangePtr *tmp;
1005 atom->maxRanges *= 2;
1006 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1007 sizeof(xmlRegRangePtr));
1008 if (tmp == NULL) {
1009 ERROR("add range: allocation failed");
1010 atom->maxRanges /= 2;
1011 return;
1012 }
1013 atom->ranges = tmp;
1014 }
1015 range = xmlRegNewRange(ctxt, neg, type, start, end);
1016 if (range == NULL)
1017 return;
1018 range->blockName = blockName;
1019 atom->ranges[atom->nbRanges++] = range;
1020
1021}
1022
1023static int
1024xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1025 if (ctxt->maxCounters == 0) {
1026 ctxt->maxCounters = 4;
1027 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1028 sizeof(xmlRegCounter));
1029 if (ctxt->counters == NULL) {
1030 ERROR("reg counter: allocation failed");
1031 ctxt->maxCounters = 0;
1032 return(-1);
1033 }
1034 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1035 xmlRegCounter *tmp;
1036 ctxt->maxCounters *= 2;
1037 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1038 sizeof(xmlRegCounter));
1039 if (tmp == NULL) {
1040 ERROR("reg counter: allocation failed");
1041 ctxt->maxCounters /= 2;
1042 return(-1);
1043 }
1044 ctxt->counters = tmp;
1045 }
1046 ctxt->counters[ctxt->nbCounters].min = -1;
1047 ctxt->counters[ctxt->nbCounters].max = -1;
1048 return(ctxt->nbCounters++);
1049}
1050
1051static void
1052xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1053 if (atom == NULL) {
1054 ERROR("atom push: atom is NULL");
1055 return;
1056 }
1057 if (ctxt->maxAtoms == 0) {
1058 ctxt->maxAtoms = 4;
1059 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1060 sizeof(xmlRegAtomPtr));
1061 if (ctxt->atoms == NULL) {
1062 ERROR("atom push: allocation failed");
1063 ctxt->maxAtoms = 0;
1064 return;
1065 }
1066 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1067 xmlRegAtomPtr *tmp;
1068 ctxt->maxAtoms *= 2;
1069 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1070 sizeof(xmlRegAtomPtr));
1071 if (tmp == NULL) {
1072 ERROR("atom push: allocation failed");
1073 ctxt->maxAtoms /= 2;
1074 return;
1075 }
1076 ctxt->atoms = tmp;
1077 }
1078 atom->no = ctxt->nbAtoms;
1079 ctxt->atoms[ctxt->nbAtoms++] = atom;
1080}
1081
1082static void
1083xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1084 xmlRegAtomPtr atom, xmlRegStatePtr target,
1085 int counter, int count) {
1086 if (state == NULL) {
1087 ERROR("add state: state is NULL");
1088 return;
1089 }
1090 if (target == NULL) {
1091 ERROR("add state: target is NULL");
1092 return;
1093 }
1094 if (state->maxTrans == 0) {
1095 state->maxTrans = 4;
1096 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1097 sizeof(xmlRegTrans));
1098 if (state->trans == NULL) {
1099 ERROR("add range: allocation failed");
1100 state->maxTrans = 0;
1101 return;
1102 }
1103 } else if (state->nbTrans >= state->maxTrans) {
1104 xmlRegTrans *tmp;
1105 state->maxTrans *= 2;
1106 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1107 sizeof(xmlRegTrans));
1108 if (tmp == NULL) {
1109 ERROR("add range: allocation failed");
1110 state->maxTrans /= 2;
1111 return;
1112 }
1113 state->trans = tmp;
1114 }
1115#ifdef DEBUG_REGEXP_GRAPH
1116 printf("Add trans from %d to %d ", state->no, target->no);
Daniel Veillard8a001f62002-04-20 07:24:11 +00001117 if (count == REGEXP_ALL_COUNTER)
1118 printf("all transition");
Daniel Veillard4402ab42002-09-12 16:02:56 +00001119 else if (count >= 0)
Daniel Veillard4255d502002-04-16 15:50:10 +00001120 printf("count based %d", count);
1121 else if (counter >= 0)
1122 printf("counted %d", counter);
1123 else if (atom == NULL)
1124 printf("epsilon transition");
1125 printf("\n");
1126#endif
1127
1128 state->trans[state->nbTrans].atom = atom;
1129 state->trans[state->nbTrans].to = target->no;
1130 state->trans[state->nbTrans].counter = counter;
1131 state->trans[state->nbTrans].count = count;
1132 state->nbTrans++;
1133}
1134
1135static void
1136xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
1137 if (ctxt->maxStates == 0) {
1138 ctxt->maxStates = 4;
1139 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1140 sizeof(xmlRegStatePtr));
1141 if (ctxt->states == NULL) {
1142 ERROR("add range: allocation failed");
1143 ctxt->maxStates = 0;
1144 return;
1145 }
1146 } else if (ctxt->nbStates >= ctxt->maxStates) {
1147 xmlRegStatePtr *tmp;
1148 ctxt->maxStates *= 2;
1149 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1150 sizeof(xmlRegStatePtr));
1151 if (tmp == NULL) {
1152 ERROR("add range: allocation failed");
1153 ctxt->maxStates /= 2;
1154 return;
1155 }
1156 ctxt->states = tmp;
1157 }
1158 state->no = ctxt->nbStates;
1159 ctxt->states[ctxt->nbStates++] = state;
1160}
1161
1162/**
Daniel Veillard7646b182002-04-20 06:41:40 +00001163 * xmlFAGenerateAllTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001164 * @ctxt: a regexp parser context
1165 * @from: the from state
1166 * @to: the target state or NULL for building a new one
1167 * @lax:
Daniel Veillard7646b182002-04-20 06:41:40 +00001168 *
1169 */
1170static void
1171xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
Daniel Veillard441bc322002-04-20 17:38:48 +00001172 xmlRegStatePtr from, xmlRegStatePtr to,
1173 int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00001174 if (to == NULL) {
1175 to = xmlRegNewState(ctxt);
1176 xmlRegStatePush(ctxt, to);
1177 ctxt->state = to;
1178 }
Daniel Veillard441bc322002-04-20 17:38:48 +00001179 if (lax)
1180 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1181 else
1182 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
Daniel Veillard7646b182002-04-20 06:41:40 +00001183}
1184
1185/**
Daniel Veillard4255d502002-04-16 15:50:10 +00001186 * xmlFAGenerateEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001187 * @ctxt: a regexp parser context
1188 * @from: the from state
1189 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001190 *
1191 */
1192static void
1193xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1194 xmlRegStatePtr from, xmlRegStatePtr to) {
1195 if (to == NULL) {
1196 to = xmlRegNewState(ctxt);
1197 xmlRegStatePush(ctxt, to);
1198 ctxt->state = to;
1199 }
1200 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1201}
1202
1203/**
1204 * xmlFAGenerateCountedEpsilonTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001205 * @ctxt: a regexp parser context
1206 * @from: the from state
1207 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001208 * counter: the counter for that transition
1209 *
1210 */
1211static void
1212xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1213 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1214 if (to == NULL) {
1215 to = xmlRegNewState(ctxt);
1216 xmlRegStatePush(ctxt, to);
1217 ctxt->state = to;
1218 }
1219 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1220}
1221
1222/**
1223 * xmlFAGenerateCountedTransition:
Daniel Veillard441bc322002-04-20 17:38:48 +00001224 * @ctxt: a regexp parser context
1225 * @from: the from state
1226 * @to: the target state or NULL for building a new one
Daniel Veillard4255d502002-04-16 15:50:10 +00001227 * counter: the counter for that transition
1228 *
1229 */
1230static void
1231xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1232 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1233 if (to == NULL) {
1234 to = xmlRegNewState(ctxt);
1235 xmlRegStatePush(ctxt, to);
1236 ctxt->state = to;
1237 }
1238 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1239}
1240
1241/**
1242 * xmlFAGenerateTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001243 * @ctxt: a regexp parser context
1244 * @from: the from state
1245 * @to: the target state or NULL for building a new one
1246 * @atom: the atom generating the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00001247 *
1248 */
1249static void
1250xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1251 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1252 if (atom == NULL) {
1253 ERROR("genrate transition: atom == NULL");
1254 return;
1255 }
1256 if (atom->type == XML_REGEXP_SUBREG) {
1257 /*
1258 * this is a subexpression handling one should not need to
1259 * create a new node excep for XML_REGEXP_QUANT_RANGE.
1260 */
1261 xmlRegAtomPush(ctxt, atom);
1262 if ((to != NULL) && (atom->stop != to) &&
1263 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1264 /*
1265 * Generate an epsilon transition to link to the target
1266 */
1267 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1268 }
1269 switch (atom->quant) {
1270 case XML_REGEXP_QUANT_OPT:
1271 atom->quant = XML_REGEXP_QUANT_ONCE;
1272 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1273 break;
1274 case XML_REGEXP_QUANT_MULT:
1275 atom->quant = XML_REGEXP_QUANT_ONCE;
1276 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1277 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1278 break;
1279 case XML_REGEXP_QUANT_PLUS:
1280 atom->quant = XML_REGEXP_QUANT_ONCE;
1281 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1282 break;
1283 case XML_REGEXP_QUANT_RANGE: {
1284 int counter;
1285 xmlRegStatePtr newstate;
1286
1287 /*
1288 * This one is nasty:
1289 * 1/ register a new counter
1290 * 2/ register an epsilon transition associated to
1291 * this counter going from atom->stop to atom->start
1292 * 3/ create a new state
1293 * 4/ generate a counted transition from atom->stop to
1294 * that state
1295 */
1296 counter = xmlRegGetCounter(ctxt);
1297 ctxt->counters[counter].min = atom->min - 1;
1298 ctxt->counters[counter].max = atom->max - 1;
1299 atom->min = 0;
1300 atom->max = 0;
1301 atom->quant = XML_REGEXP_QUANT_ONCE;
1302 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1303 atom->start, counter);
1304 if (to != NULL) {
1305 newstate = to;
1306 } else {
1307 newstate = xmlRegNewState(ctxt);
1308 xmlRegStatePush(ctxt, newstate);
1309 ctxt->state = newstate;
1310 }
1311 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1312 newstate, counter);
1313 }
1314 default:
1315 break;
1316 }
1317 return;
1318 } else {
1319 if (to == NULL) {
1320 to = xmlRegNewState(ctxt);
1321 xmlRegStatePush(ctxt, to);
1322 }
1323 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1324 xmlRegAtomPush(ctxt, atom);
1325 ctxt->state = to;
1326 }
1327 switch (atom->quant) {
1328 case XML_REGEXP_QUANT_OPT:
1329 atom->quant = XML_REGEXP_QUANT_ONCE;
1330 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1331 break;
1332 case XML_REGEXP_QUANT_MULT:
1333 atom->quant = XML_REGEXP_QUANT_ONCE;
1334 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1335 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1336 break;
1337 case XML_REGEXP_QUANT_PLUS:
1338 atom->quant = XML_REGEXP_QUANT_ONCE;
1339 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1340 break;
1341 default:
1342 break;
1343 }
1344}
1345
1346/**
1347 * xmlFAReduceEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001348 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001349 * @fromnr: the from state
1350 * @tonr: the to state
1351 * @cpunter: should that transition be associted to a counted
1352 *
1353 */
1354static void
1355xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1356 int tonr, int counter) {
1357 int transnr;
1358 xmlRegStatePtr from;
1359 xmlRegStatePtr to;
1360
1361#ifdef DEBUG_REGEXP_GRAPH
1362 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1363#endif
1364 from = ctxt->states[fromnr];
1365 if (from == NULL)
1366 return;
1367 to = ctxt->states[tonr];
1368 if (to == NULL)
1369 return;
1370 if ((to->mark == XML_REGEXP_MARK_START) ||
1371 (to->mark == XML_REGEXP_MARK_VISITED))
1372 return;
1373
1374 to->mark = XML_REGEXP_MARK_VISITED;
1375 if (to->type == XML_REGEXP_FINAL_STATE) {
1376#ifdef DEBUG_REGEXP_GRAPH
1377 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1378#endif
1379 from->type = XML_REGEXP_FINAL_STATE;
1380 }
1381 for (transnr = 0;transnr < to->nbTrans;transnr++) {
1382 if (to->trans[transnr].atom == NULL) {
1383 /*
1384 * Don't remove counted transitions
1385 * Don't loop either
1386 */
Daniel Veillardb509f152002-04-17 16:28:10 +00001387 if (to->trans[transnr].to != fromnr) {
1388 if (to->trans[transnr].count >= 0) {
1389 int newto = to->trans[transnr].to;
1390
1391 xmlRegStateAddTrans(ctxt, from, NULL,
1392 ctxt->states[newto],
1393 -1, to->trans[transnr].count);
1394 } else {
Daniel Veillard4255d502002-04-16 15:50:10 +00001395#ifdef DEBUG_REGEXP_GRAPH
Daniel Veillardb509f152002-04-17 16:28:10 +00001396 printf("Found epsilon trans %d from %d to %d\n",
1397 transnr, tonr, to->trans[transnr].to);
Daniel Veillard4255d502002-04-16 15:50:10 +00001398#endif
Daniel Veillardb509f152002-04-17 16:28:10 +00001399 if (to->trans[transnr].counter >= 0) {
1400 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1401 to->trans[transnr].to,
1402 to->trans[transnr].counter);
1403 } else {
1404 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1405 to->trans[transnr].to,
1406 counter);
1407 }
1408 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001409 }
1410 } else {
1411 int newto = to->trans[transnr].to;
1412
Daniel Veillardb509f152002-04-17 16:28:10 +00001413 if (to->trans[transnr].counter >= 0) {
1414 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1415 ctxt->states[newto],
1416 to->trans[transnr].counter, -1);
1417 } else {
1418 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1419 ctxt->states[newto], counter, -1);
1420 }
Daniel Veillard4255d502002-04-16 15:50:10 +00001421 }
1422 }
1423 to->mark = XML_REGEXP_MARK_NORMAL;
1424}
1425
1426/**
1427 * xmlFAEliminateEpsilonTransitions:
Daniel Veillard441bc322002-04-20 17:38:48 +00001428 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00001429 *
1430 */
1431static void
1432xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1433 int statenr, transnr;
1434 xmlRegStatePtr state;
1435
1436 /*
1437 * build the completed transitions bypassing the epsilons
1438 * Use a marking algorithm to avoid loops
1439 */
1440 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1441 state = ctxt->states[statenr];
1442 if (state == NULL)
1443 continue;
1444 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1445 if ((state->trans[transnr].atom == NULL) &&
1446 (state->trans[transnr].to >= 0)) {
1447 if (state->trans[transnr].to == statenr) {
1448 state->trans[transnr].to = -1;
1449#ifdef DEBUG_REGEXP_GRAPH
1450 printf("Removed loopback epsilon trans %d on %d\n",
1451 transnr, statenr);
1452#endif
1453 } else if (state->trans[transnr].count < 0) {
1454 int newto = state->trans[transnr].to;
1455
1456#ifdef DEBUG_REGEXP_GRAPH
1457 printf("Found epsilon trans %d from %d to %d\n",
1458 transnr, statenr, newto);
1459#endif
1460 state->mark = XML_REGEXP_MARK_START;
1461 xmlFAReduceEpsilonTransitions(ctxt, statenr,
1462 newto, state->trans[transnr].counter);
1463 state->mark = XML_REGEXP_MARK_NORMAL;
1464#ifdef DEBUG_REGEXP_GRAPH
1465 } else {
1466 printf("Found counted transition %d on %d\n",
1467 transnr, statenr);
1468#endif
1469 }
1470 }
1471 }
1472 }
1473 /*
1474 * Eliminate the epsilon transitions
1475 */
1476 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1477 state = ctxt->states[statenr];
1478 if (state == NULL)
1479 continue;
1480 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1481 if ((state->trans[transnr].atom == NULL) &&
1482 (state->trans[transnr].count < 0) &&
1483 (state->trans[transnr].to >= 0)) {
1484 state->trans[transnr].to = -1;
1485 }
1486 }
1487 }
Daniel Veillard23e73572002-09-19 19:56:43 +00001488
1489 /*
1490 * Use this pass to detect unreachable states too
1491 */
1492 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1493 state = ctxt->states[statenr];
1494 if (state != NULL)
1495 state->reached = 0;
1496 }
1497 state = ctxt->states[0];
1498 if (state != NULL)
1499 state->reached = 1;
1500 while (state != NULL) {
1501 xmlRegStatePtr target = NULL;
1502 state->reached = 2;
1503 /*
1504 * Mark all state reachable from the current reachable state
1505 */
1506 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1507 if ((state->trans[transnr].to >= 0) &&
1508 ((state->trans[transnr].atom != NULL) ||
1509 (state->trans[transnr].count >= 0))) {
1510 int newto = state->trans[transnr].to;
1511
1512 if (ctxt->states[newto] == NULL)
1513 continue;
1514 if (ctxt->states[newto]->reached == 0) {
1515 ctxt->states[newto]->reached = 1;
1516 target = ctxt->states[newto];
1517 }
1518 }
1519 }
1520 /*
1521 * find the next accessible state not explored
1522 */
1523 if (target == NULL) {
1524 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
1525 state = ctxt->states[statenr];
1526 if ((state != NULL) && (state->reached == 1)) {
1527 target = state;
1528 break;
1529 }
1530 }
1531 }
1532 state = target;
1533 }
1534 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1535 state = ctxt->states[statenr];
1536 if ((state != NULL) && (state->reached == 0)) {
1537#ifdef DEBUG_REGEXP_GRAPH
1538 printf("Removed unreachable state %d\n", statenr);
1539#endif
1540 xmlRegFreeState(state);
1541 ctxt->states[statenr] = NULL;
1542 }
1543 }
1544
Daniel Veillard4255d502002-04-16 15:50:10 +00001545}
1546
Daniel Veillarde19fc232002-04-22 16:01:24 +00001547/**
1548 * xmlFACompareAtoms:
1549 * @atom1: an atom
1550 * @atom2: an atom
1551 *
1552 * Compares two atoms to check whether they are equivatents
1553 *
1554 * Returns 1 if yes and 0 otherwise
1555 */
1556static int
1557xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
1558 if (atom1 == atom2)
1559 return(1);
1560 if ((atom1 == NULL) || (atom2 == NULL))
1561 return(0);
1562
1563 if (atom1->type != atom2->type)
1564 return(0);
1565 switch (atom1->type) {
1566 case XML_REGEXP_STRING:
1567 return(xmlStrEqual((xmlChar *)atom1->valuep,
1568 (xmlChar *)atom2->valuep));
1569 case XML_REGEXP_EPSILON:
1570 return(1);
1571 case XML_REGEXP_CHARVAL:
1572 return(atom1->codepoint == atom2->codepoint);
1573 case XML_REGEXP_RANGES:
1574 TODO;
1575 return(0);
1576 default:
1577 break;
1578 }
1579 return(1);
1580}
1581
1582/**
1583 * xmlFARecurseDeterminism:
1584 * @ctxt: a regexp parser context
1585 *
1586 * Check whether the associated regexp is determinist,
1587 * should be called after xmlFAEliminateEpsilonTransitions()
1588 *
1589 */
1590static int
1591xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1592 int to, xmlRegAtomPtr atom) {
1593 int ret = 1;
1594 int transnr;
1595 xmlRegTransPtr t1;
1596
1597 if (state == NULL)
1598 return(ret);
1599 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1600 t1 = &(state->trans[transnr]);
1601 /*
1602 * check transitions conflicting with the one looked at
1603 */
1604 if (t1->atom == NULL) {
1605 if (t1->to == -1)
1606 continue;
1607 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1608 to, atom);
1609 if (ret == 0)
1610 return(0);
1611 continue;
1612 }
1613 if (t1->to != to)
1614 continue;
1615 if (xmlFACompareAtoms(t1->atom, atom))
1616 return(0);
1617 }
1618 return(ret);
1619}
1620
1621/**
1622 * xmlFAComputesDeterminism:
1623 * @ctxt: a regexp parser context
1624 *
1625 * Check whether the associated regexp is determinist,
1626 * should be called after xmlFAEliminateEpsilonTransitions()
1627 *
1628 */
1629static int
1630xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
1631 int statenr, transnr;
1632 xmlRegStatePtr state;
1633 xmlRegTransPtr t1, t2;
1634 int i;
1635 int ret = 1;
1636
Daniel Veillard4402ab42002-09-12 16:02:56 +00001637#ifdef DEBUG_REGEXP_GRAPH
1638 printf("xmlFAComputesDeterminism\n");
1639 xmlRegPrintCtxt(stdout, ctxt);
1640#endif
Daniel Veillarde19fc232002-04-22 16:01:24 +00001641 if (ctxt->determinist != -1)
1642 return(ctxt->determinist);
1643
1644 /*
1645 * Check for all states that there isn't 2 transitions
1646 * with the same atom and a different target.
1647 */
1648 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1649 state = ctxt->states[statenr];
1650 if (state == NULL)
1651 continue;
1652 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1653 t1 = &(state->trans[transnr]);
1654 /*
1655 * Determinism checks in case of counted or all transitions
1656 * will have to be handled separately
1657 */
1658 if (t1->atom == NULL)
1659 continue;
1660 if (t1->to == -1) /* eliminated */
1661 continue;
1662 for (i = 0;i < transnr;i++) {
1663 t2 = &(state->trans[i]);
1664 if (t2->to == -1) /* eliminated */
1665 continue;
1666 if (t2->atom != NULL) {
1667 if (t1->to == t2->to) {
1668 if (xmlFACompareAtoms(t1->atom, t2->atom))
1669 t2->to = -1; /* eliminate */
1670 } else {
1671 /* not determinist ! */
1672 if (xmlFACompareAtoms(t1->atom, t2->atom))
1673 ret = 0;
1674 }
1675 } else if (t1->to != -1) {
1676 /*
1677 * do the closure in case of remaining specific
1678 * epsilon transitions like choices or all
1679 */
1680 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1681 t2->to, t2->atom);
1682 if (ret == 0)
1683 return(0);
1684 }
1685 }
1686 if (ret == 0)
1687 break;
1688 }
1689 if (ret == 0)
1690 break;
1691 }
1692 ctxt->determinist = ret;
1693 return(ret);
1694}
1695
Daniel Veillard4255d502002-04-16 15:50:10 +00001696/************************************************************************
1697 * *
1698 * Routines to check input against transition atoms *
1699 * *
1700 ************************************************************************/
1701
1702static int
1703xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
1704 int start, int end, const xmlChar *blockName) {
1705 int ret = 0;
1706
1707 switch (type) {
1708 case XML_REGEXP_STRING:
1709 case XML_REGEXP_SUBREG:
1710 case XML_REGEXP_RANGES:
1711 case XML_REGEXP_EPSILON:
1712 return(-1);
1713 case XML_REGEXP_ANYCHAR:
1714 ret = ((codepoint != '\n') && (codepoint != '\r'));
1715 break;
1716 case XML_REGEXP_CHARVAL:
1717 ret = ((codepoint >= start) && (codepoint <= end));
1718 break;
1719 case XML_REGEXP_NOTSPACE:
1720 neg = !neg;
1721 case XML_REGEXP_ANYSPACE:
1722 ret = ((codepoint == '\n') || (codepoint == '\r') ||
1723 (codepoint == '\t') || (codepoint == ' '));
1724 break;
1725 case XML_REGEXP_NOTINITNAME:
1726 neg = !neg;
1727 case XML_REGEXP_INITNAME:
1728 ret = (xmlIsLetter(codepoint) ||
1729 (codepoint == '_') || (codepoint == ':'));
1730 break;
1731 case XML_REGEXP_NOTNAMECHAR:
1732 neg = !neg;
1733 case XML_REGEXP_NAMECHAR:
1734 ret = (xmlIsLetter(codepoint) || xmlIsDigit(codepoint) ||
1735 (codepoint == '.') || (codepoint == '-') ||
1736 (codepoint == '_') || (codepoint == ':') ||
1737 xmlIsCombining(codepoint) || xmlIsExtender(codepoint));
1738 break;
1739 case XML_REGEXP_NOTDECIMAL:
1740 neg = !neg;
1741 case XML_REGEXP_DECIMAL:
1742 ret = xmlUCSIsCatNd(codepoint);
1743 break;
1744 case XML_REGEXP_REALCHAR:
1745 neg = !neg;
1746 case XML_REGEXP_NOTREALCHAR:
1747 ret = xmlUCSIsCatP(codepoint);
1748 if (ret == 0)
1749 ret = xmlUCSIsCatZ(codepoint);
1750 if (ret == 0)
1751 ret = xmlUCSIsCatC(codepoint);
1752 break;
1753 case XML_REGEXP_LETTER:
1754 ret = xmlUCSIsCatL(codepoint);
1755 break;
1756 case XML_REGEXP_LETTER_UPPERCASE:
1757 ret = xmlUCSIsCatLu(codepoint);
1758 break;
1759 case XML_REGEXP_LETTER_LOWERCASE:
1760 ret = xmlUCSIsCatLl(codepoint);
1761 break;
1762 case XML_REGEXP_LETTER_TITLECASE:
1763 ret = xmlUCSIsCatLt(codepoint);
1764 break;
1765 case XML_REGEXP_LETTER_MODIFIER:
1766 ret = xmlUCSIsCatLm(codepoint);
1767 break;
1768 case XML_REGEXP_LETTER_OTHERS:
1769 ret = xmlUCSIsCatLo(codepoint);
1770 break;
1771 case XML_REGEXP_MARK:
1772 ret = xmlUCSIsCatM(codepoint);
1773 break;
1774 case XML_REGEXP_MARK_NONSPACING:
1775 ret = xmlUCSIsCatMn(codepoint);
1776 break;
1777 case XML_REGEXP_MARK_SPACECOMBINING:
1778 ret = xmlUCSIsCatMc(codepoint);
1779 break;
1780 case XML_REGEXP_MARK_ENCLOSING:
1781 ret = xmlUCSIsCatMe(codepoint);
1782 break;
1783 case XML_REGEXP_NUMBER:
1784 ret = xmlUCSIsCatN(codepoint);
1785 break;
1786 case XML_REGEXP_NUMBER_DECIMAL:
1787 ret = xmlUCSIsCatNd(codepoint);
1788 break;
1789 case XML_REGEXP_NUMBER_LETTER:
1790 ret = xmlUCSIsCatNl(codepoint);
1791 break;
1792 case XML_REGEXP_NUMBER_OTHERS:
1793 ret = xmlUCSIsCatNo(codepoint);
1794 break;
1795 case XML_REGEXP_PUNCT:
1796 ret = xmlUCSIsCatP(codepoint);
1797 break;
1798 case XML_REGEXP_PUNCT_CONNECTOR:
1799 ret = xmlUCSIsCatPc(codepoint);
1800 break;
1801 case XML_REGEXP_PUNCT_DASH:
1802 ret = xmlUCSIsCatPd(codepoint);
1803 break;
1804 case XML_REGEXP_PUNCT_OPEN:
1805 ret = xmlUCSIsCatPs(codepoint);
1806 break;
1807 case XML_REGEXP_PUNCT_CLOSE:
1808 ret = xmlUCSIsCatPe(codepoint);
1809 break;
1810 case XML_REGEXP_PUNCT_INITQUOTE:
1811 ret = xmlUCSIsCatPi(codepoint);
1812 break;
1813 case XML_REGEXP_PUNCT_FINQUOTE:
1814 ret = xmlUCSIsCatPf(codepoint);
1815 break;
1816 case XML_REGEXP_PUNCT_OTHERS:
1817 ret = xmlUCSIsCatPo(codepoint);
1818 break;
1819 case XML_REGEXP_SEPAR:
1820 ret = xmlUCSIsCatZ(codepoint);
1821 break;
1822 case XML_REGEXP_SEPAR_SPACE:
1823 ret = xmlUCSIsCatZs(codepoint);
1824 break;
1825 case XML_REGEXP_SEPAR_LINE:
1826 ret = xmlUCSIsCatZl(codepoint);
1827 break;
1828 case XML_REGEXP_SEPAR_PARA:
1829 ret = xmlUCSIsCatZp(codepoint);
1830 break;
1831 case XML_REGEXP_SYMBOL:
1832 ret = xmlUCSIsCatS(codepoint);
1833 break;
1834 case XML_REGEXP_SYMBOL_MATH:
1835 ret = xmlUCSIsCatSm(codepoint);
1836 break;
1837 case XML_REGEXP_SYMBOL_CURRENCY:
1838 ret = xmlUCSIsCatSc(codepoint);
1839 break;
1840 case XML_REGEXP_SYMBOL_MODIFIER:
1841 ret = xmlUCSIsCatSk(codepoint);
1842 break;
1843 case XML_REGEXP_SYMBOL_OTHERS:
1844 ret = xmlUCSIsCatSo(codepoint);
1845 break;
1846 case XML_REGEXP_OTHER:
1847 ret = xmlUCSIsCatC(codepoint);
1848 break;
1849 case XML_REGEXP_OTHER_CONTROL:
1850 ret = xmlUCSIsCatCc(codepoint);
1851 break;
1852 case XML_REGEXP_OTHER_FORMAT:
1853 ret = xmlUCSIsCatCf(codepoint);
1854 break;
1855 case XML_REGEXP_OTHER_PRIVATE:
1856 ret = xmlUCSIsCatCo(codepoint);
1857 break;
1858 case XML_REGEXP_OTHER_NA:
1859 /* ret = xmlUCSIsCatCn(codepoint); */
1860 /* Seems it doesn't exist anymore in recent Unicode releases */
1861 ret = 0;
1862 break;
1863 case XML_REGEXP_BLOCK_NAME:
1864 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
1865 break;
1866 }
1867 if (neg)
1868 return(!ret);
1869 return(ret);
1870}
1871
1872static int
1873xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
1874 int i, ret = 0;
1875 xmlRegRangePtr range;
1876
1877 if ((atom == NULL) || (!xmlIsChar(codepoint)))
1878 return(-1);
1879
1880 switch (atom->type) {
1881 case XML_REGEXP_SUBREG:
1882 case XML_REGEXP_EPSILON:
1883 return(-1);
1884 case XML_REGEXP_CHARVAL:
1885 return(codepoint == atom->codepoint);
1886 case XML_REGEXP_RANGES: {
1887 int accept = 0;
1888 for (i = 0;i < atom->nbRanges;i++) {
1889 range = atom->ranges[i];
1890 if (range->neg) {
1891 ret = xmlRegCheckCharacterRange(range->type, codepoint,
1892 0, range->start, range->end,
1893 range->blockName);
1894 if (ret != 0)
1895 return(0); /* excluded char */
1896 } else {
1897 ret = xmlRegCheckCharacterRange(range->type, codepoint,
1898 0, range->start, range->end,
1899 range->blockName);
1900 if (ret != 0)
1901 accept = 1; /* might still be excluded */
1902 }
1903 }
1904 return(accept);
1905 }
1906 case XML_REGEXP_STRING:
1907 printf("TODO: XML_REGEXP_STRING\n");
1908 return(-1);
1909 case XML_REGEXP_ANYCHAR:
1910 case XML_REGEXP_ANYSPACE:
1911 case XML_REGEXP_NOTSPACE:
1912 case XML_REGEXP_INITNAME:
1913 case XML_REGEXP_NOTINITNAME:
1914 case XML_REGEXP_NAMECHAR:
1915 case XML_REGEXP_NOTNAMECHAR:
1916 case XML_REGEXP_DECIMAL:
1917 case XML_REGEXP_NOTDECIMAL:
1918 case XML_REGEXP_REALCHAR:
1919 case XML_REGEXP_NOTREALCHAR:
1920 case XML_REGEXP_LETTER:
1921 case XML_REGEXP_LETTER_UPPERCASE:
1922 case XML_REGEXP_LETTER_LOWERCASE:
1923 case XML_REGEXP_LETTER_TITLECASE:
1924 case XML_REGEXP_LETTER_MODIFIER:
1925 case XML_REGEXP_LETTER_OTHERS:
1926 case XML_REGEXP_MARK:
1927 case XML_REGEXP_MARK_NONSPACING:
1928 case XML_REGEXP_MARK_SPACECOMBINING:
1929 case XML_REGEXP_MARK_ENCLOSING:
1930 case XML_REGEXP_NUMBER:
1931 case XML_REGEXP_NUMBER_DECIMAL:
1932 case XML_REGEXP_NUMBER_LETTER:
1933 case XML_REGEXP_NUMBER_OTHERS:
1934 case XML_REGEXP_PUNCT:
1935 case XML_REGEXP_PUNCT_CONNECTOR:
1936 case XML_REGEXP_PUNCT_DASH:
1937 case XML_REGEXP_PUNCT_OPEN:
1938 case XML_REGEXP_PUNCT_CLOSE:
1939 case XML_REGEXP_PUNCT_INITQUOTE:
1940 case XML_REGEXP_PUNCT_FINQUOTE:
1941 case XML_REGEXP_PUNCT_OTHERS:
1942 case XML_REGEXP_SEPAR:
1943 case XML_REGEXP_SEPAR_SPACE:
1944 case XML_REGEXP_SEPAR_LINE:
1945 case XML_REGEXP_SEPAR_PARA:
1946 case XML_REGEXP_SYMBOL:
1947 case XML_REGEXP_SYMBOL_MATH:
1948 case XML_REGEXP_SYMBOL_CURRENCY:
1949 case XML_REGEXP_SYMBOL_MODIFIER:
1950 case XML_REGEXP_SYMBOL_OTHERS:
1951 case XML_REGEXP_OTHER:
1952 case XML_REGEXP_OTHER_CONTROL:
1953 case XML_REGEXP_OTHER_FORMAT:
1954 case XML_REGEXP_OTHER_PRIVATE:
1955 case XML_REGEXP_OTHER_NA:
1956 case XML_REGEXP_BLOCK_NAME:
1957 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
1958 (const xmlChar *)atom->valuep);
1959 if (atom->neg)
1960 ret = !ret;
1961 break;
1962 }
1963 return(ret);
1964}
1965
1966/************************************************************************
1967 * *
1968 * Saving an restoring state of an execution context *
1969 * *
1970 ************************************************************************/
1971
1972#ifdef DEBUG_REGEXP_EXEC
1973static void
1974xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
1975 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
1976 if (exec->inputStack != NULL) {
1977 int i;
1978 printf(": ");
1979 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
1980 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]);
1981 } else {
1982 printf(": %s", &(exec->inputString[exec->index]));
1983 }
1984 printf("\n");
1985}
1986#endif
1987
1988static void
1989xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
1990#ifdef DEBUG_REGEXP_EXEC
1991 printf("saving ");
1992 exec->transno++;
1993 xmlFARegDebugExec(exec);
1994 exec->transno--;
1995#endif
1996
1997 if (exec->maxRollbacks == 0) {
1998 exec->maxRollbacks = 4;
1999 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
2000 sizeof(xmlRegExecRollback));
2001 if (exec->rollbacks == NULL) {
2002 fprintf(stderr, "exec save: allocation failed");
2003 exec->maxRollbacks = 0;
2004 return;
2005 }
2006 memset(exec->rollbacks, 0,
2007 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2008 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
2009 xmlRegExecRollback *tmp;
2010 int len = exec->maxRollbacks;
2011
2012 exec->maxRollbacks *= 2;
2013 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
2014 exec->maxRollbacks * sizeof(xmlRegExecRollback));
2015 if (tmp == NULL) {
2016 fprintf(stderr, "exec save: allocation failed");
2017 exec->maxRollbacks /= 2;
2018 return;
2019 }
2020 exec->rollbacks = tmp;
2021 tmp = &exec->rollbacks[len];
2022 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
2023 }
2024 exec->rollbacks[exec->nbRollbacks].state = exec->state;
2025 exec->rollbacks[exec->nbRollbacks].index = exec->index;
2026 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
2027 if (exec->comp->nbCounters > 0) {
2028 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2029 exec->rollbacks[exec->nbRollbacks].counts = (int *)
2030 xmlMalloc(exec->comp->nbCounters * sizeof(int));
2031 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2032 fprintf(stderr, "exec save: allocation failed");
2033 exec->status = -5;
2034 return;
2035 }
2036 }
2037 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
2038 exec->comp->nbCounters * sizeof(int));
2039 }
2040 exec->nbRollbacks++;
2041}
2042
2043static void
2044xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
2045 if (exec->nbRollbacks <= 0) {
2046 exec->status = -1;
2047#ifdef DEBUG_REGEXP_EXEC
2048 printf("rollback failed on empty stack\n");
2049#endif
2050 return;
2051 }
2052 exec->nbRollbacks--;
2053 exec->state = exec->rollbacks[exec->nbRollbacks].state;
2054 exec->index = exec->rollbacks[exec->nbRollbacks].index;
2055 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
2056 if (exec->comp->nbCounters > 0) {
2057 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2058 fprintf(stderr, "exec save: allocation failed");
2059 exec->status = -6;
2060 return;
2061 }
2062 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
2063 exec->comp->nbCounters * sizeof(int));
2064 }
2065
2066#ifdef DEBUG_REGEXP_EXEC
2067 printf("restored ");
2068 xmlFARegDebugExec(exec);
2069#endif
2070}
2071
2072/************************************************************************
2073 * *
2074 * Verifyer, running an input against a compiled regexp *
2075 * *
2076 ************************************************************************/
2077
2078static int
2079xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
2080 xmlRegExecCtxt execval;
2081 xmlRegExecCtxtPtr exec = &execval;
2082 int ret, codepoint, len;
2083
2084 exec->inputString = content;
2085 exec->index = 0;
2086 exec->determinist = 1;
2087 exec->maxRollbacks = 0;
2088 exec->nbRollbacks = 0;
2089 exec->rollbacks = NULL;
2090 exec->status = 0;
2091 exec->comp = comp;
2092 exec->state = comp->states[0];
2093 exec->transno = 0;
2094 exec->transcount = 0;
2095 if (comp->nbCounters > 0) {
2096 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
2097 if (exec->counts == NULL)
2098 return(-1);
2099 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
2100 } else
2101 exec->counts = NULL;
2102 while ((exec->status == 0) &&
2103 ((exec->inputString[exec->index] != 0) ||
2104 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
2105 xmlRegTransPtr trans;
2106 xmlRegAtomPtr atom;
2107
2108 /*
2109 * End of input on non-terminal state, rollback, however we may
2110 * still have epsilon like transition for counted transitions
2111 * on counters, in that case don't break too early.
2112 */
2113 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
2114 goto rollback;
2115
2116 exec->transcount = 0;
2117 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2118 trans = &exec->state->trans[exec->transno];
2119 if (trans->to < 0)
2120 continue;
2121 atom = trans->atom;
2122 ret = 0;
2123 if (trans->count >= 0) {
2124 int count;
2125 xmlRegCounterPtr counter;
2126
2127 /*
2128 * A counted transition.
2129 */
2130
2131 count = exec->counts[trans->count];
2132 counter = &exec->comp->counters[trans->count];
2133#ifdef DEBUG_REGEXP_EXEC
2134 printf("testing count %d: val %d, min %d, max %d\n",
2135 trans->count, count, counter->min, counter->max);
2136#endif
2137 ret = ((count >= counter->min) && (count <= counter->max));
2138 } else if (atom == NULL) {
2139 fprintf(stderr, "epsilon transition left at runtime\n");
2140 exec->status = -2;
2141 break;
2142 } else if (exec->inputString[exec->index] != 0) {
2143 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
2144 ret = xmlRegCheckCharacter(atom, codepoint);
2145 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2146 xmlRegStatePtr to = comp->states[trans->to];
2147
2148 /*
2149 * this is a multiple input sequence
2150 */
2151 if (exec->state->nbTrans > exec->transno + 1) {
2152 xmlFARegExecSave(exec);
2153 }
2154 exec->transcount = 1;
2155 do {
2156 /*
2157 * Try to progress as much as possible on the input
2158 */
2159 if (exec->transcount == atom->max) {
2160 break;
2161 }
2162 exec->index += len;
2163 /*
2164 * End of input: stop here
2165 */
2166 if (exec->inputString[exec->index] == 0) {
2167 exec->index -= len;
2168 break;
2169 }
2170 if (exec->transcount >= atom->min) {
2171 int transno = exec->transno;
2172 xmlRegStatePtr state = exec->state;
2173
2174 /*
2175 * The transition is acceptable save it
2176 */
2177 exec->transno = -1; /* trick */
2178 exec->state = to;
2179 xmlFARegExecSave(exec);
2180 exec->transno = transno;
2181 exec->state = state;
2182 }
2183 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
2184 len);
2185 ret = xmlRegCheckCharacter(atom, codepoint);
2186 exec->transcount++;
2187 } while (ret == 1);
2188 if (exec->transcount < atom->min)
2189 ret = 0;
2190
2191 /*
2192 * If the last check failed but one transition was found
2193 * possible, rollback
2194 */
2195 if (ret < 0)
2196 ret = 0;
2197 if (ret == 0) {
2198 goto rollback;
2199 }
2200 }
2201 }
2202 if (ret == 1) {
2203 if (exec->state->nbTrans > exec->transno + 1) {
2204 xmlFARegExecSave(exec);
2205 }
2206 if (trans->counter >= 0) {
2207#ifdef DEBUG_REGEXP_EXEC
2208 printf("Increasing count %d\n", trans->counter);
2209#endif
2210 exec->counts[trans->counter]++;
2211 }
2212#ifdef DEBUG_REGEXP_EXEC
2213 printf("entering state %d\n", trans->to);
2214#endif
2215 exec->state = comp->states[trans->to];
2216 exec->transno = 0;
2217 if (trans->atom != NULL) {
2218 exec->index += len;
2219 }
2220 goto progress;
2221 } else if (ret < 0) {
2222 exec->status = -4;
2223 break;
2224 }
2225 }
2226 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2227rollback:
2228 /*
2229 * Failed to find a way out
2230 */
2231 exec->determinist = 0;
2232 xmlFARegExecRollBack(exec);
2233 }
2234progress:
2235 continue;
2236 }
2237 if (exec->rollbacks != NULL) {
2238 if (exec->counts != NULL) {
2239 int i;
2240
2241 for (i = 0;i < exec->maxRollbacks;i++)
2242 if (exec->rollbacks[i].counts != NULL)
2243 xmlFree(exec->rollbacks[i].counts);
2244 }
2245 xmlFree(exec->rollbacks);
2246 }
2247 if (exec->counts != NULL)
2248 xmlFree(exec->counts);
2249 if (exec->status == 0)
2250 return(1);
2251 if (exec->status == -1)
2252 return(0);
2253 return(exec->status);
2254}
2255
2256/************************************************************************
2257 * *
2258 * Progressive interface to the verifyer one atom at a time *
2259 * *
2260 ************************************************************************/
2261
2262/**
Daniel Veillard01c13b52002-12-10 15:19:08 +00002263 * xmlRegNewExecCtxt:
Daniel Veillard4255d502002-04-16 15:50:10 +00002264 * @comp: a precompiled regular expression
2265 * @callback: a callback function used for handling progresses in the
2266 * automata matching phase
2267 * @data: the context data associated to the callback in this context
2268 *
2269 * Build a context used for progressive evaluation of a regexp.
Daniel Veillard01c13b52002-12-10 15:19:08 +00002270 *
2271 * Returns the new context
Daniel Veillard4255d502002-04-16 15:50:10 +00002272 */
2273xmlRegExecCtxtPtr
2274xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
2275 xmlRegExecCtxtPtr exec;
2276
2277 if (comp == NULL)
2278 return(NULL);
2279 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
2280 if (exec == NULL) {
2281 return(NULL);
2282 }
2283 memset(exec, 0, sizeof(xmlRegExecCtxt));
2284 exec->inputString = NULL;
2285 exec->index = 0;
2286 exec->determinist = 1;
2287 exec->maxRollbacks = 0;
2288 exec->nbRollbacks = 0;
2289 exec->rollbacks = NULL;
2290 exec->status = 0;
2291 exec->comp = comp;
Daniel Veillard23e73572002-09-19 19:56:43 +00002292 if (comp->compact == NULL)
2293 exec->state = comp->states[0];
Daniel Veillard4255d502002-04-16 15:50:10 +00002294 exec->transno = 0;
2295 exec->transcount = 0;
2296 exec->callback = callback;
2297 exec->data = data;
2298 if (comp->nbCounters > 0) {
2299 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
2300 if (exec->counts == NULL) {
2301 xmlFree(exec);
2302 return(NULL);
2303 }
2304 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
2305 } else
2306 exec->counts = NULL;
2307 exec->inputStackMax = 0;
2308 exec->inputStackNr = 0;
2309 exec->inputStack = NULL;
2310 return(exec);
2311}
2312
2313/**
2314 * xmlRegFreeExecCtxt:
2315 * @exec: a regular expression evaulation context
2316 *
2317 * Free the structures associated to a regular expression evaulation context.
2318 */
2319void
2320xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
2321 if (exec == NULL)
2322 return;
2323
2324 if (exec->rollbacks != NULL) {
2325 if (exec->counts != NULL) {
2326 int i;
2327
2328 for (i = 0;i < exec->maxRollbacks;i++)
2329 if (exec->rollbacks[i].counts != NULL)
2330 xmlFree(exec->rollbacks[i].counts);
2331 }
2332 xmlFree(exec->rollbacks);
2333 }
2334 if (exec->counts != NULL)
2335 xmlFree(exec->counts);
2336 if (exec->inputStack != NULL) {
2337 int i;
2338
Daniel Veillard32370232002-10-16 14:08:14 +00002339 for (i = 0;i < exec->inputStackNr;i++) {
2340 if (exec->inputStack[i].value != NULL)
2341 xmlFree(exec->inputStack[i].value);
2342 }
Daniel Veillard4255d502002-04-16 15:50:10 +00002343 xmlFree(exec->inputStack);
2344 }
2345 xmlFree(exec);
2346}
2347
2348static void
2349xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2350 void *data) {
2351#ifdef DEBUG_PUSH
2352 printf("saving value: %d:%s\n", exec->inputStackNr, value);
2353#endif
2354 if (exec->inputStackMax == 0) {
2355 exec->inputStackMax = 4;
2356 exec->inputStack = (xmlRegInputTokenPtr)
2357 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
2358 if (exec->inputStack == NULL) {
2359 fprintf(stderr, "push input: allocation failed");
2360 exec->inputStackMax = 0;
2361 return;
2362 }
2363 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
2364 xmlRegInputTokenPtr tmp;
2365
2366 exec->inputStackMax *= 2;
2367 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
2368 exec->inputStackMax * sizeof(xmlRegInputToken));
2369 if (tmp == NULL) {
2370 fprintf(stderr, "push input: allocation failed");
2371 exec->inputStackMax /= 2;
2372 return;
2373 }
2374 exec->inputStack = tmp;
2375 }
2376 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
2377 exec->inputStack[exec->inputStackNr].data = data;
2378 exec->inputStackNr++;
2379 exec->inputStack[exec->inputStackNr].value = NULL;
2380 exec->inputStack[exec->inputStackNr].data = NULL;
2381}
2382
2383
2384/**
Daniel Veillard23e73572002-09-19 19:56:43 +00002385 * xmlRegCompactPushString:
2386 * @exec: a regexp execution context
2387 * @comp: the precompiled exec with a compact table
2388 * @value: a string token input
2389 * @data: data associated to the token to reuse in callbacks
2390 *
2391 * Push one input token in the execution context
2392 *
2393 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2394 * a negative value in case of error.
2395 */
2396static int
2397xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
2398 xmlRegexpPtr comp,
2399 const xmlChar *value,
2400 void *data) {
2401 int state = exec->index;
2402 int i, target;
2403
2404 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
2405 return(-1);
2406
2407 if (value == NULL) {
2408 /*
2409 * are we at a final state ?
2410 */
2411 if (comp->compact[state * (comp->nbstrings + 1)] ==
2412 XML_REGEXP_FINAL_STATE)
2413 return(1);
2414 return(0);
2415 }
2416
2417#ifdef DEBUG_PUSH
2418 printf("value pushed: %s\n", value);
2419#endif
2420
2421 /*
2422 * Examine all outside transition from current state
2423 */
2424 for (i = 0;i < comp->nbstrings;i++) {
2425 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
2426 if ((target > 0) && (target <= comp->nbstates)) {
2427 target--; /* to avoid 0 */
2428 if (xmlStrEqual(comp->stringMap[i], value)) {
2429 exec->index = target;
Daniel Veillard118aed72002-09-24 14:13:13 +00002430 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
2431 exec->callback(exec->data, value,
2432 comp->transdata[state * comp->nbstrings + i], data);
2433 }
Daniel Veillard23e73572002-09-19 19:56:43 +00002434#ifdef DEBUG_PUSH
2435 printf("entering state %d\n", target);
2436#endif
2437 if (comp->compact[target * (comp->nbstrings + 1)] ==
2438 XML_REGEXP_FINAL_STATE)
2439 return(1);
2440 return(0);
2441 }
2442 }
2443 }
2444 /*
2445 * Failed to find an exit transition out from current state for the
2446 * current token
2447 */
2448#ifdef DEBUG_PUSH
2449 printf("failed to find a transition for %s on state %d\n", value, state);
2450#endif
2451 exec->status = -1;
2452 return(-1);
2453}
2454
2455/**
Daniel Veillard4255d502002-04-16 15:50:10 +00002456 * xmlRegExecPushString:
Daniel Veillardea7751d2002-12-20 00:16:24 +00002457 * @exec: a regexp execution context or NULL to indicate the end
Daniel Veillard4255d502002-04-16 15:50:10 +00002458 * @value: a string token input
2459 * @data: data associated to the token to reuse in callbacks
2460 *
2461 * Push one input token in the execution context
2462 *
2463 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2464 * a negative value in case of error.
2465 */
2466int
2467xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2468 void *data) {
2469 xmlRegTransPtr trans;
2470 xmlRegAtomPtr atom;
2471 int ret;
2472 int final = 0;
2473
2474 if (exec == NULL)
2475 return(-1);
Daniel Veillard23e73572002-09-19 19:56:43 +00002476 if (exec->comp == NULL)
2477 return(-1);
Daniel Veillard4255d502002-04-16 15:50:10 +00002478 if (exec->status != 0)
2479 return(exec->status);
2480
Daniel Veillard23e73572002-09-19 19:56:43 +00002481 if (exec->comp->compact != NULL)
2482 return(xmlRegCompactPushString(exec, exec->comp, value, data));
2483
Daniel Veillard4255d502002-04-16 15:50:10 +00002484 if (value == NULL) {
2485 if (exec->state->type == XML_REGEXP_FINAL_STATE)
2486 return(1);
2487 final = 1;
2488 }
2489
2490#ifdef DEBUG_PUSH
2491 printf("value pushed: %s\n", value);
2492#endif
2493 /*
2494 * If we have an active rollback stack push the new value there
2495 * and get back to where we were left
2496 */
2497 if ((value != NULL) && (exec->inputStackNr > 0)) {
2498 xmlFARegExecSaveInputString(exec, value, data);
2499 value = exec->inputStack[exec->index].value;
2500 data = exec->inputStack[exec->index].data;
2501#ifdef DEBUG_PUSH
2502 printf("value loaded: %s\n", value);
2503#endif
2504 }
2505
2506 while ((exec->status == 0) &&
2507 ((value != NULL) ||
2508 ((final == 1) &&
2509 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
2510
2511 /*
2512 * End of input on non-terminal state, rollback, however we may
2513 * still have epsilon like transition for counted transitions
2514 * on counters, in that case don't break too early.
2515 */
Daniel Veillardb509f152002-04-17 16:28:10 +00002516 if ((value == NULL) && (exec->counts == NULL))
Daniel Veillard4255d502002-04-16 15:50:10 +00002517 goto rollback;
2518
2519 exec->transcount = 0;
2520 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2521 trans = &exec->state->trans[exec->transno];
2522 if (trans->to < 0)
2523 continue;
2524 atom = trans->atom;
2525 ret = 0;
Daniel Veillard441bc322002-04-20 17:38:48 +00002526 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
2527 int i;
2528 int count;
2529 xmlRegTransPtr t;
2530 xmlRegCounterPtr counter;
2531
2532 ret = 0;
2533
2534#ifdef DEBUG_PUSH
2535 printf("testing all lax %d\n", trans->count);
2536#endif
2537 /*
2538 * Check all counted transitions from the current state
2539 */
2540 if ((value == NULL) && (final)) {
2541 ret = 1;
2542 } else if (value != NULL) {
2543 for (i = 0;i < exec->state->nbTrans;i++) {
2544 t = &exec->state->trans[i];
2545 if ((t->counter < 0) || (t == trans))
2546 continue;
2547 counter = &exec->comp->counters[t->counter];
2548 count = exec->counts[t->counter];
2549 if ((count < counter->max) &&
2550 (t->atom != NULL) &&
2551 (xmlStrEqual(value, t->atom->valuep))) {
2552 ret = 0;
2553 break;
2554 }
2555 if ((count >= counter->min) &&
2556 (count < counter->max) &&
2557 (xmlStrEqual(value, t->atom->valuep))) {
2558 ret = 1;
2559 break;
2560 }
2561 }
2562 }
2563 } else if (trans->count == REGEXP_ALL_COUNTER) {
Daniel Veillard8a001f62002-04-20 07:24:11 +00002564 int i;
2565 int count;
2566 xmlRegTransPtr t;
2567 xmlRegCounterPtr counter;
2568
2569 ret = 1;
2570
2571#ifdef DEBUG_PUSH
2572 printf("testing all %d\n", trans->count);
2573#endif
2574 /*
2575 * Check all counted transitions from the current state
2576 */
2577 for (i = 0;i < exec->state->nbTrans;i++) {
2578 t = &exec->state->trans[i];
2579 if ((t->counter < 0) || (t == trans))
2580 continue;
2581 counter = &exec->comp->counters[t->counter];
2582 count = exec->counts[t->counter];
2583 if ((count < counter->min) || (count > counter->max)) {
2584 ret = 0;
2585 break;
2586 }
2587 }
2588 } else if (trans->count >= 0) {
Daniel Veillard4255d502002-04-16 15:50:10 +00002589 int count;
2590 xmlRegCounterPtr counter;
2591
2592 /*
2593 * A counted transition.
2594 */
2595
2596 count = exec->counts[trans->count];
2597 counter = &exec->comp->counters[trans->count];
2598#ifdef DEBUG_PUSH
2599 printf("testing count %d: val %d, min %d, max %d\n",
2600 trans->count, count, counter->min, counter->max);
2601#endif
2602 ret = ((count >= counter->min) && (count <= counter->max));
2603 } else if (atom == NULL) {
2604 fprintf(stderr, "epsilon transition left at runtime\n");
2605 exec->status = -2;
2606 break;
2607 } else if (value != NULL) {
2608 ret = xmlStrEqual(value, atom->valuep);
Daniel Veillard441bc322002-04-20 17:38:48 +00002609 if ((ret == 1) && (trans->counter >= 0)) {
2610 xmlRegCounterPtr counter;
2611 int count;
2612
2613 count = exec->counts[trans->counter];
2614 counter = &exec->comp->counters[trans->counter];
2615 if (count >= counter->max)
2616 ret = 0;
2617 }
2618
Daniel Veillard4255d502002-04-16 15:50:10 +00002619 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2620 xmlRegStatePtr to = exec->comp->states[trans->to];
2621
2622 /*
2623 * this is a multiple input sequence
2624 */
2625 if (exec->state->nbTrans > exec->transno + 1) {
2626 if (exec->inputStackNr <= 0) {
2627 xmlFARegExecSaveInputString(exec, value, data);
2628 }
2629 xmlFARegExecSave(exec);
2630 }
2631 exec->transcount = 1;
2632 do {
2633 /*
2634 * Try to progress as much as possible on the input
2635 */
2636 if (exec->transcount == atom->max) {
2637 break;
2638 }
2639 exec->index++;
2640 value = exec->inputStack[exec->index].value;
2641 data = exec->inputStack[exec->index].data;
2642#ifdef DEBUG_PUSH
2643 printf("value loaded: %s\n", value);
2644#endif
2645
2646 /*
2647 * End of input: stop here
2648 */
2649 if (value == NULL) {
2650 exec->index --;
2651 break;
2652 }
2653 if (exec->transcount >= atom->min) {
2654 int transno = exec->transno;
2655 xmlRegStatePtr state = exec->state;
2656
2657 /*
2658 * The transition is acceptable save it
2659 */
2660 exec->transno = -1; /* trick */
2661 exec->state = to;
2662 if (exec->inputStackNr <= 0) {
2663 xmlFARegExecSaveInputString(exec, value, data);
2664 }
2665 xmlFARegExecSave(exec);
2666 exec->transno = transno;
2667 exec->state = state;
2668 }
2669 ret = xmlStrEqual(value, atom->valuep);
2670 exec->transcount++;
2671 } while (ret == 1);
2672 if (exec->transcount < atom->min)
2673 ret = 0;
2674
2675 /*
2676 * If the last check failed but one transition was found
2677 * possible, rollback
2678 */
2679 if (ret < 0)
2680 ret = 0;
2681 if (ret == 0) {
2682 goto rollback;
2683 }
2684 }
2685 }
2686 if (ret == 1) {
2687 if ((exec->callback != NULL) && (atom != NULL)) {
2688 exec->callback(exec->data, atom->valuep,
2689 atom->data, data);
2690 }
2691 if (exec->state->nbTrans > exec->transno + 1) {
2692 if (exec->inputStackNr <= 0) {
2693 xmlFARegExecSaveInputString(exec, value, data);
2694 }
2695 xmlFARegExecSave(exec);
2696 }
2697 if (trans->counter >= 0) {
2698#ifdef DEBUG_PUSH
2699 printf("Increasing count %d\n", trans->counter);
2700#endif
2701 exec->counts[trans->counter]++;
2702 }
2703#ifdef DEBUG_PUSH
2704 printf("entering state %d\n", trans->to);
2705#endif
2706 exec->state = exec->comp->states[trans->to];
2707 exec->transno = 0;
2708 if (trans->atom != NULL) {
2709 if (exec->inputStack != NULL) {
2710 exec->index++;
2711 if (exec->index < exec->inputStackNr) {
2712 value = exec->inputStack[exec->index].value;
2713 data = exec->inputStack[exec->index].data;
2714#ifdef DEBUG_PUSH
2715 printf("value loaded: %s\n", value);
2716#endif
2717 } else {
2718 value = NULL;
2719 data = NULL;
2720#ifdef DEBUG_PUSH
2721 printf("end of input\n");
2722#endif
2723 }
2724 } else {
2725 value = NULL;
2726 data = NULL;
2727#ifdef DEBUG_PUSH
2728 printf("end of input\n");
2729#endif
2730 }
2731 }
2732 goto progress;
2733 } else if (ret < 0) {
2734 exec->status = -4;
2735 break;
2736 }
2737 }
2738 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2739rollback:
2740 /*
2741 * Failed to find a way out
2742 */
2743 exec->determinist = 0;
2744 xmlFARegExecRollBack(exec);
2745 if (exec->status == 0) {
2746 value = exec->inputStack[exec->index].value;
2747 data = exec->inputStack[exec->index].data;
2748#ifdef DEBUG_PUSH
2749 printf("value loaded: %s\n", value);
2750#endif
2751 }
2752 }
2753progress:
2754 continue;
2755 }
2756 if (exec->status == 0) {
2757 return(exec->state->type == XML_REGEXP_FINAL_STATE);
2758 }
2759 return(exec->status);
2760}
2761
Daniel Veillard52b48c72003-04-13 19:53:42 +00002762/**
2763 * xmlRegExecPushString2:
2764 * @exec: a regexp execution context or NULL to indicate the end
2765 * @value: the first string token input
2766 * @value2: the second string token input
2767 * @data: data associated to the token to reuse in callbacks
2768 *
2769 * Push one input token in the execution context
2770 *
2771 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2772 * a negative value in case of error.
2773 */
2774int
2775xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
2776 const xmlChar *value2, void *data) {
2777 xmlChar buf[150];
2778 int lenn, lenp, ret;
2779 xmlChar *str;
2780
2781 if (exec == NULL)
2782 return(-1);
2783 if (exec->comp == NULL)
2784 return(-1);
2785 if (exec->status != 0)
2786 return(exec->status);
2787
2788 if (value2 == NULL)
2789 return(xmlRegExecPushString(exec, value, data));
2790
2791 lenn = strlen((char *) value2);
2792 lenp = strlen((char *) value);
2793
2794 if (150 < lenn + lenp + 2) {
Daniel Veillard3c908dc2003-04-19 00:07:51 +00002795 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00002796 if (str == NULL) {
2797 exec->status = -1;
2798 return(-1);
2799 }
2800 } else {
2801 str = buf;
2802 }
2803 memcpy(&str[0], value, lenp);
2804 str[lenp] = '|';
2805 memcpy(&str[lenp + 1], value2, lenn);
2806 str[lenn + lenp + 1] = 0;
2807
2808 if (exec->comp->compact != NULL)
2809 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
2810 else
2811 ret = xmlRegExecPushString(exec, str, data);
2812
2813 if (str != buf)
2814 xmlFree(buf);
2815 return(ret);
2816}
2817
Daniel Veillard4255d502002-04-16 15:50:10 +00002818#if 0
2819static int
2820xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
2821 xmlRegTransPtr trans;
2822 xmlRegAtomPtr atom;
2823 int ret;
2824 int codepoint, len;
2825
2826 if (exec == NULL)
2827 return(-1);
2828 if (exec->status != 0)
2829 return(exec->status);
2830
2831 while ((exec->status == 0) &&
2832 ((exec->inputString[exec->index] != 0) ||
2833 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
2834
2835 /*
2836 * End of input on non-terminal state, rollback, however we may
2837 * still have epsilon like transition for counted transitions
2838 * on counters, in that case don't break too early.
2839 */
2840 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
2841 goto rollback;
2842
2843 exec->transcount = 0;
2844 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2845 trans = &exec->state->trans[exec->transno];
2846 if (trans->to < 0)
2847 continue;
2848 atom = trans->atom;
2849 ret = 0;
2850 if (trans->count >= 0) {
2851 int count;
2852 xmlRegCounterPtr counter;
2853
2854 /*
2855 * A counted transition.
2856 */
2857
2858 count = exec->counts[trans->count];
2859 counter = &exec->comp->counters[trans->count];
2860#ifdef DEBUG_REGEXP_EXEC
2861 printf("testing count %d: val %d, min %d, max %d\n",
2862 trans->count, count, counter->min, counter->max);
2863#endif
2864 ret = ((count >= counter->min) && (count <= counter->max));
2865 } else if (atom == NULL) {
2866 fprintf(stderr, "epsilon transition left at runtime\n");
2867 exec->status = -2;
2868 break;
2869 } else if (exec->inputString[exec->index] != 0) {
2870 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
2871 ret = xmlRegCheckCharacter(atom, codepoint);
2872 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2873 xmlRegStatePtr to = exec->comp->states[trans->to];
2874
2875 /*
2876 * this is a multiple input sequence
2877 */
2878 if (exec->state->nbTrans > exec->transno + 1) {
2879 xmlFARegExecSave(exec);
2880 }
2881 exec->transcount = 1;
2882 do {
2883 /*
2884 * Try to progress as much as possible on the input
2885 */
2886 if (exec->transcount == atom->max) {
2887 break;
2888 }
2889 exec->index += len;
2890 /*
2891 * End of input: stop here
2892 */
2893 if (exec->inputString[exec->index] == 0) {
2894 exec->index -= len;
2895 break;
2896 }
2897 if (exec->transcount >= atom->min) {
2898 int transno = exec->transno;
2899 xmlRegStatePtr state = exec->state;
2900
2901 /*
2902 * The transition is acceptable save it
2903 */
2904 exec->transno = -1; /* trick */
2905 exec->state = to;
2906 xmlFARegExecSave(exec);
2907 exec->transno = transno;
2908 exec->state = state;
2909 }
2910 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
2911 len);
2912 ret = xmlRegCheckCharacter(atom, codepoint);
2913 exec->transcount++;
2914 } while (ret == 1);
2915 if (exec->transcount < atom->min)
2916 ret = 0;
2917
2918 /*
2919 * If the last check failed but one transition was found
2920 * possible, rollback
2921 */
2922 if (ret < 0)
2923 ret = 0;
2924 if (ret == 0) {
2925 goto rollback;
2926 }
2927 }
2928 }
2929 if (ret == 1) {
2930 if (exec->state->nbTrans > exec->transno + 1) {
2931 xmlFARegExecSave(exec);
2932 }
2933 if (trans->counter >= 0) {
2934#ifdef DEBUG_REGEXP_EXEC
2935 printf("Increasing count %d\n", trans->counter);
2936#endif
2937 exec->counts[trans->counter]++;
2938 }
2939#ifdef DEBUG_REGEXP_EXEC
2940 printf("entering state %d\n", trans->to);
2941#endif
2942 exec->state = exec->comp->states[trans->to];
2943 exec->transno = 0;
2944 if (trans->atom != NULL) {
2945 exec->index += len;
2946 }
2947 goto progress;
2948 } else if (ret < 0) {
2949 exec->status = -4;
2950 break;
2951 }
2952 }
2953 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2954rollback:
2955 /*
2956 * Failed to find a way out
2957 */
2958 exec->determinist = 0;
2959 xmlFARegExecRollBack(exec);
2960 }
2961progress:
2962 continue;
2963 }
2964}
2965#endif
2966/************************************************************************
2967 * *
2968 * Parser for the Shemas Datatype Regular Expressions *
2969 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
2970 * *
2971 ************************************************************************/
2972
2973/**
2974 * xmlFAIsChar:
Daniel Veillard441bc322002-04-20 17:38:48 +00002975 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00002976 *
2977 * [10] Char ::= [^.\?*+()|#x5B#x5D]
2978 */
2979static int
2980xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
2981 int cur;
2982 int len;
2983
2984 cur = CUR_SCHAR(ctxt->cur, len);
2985 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
2986 (cur == '*') || (cur == '+') || (cur == '(') ||
2987 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
2988 (cur == 0x5D) || (cur == 0))
2989 return(-1);
2990 return(cur);
2991}
2992
2993/**
2994 * xmlFAParseCharProp:
Daniel Veillard441bc322002-04-20 17:38:48 +00002995 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00002996 *
2997 * [27] charProp ::= IsCategory | IsBlock
2998 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
2999 * Separators | Symbols | Others
3000 * [29] Letters ::= 'L' [ultmo]?
3001 * [30] Marks ::= 'M' [nce]?
3002 * [31] Numbers ::= 'N' [dlo]?
3003 * [32] Punctuation ::= 'P' [cdseifo]?
3004 * [33] Separators ::= 'Z' [slp]?
3005 * [34] Symbols ::= 'S' [mcko]?
3006 * [35] Others ::= 'C' [cfon]?
3007 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
3008 */
3009static void
3010xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
3011 int cur;
3012 xmlRegAtomType type = 0;
3013 xmlChar *blockName = NULL;
3014
3015 cur = CUR;
3016 if (cur == 'L') {
3017 NEXT;
3018 cur = CUR;
3019 if (cur == 'u') {
3020 NEXT;
3021 type = XML_REGEXP_LETTER_UPPERCASE;
3022 } else if (cur == 'l') {
3023 NEXT;
3024 type = XML_REGEXP_LETTER_LOWERCASE;
3025 } else if (cur == 't') {
3026 NEXT;
3027 type = XML_REGEXP_LETTER_TITLECASE;
3028 } else if (cur == 'm') {
3029 NEXT;
3030 type = XML_REGEXP_LETTER_MODIFIER;
3031 } else if (cur == 'o') {
3032 NEXT;
3033 type = XML_REGEXP_LETTER_OTHERS;
3034 } else {
3035 type = XML_REGEXP_LETTER;
3036 }
3037 } else if (cur == 'M') {
3038 NEXT;
3039 cur = CUR;
3040 if (cur == 'n') {
3041 NEXT;
3042 /* nonspacing */
3043 type = XML_REGEXP_MARK_NONSPACING;
3044 } else if (cur == 'c') {
3045 NEXT;
3046 /* spacing combining */
3047 type = XML_REGEXP_MARK_SPACECOMBINING;
3048 } else if (cur == 'e') {
3049 NEXT;
3050 /* enclosing */
3051 type = XML_REGEXP_MARK_ENCLOSING;
3052 } else {
3053 /* all marks */
3054 type = XML_REGEXP_MARK;
3055 }
3056 } else if (cur == 'N') {
3057 NEXT;
3058 cur = CUR;
3059 if (cur == 'd') {
3060 NEXT;
3061 /* digital */
3062 type = XML_REGEXP_NUMBER_DECIMAL;
3063 } else if (cur == 'l') {
3064 NEXT;
3065 /* letter */
3066 type = XML_REGEXP_NUMBER_LETTER;
3067 } else if (cur == 'o') {
3068 NEXT;
3069 /* other */
3070 type = XML_REGEXP_NUMBER_OTHERS;
3071 } else {
3072 /* all numbers */
3073 type = XML_REGEXP_NUMBER;
3074 }
3075 } else if (cur == 'P') {
3076 NEXT;
3077 cur = CUR;
3078 if (cur == 'c') {
3079 NEXT;
3080 /* connector */
3081 type = XML_REGEXP_PUNCT_CONNECTOR;
3082 } else if (cur == 'd') {
3083 NEXT;
3084 /* dash */
3085 type = XML_REGEXP_PUNCT_DASH;
3086 } else if (cur == 's') {
3087 NEXT;
3088 /* open */
3089 type = XML_REGEXP_PUNCT_OPEN;
3090 } else if (cur == 'e') {
3091 NEXT;
3092 /* close */
3093 type = XML_REGEXP_PUNCT_CLOSE;
3094 } else if (cur == 'i') {
3095 NEXT;
3096 /* initial quote */
3097 type = XML_REGEXP_PUNCT_INITQUOTE;
3098 } else if (cur == 'f') {
3099 NEXT;
3100 /* final quote */
3101 type = XML_REGEXP_PUNCT_FINQUOTE;
3102 } else if (cur == 'o') {
3103 NEXT;
3104 /* other */
3105 type = XML_REGEXP_PUNCT_OTHERS;
3106 } else {
3107 /* all punctuation */
3108 type = XML_REGEXP_PUNCT;
3109 }
3110 } else if (cur == 'Z') {
3111 NEXT;
3112 cur = CUR;
3113 if (cur == 's') {
3114 NEXT;
3115 /* space */
3116 type = XML_REGEXP_SEPAR_SPACE;
3117 } else if (cur == 'l') {
3118 NEXT;
3119 /* line */
3120 type = XML_REGEXP_SEPAR_LINE;
3121 } else if (cur == 'p') {
3122 NEXT;
3123 /* paragraph */
3124 type = XML_REGEXP_SEPAR_PARA;
3125 } else {
3126 /* all separators */
3127 type = XML_REGEXP_SEPAR;
3128 }
3129 } else if (cur == 'S') {
3130 NEXT;
3131 cur = CUR;
3132 if (cur == 'm') {
3133 NEXT;
3134 type = XML_REGEXP_SYMBOL_MATH;
3135 /* math */
3136 } else if (cur == 'c') {
3137 NEXT;
3138 type = XML_REGEXP_SYMBOL_CURRENCY;
3139 /* currency */
3140 } else if (cur == 'k') {
3141 NEXT;
3142 type = XML_REGEXP_SYMBOL_MODIFIER;
3143 /* modifiers */
3144 } else if (cur == 'o') {
3145 NEXT;
3146 type = XML_REGEXP_SYMBOL_OTHERS;
3147 /* other */
3148 } else {
3149 /* all symbols */
3150 type = XML_REGEXP_SYMBOL;
3151 }
3152 } else if (cur == 'C') {
3153 NEXT;
3154 cur = CUR;
3155 if (cur == 'c') {
3156 NEXT;
3157 /* control */
3158 type = XML_REGEXP_OTHER_CONTROL;
3159 } else if (cur == 'f') {
3160 NEXT;
3161 /* format */
3162 type = XML_REGEXP_OTHER_FORMAT;
3163 } else if (cur == 'o') {
3164 NEXT;
3165 /* private use */
3166 type = XML_REGEXP_OTHER_PRIVATE;
3167 } else if (cur == 'n') {
3168 NEXT;
3169 /* not assigned */
3170 type = XML_REGEXP_OTHER_NA;
3171 } else {
3172 /* all others */
3173 type = XML_REGEXP_OTHER;
3174 }
3175 } else if (cur == 'I') {
3176 const xmlChar *start;
3177 NEXT;
3178 cur = CUR;
3179 if (cur != 's') {
3180 ERROR("IsXXXX expected");
3181 return;
3182 }
3183 NEXT;
3184 start = ctxt->cur;
3185 cur = CUR;
3186 if (((cur >= 'a') && (cur <= 'z')) ||
3187 ((cur >= 'A') && (cur <= 'Z')) ||
3188 ((cur >= '0') && (cur <= '9')) ||
3189 (cur == 0x2D)) {
3190 NEXT;
3191 cur = CUR;
3192 while (((cur >= 'a') && (cur <= 'z')) ||
3193 ((cur >= 'A') && (cur <= 'Z')) ||
3194 ((cur >= '0') && (cur <= '9')) ||
3195 (cur == 0x2D)) {
3196 NEXT;
3197 cur = CUR;
3198 }
3199 }
3200 type = XML_REGEXP_BLOCK_NAME;
3201 blockName = xmlStrndup(start, ctxt->cur - start);
3202 } else {
3203 ERROR("Unknown char property");
3204 return;
3205 }
3206 if (ctxt->atom == NULL) {
3207 ctxt->atom = xmlRegNewAtom(ctxt, type);
3208 if (ctxt->atom != NULL)
3209 ctxt->atom->valuep = blockName;
3210 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3211 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3212 type, 0, 0, blockName);
3213 }
3214}
3215
3216/**
3217 * xmlFAParseCharClassEsc:
Daniel Veillard441bc322002-04-20 17:38:48 +00003218 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003219 *
3220 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
3221 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
3222 * [25] catEsc ::= '\p{' charProp '}'
3223 * [26] complEsc ::= '\P{' charProp '}'
3224 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
3225 */
3226static void
3227xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
3228 int cur;
3229
3230 if (CUR == '.') {
3231 if (ctxt->atom == NULL) {
3232 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
3233 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3234 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3235 XML_REGEXP_ANYCHAR, 0, 0, NULL);
3236 }
3237 NEXT;
3238 return;
3239 }
3240 if (CUR != '\\') {
3241 ERROR("Escaped sequence: expecting \\");
3242 return;
3243 }
3244 NEXT;
3245 cur = CUR;
3246 if (cur == 'p') {
3247 NEXT;
3248 if (CUR != '{') {
3249 ERROR("Expecting '{'");
3250 return;
3251 }
3252 NEXT;
3253 xmlFAParseCharProp(ctxt);
3254 if (CUR != '}') {
3255 ERROR("Expecting '}'");
3256 return;
3257 }
3258 NEXT;
3259 } else if (cur == 'P') {
3260 NEXT;
3261 if (CUR != '{') {
3262 ERROR("Expecting '{'");
3263 return;
3264 }
3265 NEXT;
3266 xmlFAParseCharProp(ctxt);
3267 ctxt->atom->neg = 1;
3268 if (CUR != '}') {
3269 ERROR("Expecting '}'");
3270 return;
3271 }
3272 NEXT;
3273 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
3274 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
3275 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
3276 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
3277 (cur == 0x5E)) {
3278 if (ctxt->atom == NULL) {
3279 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
3280 if (ctxt->atom != NULL)
3281 ctxt->atom->codepoint = cur;
3282 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3283 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3284 XML_REGEXP_CHARVAL, cur, cur, NULL);
3285 }
3286 NEXT;
3287 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
3288 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
3289 (cur == 'w') || (cur == 'W')) {
Daniel Veillardb509f152002-04-17 16:28:10 +00003290 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
Daniel Veillard4255d502002-04-16 15:50:10 +00003291
3292 switch (cur) {
3293 case 's':
3294 type = XML_REGEXP_ANYSPACE;
3295 break;
3296 case 'S':
3297 type = XML_REGEXP_NOTSPACE;
3298 break;
3299 case 'i':
3300 type = XML_REGEXP_INITNAME;
3301 break;
3302 case 'I':
3303 type = XML_REGEXP_NOTINITNAME;
3304 break;
3305 case 'c':
3306 type = XML_REGEXP_NAMECHAR;
3307 break;
3308 case 'C':
3309 type = XML_REGEXP_NOTNAMECHAR;
3310 break;
3311 case 'd':
3312 type = XML_REGEXP_DECIMAL;
3313 break;
3314 case 'D':
3315 type = XML_REGEXP_NOTDECIMAL;
3316 break;
3317 case 'w':
3318 type = XML_REGEXP_REALCHAR;
3319 break;
3320 case 'W':
3321 type = XML_REGEXP_NOTREALCHAR;
3322 break;
3323 }
3324 NEXT;
3325 if (ctxt->atom == NULL) {
3326 ctxt->atom = xmlRegNewAtom(ctxt, type);
3327 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3328 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3329 type, 0, 0, NULL);
3330 }
3331 }
3332}
3333
3334/**
3335 * xmlFAParseCharRef:
Daniel Veillard441bc322002-04-20 17:38:48 +00003336 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003337 *
3338 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' )
3339 */
3340static int
3341xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) {
3342 int ret = 0, cur;
3343
3344 if ((CUR != '&') || (NXT(1) != '#'))
3345 return(-1);
3346 NEXT;
3347 NEXT;
3348 cur = CUR;
3349 if (cur == 'x') {
3350 NEXT;
3351 cur = CUR;
3352 if (((cur >= '0') && (cur <= '9')) ||
3353 ((cur >= 'a') && (cur <= 'f')) ||
3354 ((cur >= 'A') && (cur <= 'F'))) {
3355 while (((cur >= '0') && (cur <= '9')) ||
3356 ((cur >= 'A') && (cur <= 'F'))) {
3357 if ((cur >= '0') && (cur <= '9'))
3358 ret = ret * 16 + cur - '0';
3359 else if ((cur >= 'a') && (cur <= 'f'))
3360 ret = ret * 16 + 10 + (cur - 'a');
3361 else
3362 ret = ret * 16 + 10 + (cur - 'A');
3363 NEXT;
3364 cur = CUR;
3365 }
3366 } else {
3367 ERROR("Char ref: expecting [0-9A-F]");
3368 return(-1);
3369 }
3370 } else {
3371 if ((cur >= '0') && (cur <= '9')) {
3372 while ((cur >= '0') && (cur <= '9')) {
3373 ret = ret * 10 + cur - '0';
3374 NEXT;
3375 cur = CUR;
3376 }
3377 } else {
3378 ERROR("Char ref: expecting [0-9]");
3379 return(-1);
3380 }
3381 }
3382 if (cur != ';') {
3383 ERROR("Char ref: expecting ';'");
3384 return(-1);
3385 } else {
3386 NEXT;
3387 }
3388 return(ret);
3389}
3390
3391/**
3392 * xmlFAParseCharRange:
Daniel Veillard441bc322002-04-20 17:38:48 +00003393 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003394 *
3395 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
3396 * [18] seRange ::= charOrEsc '-' charOrEsc
3397 * [20] charOrEsc ::= XmlChar | SingleCharEsc
3398 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
3399 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
3400 */
3401static void
3402xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
3403 int cur;
3404 int start = -1;
3405 int end = -1;
3406
3407 if ((CUR == '&') && (NXT(1) == '#')) {
3408 end = start = xmlFAParseCharRef(ctxt);
3409 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3410 XML_REGEXP_CHARVAL, start, end, NULL);
3411 return;
3412 }
3413 cur = CUR;
3414 if (cur == '\\') {
3415 NEXT;
3416 cur = CUR;
3417 switch (cur) {
3418 case 'n': start = 0xA; break;
3419 case 'r': start = 0xD; break;
3420 case 't': start = 0x9; break;
3421 case '\\': case '|': case '.': case '-': case '^': case '?':
3422 case '*': case '+': case '{': case '}': case '(': case ')':
3423 case '[': case ']':
3424 start = cur; break;
3425 default:
3426 ERROR("Invalid escape value");
3427 return;
3428 }
3429 end = start;
3430 } else if ((cur != 0x5B) && (cur != 0x5D)) {
3431 end = start = cur;
3432 } else {
3433 ERROR("Expecting a char range");
3434 return;
3435 }
3436 NEXT;
3437 if (start == '-') {
3438 return;
3439 }
3440 cur = CUR;
3441 if (cur != '-') {
3442 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3443 XML_REGEXP_CHARVAL, start, end, NULL);
3444 return;
3445 }
3446 NEXT;
3447 cur = CUR;
3448 if (cur == '\\') {
3449 NEXT;
3450 cur = CUR;
3451 switch (cur) {
3452 case 'n': end = 0xA; break;
3453 case 'r': end = 0xD; break;
3454 case 't': end = 0x9; break;
3455 case '\\': case '|': case '.': case '-': case '^': case '?':
3456 case '*': case '+': case '{': case '}': case '(': case ')':
3457 case '[': case ']':
3458 end = cur; break;
3459 default:
3460 ERROR("Invalid escape value");
3461 return;
3462 }
3463 } else if ((cur != 0x5B) && (cur != 0x5D)) {
3464 end = cur;
3465 } else {
3466 ERROR("Expecting the end of a char range");
3467 return;
3468 }
3469 NEXT;
3470 /* TODO check that the values are acceptable character ranges for XML */
3471 if (end < start) {
3472 ERROR("End of range is before start of range");
3473 } else {
3474 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3475 XML_REGEXP_CHARVAL, start, end, NULL);
3476 }
3477 return;
3478}
3479
3480/**
3481 * xmlFAParsePosCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00003482 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003483 *
3484 * [14] posCharGroup ::= ( charRange | charClassEsc )+
3485 */
3486static void
3487xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
3488 do {
3489 if ((CUR == '\\') || (CUR == '.')) {
3490 xmlFAParseCharClassEsc(ctxt);
3491 } else {
3492 xmlFAParseCharRange(ctxt);
3493 }
3494 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
3495 (ctxt->error == 0));
3496}
3497
3498/**
3499 * xmlFAParseCharGroup:
Daniel Veillard441bc322002-04-20 17:38:48 +00003500 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003501 *
3502 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
3503 * [15] negCharGroup ::= '^' posCharGroup
3504 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
3505 * [12] charClassExpr ::= '[' charGroup ']'
3506 */
3507static void
3508xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
3509 int n = ctxt->neg;
3510 while ((CUR != ']') && (ctxt->error == 0)) {
3511 if (CUR == '^') {
3512 int neg = ctxt->neg;
3513
3514 NEXT;
3515 ctxt->neg = !ctxt->neg;
3516 xmlFAParsePosCharGroup(ctxt);
3517 ctxt->neg = neg;
3518 } else if (CUR == '-') {
3519 NEXT;
3520 ctxt->neg = !ctxt->neg;
3521 if (CUR != '[') {
3522 ERROR("charClassExpr: '[' expected");
3523 break;
3524 }
3525 NEXT;
3526 xmlFAParseCharGroup(ctxt);
3527 if (CUR == ']') {
3528 NEXT;
3529 } else {
3530 ERROR("charClassExpr: ']' expected");
3531 break;
3532 }
3533 break;
3534 } else if (CUR != ']') {
3535 xmlFAParsePosCharGroup(ctxt);
3536 }
3537 }
3538 ctxt->neg = n;
3539}
3540
3541/**
3542 * xmlFAParseCharClass:
Daniel Veillard441bc322002-04-20 17:38:48 +00003543 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003544 *
3545 * [11] charClass ::= charClassEsc | charClassExpr
3546 * [12] charClassExpr ::= '[' charGroup ']'
3547 */
3548static void
3549xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
3550 if (CUR == '[') {
3551 NEXT;
3552 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
3553 if (ctxt->atom == NULL)
3554 return;
3555 xmlFAParseCharGroup(ctxt);
3556 if (CUR == ']') {
3557 NEXT;
3558 } else {
3559 ERROR("xmlFAParseCharClass: ']' expected");
3560 }
3561 } else {
3562 xmlFAParseCharClassEsc(ctxt);
3563 }
3564}
3565
3566/**
3567 * xmlFAParseQuantExact:
Daniel Veillard441bc322002-04-20 17:38:48 +00003568 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003569 *
3570 * [8] QuantExact ::= [0-9]+
3571 */
3572static int
3573xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
3574 int ret = 0;
3575 int ok = 0;
3576
3577 while ((CUR >= '0') && (CUR <= '9')) {
3578 ret = ret * 10 + (CUR - '0');
3579 ok = 1;
3580 NEXT;
3581 }
3582 if (ok != 1) {
3583 return(-1);
3584 }
3585 return(ret);
3586}
3587
3588/**
3589 * xmlFAParseQuantifier:
Daniel Veillard441bc322002-04-20 17:38:48 +00003590 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003591 *
3592 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
3593 * [5] quantity ::= quantRange | quantMin | QuantExact
3594 * [6] quantRange ::= QuantExact ',' QuantExact
3595 * [7] quantMin ::= QuantExact ','
3596 * [8] QuantExact ::= [0-9]+
3597 */
3598static int
3599xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
3600 int cur;
3601
3602 cur = CUR;
3603 if ((cur == '?') || (cur == '*') || (cur == '+')) {
3604 if (ctxt->atom != NULL) {
3605 if (cur == '?')
3606 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
3607 else if (cur == '*')
3608 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
3609 else if (cur == '+')
3610 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
3611 }
3612 NEXT;
3613 return(1);
3614 }
3615 if (cur == '{') {
3616 int min = 0, max = 0;
3617
3618 NEXT;
3619 cur = xmlFAParseQuantExact(ctxt);
3620 if (cur >= 0)
3621 min = cur;
3622 if (CUR == ',') {
3623 NEXT;
3624 cur = xmlFAParseQuantExact(ctxt);
3625 if (cur >= 0)
3626 max = cur;
3627 }
3628 if (CUR == '}') {
3629 NEXT;
3630 } else {
3631 ERROR("Unterminated quantifier");
3632 }
3633 if (max == 0)
3634 max = min;
3635 if (ctxt->atom != NULL) {
3636 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
3637 ctxt->atom->min = min;
3638 ctxt->atom->max = max;
3639 }
3640 return(1);
3641 }
3642 return(0);
3643}
3644
3645/**
3646 * xmlFAParseAtom:
Daniel Veillard441bc322002-04-20 17:38:48 +00003647 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003648 *
3649 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
3650 */
3651static int
3652xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
3653 int codepoint, len;
3654
3655 codepoint = xmlFAIsChar(ctxt);
3656 if (codepoint > 0) {
3657 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
3658 if (ctxt->atom == NULL)
3659 return(-1);
3660 codepoint = CUR_SCHAR(ctxt->cur, len);
3661 ctxt->atom->codepoint = codepoint;
3662 NEXTL(len);
3663 return(1);
3664 } else if (CUR == '|') {
3665 return(0);
3666 } else if (CUR == 0) {
3667 return(0);
3668 } else if (CUR == ')') {
3669 return(0);
3670 } else if (CUR == '(') {
3671 xmlRegStatePtr start, oldend;
3672
3673 NEXT;
3674 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
3675 start = ctxt->state;
3676 oldend = ctxt->end;
3677 ctxt->end = NULL;
3678 ctxt->atom = NULL;
3679 xmlFAParseRegExp(ctxt, 0);
3680 if (CUR == ')') {
3681 NEXT;
3682 } else {
3683 ERROR("xmlFAParseAtom: expecting ')'");
3684 }
3685 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
3686 if (ctxt->atom == NULL)
3687 return(-1);
3688 ctxt->atom->start = start;
3689 ctxt->atom->stop = ctxt->state;
3690 ctxt->end = oldend;
3691 return(1);
3692 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
3693 xmlFAParseCharClass(ctxt);
3694 return(1);
3695 }
3696 return(0);
3697}
3698
3699/**
3700 * xmlFAParsePiece:
Daniel Veillard441bc322002-04-20 17:38:48 +00003701 * @ctxt: a regexp parser context
Daniel Veillard4255d502002-04-16 15:50:10 +00003702 *
3703 * [3] piece ::= atom quantifier?
3704 */
3705static int
3706xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
3707 int ret;
3708
3709 ctxt->atom = NULL;
3710 ret = xmlFAParseAtom(ctxt);
3711 if (ret == 0)
3712 return(0);
3713 if (ctxt->atom == NULL) {
3714 ERROR("internal: no atom generated");
3715 }
3716 xmlFAParseQuantifier(ctxt);
3717 return(1);
3718}
3719
3720/**
3721 * xmlFAParseBranch:
Daniel Veillard441bc322002-04-20 17:38:48 +00003722 * @ctxt: a regexp parser context
3723 * @first: is taht the first
Daniel Veillard4255d502002-04-16 15:50:10 +00003724 *
3725 * [2] branch ::= piece*
3726 */
3727static void
3728xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, int first) {
3729 xmlRegStatePtr previous;
3730 xmlRegAtomPtr prevatom = NULL;
3731 int ret;
3732
3733 previous = ctxt->state;
3734 ret = xmlFAParsePiece(ctxt);
3735 if (ret != 0) {
3736 if (first) {
3737 xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom);
3738 previous = ctxt->state;
3739 } else {
3740 prevatom = ctxt->atom;
3741 }
3742 ctxt->atom = NULL;
3743 }
3744 while ((ret != 0) && (ctxt->error == 0)) {
3745 ret = xmlFAParsePiece(ctxt);
3746 if (ret != 0) {
3747 if (first) {
3748 xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom);
3749 } else {
3750 xmlFAGenerateTransitions(ctxt, previous, NULL, prevatom);
3751 prevatom = ctxt->atom;
3752 }
3753 previous = ctxt->state;
3754 ctxt->atom = NULL;
3755 }
3756 }
3757 if (!first) {
3758 xmlFAGenerateTransitions(ctxt, previous, ctxt->end, prevatom);
3759 }
3760}
3761
3762/**
3763 * xmlFAParseRegExp:
Daniel Veillard441bc322002-04-20 17:38:48 +00003764 * @ctxt: a regexp parser context
3765 * @top: is that the top-level expressions ?
Daniel Veillard4255d502002-04-16 15:50:10 +00003766 *
3767 * [1] regExp ::= branch ( '|' branch )*
3768 */
3769static void
3770xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
3771 xmlRegStatePtr start, end, oldend;
3772
3773 oldend = ctxt->end;
3774
3775 start = ctxt->state;
3776 xmlFAParseBranch(ctxt, (ctxt->end == NULL));
3777 if (CUR != '|') {
3778 ctxt->end = ctxt->state;
3779 return;
3780 }
3781 end = ctxt->state;
3782 while ((CUR == '|') && (ctxt->error == 0)) {
3783 NEXT;
3784 ctxt->state = start;
3785 ctxt->end = end;
3786 xmlFAParseBranch(ctxt, 0);
3787 }
3788 if (!top)
3789 ctxt->end = oldend;
3790}
3791
3792/************************************************************************
3793 * *
3794 * The basic API *
3795 * *
3796 ************************************************************************/
3797
3798/**
3799 * xmlRegexpPrint:
3800 * @output: the file for the output debug
3801 * @regexp: the compiled regexp
3802 *
3803 * Print the content of the compiled regular expression
3804 */
3805void
3806xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
3807 int i;
3808
3809 fprintf(output, " regexp: ");
3810 if (regexp == NULL) {
3811 fprintf(output, "NULL\n");
3812 return;
3813 }
3814 fprintf(output, "'%s' ", regexp->string);
3815 fprintf(output, "\n");
3816 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
3817 for (i = 0;i < regexp->nbAtoms; i++) {
3818 fprintf(output, " %02d ", i);
3819 xmlRegPrintAtom(output, regexp->atoms[i]);
3820 }
3821 fprintf(output, "%d states:", regexp->nbStates);
3822 fprintf(output, "\n");
3823 for (i = 0;i < regexp->nbStates; i++) {
3824 xmlRegPrintState(output, regexp->states[i]);
3825 }
3826 fprintf(output, "%d counters:\n", regexp->nbCounters);
3827 for (i = 0;i < regexp->nbCounters; i++) {
3828 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
3829 regexp->counters[i].max);
3830 }
3831}
3832
3833/**
3834 * xmlRegexpCompile:
3835 * @regexp: a regular expression string
3836 *
3837 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
3838 * Appendix F and build an automata suitable for testing strings against
3839 * that regular expression
3840 *
3841 * Returns the compiled expression or NULL in case of error
3842 */
3843xmlRegexpPtr
3844xmlRegexpCompile(const xmlChar *regexp) {
3845 xmlRegexpPtr ret;
3846 xmlRegParserCtxtPtr ctxt;
3847
3848 ctxt = xmlRegNewParserCtxt(regexp);
3849 if (ctxt == NULL)
3850 return(NULL);
3851
3852 /* initialize the parser */
3853 ctxt->end = NULL;
3854 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
3855 xmlRegStatePush(ctxt, ctxt->start);
3856
3857 /* parse the expression building an automata */
3858 xmlFAParseRegExp(ctxt, 1);
3859 if (CUR != 0) {
3860 ERROR("xmlFAParseRegExp: extra characters");
3861 }
3862 ctxt->end = ctxt->state;
3863 ctxt->start->type = XML_REGEXP_START_STATE;
3864 ctxt->end->type = XML_REGEXP_FINAL_STATE;
3865
3866 /* remove the Epsilon except for counted transitions */
3867 xmlFAEliminateEpsilonTransitions(ctxt);
3868
3869
3870 if (ctxt->error != 0) {
3871 xmlRegFreeParserCtxt(ctxt);
3872 return(NULL);
3873 }
3874 ret = xmlRegEpxFromParse(ctxt);
3875 xmlRegFreeParserCtxt(ctxt);
3876 return(ret);
3877}
3878
3879/**
3880 * xmlRegexpExec:
3881 * @comp: the compiled regular expression
3882 * @content: the value to check against the regular expression
3883 *
3884 * Check if the regular expression generate the value
3885 *
3886 * Returns 1 if it matches, 0 if not and a negativa value in case of error
3887 */
3888int
3889xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
3890 if ((comp == NULL) || (content == NULL))
3891 return(-1);
3892 return(xmlFARegExec(comp, content));
3893}
3894
3895/**
Daniel Veillard23e73572002-09-19 19:56:43 +00003896 * xmlRegexpIsDeterminist:
3897 * @comp: the compiled regular expression
3898 *
3899 * Check if the regular expression is determinist
3900 *
3901 * Returns 1 if it yes, 0 if not and a negativa value in case of error
3902 */
3903int
3904xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
3905 xmlAutomataPtr am;
3906 int ret;
3907
3908 if (comp == NULL)
3909 return(-1);
3910 if (comp->determinist != -1)
3911 return(comp->determinist);
3912
3913 am = xmlNewAutomata();
Daniel Veillardbd9afb52002-09-25 22:25:35 +00003914 if (am->states != NULL) {
3915 int i;
3916
3917 for (i = 0;i < am->nbStates;i++)
3918 xmlRegFreeState(am->states[i]);
3919 xmlFree(am->states);
3920 }
Daniel Veillard23e73572002-09-19 19:56:43 +00003921 am->nbAtoms = comp->nbAtoms;
3922 am->atoms = comp->atoms;
3923 am->nbStates = comp->nbStates;
3924 am->states = comp->states;
3925 am->determinist = -1;
3926 ret = xmlFAComputesDeterminism(am);
3927 am->atoms = NULL;
3928 am->states = NULL;
3929 xmlFreeAutomata(am);
3930 return(ret);
3931}
3932
3933/**
Daniel Veillard4255d502002-04-16 15:50:10 +00003934 * xmlRegFreeRegexp:
3935 * @regexp: the regexp
3936 *
3937 * Free a regexp
3938 */
3939void
3940xmlRegFreeRegexp(xmlRegexpPtr regexp) {
3941 int i;
3942 if (regexp == NULL)
3943 return;
3944
3945 if (regexp->string != NULL)
3946 xmlFree(regexp->string);
3947 if (regexp->states != NULL) {
3948 for (i = 0;i < regexp->nbStates;i++)
3949 xmlRegFreeState(regexp->states[i]);
3950 xmlFree(regexp->states);
3951 }
3952 if (regexp->atoms != NULL) {
3953 for (i = 0;i < regexp->nbAtoms;i++)
3954 xmlRegFreeAtom(regexp->atoms[i]);
3955 xmlFree(regexp->atoms);
3956 }
3957 if (regexp->counters != NULL)
3958 xmlFree(regexp->counters);
Daniel Veillard23e73572002-09-19 19:56:43 +00003959 if (regexp->compact != NULL)
3960 xmlFree(regexp->compact);
Daniel Veillard118aed72002-09-24 14:13:13 +00003961 if (regexp->transdata != NULL)
3962 xmlFree(regexp->transdata);
Daniel Veillard23e73572002-09-19 19:56:43 +00003963 if (regexp->stringMap != NULL) {
3964 for (i = 0; i < regexp->nbstrings;i++)
3965 xmlFree(regexp->stringMap[i]);
3966 xmlFree(regexp->stringMap);
3967 }
3968
Daniel Veillard4255d502002-04-16 15:50:10 +00003969 xmlFree(regexp);
3970}
3971
3972#ifdef LIBXML_AUTOMATA_ENABLED
3973/************************************************************************
3974 * *
3975 * The Automata interface *
3976 * *
3977 ************************************************************************/
3978
3979/**
3980 * xmlNewAutomata:
3981 *
3982 * Create a new automata
3983 *
3984 * Returns the new object or NULL in case of failure
3985 */
3986xmlAutomataPtr
3987xmlNewAutomata(void) {
3988 xmlAutomataPtr ctxt;
3989
3990 ctxt = xmlRegNewParserCtxt(NULL);
3991 if (ctxt == NULL)
3992 return(NULL);
3993
3994 /* initialize the parser */
3995 ctxt->end = NULL;
3996 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
3997 xmlRegStatePush(ctxt, ctxt->start);
3998
3999 return(ctxt);
4000}
4001
4002/**
4003 * xmlFreeAutomata:
4004 * @am: an automata
4005 *
4006 * Free an automata
4007 */
4008void
4009xmlFreeAutomata(xmlAutomataPtr am) {
4010 if (am == NULL)
4011 return;
4012 xmlRegFreeParserCtxt(am);
4013}
4014
4015/**
4016 * xmlAutomataGetInitState:
4017 * @am: an automata
4018 *
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004019 * Initial state lookup
4020 *
Daniel Veillard4255d502002-04-16 15:50:10 +00004021 * Returns the initial state of the automata
4022 */
4023xmlAutomataStatePtr
4024xmlAutomataGetInitState(xmlAutomataPtr am) {
4025 if (am == NULL)
4026 return(NULL);
4027 return(am->start);
4028}
4029
4030/**
4031 * xmlAutomataSetFinalState:
4032 * @am: an automata
4033 * @state: a state in this automata
4034 *
4035 * Makes that state a final state
4036 *
4037 * Returns 0 or -1 in case of error
4038 */
4039int
4040xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
4041 if ((am == NULL) || (state == NULL))
4042 return(-1);
4043 state->type = XML_REGEXP_FINAL_STATE;
4044 return(0);
4045}
4046
4047/**
4048 * xmlAutomataNewTransition:
4049 * @am: an automata
4050 * @from: the starting point of the transition
4051 * @to: the target point of the transition or NULL
4052 * @token: the input string associated to that transition
4053 * @data: data passed to the callback function if the transition is activated
4054 *
4055 * If @to is NULL, this create first a new target state in the automata
4056 * and then adds a transition from the @from state to the target state
4057 * activated by the value of @token
4058 *
4059 * Returns the target state or NULL in case of error
4060 */
4061xmlAutomataStatePtr
4062xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
4063 xmlAutomataStatePtr to, const xmlChar *token,
4064 void *data) {
4065 xmlRegAtomPtr atom;
4066
4067 if ((am == NULL) || (from == NULL) || (token == NULL))
4068 return(NULL);
4069 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4070 atom->data = data;
4071 if (atom == NULL)
4072 return(NULL);
4073 atom->valuep = xmlStrdup(token);
4074
4075 xmlFAGenerateTransitions(am, from, to, atom);
4076 if (to == NULL)
4077 return(am->state);
4078 return(to);
4079}
4080
4081/**
Daniel Veillard52b48c72003-04-13 19:53:42 +00004082 * xmlAutomataNewTransition2:
4083 * @am: an automata
4084 * @from: the starting point of the transition
4085 * @to: the target point of the transition or NULL
4086 * @token: the first input string associated to that transition
4087 * @token2: the second input string associated to that transition
4088 * @data: data passed to the callback function if the transition is activated
4089 *
4090 * If @to is NULL, this create first a new target state in the automata
4091 * and then adds a transition from the @from state to the target state
4092 * activated by the value of @token
4093 *
4094 * Returns the target state or NULL in case of error
4095 */
4096xmlAutomataStatePtr
4097xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
4098 xmlAutomataStatePtr to, const xmlChar *token,
4099 const xmlChar *token2, void *data) {
4100 xmlRegAtomPtr atom;
4101
4102 if ((am == NULL) || (from == NULL) || (token == NULL))
4103 return(NULL);
4104 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4105 atom->data = data;
4106 if (atom == NULL)
4107 return(NULL);
4108 if ((token2 == NULL) || (*token2 == 0)) {
4109 atom->valuep = xmlStrdup(token);
4110 } else {
4111 int lenn, lenp;
4112 xmlChar *str;
4113
4114 lenn = strlen((char *) token2);
4115 lenp = strlen((char *) token);
4116
Daniel Veillard3c908dc2003-04-19 00:07:51 +00004117 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
Daniel Veillard52b48c72003-04-13 19:53:42 +00004118 if (str == NULL) {
4119 xmlRegFreeAtom(atom);
4120 return(NULL);
4121 }
4122 memcpy(&str[0], token, lenp);
4123 str[lenp] = '|';
4124 memcpy(&str[lenp + 1], token2, lenn);
4125 str[lenn + lenp + 1] = 0;
4126
4127 atom->valuep = str;
4128 }
4129
4130 xmlFAGenerateTransitions(am, from, to, atom);
4131 if (to == NULL)
4132 return(am->state);
4133 return(to);
4134}
4135
4136/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004137 * xmlAutomataNewCountTrans:
4138 * @am: an automata
4139 * @from: the starting point of the transition
4140 * @to: the target point of the transition or NULL
4141 * @token: the input string associated to that transition
4142 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004143 * @max: the maximum successive occurences of token
4144 * @data: data associated to the transition
Daniel Veillard4255d502002-04-16 15:50:10 +00004145 *
4146 * If @to is NULL, this create first a new target state in the automata
4147 * and then adds a transition from the @from state to the target state
4148 * activated by a succession of input of value @token and whose number
4149 * is between @min and @max
4150 *
4151 * Returns the target state or NULL in case of error
4152 */
4153xmlAutomataStatePtr
4154xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4155 xmlAutomataStatePtr to, const xmlChar *token,
4156 int min, int max, void *data) {
4157 xmlRegAtomPtr atom;
4158
4159 if ((am == NULL) || (from == NULL) || (token == NULL))
4160 return(NULL);
4161 if (min < 0)
4162 return(NULL);
4163 if ((max < min) || (max < 1))
4164 return(NULL);
4165 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4166 if (atom == NULL)
4167 return(NULL);
4168 atom->valuep = xmlStrdup(token);
4169 atom->data = data;
4170 if (min == 0)
4171 atom->min = 1;
4172 else
4173 atom->min = min;
4174 atom->max = max;
4175
4176 xmlFAGenerateTransitions(am, from, to, atom);
4177 if (to == NULL)
4178 to = am->state;
4179 if (to == NULL)
4180 return(NULL);
4181 if (min == 0)
4182 xmlFAGenerateEpsilonTransition(am, from, to);
4183 return(to);
4184}
4185
4186/**
Daniel Veillard7646b182002-04-20 06:41:40 +00004187 * xmlAutomataNewOnceTrans:
4188 * @am: an automata
4189 * @from: the starting point of the transition
4190 * @to: the target point of the transition or NULL
4191 * @token: the input string associated to that transition
4192 * @min: the minimum successive occurences of token
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004193 * @max: the maximum successive occurences of token
4194 * @data: data associated to the transition
Daniel Veillard7646b182002-04-20 06:41:40 +00004195 *
4196 * If @to is NULL, this create first a new target state in the automata
4197 * and then adds a transition from the @from state to the target state
4198 * activated by a succession of input of value @token and whose number
4199 * is between @min and @max, moreover that transistion can only be crossed
4200 * once.
4201 *
4202 * Returns the target state or NULL in case of error
4203 */
4204xmlAutomataStatePtr
4205xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4206 xmlAutomataStatePtr to, const xmlChar *token,
4207 int min, int max, void *data) {
4208 xmlRegAtomPtr atom;
4209 int counter;
4210
4211 if ((am == NULL) || (from == NULL) || (token == NULL))
4212 return(NULL);
4213 if (min < 1)
4214 return(NULL);
4215 if ((max < min) || (max < 1))
4216 return(NULL);
4217 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4218 if (atom == NULL)
4219 return(NULL);
4220 atom->valuep = xmlStrdup(token);
4221 atom->data = data;
4222 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
4223 if (min == 0)
4224 atom->min = 1;
4225 else
4226 atom->min = min;
4227 atom->max = max;
4228 /*
4229 * associate a counter to the transition.
4230 */
4231 counter = xmlRegGetCounter(am);
4232 am->counters[counter].min = 1;
4233 am->counters[counter].max = 1;
4234
4235 /* xmlFAGenerateTransitions(am, from, to, atom); */
4236 if (to == NULL) {
4237 to = xmlRegNewState(am);
4238 xmlRegStatePush(am, to);
4239 }
4240 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4241 xmlRegAtomPush(am, atom);
4242 am->state = to;
4243 if (to == NULL)
4244 to = am->state;
4245 if (to == NULL)
4246 return(NULL);
4247 return(to);
4248}
4249
4250/**
Daniel Veillard4255d502002-04-16 15:50:10 +00004251 * xmlAutomataNewState:
4252 * @am: an automata
4253 *
4254 * Create a new disconnected state in the automata
4255 *
4256 * Returns the new state or NULL in case of error
4257 */
4258xmlAutomataStatePtr
4259xmlAutomataNewState(xmlAutomataPtr am) {
4260 xmlAutomataStatePtr to;
4261
4262 if (am == NULL)
4263 return(NULL);
4264 to = xmlRegNewState(am);
4265 xmlRegStatePush(am, to);
4266 return(to);
4267}
4268
4269/**
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004270 * xmlAutomataNewEpsilon:
Daniel Veillard4255d502002-04-16 15:50:10 +00004271 * @am: an automata
4272 * @from: the starting point of the transition
4273 * @to: the target point of the transition or NULL
4274 *
4275 * If @to is NULL, this create first a new target state in the automata
4276 * and then adds a an epsilon transition from the @from state to the
4277 * target state
4278 *
4279 * Returns the target state or NULL in case of error
4280 */
4281xmlAutomataStatePtr
4282xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
4283 xmlAutomataStatePtr to) {
4284 if ((am == NULL) || (from == NULL))
4285 return(NULL);
4286 xmlFAGenerateEpsilonTransition(am, from, to);
4287 if (to == NULL)
4288 return(am->state);
4289 return(to);
4290}
4291
Daniel Veillardb509f152002-04-17 16:28:10 +00004292/**
Daniel Veillard7646b182002-04-20 06:41:40 +00004293 * xmlAutomataNewAllTrans:
4294 * @am: an automata
4295 * @from: the starting point of the transition
4296 * @to: the target point of the transition or NULL
Daniel Veillarda9b66d02002-12-11 14:23:49 +00004297 * @lax: allow to transition if not all all transitions have been activated
Daniel Veillard7646b182002-04-20 06:41:40 +00004298 *
4299 * If @to is NULL, this create first a new target state in the automata
4300 * and then adds a an ALL transition from the @from state to the
4301 * target state. That transition is an epsilon transition allowed only when
4302 * all transitions from the @from node have been activated.
4303 *
4304 * Returns the target state or NULL in case of error
4305 */
4306xmlAutomataStatePtr
4307xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
Daniel Veillard441bc322002-04-20 17:38:48 +00004308 xmlAutomataStatePtr to, int lax) {
Daniel Veillard7646b182002-04-20 06:41:40 +00004309 if ((am == NULL) || (from == NULL))
4310 return(NULL);
Daniel Veillard441bc322002-04-20 17:38:48 +00004311 xmlFAGenerateAllTransition(am, from, to, lax);
Daniel Veillard7646b182002-04-20 06:41:40 +00004312 if (to == NULL)
4313 return(am->state);
4314 return(to);
4315}
4316
4317/**
Daniel Veillardb509f152002-04-17 16:28:10 +00004318 * xmlAutomataNewCounter:
4319 * @am: an automata
4320 * @min: the minimal value on the counter
4321 * @max: the maximal value on the counter
4322 *
4323 * Create a new counter
4324 *
4325 * Returns the counter number or -1 in case of error
4326 */
4327int
4328xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
4329 int ret;
4330
4331 if (am == NULL)
4332 return(-1);
4333
4334 ret = xmlRegGetCounter(am);
4335 if (ret < 0)
4336 return(-1);
4337 am->counters[ret].min = min;
4338 am->counters[ret].max = max;
4339 return(ret);
4340}
4341
4342/**
4343 * xmlAutomataNewCountedTrans:
4344 * @am: an automata
4345 * @from: the starting point of the transition
4346 * @to: the target point of the transition or NULL
4347 * @counter: the counter associated to that transition
4348 *
4349 * If @to is NULL, this create first a new target state in the automata
4350 * and then adds an epsilon transition from the @from state to the target state
4351 * which will increment the counter provided
4352 *
4353 * Returns the target state or NULL in case of error
4354 */
4355xmlAutomataStatePtr
4356xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4357 xmlAutomataStatePtr to, int counter) {
4358 if ((am == NULL) || (from == NULL) || (counter < 0))
4359 return(NULL);
4360 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
4361 if (to == NULL)
4362 return(am->state);
4363 return(to);
4364}
4365
4366/**
4367 * xmlAutomataNewCounterTrans:
4368 * @am: an automata
4369 * @from: the starting point of the transition
4370 * @to: the target point of the transition or NULL
4371 * @counter: the counter associated to that transition
4372 *
4373 * If @to is NULL, this create first a new target state in the automata
4374 * and then adds an epsilon transition from the @from state to the target state
4375 * which will be allowed only if the counter is within the right range.
4376 *
4377 * Returns the target state or NULL in case of error
4378 */
4379xmlAutomataStatePtr
4380xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4381 xmlAutomataStatePtr to, int counter) {
4382 if ((am == NULL) || (from == NULL) || (counter < 0))
4383 return(NULL);
4384 xmlFAGenerateCountedTransition(am, from, to, counter);
4385 if (to == NULL)
4386 return(am->state);
4387 return(to);
4388}
Daniel Veillard4255d502002-04-16 15:50:10 +00004389
4390/**
4391 * xmlAutomataCompile:
4392 * @am: an automata
4393 *
4394 * Compile the automata into a Reg Exp ready for being executed.
4395 * The automata should be free after this point.
4396 *
4397 * Returns the compiled regexp or NULL in case of error
4398 */
4399xmlRegexpPtr
4400xmlAutomataCompile(xmlAutomataPtr am) {
4401 xmlRegexpPtr ret;
4402
4403 xmlFAEliminateEpsilonTransitions(am);
Daniel Veillard23e73572002-09-19 19:56:43 +00004404 /* xmlFAComputesDeterminism(am); */
Daniel Veillard4255d502002-04-16 15:50:10 +00004405 ret = xmlRegEpxFromParse(am);
4406
4407 return(ret);
4408}
Daniel Veillarde19fc232002-04-22 16:01:24 +00004409
4410/**
4411 * xmlAutomataIsDeterminist:
4412 * @am: an automata
4413 *
4414 * Checks if an automata is determinist.
4415 *
4416 * Returns 1 if true, 0 if not, and -1 in case of error
4417 */
4418int
4419xmlAutomataIsDeterminist(xmlAutomataPtr am) {
4420 int ret;
4421
4422 if (am == NULL)
4423 return(-1);
4424
4425 ret = xmlFAComputesDeterminism(am);
4426 return(ret);
4427}
Daniel Veillard4255d502002-04-16 15:50:10 +00004428#endif /* LIBXML_AUTOMATA_ENABLED */
4429#endif /* LIBXML_REGEXP_ENABLED */