blob: 36d2065ce363bb6d9fa189ebc7dd53728fcaad6b [file] [log] [blame]
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001/*
2Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
3See the file COPYING for copying permission.
4*/
5
6#ifndef IS_INVALID_CHAR
7#define IS_INVALID_CHAR(enc, ptr, n) (0)
8#endif
9
10#define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
11 case BT_LEAD ## n: \
12 if (end - ptr < n) \
13 return XML_TOK_PARTIAL_CHAR; \
14 if (IS_INVALID_CHAR(enc, ptr, n)) { \
15 *(nextTokPtr) = (ptr); \
16 return XML_TOK_INVALID; \
17 } \
18 ptr += n; \
19 break;
20
21#define INVALID_CASES(ptr, nextTokPtr) \
22 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
23 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
24 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
25 case BT_NONXML: \
26 case BT_MALFORM: \
27 case BT_TRAIL: \
28 *(nextTokPtr) = (ptr); \
29 return XML_TOK_INVALID;
30
31#define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
32 case BT_LEAD ## n: \
33 if (end - ptr < n) \
34 return XML_TOK_PARTIAL_CHAR; \
35 if (!IS_NAME_CHAR(enc, ptr, n)) { \
36 *nextTokPtr = ptr; \
37 return XML_TOK_INVALID; \
38 } \
39 ptr += n; \
40 break;
41
42#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
43 case BT_NONASCII: \
44 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
45 *nextTokPtr = ptr; \
46 return XML_TOK_INVALID; \
47 } \
48 case BT_NMSTRT: \
49 case BT_HEX: \
50 case BT_DIGIT: \
51 case BT_NAME: \
52 case BT_MINUS: \
53 ptr += MINBPC(enc); \
54 break; \
55 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
56 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
57 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
58
59#define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
60 case BT_LEAD ## n: \
61 if (end - ptr < n) \
62 return XML_TOK_PARTIAL_CHAR; \
63 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
64 *nextTokPtr = ptr; \
65 return XML_TOK_INVALID; \
66 } \
67 ptr += n; \
68 break;
69
70#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
71 case BT_NONASCII: \
72 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
73 *nextTokPtr = ptr; \
74 return XML_TOK_INVALID; \
75 } \
76 case BT_NMSTRT: \
77 case BT_HEX: \
78 ptr += MINBPC(enc); \
79 break; \
80 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
81 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
82 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
83
84#ifndef PREFIX
85#define PREFIX(ident) ident
86#endif
87
88/* ptr points to character following "<!-" */
89
90static
91int PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
92 const char **nextTokPtr)
93{
94 if (ptr != end) {
95 if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
96 *nextTokPtr = ptr;
97 return XML_TOK_INVALID;
98 }
99 ptr += MINBPC(enc);
100 while (ptr != end) {
101 switch (BYTE_TYPE(enc, ptr)) {
102 INVALID_CASES(ptr, nextTokPtr)
103 case BT_MINUS:
104 if ((ptr += MINBPC(enc)) == end)
105 return XML_TOK_PARTIAL;
106 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
107 if ((ptr += MINBPC(enc)) == end)
108 return XML_TOK_PARTIAL;
109 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
110 *nextTokPtr = ptr;
111 return XML_TOK_INVALID;
112 }
113 *nextTokPtr = ptr + MINBPC(enc);
114 return XML_TOK_COMMENT;
115 }
116 break;
117 default:
118 ptr += MINBPC(enc);
119 break;
120 }
121 }
122 }
123 return XML_TOK_PARTIAL;
124}
125
126/* ptr points to character following "<!" */
127
128static
129int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
130 const char **nextTokPtr)
131{
132 if (ptr == end)
133 return XML_TOK_PARTIAL;
134 switch (BYTE_TYPE(enc, ptr)) {
135 case BT_MINUS:
136 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
137 case BT_LSQB:
138 *nextTokPtr = ptr + MINBPC(enc);
139 return XML_TOK_COND_SECT_OPEN;
140 case BT_NMSTRT:
141 case BT_HEX:
142 ptr += MINBPC(enc);
143 break;
144 default:
145 *nextTokPtr = ptr;
146 return XML_TOK_INVALID;
147 }
148 while (ptr != end) {
149 switch (BYTE_TYPE(enc, ptr)) {
150 case BT_PERCNT:
151 if (ptr + MINBPC(enc) == end)
152 return XML_TOK_PARTIAL;
153 /* don't allow <!ENTITY% foo "whatever"> */
154 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
155 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
156 *nextTokPtr = ptr;
157 return XML_TOK_INVALID;
158 }
159 /* fall through */
160 case BT_S: case BT_CR: case BT_LF:
161 *nextTokPtr = ptr;
162 return XML_TOK_DECL_OPEN;
163 case BT_NMSTRT:
164 case BT_HEX:
165 ptr += MINBPC(enc);
166 break;
167 default:
168 *nextTokPtr = ptr;
169 return XML_TOK_INVALID;
170 }
171 }
172 return XML_TOK_PARTIAL;
173}
174
175static
176int PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end, int *tokPtr)
177{
178 int upper = 0;
179 *tokPtr = XML_TOK_PI;
180 if (end - ptr != MINBPC(enc)*3)
181 return 1;
182 switch (BYTE_TO_ASCII(enc, ptr)) {
183 case ASCII_x:
184 break;
185 case ASCII_X:
186 upper = 1;
187 break;
188 default:
189 return 1;
190 }
191 ptr += MINBPC(enc);
192 switch (BYTE_TO_ASCII(enc, ptr)) {
193 case ASCII_m:
194 break;
195 case ASCII_M:
196 upper = 1;
197 break;
198 default:
199 return 1;
200 }
201 ptr += MINBPC(enc);
202 switch (BYTE_TO_ASCII(enc, ptr)) {
203 case ASCII_l:
204 break;
205 case ASCII_L:
206 upper = 1;
207 break;
208 default:
209 return 1;
210 }
211 if (upper)
212 return 0;
213 *tokPtr = XML_TOK_XML_DECL;
214 return 1;
215}
216
217/* ptr points to character following "<?" */
218
219static
220int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
221 const char **nextTokPtr)
222{
223 int tok;
224 const char *target = ptr;
225 if (ptr == end)
226 return XML_TOK_PARTIAL;
227 switch (BYTE_TYPE(enc, ptr)) {
228 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
229 default:
230 *nextTokPtr = ptr;
231 return XML_TOK_INVALID;
232 }
233 while (ptr != end) {
234 switch (BYTE_TYPE(enc, ptr)) {
235 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
236 case BT_S: case BT_CR: case BT_LF:
237 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
238 *nextTokPtr = ptr;
239 return XML_TOK_INVALID;
240 }
241 ptr += MINBPC(enc);
242 while (ptr != end) {
243 switch (BYTE_TYPE(enc, ptr)) {
244 INVALID_CASES(ptr, nextTokPtr)
245 case BT_QUEST:
246 ptr += MINBPC(enc);
247 if (ptr == end)
248 return XML_TOK_PARTIAL;
249 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
250 *nextTokPtr = ptr + MINBPC(enc);
251 return tok;
252 }
253 break;
254 default:
255 ptr += MINBPC(enc);
256 break;
257 }
258 }
259 return XML_TOK_PARTIAL;
260 case BT_QUEST:
261 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
262 *nextTokPtr = ptr;
263 return XML_TOK_INVALID;
264 }
265 ptr += MINBPC(enc);
266 if (ptr == end)
267 return XML_TOK_PARTIAL;
268 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
269 *nextTokPtr = ptr + MINBPC(enc);
270 return tok;
271 }
272 /* fall through */
273 default:
274 *nextTokPtr = ptr;
275 return XML_TOK_INVALID;
276 }
277 }
278 return XML_TOK_PARTIAL;
279}
280
281
282static
283int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
284 const char **nextTokPtr)
285{
286 static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB };
287 int i;
288 /* CDATA[ */
289 if (end - ptr < 6 * MINBPC(enc))
290 return XML_TOK_PARTIAL;
291 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
292 if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
293 *nextTokPtr = ptr;
294 return XML_TOK_INVALID;
295 }
296 }
297 *nextTokPtr = ptr;
298 return XML_TOK_CDATA_SECT_OPEN;
299}
300
301static
302int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
303 const char **nextTokPtr)
304{
305 if (ptr == end)
306 return XML_TOK_NONE;
307 if (MINBPC(enc) > 1) {
308 size_t n = end - ptr;
309 if (n & (MINBPC(enc) - 1)) {
310 n &= ~(MINBPC(enc) - 1);
311 if (n == 0)
312 return XML_TOK_PARTIAL;
313 end = ptr + n;
314 }
315 }
316 switch (BYTE_TYPE(enc, ptr)) {
317 case BT_RSQB:
318 ptr += MINBPC(enc);
319 if (ptr == end)
320 return XML_TOK_PARTIAL;
321 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
322 break;
323 ptr += MINBPC(enc);
324 if (ptr == end)
325 return XML_TOK_PARTIAL;
326 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
327 ptr -= MINBPC(enc);
328 break;
329 }
330 *nextTokPtr = ptr + MINBPC(enc);
331 return XML_TOK_CDATA_SECT_CLOSE;
332 case BT_CR:
333 ptr += MINBPC(enc);
334 if (ptr == end)
335 return XML_TOK_PARTIAL;
336 if (BYTE_TYPE(enc, ptr) == BT_LF)
337 ptr += MINBPC(enc);
338 *nextTokPtr = ptr;
339 return XML_TOK_DATA_NEWLINE;
340 case BT_LF:
341 *nextTokPtr = ptr + MINBPC(enc);
342 return XML_TOK_DATA_NEWLINE;
343 INVALID_CASES(ptr, nextTokPtr)
344 default:
345 ptr += MINBPC(enc);
346 break;
347 }
348 while (ptr != end) {
349 switch (BYTE_TYPE(enc, ptr)) {
350#define LEAD_CASE(n) \
351 case BT_LEAD ## n: \
352 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
353 *nextTokPtr = ptr; \
354 return XML_TOK_DATA_CHARS; \
355 } \
356 ptr += n; \
357 break;
358 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
359#undef LEAD_CASE
360 case BT_NONXML:
361 case BT_MALFORM:
362 case BT_TRAIL:
363 case BT_CR:
364 case BT_LF:
365 case BT_RSQB:
366 *nextTokPtr = ptr;
367 return XML_TOK_DATA_CHARS;
368 default:
369 ptr += MINBPC(enc);
370 break;
371 }
372 }
373 *nextTokPtr = ptr;
374 return XML_TOK_DATA_CHARS;
375}
376
377/* ptr points to character following "</" */
378
379static
380int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
381 const char **nextTokPtr)
382{
383 if (ptr == end)
384 return XML_TOK_PARTIAL;
385 switch (BYTE_TYPE(enc, ptr)) {
386 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
387 default:
388 *nextTokPtr = ptr;
389 return XML_TOK_INVALID;
390 }
391 while (ptr != end) {
392 switch (BYTE_TYPE(enc, ptr)) {
393 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
394 case BT_S: case BT_CR: case BT_LF:
395 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
396 switch (BYTE_TYPE(enc, ptr)) {
397 case BT_S: case BT_CR: case BT_LF:
398 break;
399 case BT_GT:
400 *nextTokPtr = ptr + MINBPC(enc);
401 return XML_TOK_END_TAG;
402 default:
403 *nextTokPtr = ptr;
404 return XML_TOK_INVALID;
405 }
406 }
407 return XML_TOK_PARTIAL;
408#ifdef XML_NS
409 case BT_COLON:
410 /* no need to check qname syntax here, since end-tag must match exactly */
411 ptr += MINBPC(enc);
412 break;
413#endif
414 case BT_GT:
415 *nextTokPtr = ptr + MINBPC(enc);
416 return XML_TOK_END_TAG;
417 default:
418 *nextTokPtr = ptr;
419 return XML_TOK_INVALID;
420 }
421 }
422 return XML_TOK_PARTIAL;
423}
424
425/* ptr points to character following "&#X" */
426
427static
428int PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
429 const char **nextTokPtr)
430{
431 if (ptr != end) {
432 switch (BYTE_TYPE(enc, ptr)) {
433 case BT_DIGIT:
434 case BT_HEX:
435 break;
436 default:
437 *nextTokPtr = ptr;
438 return XML_TOK_INVALID;
439 }
440 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
441 switch (BYTE_TYPE(enc, ptr)) {
442 case BT_DIGIT:
443 case BT_HEX:
444 break;
445 case BT_SEMI:
446 *nextTokPtr = ptr + MINBPC(enc);
447 return XML_TOK_CHAR_REF;
448 default:
449 *nextTokPtr = ptr;
450 return XML_TOK_INVALID;
451 }
452 }
453 }
454 return XML_TOK_PARTIAL;
455}
456
457/* ptr points to character following "&#" */
458
459static
460int PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
461 const char **nextTokPtr)
462{
463 if (ptr != end) {
464 if (CHAR_MATCHES(enc, ptr, ASCII_x))
465 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
466 switch (BYTE_TYPE(enc, ptr)) {
467 case BT_DIGIT:
468 break;
469 default:
470 *nextTokPtr = ptr;
471 return XML_TOK_INVALID;
472 }
473 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
474 switch (BYTE_TYPE(enc, ptr)) {
475 case BT_DIGIT:
476 break;
477 case BT_SEMI:
478 *nextTokPtr = ptr + MINBPC(enc);
479 return XML_TOK_CHAR_REF;
480 default:
481 *nextTokPtr = ptr;
482 return XML_TOK_INVALID;
483 }
484 }
485 }
486 return XML_TOK_PARTIAL;
487}
488
489/* ptr points to character following "&" */
490
491static
492int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
493 const char **nextTokPtr)
494{
495 if (ptr == end)
496 return XML_TOK_PARTIAL;
497 switch (BYTE_TYPE(enc, ptr)) {
498 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
499 case BT_NUM:
500 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
501 default:
502 *nextTokPtr = ptr;
503 return XML_TOK_INVALID;
504 }
505 while (ptr != end) {
506 switch (BYTE_TYPE(enc, ptr)) {
507 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
508 case BT_SEMI:
509 *nextTokPtr = ptr + MINBPC(enc);
510 return XML_TOK_ENTITY_REF;
511 default:
512 *nextTokPtr = ptr;
513 return XML_TOK_INVALID;
514 }
515 }
516 return XML_TOK_PARTIAL;
517}
518
519/* ptr points to character following first character of attribute name */
520
521static
522int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
523 const char **nextTokPtr)
524{
525#ifdef XML_NS
526 int hadColon = 0;
527#endif
528 while (ptr != end) {
529 switch (BYTE_TYPE(enc, ptr)) {
530 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
531#ifdef XML_NS
532 case BT_COLON:
533 if (hadColon) {
534 *nextTokPtr = ptr;
535 return XML_TOK_INVALID;
536 }
537 hadColon = 1;
538 ptr += MINBPC(enc);
539 if (ptr == end)
540 return XML_TOK_PARTIAL;
541 switch (BYTE_TYPE(enc, ptr)) {
542 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
543 default:
544 *nextTokPtr = ptr;
545 return XML_TOK_INVALID;
546 }
547 break;
548#endif
549 case BT_S: case BT_CR: case BT_LF:
550 for (;;) {
551 int t;
552
553 ptr += MINBPC(enc);
554 if (ptr == end)
555 return XML_TOK_PARTIAL;
556 t = BYTE_TYPE(enc, ptr);
557 if (t == BT_EQUALS)
558 break;
559 switch (t) {
560 case BT_S:
561 case BT_LF:
562 case BT_CR:
563 break;
564 default:
565 *nextTokPtr = ptr;
566 return XML_TOK_INVALID;
567 }
568 }
569 /* fall through */
570 case BT_EQUALS:
571 {
572 int open;
573#ifdef XML_NS
574 hadColon = 0;
575#endif
576 for (;;) {
577
578 ptr += MINBPC(enc);
579 if (ptr == end)
580 return XML_TOK_PARTIAL;
581 open = BYTE_TYPE(enc, ptr);
582 if (open == BT_QUOT || open == BT_APOS)
583 break;
584 switch (open) {
585 case BT_S:
586 case BT_LF:
587 case BT_CR:
588 break;
589 default:
590 *nextTokPtr = ptr;
591 return XML_TOK_INVALID;
592 }
593 }
594 ptr += MINBPC(enc);
595 /* in attribute value */
596 for (;;) {
597 int t;
598 if (ptr == end)
599 return XML_TOK_PARTIAL;
600 t = BYTE_TYPE(enc, ptr);
601 if (t == open)
602 break;
603 switch (t) {
604 INVALID_CASES(ptr, nextTokPtr)
605 case BT_AMP:
606 {
607 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
608 if (tok <= 0) {
609 if (tok == XML_TOK_INVALID)
610 *nextTokPtr = ptr;
611 return tok;
612 }
613 break;
614 }
615 case BT_LT:
616 *nextTokPtr = ptr;
617 return XML_TOK_INVALID;
618 default:
619 ptr += MINBPC(enc);
620 break;
621 }
622 }
623 ptr += MINBPC(enc);
624 if (ptr == end)
625 return XML_TOK_PARTIAL;
626 switch (BYTE_TYPE(enc, ptr)) {
627 case BT_S:
628 case BT_CR:
629 case BT_LF:
630 break;
631 case BT_SOL:
632 goto sol;
633 case BT_GT:
634 goto gt;
635 default:
636 *nextTokPtr = ptr;
637 return XML_TOK_INVALID;
638 }
639 /* ptr points to closing quote */
640 for (;;) {
641 ptr += MINBPC(enc);
642 if (ptr == end)
643 return XML_TOK_PARTIAL;
644 switch (BYTE_TYPE(enc, ptr)) {
645 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
646 case BT_S: case BT_CR: case BT_LF:
647 continue;
648 case BT_GT:
649 gt:
650 *nextTokPtr = ptr + MINBPC(enc);
651 return XML_TOK_START_TAG_WITH_ATTS;
652 case BT_SOL:
653 sol:
654 ptr += MINBPC(enc);
655 if (ptr == end)
656 return XML_TOK_PARTIAL;
657 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
658 *nextTokPtr = ptr;
659 return XML_TOK_INVALID;
660 }
661 *nextTokPtr = ptr + MINBPC(enc);
662 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
663 default:
664 *nextTokPtr = ptr;
665 return XML_TOK_INVALID;
666 }
667 break;
668 }
669 break;
670 }
671 default:
672 *nextTokPtr = ptr;
673 return XML_TOK_INVALID;
674 }
675 }
676 return XML_TOK_PARTIAL;
677}
678
679/* ptr points to character following "<" */
680
681static
682int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
683 const char **nextTokPtr)
684{
685#ifdef XML_NS
686 int hadColon;
687#endif
688 if (ptr == end)
689 return XML_TOK_PARTIAL;
690 switch (BYTE_TYPE(enc, ptr)) {
691 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
692 case BT_EXCL:
693 if ((ptr += MINBPC(enc)) == end)
694 return XML_TOK_PARTIAL;
695 switch (BYTE_TYPE(enc, ptr)) {
696 case BT_MINUS:
697 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
698 case BT_LSQB:
699 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
700 }
701 *nextTokPtr = ptr;
702 return XML_TOK_INVALID;
703 case BT_QUEST:
704 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
705 case BT_SOL:
706 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
707 default:
708 *nextTokPtr = ptr;
709 return XML_TOK_INVALID;
710 }
711#ifdef XML_NS
712 hadColon = 0;
713#endif
714 /* we have a start-tag */
715 while (ptr != end) {
716 switch (BYTE_TYPE(enc, ptr)) {
717 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
718#ifdef XML_NS
719 case BT_COLON:
720 if (hadColon) {
721 *nextTokPtr = ptr;
722 return XML_TOK_INVALID;
723 }
724 hadColon = 1;
725 ptr += MINBPC(enc);
726 if (ptr == end)
727 return XML_TOK_PARTIAL;
728 switch (BYTE_TYPE(enc, ptr)) {
729 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
730 default:
731 *nextTokPtr = ptr;
732 return XML_TOK_INVALID;
733 }
734 break;
735#endif
736 case BT_S: case BT_CR: case BT_LF:
737 {
738 ptr += MINBPC(enc);
739 while (ptr != end) {
740 switch (BYTE_TYPE(enc, ptr)) {
741 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
742 case BT_GT:
743 goto gt;
744 case BT_SOL:
745 goto sol;
746 case BT_S: case BT_CR: case BT_LF:
747 ptr += MINBPC(enc);
748 continue;
749 default:
750 *nextTokPtr = ptr;
751 return XML_TOK_INVALID;
752 }
753 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
754 }
755 return XML_TOK_PARTIAL;
756 }
757 case BT_GT:
758 gt:
759 *nextTokPtr = ptr + MINBPC(enc);
760 return XML_TOK_START_TAG_NO_ATTS;
761 case BT_SOL:
762 sol:
763 ptr += MINBPC(enc);
764 if (ptr == end)
765 return XML_TOK_PARTIAL;
766 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
767 *nextTokPtr = ptr;
768 return XML_TOK_INVALID;
769 }
770 *nextTokPtr = ptr + MINBPC(enc);
771 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
772 default:
773 *nextTokPtr = ptr;
774 return XML_TOK_INVALID;
775 }
776 }
777 return XML_TOK_PARTIAL;
778}
779
780static
781int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
782 const char **nextTokPtr)
783{
784 if (ptr == end)
785 return XML_TOK_NONE;
786 if (MINBPC(enc) > 1) {
787 size_t n = end - ptr;
788 if (n & (MINBPC(enc) - 1)) {
789 n &= ~(MINBPC(enc) - 1);
790 if (n == 0)
791 return XML_TOK_PARTIAL;
792 end = ptr + n;
793 }
794 }
795 switch (BYTE_TYPE(enc, ptr)) {
796 case BT_LT:
797 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
798 case BT_AMP:
799 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
800 case BT_CR:
801 ptr += MINBPC(enc);
802 if (ptr == end)
803 return XML_TOK_TRAILING_CR;
804 if (BYTE_TYPE(enc, ptr) == BT_LF)
805 ptr += MINBPC(enc);
806 *nextTokPtr = ptr;
807 return XML_TOK_DATA_NEWLINE;
808 case BT_LF:
809 *nextTokPtr = ptr + MINBPC(enc);
810 return XML_TOK_DATA_NEWLINE;
811 case BT_RSQB:
812 ptr += MINBPC(enc);
813 if (ptr == end)
814 return XML_TOK_TRAILING_RSQB;
815 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
816 break;
817 ptr += MINBPC(enc);
818 if (ptr == end)
819 return XML_TOK_TRAILING_RSQB;
820 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
821 ptr -= MINBPC(enc);
822 break;
823 }
824 *nextTokPtr = ptr;
825 return XML_TOK_INVALID;
826 INVALID_CASES(ptr, nextTokPtr)
827 default:
828 ptr += MINBPC(enc);
829 break;
830 }
831 while (ptr != end) {
832 switch (BYTE_TYPE(enc, ptr)) {
833#define LEAD_CASE(n) \
834 case BT_LEAD ## n: \
835 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
836 *nextTokPtr = ptr; \
837 return XML_TOK_DATA_CHARS; \
838 } \
839 ptr += n; \
840 break;
841 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
842#undef LEAD_CASE
843 case BT_RSQB:
844 if (ptr + MINBPC(enc) != end) {
845 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
846 ptr += MINBPC(enc);
847 break;
848 }
849 if (ptr + 2*MINBPC(enc) != end) {
850 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
851 ptr += MINBPC(enc);
852 break;
853 }
854 *nextTokPtr = ptr + 2*MINBPC(enc);
855 return XML_TOK_INVALID;
856 }
857 }
858 /* fall through */
859 case BT_AMP:
860 case BT_LT:
861 case BT_NONXML:
862 case BT_MALFORM:
863 case BT_TRAIL:
864 case BT_CR:
865 case BT_LF:
866 *nextTokPtr = ptr;
867 return XML_TOK_DATA_CHARS;
868 default:
869 ptr += MINBPC(enc);
870 break;
871 }
872 }
873 *nextTokPtr = ptr;
874 return XML_TOK_DATA_CHARS;
875}
876
877/* ptr points to character following "%" */
878
879static
880int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
881 const char **nextTokPtr)
882{
883 if (ptr == end)
884 return XML_TOK_PARTIAL;
885 switch (BYTE_TYPE(enc, ptr)) {
886 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
887 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
888 *nextTokPtr = ptr;
889 return XML_TOK_PERCENT;
890 default:
891 *nextTokPtr = ptr;
892 return XML_TOK_INVALID;
893 }
894 while (ptr != end) {
895 switch (BYTE_TYPE(enc, ptr)) {
896 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
897 case BT_SEMI:
898 *nextTokPtr = ptr + MINBPC(enc);
899 return XML_TOK_PARAM_ENTITY_REF;
900 default:
901 *nextTokPtr = ptr;
902 return XML_TOK_INVALID;
903 }
904 }
905 return XML_TOK_PARTIAL;
906}
907
908static
909int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
910 const char **nextTokPtr)
911{
912 if (ptr == end)
913 return XML_TOK_PARTIAL;
914 switch (BYTE_TYPE(enc, ptr)) {
915 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
916 default:
917 *nextTokPtr = ptr;
918 return XML_TOK_INVALID;
919 }
920 while (ptr != end) {
921 switch (BYTE_TYPE(enc, ptr)) {
922 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
923 case BT_CR: case BT_LF: case BT_S:
924 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
925 *nextTokPtr = ptr;
926 return XML_TOK_POUND_NAME;
927 default:
928 *nextTokPtr = ptr;
929 return XML_TOK_INVALID;
930 }
931 }
932 return -XML_TOK_POUND_NAME;
933}
934
935static
936int PREFIX(scanLit)(int open, const ENCODING *enc,
937 const char *ptr, const char *end,
938 const char **nextTokPtr)
939{
940 while (ptr != end) {
941 int t = BYTE_TYPE(enc, ptr);
942 switch (t) {
943 INVALID_CASES(ptr, nextTokPtr)
944 case BT_QUOT:
945 case BT_APOS:
946 ptr += MINBPC(enc);
947 if (t != open)
948 break;
949 if (ptr == end)
950 return -XML_TOK_LITERAL;
951 *nextTokPtr = ptr;
952 switch (BYTE_TYPE(enc, ptr)) {
953 case BT_S: case BT_CR: case BT_LF:
954 case BT_GT: case BT_PERCNT: case BT_LSQB:
955 return XML_TOK_LITERAL;
956 default:
957 return XML_TOK_INVALID;
958 }
959 default:
960 ptr += MINBPC(enc);
961 break;
962 }
963 }
964 return XML_TOK_PARTIAL;
965}
966
967static
968int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
969 const char **nextTokPtr)
970{
971 int tok;
972 if (ptr == end)
973 return XML_TOK_NONE;
974 if (MINBPC(enc) > 1) {
975 size_t n = end - ptr;
976 if (n & (MINBPC(enc) - 1)) {
977 n &= ~(MINBPC(enc) - 1);
978 if (n == 0)
979 return XML_TOK_PARTIAL;
980 end = ptr + n;
981 }
982 }
983 switch (BYTE_TYPE(enc, ptr)) {
984 case BT_QUOT:
985 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
986 case BT_APOS:
987 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
988 case BT_LT:
989 {
990 ptr += MINBPC(enc);
991 if (ptr == end)
992 return XML_TOK_PARTIAL;
993 switch (BYTE_TYPE(enc, ptr)) {
994 case BT_EXCL:
995 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
996 case BT_QUEST:
997 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
998 case BT_NMSTRT:
999 case BT_HEX:
1000 case BT_NONASCII:
1001 case BT_LEAD2:
1002 case BT_LEAD3:
1003 case BT_LEAD4:
1004 *nextTokPtr = ptr - MINBPC(enc);
1005 return XML_TOK_INSTANCE_START;
1006 }
1007 *nextTokPtr = ptr;
1008 return XML_TOK_INVALID;
1009 }
1010 case BT_CR:
1011 if (ptr + MINBPC(enc) == end)
1012 return -XML_TOK_PROLOG_S;
1013 /* fall through */
1014 case BT_S: case BT_LF:
1015 for (;;) {
1016 ptr += MINBPC(enc);
1017 if (ptr == end)
1018 break;
1019 switch (BYTE_TYPE(enc, ptr)) {
1020 case BT_S: case BT_LF:
1021 break;
1022 case BT_CR:
1023 /* don't split CR/LF pair */
1024 if (ptr + MINBPC(enc) != end)
1025 break;
1026 /* fall through */
1027 default:
1028 *nextTokPtr = ptr;
1029 return XML_TOK_PROLOG_S;
1030 }
1031 }
1032 *nextTokPtr = ptr;
1033 return XML_TOK_PROLOG_S;
1034 case BT_PERCNT:
1035 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1036 case BT_COMMA:
1037 *nextTokPtr = ptr + MINBPC(enc);
1038 return XML_TOK_COMMA;
1039 case BT_LSQB:
1040 *nextTokPtr = ptr + MINBPC(enc);
1041 return XML_TOK_OPEN_BRACKET;
1042 case BT_RSQB:
1043 ptr += MINBPC(enc);
1044 if (ptr == end)
1045 return -XML_TOK_CLOSE_BRACKET;
1046 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1047 if (ptr + MINBPC(enc) == end)
1048 return XML_TOK_PARTIAL;
1049 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1050 *nextTokPtr = ptr + 2*MINBPC(enc);
1051 return XML_TOK_COND_SECT_CLOSE;
1052 }
1053 }
1054 *nextTokPtr = ptr;
1055 return XML_TOK_CLOSE_BRACKET;
1056 case BT_LPAR:
1057 *nextTokPtr = ptr + MINBPC(enc);
1058 return XML_TOK_OPEN_PAREN;
1059 case BT_RPAR:
1060 ptr += MINBPC(enc);
1061 if (ptr == end)
1062 return -XML_TOK_CLOSE_PAREN;
1063 switch (BYTE_TYPE(enc, ptr)) {
1064 case BT_AST:
1065 *nextTokPtr = ptr + MINBPC(enc);
1066 return XML_TOK_CLOSE_PAREN_ASTERISK;
1067 case BT_QUEST:
1068 *nextTokPtr = ptr + MINBPC(enc);
1069 return XML_TOK_CLOSE_PAREN_QUESTION;
1070 case BT_PLUS:
1071 *nextTokPtr = ptr + MINBPC(enc);
1072 return XML_TOK_CLOSE_PAREN_PLUS;
1073 case BT_CR: case BT_LF: case BT_S:
1074 case BT_GT: case BT_COMMA: case BT_VERBAR:
1075 case BT_RPAR:
1076 *nextTokPtr = ptr;
1077 return XML_TOK_CLOSE_PAREN;
1078 }
1079 *nextTokPtr = ptr;
1080 return XML_TOK_INVALID;
1081 case BT_VERBAR:
1082 *nextTokPtr = ptr + MINBPC(enc);
1083 return XML_TOK_OR;
1084 case BT_GT:
1085 *nextTokPtr = ptr + MINBPC(enc);
1086 return XML_TOK_DECL_CLOSE;
1087 case BT_NUM:
1088 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1089#define LEAD_CASE(n) \
1090 case BT_LEAD ## n: \
1091 if (end - ptr < n) \
1092 return XML_TOK_PARTIAL_CHAR; \
1093 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1094 ptr += n; \
1095 tok = XML_TOK_NAME; \
1096 break; \
1097 } \
1098 if (IS_NAME_CHAR(enc, ptr, n)) { \
1099 ptr += n; \
1100 tok = XML_TOK_NMTOKEN; \
1101 break; \
1102 } \
1103 *nextTokPtr = ptr; \
1104 return XML_TOK_INVALID;
1105 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1106#undef LEAD_CASE
1107 case BT_NMSTRT:
1108 case BT_HEX:
1109 tok = XML_TOK_NAME;
1110 ptr += MINBPC(enc);
1111 break;
1112 case BT_DIGIT:
1113 case BT_NAME:
1114 case BT_MINUS:
1115#ifdef XML_NS
1116 case BT_COLON:
1117#endif
1118 tok = XML_TOK_NMTOKEN;
1119 ptr += MINBPC(enc);
1120 break;
1121 case BT_NONASCII:
1122 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1123 ptr += MINBPC(enc);
1124 tok = XML_TOK_NAME;
1125 break;
1126 }
1127 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1128 ptr += MINBPC(enc);
1129 tok = XML_TOK_NMTOKEN;
1130 break;
1131 }
1132 /* fall through */
1133 default:
1134 *nextTokPtr = ptr;
1135 return XML_TOK_INVALID;
1136 }
1137 while (ptr != end) {
1138 switch (BYTE_TYPE(enc, ptr)) {
1139 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1140 case BT_GT: case BT_RPAR: case BT_COMMA:
1141 case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1142 case BT_S: case BT_CR: case BT_LF:
1143 *nextTokPtr = ptr;
1144 return tok;
1145#ifdef XML_NS
1146 case BT_COLON:
1147 ptr += MINBPC(enc);
1148 switch (tok) {
1149 case XML_TOK_NAME:
1150 if (ptr == end)
1151 return XML_TOK_PARTIAL;
1152 tok = XML_TOK_PREFIXED_NAME;
1153 switch (BYTE_TYPE(enc, ptr)) {
1154 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1155 default:
1156 tok = XML_TOK_NMTOKEN;
1157 break;
1158 }
1159 break;
1160 case XML_TOK_PREFIXED_NAME:
1161 tok = XML_TOK_NMTOKEN;
1162 break;
1163 }
1164 break;
1165#endif
1166 case BT_PLUS:
1167 if (tok == XML_TOK_NMTOKEN) {
1168 *nextTokPtr = ptr;
1169 return XML_TOK_INVALID;
1170 }
1171 *nextTokPtr = ptr + MINBPC(enc);
1172 return XML_TOK_NAME_PLUS;
1173 case BT_AST:
1174 if (tok == XML_TOK_NMTOKEN) {
1175 *nextTokPtr = ptr;
1176 return XML_TOK_INVALID;
1177 }
1178 *nextTokPtr = ptr + MINBPC(enc);
1179 return XML_TOK_NAME_ASTERISK;
1180 case BT_QUEST:
1181 if (tok == XML_TOK_NMTOKEN) {
1182 *nextTokPtr = ptr;
1183 return XML_TOK_INVALID;
1184 }
1185 *nextTokPtr = ptr + MINBPC(enc);
1186 return XML_TOK_NAME_QUESTION;
1187 default:
1188 *nextTokPtr = ptr;
1189 return XML_TOK_INVALID;
1190 }
1191 }
1192 return -tok;
1193}
1194
1195static
1196int PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1197 const char **nextTokPtr)
1198{
1199 const char *start;
1200 if (ptr == end)
1201 return XML_TOK_NONE;
1202 start = ptr;
1203 while (ptr != end) {
1204 switch (BYTE_TYPE(enc, ptr)) {
1205#define LEAD_CASE(n) \
1206 case BT_LEAD ## n: ptr += n; break;
1207 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1208#undef LEAD_CASE
1209 case BT_AMP:
1210 if (ptr == start)
1211 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1212 *nextTokPtr = ptr;
1213 return XML_TOK_DATA_CHARS;
1214 case BT_LT:
1215 /* this is for inside entity references */
1216 *nextTokPtr = ptr;
1217 return XML_TOK_INVALID;
1218 case BT_LF:
1219 if (ptr == start) {
1220 *nextTokPtr = ptr + MINBPC(enc);
1221 return XML_TOK_DATA_NEWLINE;
1222 }
1223 *nextTokPtr = ptr;
1224 return XML_TOK_DATA_CHARS;
1225 case BT_CR:
1226 if (ptr == start) {
1227 ptr += MINBPC(enc);
1228 if (ptr == end)
1229 return XML_TOK_TRAILING_CR;
1230 if (BYTE_TYPE(enc, ptr) == BT_LF)
1231 ptr += MINBPC(enc);
1232 *nextTokPtr = ptr;
1233 return XML_TOK_DATA_NEWLINE;
1234 }
1235 *nextTokPtr = ptr;
1236 return XML_TOK_DATA_CHARS;
1237 case BT_S:
1238 if (ptr == start) {
1239 *nextTokPtr = ptr + MINBPC(enc);
1240 return XML_TOK_ATTRIBUTE_VALUE_S;
1241 }
1242 *nextTokPtr = ptr;
1243 return XML_TOK_DATA_CHARS;
1244 default:
1245 ptr += MINBPC(enc);
1246 break;
1247 }
1248 }
1249 *nextTokPtr = ptr;
1250 return XML_TOK_DATA_CHARS;
1251}
1252
1253static
1254int PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1255 const char **nextTokPtr)
1256{
1257 const char *start;
1258 if (ptr == end)
1259 return XML_TOK_NONE;
1260 start = ptr;
1261 while (ptr != end) {
1262 switch (BYTE_TYPE(enc, ptr)) {
1263#define LEAD_CASE(n) \
1264 case BT_LEAD ## n: ptr += n; break;
1265 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1266#undef LEAD_CASE
1267 case BT_AMP:
1268 if (ptr == start)
1269 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1270 *nextTokPtr = ptr;
1271 return XML_TOK_DATA_CHARS;
1272 case BT_PERCNT:
1273 if (ptr == start) {
1274 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1275 end, nextTokPtr);
1276 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1277 }
1278 *nextTokPtr = ptr;
1279 return XML_TOK_DATA_CHARS;
1280 case BT_LF:
1281 if (ptr == start) {
1282 *nextTokPtr = ptr + MINBPC(enc);
1283 return XML_TOK_DATA_NEWLINE;
1284 }
1285 *nextTokPtr = ptr;
1286 return XML_TOK_DATA_CHARS;
1287 case BT_CR:
1288 if (ptr == start) {
1289 ptr += MINBPC(enc);
1290 if (ptr == end)
1291 return XML_TOK_TRAILING_CR;
1292 if (BYTE_TYPE(enc, ptr) == BT_LF)
1293 ptr += MINBPC(enc);
1294 *nextTokPtr = ptr;
1295 return XML_TOK_DATA_NEWLINE;
1296 }
1297 *nextTokPtr = ptr;
1298 return XML_TOK_DATA_CHARS;
1299 default:
1300 ptr += MINBPC(enc);
1301 break;
1302 }
1303 }
1304 *nextTokPtr = ptr;
1305 return XML_TOK_DATA_CHARS;
1306}
1307
1308#ifdef XML_DTD
1309
1310static
1311int PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1312 const char **nextTokPtr)
1313{
1314 int level = 0;
1315 if (MINBPC(enc) > 1) {
1316 size_t n = end - ptr;
1317 if (n & (MINBPC(enc) - 1)) {
1318 n &= ~(MINBPC(enc) - 1);
1319 end = ptr + n;
1320 }
1321 }
1322 while (ptr != end) {
1323 switch (BYTE_TYPE(enc, ptr)) {
1324 INVALID_CASES(ptr, nextTokPtr)
1325 case BT_LT:
1326 if ((ptr += MINBPC(enc)) == end)
1327 return XML_TOK_PARTIAL;
1328 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1329 if ((ptr += MINBPC(enc)) == end)
1330 return XML_TOK_PARTIAL;
1331 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1332 ++level;
1333 ptr += MINBPC(enc);
1334 }
1335 }
1336 break;
1337 case BT_RSQB:
1338 if ((ptr += MINBPC(enc)) == end)
1339 return XML_TOK_PARTIAL;
1340 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1341 if ((ptr += MINBPC(enc)) == end)
1342 return XML_TOK_PARTIAL;
1343 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1344 ptr += MINBPC(enc);
1345 if (level == 0) {
1346 *nextTokPtr = ptr;
1347 return XML_TOK_IGNORE_SECT;
1348 }
1349 --level;
1350 }
1351 }
1352 break;
1353 default:
1354 ptr += MINBPC(enc);
1355 break;
1356 }
1357 }
1358 return XML_TOK_PARTIAL;
1359}
1360
1361#endif /* XML_DTD */
1362
1363static
1364int PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1365 const char **badPtr)
1366{
1367 ptr += MINBPC(enc);
1368 end -= MINBPC(enc);
1369 for (; ptr != end; ptr += MINBPC(enc)) {
1370 switch (BYTE_TYPE(enc, ptr)) {
1371 case BT_DIGIT:
1372 case BT_HEX:
1373 case BT_MINUS:
1374 case BT_APOS:
1375 case BT_LPAR:
1376 case BT_RPAR:
1377 case BT_PLUS:
1378 case BT_COMMA:
1379 case BT_SOL:
1380 case BT_EQUALS:
1381 case BT_QUEST:
1382 case BT_CR:
1383 case BT_LF:
1384 case BT_SEMI:
1385 case BT_EXCL:
1386 case BT_AST:
1387 case BT_PERCNT:
1388 case BT_NUM:
1389#ifdef XML_NS
1390 case BT_COLON:
1391#endif
1392 break;
1393 case BT_S:
1394 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1395 *badPtr = ptr;
1396 return 0;
1397 }
1398 break;
1399 case BT_NAME:
1400 case BT_NMSTRT:
1401 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1402 break;
1403 default:
1404 switch (BYTE_TO_ASCII(enc, ptr)) {
1405 case 0x24: /* $ */
1406 case 0x40: /* @ */
1407 break;
1408 default:
1409 *badPtr = ptr;
1410 return 0;
1411 }
1412 break;
1413 }
1414 }
1415 return 1;
1416}
1417
1418/* This must only be called for a well-formed start-tag or empty element tag.
1419Returns the number of attributes. Pointers to the first attsMax attributes
1420are stored in atts. */
1421
1422static
1423int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1424 int attsMax, ATTRIBUTE *atts)
1425{
1426 enum { other, inName, inValue } state = inName;
1427 int nAtts = 0;
1428 int open = 0; /* defined when state == inValue;
1429 initialization just to shut up compilers */
1430
1431 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1432 switch (BYTE_TYPE(enc, ptr)) {
1433#define START_NAME \
1434 if (state == other) { \
1435 if (nAtts < attsMax) { \
1436 atts[nAtts].name = ptr; \
1437 atts[nAtts].normalized = 1; \
1438 } \
1439 state = inName; \
1440 }
1441#define LEAD_CASE(n) \
1442 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1443 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1444#undef LEAD_CASE
1445 case BT_NONASCII:
1446 case BT_NMSTRT:
1447 case BT_HEX:
1448 START_NAME
1449 break;
1450#undef START_NAME
1451 case BT_QUOT:
1452 if (state != inValue) {
1453 if (nAtts < attsMax)
1454 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1455 state = inValue;
1456 open = BT_QUOT;
1457 }
1458 else if (open == BT_QUOT) {
1459 state = other;
1460 if (nAtts < attsMax)
1461 atts[nAtts].valueEnd = ptr;
1462 nAtts++;
1463 }
1464 break;
1465 case BT_APOS:
1466 if (state != inValue) {
1467 if (nAtts < attsMax)
1468 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1469 state = inValue;
1470 open = BT_APOS;
1471 }
1472 else if (open == BT_APOS) {
1473 state = other;
1474 if (nAtts < attsMax)
1475 atts[nAtts].valueEnd = ptr;
1476 nAtts++;
1477 }
1478 break;
1479 case BT_AMP:
1480 if (nAtts < attsMax)
1481 atts[nAtts].normalized = 0;
1482 break;
1483 case BT_S:
1484 if (state == inName)
1485 state = other;
1486 else if (state == inValue
1487 && nAtts < attsMax
1488 && atts[nAtts].normalized
1489 && (ptr == atts[nAtts].valuePtr
1490 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1491 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1492 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1493 atts[nAtts].normalized = 0;
1494 break;
1495 case BT_CR: case BT_LF:
1496 /* This case ensures that the first attribute name is counted
1497 Apart from that we could just change state on the quote. */
1498 if (state == inName)
1499 state = other;
1500 else if (state == inValue && nAtts < attsMax)
1501 atts[nAtts].normalized = 0;
1502 break;
1503 case BT_GT:
1504 case BT_SOL:
1505 if (state != inValue)
1506 return nAtts;
1507 break;
1508 default:
1509 break;
1510 }
1511 }
1512 /* not reached */
1513}
1514
1515static
1516int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
1517{
1518 int result = 0;
1519 /* skip &# */
1520 ptr += 2*MINBPC(enc);
1521 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1522 for (ptr += MINBPC(enc); !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1523 int c = BYTE_TO_ASCII(enc, ptr);
1524 switch (c) {
1525 case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1526 case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1527 result <<= 4;
1528 result |= (c - ASCII_0);
1529 break;
1530 case ASCII_A: case ASCII_B: case ASCII_C: case ASCII_D: case ASCII_E: case ASCII_F:
1531 result <<= 4;
1532 result += 10 + (c - ASCII_A);
1533 break;
1534 case ASCII_a: case ASCII_b: case ASCII_c: case ASCII_d: case ASCII_e: case ASCII_f:
1535 result <<= 4;
1536 result += 10 + (c - ASCII_a);
1537 break;
1538 }
1539 if (result >= 0x110000)
1540 return -1;
1541 }
1542 }
1543 else {
1544 for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1545 int c = BYTE_TO_ASCII(enc, ptr);
1546 result *= 10;
1547 result += (c - ASCII_0);
1548 if (result >= 0x110000)
1549 return -1;
1550 }
1551 }
1552 return checkCharRefNumber(result);
1553}
1554
1555static
1556int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end)
1557{
1558 switch ((end - ptr)/MINBPC(enc)) {
1559 case 2:
1560 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1561 switch (BYTE_TO_ASCII(enc, ptr)) {
1562 case ASCII_l:
1563 return ASCII_LT;
1564 case ASCII_g:
1565 return ASCII_GT;
1566 }
1567 }
1568 break;
1569 case 3:
1570 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1571 ptr += MINBPC(enc);
1572 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1573 ptr += MINBPC(enc);
1574 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1575 return ASCII_AMP;
1576 }
1577 }
1578 break;
1579 case 4:
1580 switch (BYTE_TO_ASCII(enc, ptr)) {
1581 case ASCII_q:
1582 ptr += MINBPC(enc);
1583 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1584 ptr += MINBPC(enc);
1585 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1586 ptr += MINBPC(enc);
1587 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1588 return ASCII_QUOT;
1589 }
1590 }
1591 break;
1592 case ASCII_a:
1593 ptr += MINBPC(enc);
1594 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1595 ptr += MINBPC(enc);
1596 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1597 ptr += MINBPC(enc);
1598 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1599 return ASCII_APOS;
1600 }
1601 }
1602 break;
1603 }
1604 }
1605 return 0;
1606}
1607
1608static
1609int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1610{
1611 for (;;) {
1612 switch (BYTE_TYPE(enc, ptr1)) {
1613#define LEAD_CASE(n) \
1614 case BT_LEAD ## n: \
1615 if (*ptr1++ != *ptr2++) \
1616 return 0;
1617 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1618#undef LEAD_CASE
1619 /* fall through */
1620 if (*ptr1++ != *ptr2++)
1621 return 0;
1622 break;
1623 case BT_NONASCII:
1624 case BT_NMSTRT:
1625#ifdef XML_NS
1626 case BT_COLON:
1627#endif
1628 case BT_HEX:
1629 case BT_DIGIT:
1630 case BT_NAME:
1631 case BT_MINUS:
1632 if (*ptr2++ != *ptr1++)
1633 return 0;
1634 if (MINBPC(enc) > 1) {
1635 if (*ptr2++ != *ptr1++)
1636 return 0;
1637 if (MINBPC(enc) > 2) {
1638 if (*ptr2++ != *ptr1++)
1639 return 0;
1640 if (MINBPC(enc) > 3) {
1641 if (*ptr2++ != *ptr1++)
1642 return 0;
1643 }
1644 }
1645 }
1646 break;
1647 default:
1648 if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1649 return 1;
1650 switch (BYTE_TYPE(enc, ptr2)) {
1651 case BT_LEAD2:
1652 case BT_LEAD3:
1653 case BT_LEAD4:
1654 case BT_NONASCII:
1655 case BT_NMSTRT:
1656#ifdef XML_NS
1657 case BT_COLON:
1658#endif
1659 case BT_HEX:
1660 case BT_DIGIT:
1661 case BT_NAME:
1662 case BT_MINUS:
1663 return 0;
1664 default:
1665 return 1;
1666 }
1667 }
1668 }
1669 /* not reached */
1670}
1671
1672static
1673int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1674 const char *end1, const char *ptr2)
1675{
1676 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1677 if (ptr1 == end1)
1678 return 0;
1679 if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1680 return 0;
1681 }
1682 return ptr1 == end1;
1683}
1684
1685static
1686int PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1687{
1688 const char *start = ptr;
1689 for (;;) {
1690 switch (BYTE_TYPE(enc, ptr)) {
1691#define LEAD_CASE(n) \
1692 case BT_LEAD ## n: ptr += n; break;
1693 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1694#undef LEAD_CASE
1695 case BT_NONASCII:
1696 case BT_NMSTRT:
1697#ifdef XML_NS
1698 case BT_COLON:
1699#endif
1700 case BT_HEX:
1701 case BT_DIGIT:
1702 case BT_NAME:
1703 case BT_MINUS:
1704 ptr += MINBPC(enc);
1705 break;
1706 default:
1707 return ptr - start;
1708 }
1709 }
1710}
1711
1712static
1713const char *PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1714{
1715 for (;;) {
1716 switch (BYTE_TYPE(enc, ptr)) {
1717 case BT_LF:
1718 case BT_CR:
1719 case BT_S:
1720 ptr += MINBPC(enc);
1721 break;
1722 default:
1723 return ptr;
1724 }
1725 }
1726}
1727
1728static
1729void PREFIX(updatePosition)(const ENCODING *enc,
1730 const char *ptr,
1731 const char *end,
1732 POSITION *pos)
1733{
1734 while (ptr != end) {
1735 switch (BYTE_TYPE(enc, ptr)) {
1736#define LEAD_CASE(n) \
1737 case BT_LEAD ## n: \
1738 ptr += n; \
1739 break;
1740 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1741#undef LEAD_CASE
1742 case BT_LF:
1743 pos->columnNumber = (unsigned)-1;
1744 pos->lineNumber++;
1745 ptr += MINBPC(enc);
1746 break;
1747 case BT_CR:
1748 pos->lineNumber++;
1749 ptr += MINBPC(enc);
1750 if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
1751 ptr += MINBPC(enc);
1752 pos->columnNumber = (unsigned)-1;
1753 break;
1754 default:
1755 ptr += MINBPC(enc);
1756 break;
1757 }
1758 pos->columnNumber++;
1759 }
1760}
1761
1762#undef DO_LEAD_CASE
1763#undef MULTIBYTE_CASES
1764#undef INVALID_CASES
1765#undef CHECK_NAME_CASE
1766#undef CHECK_NAME_CASES
1767#undef CHECK_NMSTRT_CASE
1768#undef CHECK_NMSTRT_CASES