blob: 4d9ae7dc3896b8393f8b41eeb98becdcb3bcaa46 [file] [log] [blame]
Victor Stinner759e30e2017-09-05 01:58:08 +02001/* This file is included!
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000-2017 Expat development team
11 Licensed under the MIT license:
12
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to permit
18 persons to whom the Software is furnished to do so, subject to the
19 following conditions:
20
21 The above copyright notice and this permission notice shall be included
22 in all copies or substantial portions of the Software.
23
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30 USE OR OTHER DEALINGS IN THE SOFTWARE.
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000031*/
32
Gregory P. Smith7c6309c2012-07-14 14:12:35 -070033#ifdef XML_TOK_IMPL_C
34
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000035#ifndef IS_INVALID_CHAR
36#define IS_INVALID_CHAR(enc, ptr, n) (0)
37#endif
38
39#define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
40 case BT_LEAD ## n: \
41 if (end - ptr < n) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +000042 return XML_TOK_PARTIAL_CHAR; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000043 if (IS_INVALID_CHAR(enc, ptr, n)) { \
44 *(nextTokPtr) = (ptr); \
45 return XML_TOK_INVALID; \
46 } \
47 ptr += n; \
48 break;
49
50#define INVALID_CASES(ptr, nextTokPtr) \
51 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
52 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
53 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
54 case BT_NONXML: \
55 case BT_MALFORM: \
56 case BT_TRAIL: \
57 *(nextTokPtr) = (ptr); \
58 return XML_TOK_INVALID;
59
60#define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
61 case BT_LEAD ## n: \
62 if (end - ptr < n) \
63 return XML_TOK_PARTIAL_CHAR; \
64 if (!IS_NAME_CHAR(enc, ptr, n)) { \
65 *nextTokPtr = ptr; \
66 return XML_TOK_INVALID; \
67 } \
68 ptr += n; \
69 break;
70
71#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
72 case BT_NONASCII: \
73 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
74 *nextTokPtr = ptr; \
75 return XML_TOK_INVALID; \
76 } \
Benjamin Peterson5033aa72018-09-10 21:04:00 -070077 /* fall through */ \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000078 case BT_NMSTRT: \
79 case BT_HEX: \
80 case BT_DIGIT: \
81 case BT_NAME: \
82 case BT_MINUS: \
83 ptr += MINBPC(enc); \
84 break; \
85 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
86 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
87 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
88
89#define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
90 case BT_LEAD ## n: \
91 if (end - ptr < n) \
92 return XML_TOK_PARTIAL_CHAR; \
93 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
94 *nextTokPtr = ptr; \
95 return XML_TOK_INVALID; \
96 } \
97 ptr += n; \
98 break;
99
100#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
101 case BT_NONASCII: \
102 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
103 *nextTokPtr = ptr; \
104 return XML_TOK_INVALID; \
105 } \
Benjamin Peterson5033aa72018-09-10 21:04:00 -0700106 /* fall through */ \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000107 case BT_NMSTRT: \
108 case BT_HEX: \
109 ptr += MINBPC(enc); \
110 break; \
111 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
112 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
113 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
114
115#ifndef PREFIX
116#define PREFIX(ident) ident
117#endif
118
Victor Stinner23ec4b52017-06-15 00:54:36 +0200119
120#define HAS_CHARS(enc, ptr, end, count) \
121 (end - ptr >= count * MINBPC(enc))
122
123#define HAS_CHAR(enc, ptr, end) \
124 HAS_CHARS(enc, ptr, end, 1)
125
126#define REQUIRE_CHARS(enc, ptr, end, count) \
127 { \
128 if (! HAS_CHARS(enc, ptr, end, count)) { \
129 return XML_TOK_PARTIAL; \
130 } \
131 }
132
133#define REQUIRE_CHAR(enc, ptr, end) \
134 REQUIRE_CHARS(enc, ptr, end, 1)
135
136
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000137/* ptr points to character following "<!-" */
138
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000139static int PTRCALL
140PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
141 const char *end, const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000142{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200143 if (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000144 if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
145 *nextTokPtr = ptr;
146 return XML_TOK_INVALID;
147 }
148 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200149 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000150 switch (BYTE_TYPE(enc, ptr)) {
151 INVALID_CASES(ptr, nextTokPtr)
152 case BT_MINUS:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200153 ptr += MINBPC(enc);
154 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000155 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200156 ptr += MINBPC(enc);
157 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000158 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
159 *nextTokPtr = ptr;
160 return XML_TOK_INVALID;
161 }
162 *nextTokPtr = ptr + MINBPC(enc);
163 return XML_TOK_COMMENT;
164 }
165 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000166 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000167 ptr += MINBPC(enc);
168 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000169 }
170 }
171 }
172 return XML_TOK_PARTIAL;
173}
174
175/* ptr points to character following "<!" */
176
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000177static int PTRCALL
178PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
179 const char *end, const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000180{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200181 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000182 switch (BYTE_TYPE(enc, ptr)) {
183 case BT_MINUS:
184 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
185 case BT_LSQB:
186 *nextTokPtr = ptr + MINBPC(enc);
187 return XML_TOK_COND_SECT_OPEN;
188 case BT_NMSTRT:
189 case BT_HEX:
190 ptr += MINBPC(enc);
191 break;
192 default:
193 *nextTokPtr = ptr;
194 return XML_TOK_INVALID;
195 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200196 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000197 switch (BYTE_TYPE(enc, ptr)) {
198 case BT_PERCNT:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200199 REQUIRE_CHARS(enc, ptr, end, 2);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000200 /* don't allow <!ENTITY% foo "whatever"> */
201 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
202 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000203 *nextTokPtr = ptr;
204 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000205 }
206 /* fall through */
207 case BT_S: case BT_CR: case BT_LF:
208 *nextTokPtr = ptr;
209 return XML_TOK_DECL_OPEN;
210 case BT_NMSTRT:
211 case BT_HEX:
212 ptr += MINBPC(enc);
213 break;
214 default:
215 *nextTokPtr = ptr;
216 return XML_TOK_INVALID;
217 }
218 }
219 return XML_TOK_PARTIAL;
220}
221
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000222static int PTRCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200223PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr,
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000224 const char *end, int *tokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000225{
226 int upper = 0;
227 *tokPtr = XML_TOK_PI;
228 if (end - ptr != MINBPC(enc)*3)
229 return 1;
230 switch (BYTE_TO_ASCII(enc, ptr)) {
231 case ASCII_x:
232 break;
233 case ASCII_X:
234 upper = 1;
235 break;
236 default:
237 return 1;
238 }
239 ptr += MINBPC(enc);
240 switch (BYTE_TO_ASCII(enc, ptr)) {
241 case ASCII_m:
242 break;
243 case ASCII_M:
244 upper = 1;
245 break;
246 default:
247 return 1;
248 }
249 ptr += MINBPC(enc);
250 switch (BYTE_TO_ASCII(enc, ptr)) {
251 case ASCII_l:
252 break;
253 case ASCII_L:
254 upper = 1;
255 break;
256 default:
257 return 1;
258 }
259 if (upper)
260 return 0;
261 *tokPtr = XML_TOK_XML_DECL;
262 return 1;
263}
264
265/* ptr points to character following "<?" */
266
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000267static int PTRCALL
268PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
269 const char *end, const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000270{
271 int tok;
272 const char *target = ptr;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200273 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000274 switch (BYTE_TYPE(enc, ptr)) {
275 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
276 default:
277 *nextTokPtr = ptr;
278 return XML_TOK_INVALID;
279 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200280 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000281 switch (BYTE_TYPE(enc, ptr)) {
282 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
283 case BT_S: case BT_CR: case BT_LF:
284 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000285 *nextTokPtr = ptr;
286 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000287 }
288 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200289 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000290 switch (BYTE_TYPE(enc, ptr)) {
291 INVALID_CASES(ptr, nextTokPtr)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000292 case BT_QUEST:
293 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200294 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000295 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
296 *nextTokPtr = ptr + MINBPC(enc);
297 return tok;
298 }
299 break;
300 default:
301 ptr += MINBPC(enc);
302 break;
303 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000304 }
305 return XML_TOK_PARTIAL;
306 case BT_QUEST:
307 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000308 *nextTokPtr = ptr;
309 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000310 }
311 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200312 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000313 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000314 *nextTokPtr = ptr + MINBPC(enc);
315 return tok;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000316 }
317 /* fall through */
318 default:
319 *nextTokPtr = ptr;
320 return XML_TOK_INVALID;
321 }
322 }
323 return XML_TOK_PARTIAL;
324}
325
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000326static int PTRCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200327PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr,
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000328 const char *end, const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000329{
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000330 static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
331 ASCII_T, ASCII_A, ASCII_LSQB };
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000332 int i;
333 /* CDATA[ */
Victor Stinner23ec4b52017-06-15 00:54:36 +0200334 REQUIRE_CHARS(enc, ptr, end, 6);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000335 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
336 if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
337 *nextTokPtr = ptr;
338 return XML_TOK_INVALID;
339 }
340 }
341 *nextTokPtr = ptr;
342 return XML_TOK_CDATA_SECT_OPEN;
343}
344
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000345static int PTRCALL
346PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
347 const char *end, const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000348{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200349 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000350 return XML_TOK_NONE;
351 if (MINBPC(enc) > 1) {
352 size_t n = end - ptr;
353 if (n & (MINBPC(enc) - 1)) {
354 n &= ~(MINBPC(enc) - 1);
355 if (n == 0)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000356 return XML_TOK_PARTIAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000357 end = ptr + n;
358 }
359 }
360 switch (BYTE_TYPE(enc, ptr)) {
361 case BT_RSQB:
362 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200363 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000364 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
365 break;
366 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200367 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000368 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
369 ptr -= MINBPC(enc);
370 break;
371 }
372 *nextTokPtr = ptr + MINBPC(enc);
373 return XML_TOK_CDATA_SECT_CLOSE;
374 case BT_CR:
375 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200376 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000377 if (BYTE_TYPE(enc, ptr) == BT_LF)
378 ptr += MINBPC(enc);
379 *nextTokPtr = ptr;
380 return XML_TOK_DATA_NEWLINE;
381 case BT_LF:
382 *nextTokPtr = ptr + MINBPC(enc);
383 return XML_TOK_DATA_NEWLINE;
384 INVALID_CASES(ptr, nextTokPtr)
385 default:
386 ptr += MINBPC(enc);
387 break;
388 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200389 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000390 switch (BYTE_TYPE(enc, ptr)) {
391#define LEAD_CASE(n) \
392 case BT_LEAD ## n: \
393 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000394 *nextTokPtr = ptr; \
395 return XML_TOK_DATA_CHARS; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000396 } \
397 ptr += n; \
398 break;
399 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
400#undef LEAD_CASE
401 case BT_NONXML:
402 case BT_MALFORM:
403 case BT_TRAIL:
404 case BT_CR:
405 case BT_LF:
406 case BT_RSQB:
407 *nextTokPtr = ptr;
408 return XML_TOK_DATA_CHARS;
409 default:
410 ptr += MINBPC(enc);
411 break;
412 }
413 }
414 *nextTokPtr = ptr;
415 return XML_TOK_DATA_CHARS;
416}
417
418/* ptr points to character following "</" */
419
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000420static int PTRCALL
421PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
422 const char *end, const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000423{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200424 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000425 switch (BYTE_TYPE(enc, ptr)) {
426 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
427 default:
428 *nextTokPtr = ptr;
429 return XML_TOK_INVALID;
430 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200431 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000432 switch (BYTE_TYPE(enc, ptr)) {
433 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
434 case BT_S: case BT_CR: case BT_LF:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200435 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000436 switch (BYTE_TYPE(enc, ptr)) {
437 case BT_S: case BT_CR: case BT_LF:
438 break;
439 case BT_GT:
440 *nextTokPtr = ptr + MINBPC(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000441 return XML_TOK_END_TAG;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000442 default:
443 *nextTokPtr = ptr;
444 return XML_TOK_INVALID;
445 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000446 }
447 return XML_TOK_PARTIAL;
448#ifdef XML_NS
449 case BT_COLON:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000450 /* no need to check qname syntax here,
451 since end-tag must match exactly */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000452 ptr += MINBPC(enc);
453 break;
454#endif
455 case BT_GT:
456 *nextTokPtr = ptr + MINBPC(enc);
457 return XML_TOK_END_TAG;
458 default:
459 *nextTokPtr = ptr;
460 return XML_TOK_INVALID;
461 }
462 }
463 return XML_TOK_PARTIAL;
464}
465
466/* ptr points to character following "&#X" */
467
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000468static int PTRCALL
469PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
470 const char *end, const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000471{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200472 if (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000473 switch (BYTE_TYPE(enc, ptr)) {
474 case BT_DIGIT:
475 case BT_HEX:
476 break;
477 default:
478 *nextTokPtr = ptr;
479 return XML_TOK_INVALID;
480 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200481 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000482 switch (BYTE_TYPE(enc, ptr)) {
483 case BT_DIGIT:
484 case BT_HEX:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000485 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000486 case BT_SEMI:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000487 *nextTokPtr = ptr + MINBPC(enc);
488 return XML_TOK_CHAR_REF;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000489 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000490 *nextTokPtr = ptr;
491 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000492 }
493 }
494 }
495 return XML_TOK_PARTIAL;
496}
497
498/* ptr points to character following "&#" */
499
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000500static int PTRCALL
501PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
502 const char *end, const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000503{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200504 if (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000505 if (CHAR_MATCHES(enc, ptr, ASCII_x))
506 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
507 switch (BYTE_TYPE(enc, ptr)) {
508 case BT_DIGIT:
509 break;
510 default:
511 *nextTokPtr = ptr;
512 return XML_TOK_INVALID;
513 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200514 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000515 switch (BYTE_TYPE(enc, ptr)) {
516 case BT_DIGIT:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000517 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000518 case BT_SEMI:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000519 *nextTokPtr = ptr + MINBPC(enc);
520 return XML_TOK_CHAR_REF;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000521 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000522 *nextTokPtr = ptr;
523 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000524 }
525 }
526 }
527 return XML_TOK_PARTIAL;
528}
529
530/* ptr points to character following "&" */
531
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000532static int PTRCALL
533PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
534 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000535{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200536 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000537 switch (BYTE_TYPE(enc, ptr)) {
538 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
539 case BT_NUM:
540 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
541 default:
542 *nextTokPtr = ptr;
543 return XML_TOK_INVALID;
544 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200545 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000546 switch (BYTE_TYPE(enc, ptr)) {
547 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
548 case BT_SEMI:
549 *nextTokPtr = ptr + MINBPC(enc);
550 return XML_TOK_ENTITY_REF;
551 default:
552 *nextTokPtr = ptr;
553 return XML_TOK_INVALID;
554 }
555 }
556 return XML_TOK_PARTIAL;
557}
558
559/* ptr points to character following first character of attribute name */
560
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000561static int PTRCALL
562PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
563 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000564{
565#ifdef XML_NS
566 int hadColon = 0;
567#endif
Victor Stinner23ec4b52017-06-15 00:54:36 +0200568 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000569 switch (BYTE_TYPE(enc, ptr)) {
570 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
571#ifdef XML_NS
572 case BT_COLON:
573 if (hadColon) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000574 *nextTokPtr = ptr;
575 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000576 }
577 hadColon = 1;
578 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200579 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000580 switch (BYTE_TYPE(enc, ptr)) {
581 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
582 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000583 *nextTokPtr = ptr;
584 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000585 }
586 break;
587#endif
588 case BT_S: case BT_CR: case BT_LF:
589 for (;;) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000590 int t;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000591
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000592 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200593 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000594 t = BYTE_TYPE(enc, ptr);
595 if (t == BT_EQUALS)
596 break;
597 switch (t) {
598 case BT_S:
599 case BT_LF:
600 case BT_CR:
601 break;
602 default:
603 *nextTokPtr = ptr;
604 return XML_TOK_INVALID;
605 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000606 }
Benjamin Peterson5033aa72018-09-10 21:04:00 -0700607 /* fall through */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000608 case BT_EQUALS:
609 {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000610 int open;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000611#ifdef XML_NS
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000612 hadColon = 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000613#endif
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000614 for (;;) {
615 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200616 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000617 open = BYTE_TYPE(enc, ptr);
618 if (open == BT_QUOT || open == BT_APOS)
619 break;
620 switch (open) {
621 case BT_S:
622 case BT_LF:
623 case BT_CR:
624 break;
625 default:
626 *nextTokPtr = ptr;
627 return XML_TOK_INVALID;
628 }
629 }
630 ptr += MINBPC(enc);
631 /* in attribute value */
632 for (;;) {
633 int t;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200634 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000635 t = BYTE_TYPE(enc, ptr);
636 if (t == open)
637 break;
638 switch (t) {
639 INVALID_CASES(ptr, nextTokPtr)
640 case BT_AMP:
641 {
642 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
643 if (tok <= 0) {
644 if (tok == XML_TOK_INVALID)
645 *nextTokPtr = ptr;
646 return tok;
647 }
648 break;
649 }
650 case BT_LT:
651 *nextTokPtr = ptr;
652 return XML_TOK_INVALID;
653 default:
654 ptr += MINBPC(enc);
655 break;
656 }
657 }
658 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200659 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000660 switch (BYTE_TYPE(enc, ptr)) {
661 case BT_S:
662 case BT_CR:
663 case BT_LF:
664 break;
665 case BT_SOL:
666 goto sol;
667 case BT_GT:
668 goto gt;
669 default:
670 *nextTokPtr = ptr;
671 return XML_TOK_INVALID;
672 }
673 /* ptr points to closing quote */
674 for (;;) {
675 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200676 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000677 switch (BYTE_TYPE(enc, ptr)) {
678 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
679 case BT_S: case BT_CR: case BT_LF:
680 continue;
681 case BT_GT:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000682 gt:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000683 *nextTokPtr = ptr + MINBPC(enc);
684 return XML_TOK_START_TAG_WITH_ATTS;
685 case BT_SOL:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000686 sol:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000687 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200688 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000689 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
690 *nextTokPtr = ptr;
691 return XML_TOK_INVALID;
692 }
693 *nextTokPtr = ptr + MINBPC(enc);
694 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
695 default:
696 *nextTokPtr = ptr;
697 return XML_TOK_INVALID;
698 }
699 break;
700 }
701 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000702 }
703 default:
704 *nextTokPtr = ptr;
705 return XML_TOK_INVALID;
706 }
707 }
708 return XML_TOK_PARTIAL;
709}
710
711/* ptr points to character following "<" */
712
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000713static int PTRCALL
714PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
715 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000716{
717#ifdef XML_NS
718 int hadColon;
719#endif
Victor Stinner23ec4b52017-06-15 00:54:36 +0200720 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000721 switch (BYTE_TYPE(enc, ptr)) {
722 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
723 case BT_EXCL:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200724 ptr += MINBPC(enc);
725 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000726 switch (BYTE_TYPE(enc, ptr)) {
727 case BT_MINUS:
728 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
729 case BT_LSQB:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000730 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
731 end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000732 }
733 *nextTokPtr = ptr;
734 return XML_TOK_INVALID;
735 case BT_QUEST:
736 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
737 case BT_SOL:
738 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
739 default:
740 *nextTokPtr = ptr;
741 return XML_TOK_INVALID;
742 }
743#ifdef XML_NS
744 hadColon = 0;
745#endif
746 /* we have a start-tag */
Victor Stinner23ec4b52017-06-15 00:54:36 +0200747 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000748 switch (BYTE_TYPE(enc, ptr)) {
749 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
750#ifdef XML_NS
751 case BT_COLON:
752 if (hadColon) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000753 *nextTokPtr = ptr;
754 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000755 }
756 hadColon = 1;
757 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200758 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000759 switch (BYTE_TYPE(enc, ptr)) {
760 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
761 default:
762 *nextTokPtr = ptr;
763 return XML_TOK_INVALID;
764 }
765 break;
766#endif
767 case BT_S: case BT_CR: case BT_LF:
768 {
769 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200770 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000771 switch (BYTE_TYPE(enc, ptr)) {
772 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
773 case BT_GT:
774 goto gt;
775 case BT_SOL:
776 goto sol;
777 case BT_S: case BT_CR: case BT_LF:
778 ptr += MINBPC(enc);
779 continue;
780 default:
781 *nextTokPtr = ptr;
782 return XML_TOK_INVALID;
783 }
784 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
785 }
786 return XML_TOK_PARTIAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000787 }
788 case BT_GT:
789 gt:
790 *nextTokPtr = ptr + MINBPC(enc);
791 return XML_TOK_START_TAG_NO_ATTS;
792 case BT_SOL:
793 sol:
794 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200795 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000796 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000797 *nextTokPtr = ptr;
798 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000799 }
800 *nextTokPtr = ptr + MINBPC(enc);
801 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
802 default:
803 *nextTokPtr = ptr;
804 return XML_TOK_INVALID;
805 }
806 }
807 return XML_TOK_PARTIAL;
808}
809
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000810static int PTRCALL
811PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
812 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000813{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200814 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000815 return XML_TOK_NONE;
816 if (MINBPC(enc) > 1) {
817 size_t n = end - ptr;
818 if (n & (MINBPC(enc) - 1)) {
819 n &= ~(MINBPC(enc) - 1);
820 if (n == 0)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000821 return XML_TOK_PARTIAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000822 end = ptr + n;
823 }
824 }
825 switch (BYTE_TYPE(enc, ptr)) {
826 case BT_LT:
827 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
828 case BT_AMP:
829 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
830 case BT_CR:
831 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200832 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000833 return XML_TOK_TRAILING_CR;
834 if (BYTE_TYPE(enc, ptr) == BT_LF)
835 ptr += MINBPC(enc);
836 *nextTokPtr = ptr;
837 return XML_TOK_DATA_NEWLINE;
838 case BT_LF:
839 *nextTokPtr = ptr + MINBPC(enc);
840 return XML_TOK_DATA_NEWLINE;
841 case BT_RSQB:
842 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200843 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000844 return XML_TOK_TRAILING_RSQB;
845 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
846 break;
847 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200848 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000849 return XML_TOK_TRAILING_RSQB;
850 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
851 ptr -= MINBPC(enc);
852 break;
853 }
854 *nextTokPtr = ptr;
855 return XML_TOK_INVALID;
856 INVALID_CASES(ptr, nextTokPtr)
857 default:
858 ptr += MINBPC(enc);
859 break;
860 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200861 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000862 switch (BYTE_TYPE(enc, ptr)) {
863#define LEAD_CASE(n) \
864 case BT_LEAD ## n: \
865 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000866 *nextTokPtr = ptr; \
867 return XML_TOK_DATA_CHARS; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000868 } \
869 ptr += n; \
870 break;
871 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
872#undef LEAD_CASE
873 case BT_RSQB:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200874 if (HAS_CHARS(enc, ptr, end, 2)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000875 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
876 ptr += MINBPC(enc);
877 break;
878 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200879 if (HAS_CHARS(enc, ptr, end, 3)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000880 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
881 ptr += MINBPC(enc);
882 break;
883 }
884 *nextTokPtr = ptr + 2*MINBPC(enc);
885 return XML_TOK_INVALID;
886 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000887 }
888 /* fall through */
889 case BT_AMP:
890 case BT_LT:
891 case BT_NONXML:
892 case BT_MALFORM:
893 case BT_TRAIL:
894 case BT_CR:
895 case BT_LF:
896 *nextTokPtr = ptr;
897 return XML_TOK_DATA_CHARS;
898 default:
899 ptr += MINBPC(enc);
900 break;
901 }
902 }
903 *nextTokPtr = ptr;
904 return XML_TOK_DATA_CHARS;
905}
906
907/* ptr points to character following "%" */
908
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000909static int PTRCALL
910PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
911 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000912{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200913 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000914 switch (BYTE_TYPE(enc, ptr)) {
915 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
916 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
917 *nextTokPtr = ptr;
918 return XML_TOK_PERCENT;
919 default:
920 *nextTokPtr = ptr;
921 return XML_TOK_INVALID;
922 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200923 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000924 switch (BYTE_TYPE(enc, ptr)) {
925 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
926 case BT_SEMI:
927 *nextTokPtr = ptr + MINBPC(enc);
928 return XML_TOK_PARAM_ENTITY_REF;
929 default:
930 *nextTokPtr = ptr;
931 return XML_TOK_INVALID;
932 }
933 }
934 return XML_TOK_PARTIAL;
935}
936
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000937static int PTRCALL
938PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
939 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000940{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200941 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000942 switch (BYTE_TYPE(enc, ptr)) {
943 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
944 default:
945 *nextTokPtr = ptr;
946 return XML_TOK_INVALID;
947 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200948 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000949 switch (BYTE_TYPE(enc, ptr)) {
950 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
951 case BT_CR: case BT_LF: case BT_S:
952 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
953 *nextTokPtr = ptr;
954 return XML_TOK_POUND_NAME;
955 default:
956 *nextTokPtr = ptr;
957 return XML_TOK_INVALID;
958 }
959 }
960 return -XML_TOK_POUND_NAME;
961}
962
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000963static int PTRCALL
964PREFIX(scanLit)(int open, const ENCODING *enc,
965 const char *ptr, const char *end,
966 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000967{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200968 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000969 int t = BYTE_TYPE(enc, ptr);
970 switch (t) {
971 INVALID_CASES(ptr, nextTokPtr)
972 case BT_QUOT:
973 case BT_APOS:
974 ptr += MINBPC(enc);
975 if (t != open)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000976 break;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200977 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000978 return -XML_TOK_LITERAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000979 *nextTokPtr = ptr;
980 switch (BYTE_TYPE(enc, ptr)) {
981 case BT_S: case BT_CR: case BT_LF:
982 case BT_GT: case BT_PERCNT: case BT_LSQB:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000983 return XML_TOK_LITERAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000984 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000985 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000986 }
987 default:
988 ptr += MINBPC(enc);
989 break;
990 }
991 }
992 return XML_TOK_PARTIAL;
993}
994
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000995static int PTRCALL
996PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
997 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000998{
999 int tok;
Victor Stinner23ec4b52017-06-15 00:54:36 +02001000 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001001 return XML_TOK_NONE;
1002 if (MINBPC(enc) > 1) {
1003 size_t n = end - ptr;
1004 if (n & (MINBPC(enc) - 1)) {
1005 n &= ~(MINBPC(enc) - 1);
1006 if (n == 0)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001007 return XML_TOK_PARTIAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001008 end = ptr + n;
1009 }
1010 }
1011 switch (BYTE_TYPE(enc, ptr)) {
1012 case BT_QUOT:
1013 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1014 case BT_APOS:
1015 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1016 case BT_LT:
1017 {
1018 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001019 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001020 switch (BYTE_TYPE(enc, ptr)) {
1021 case BT_EXCL:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001022 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001023 case BT_QUEST:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001024 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001025 case BT_NMSTRT:
1026 case BT_HEX:
1027 case BT_NONASCII:
1028 case BT_LEAD2:
1029 case BT_LEAD3:
1030 case BT_LEAD4:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001031 *nextTokPtr = ptr - MINBPC(enc);
1032 return XML_TOK_INSTANCE_START;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001033 }
1034 *nextTokPtr = ptr;
1035 return XML_TOK_INVALID;
1036 }
1037 case BT_CR:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001038 if (ptr + MINBPC(enc) == end) {
1039 *nextTokPtr = end;
1040 /* indicate that this might be part of a CR/LF pair */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001041 return -XML_TOK_PROLOG_S;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001042 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001043 /* fall through */
1044 case BT_S: case BT_LF:
1045 for (;;) {
1046 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001047 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001048 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001049 switch (BYTE_TYPE(enc, ptr)) {
1050 case BT_S: case BT_LF:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001051 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001052 case BT_CR:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001053 /* don't split CR/LF pair */
1054 if (ptr + MINBPC(enc) != end)
1055 break;
1056 /* fall through */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001057 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001058 *nextTokPtr = ptr;
1059 return XML_TOK_PROLOG_S;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001060 }
1061 }
1062 *nextTokPtr = ptr;
1063 return XML_TOK_PROLOG_S;
1064 case BT_PERCNT:
1065 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1066 case BT_COMMA:
1067 *nextTokPtr = ptr + MINBPC(enc);
1068 return XML_TOK_COMMA;
1069 case BT_LSQB:
1070 *nextTokPtr = ptr + MINBPC(enc);
1071 return XML_TOK_OPEN_BRACKET;
1072 case BT_RSQB:
1073 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001074 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001075 return -XML_TOK_CLOSE_BRACKET;
1076 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
Victor Stinner23ec4b52017-06-15 00:54:36 +02001077 REQUIRE_CHARS(enc, ptr, end, 2);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001078 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001079 *nextTokPtr = ptr + 2*MINBPC(enc);
1080 return XML_TOK_COND_SECT_CLOSE;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001081 }
1082 }
1083 *nextTokPtr = ptr;
1084 return XML_TOK_CLOSE_BRACKET;
1085 case BT_LPAR:
1086 *nextTokPtr = ptr + MINBPC(enc);
1087 return XML_TOK_OPEN_PAREN;
1088 case BT_RPAR:
1089 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001090 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001091 return -XML_TOK_CLOSE_PAREN;
1092 switch (BYTE_TYPE(enc, ptr)) {
1093 case BT_AST:
1094 *nextTokPtr = ptr + MINBPC(enc);
1095 return XML_TOK_CLOSE_PAREN_ASTERISK;
1096 case BT_QUEST:
1097 *nextTokPtr = ptr + MINBPC(enc);
1098 return XML_TOK_CLOSE_PAREN_QUESTION;
1099 case BT_PLUS:
1100 *nextTokPtr = ptr + MINBPC(enc);
1101 return XML_TOK_CLOSE_PAREN_PLUS;
1102 case BT_CR: case BT_LF: case BT_S:
1103 case BT_GT: case BT_COMMA: case BT_VERBAR:
1104 case BT_RPAR:
1105 *nextTokPtr = ptr;
1106 return XML_TOK_CLOSE_PAREN;
1107 }
1108 *nextTokPtr = ptr;
1109 return XML_TOK_INVALID;
1110 case BT_VERBAR:
1111 *nextTokPtr = ptr + MINBPC(enc);
1112 return XML_TOK_OR;
1113 case BT_GT:
1114 *nextTokPtr = ptr + MINBPC(enc);
1115 return XML_TOK_DECL_CLOSE;
1116 case BT_NUM:
1117 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1118#define LEAD_CASE(n) \
1119 case BT_LEAD ## n: \
1120 if (end - ptr < n) \
1121 return XML_TOK_PARTIAL_CHAR; \
1122 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1123 ptr += n; \
1124 tok = XML_TOK_NAME; \
1125 break; \
1126 } \
1127 if (IS_NAME_CHAR(enc, ptr, n)) { \
1128 ptr += n; \
1129 tok = XML_TOK_NMTOKEN; \
1130 break; \
1131 } \
1132 *nextTokPtr = ptr; \
1133 return XML_TOK_INVALID;
1134 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1135#undef LEAD_CASE
1136 case BT_NMSTRT:
1137 case BT_HEX:
1138 tok = XML_TOK_NAME;
1139 ptr += MINBPC(enc);
1140 break;
1141 case BT_DIGIT:
1142 case BT_NAME:
1143 case BT_MINUS:
1144#ifdef XML_NS
1145 case BT_COLON:
1146#endif
1147 tok = XML_TOK_NMTOKEN;
1148 ptr += MINBPC(enc);
1149 break;
1150 case BT_NONASCII:
1151 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1152 ptr += MINBPC(enc);
1153 tok = XML_TOK_NAME;
1154 break;
1155 }
1156 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1157 ptr += MINBPC(enc);
1158 tok = XML_TOK_NMTOKEN;
1159 break;
1160 }
1161 /* fall through */
1162 default:
1163 *nextTokPtr = ptr;
1164 return XML_TOK_INVALID;
1165 }
Victor Stinner23ec4b52017-06-15 00:54:36 +02001166 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001167 switch (BYTE_TYPE(enc, ptr)) {
1168 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1169 case BT_GT: case BT_RPAR: case BT_COMMA:
1170 case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1171 case BT_S: case BT_CR: case BT_LF:
1172 *nextTokPtr = ptr;
1173 return tok;
1174#ifdef XML_NS
1175 case BT_COLON:
1176 ptr += MINBPC(enc);
1177 switch (tok) {
1178 case XML_TOK_NAME:
Victor Stinner23ec4b52017-06-15 00:54:36 +02001179 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001180 tok = XML_TOK_PREFIXED_NAME;
1181 switch (BYTE_TYPE(enc, ptr)) {
1182 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1183 default:
1184 tok = XML_TOK_NMTOKEN;
1185 break;
1186 }
1187 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001188 case XML_TOK_PREFIXED_NAME:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001189 tok = XML_TOK_NMTOKEN;
1190 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001191 }
1192 break;
1193#endif
1194 case BT_PLUS:
1195 if (tok == XML_TOK_NMTOKEN) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001196 *nextTokPtr = ptr;
1197 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001198 }
1199 *nextTokPtr = ptr + MINBPC(enc);
1200 return XML_TOK_NAME_PLUS;
1201 case BT_AST:
1202 if (tok == XML_TOK_NMTOKEN) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001203 *nextTokPtr = ptr;
1204 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001205 }
1206 *nextTokPtr = ptr + MINBPC(enc);
1207 return XML_TOK_NAME_ASTERISK;
1208 case BT_QUEST:
1209 if (tok == XML_TOK_NMTOKEN) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001210 *nextTokPtr = ptr;
1211 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001212 }
1213 *nextTokPtr = ptr + MINBPC(enc);
1214 return XML_TOK_NAME_QUESTION;
1215 default:
1216 *nextTokPtr = ptr;
1217 return XML_TOK_INVALID;
1218 }
1219 }
1220 return -tok;
1221}
1222
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001223static int PTRCALL
1224PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1225 const char *end, const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001226{
1227 const char *start;
Victor Stinner23ec4b52017-06-15 00:54:36 +02001228 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001229 return XML_TOK_NONE;
Victor Stinner93d0cb52017-08-18 23:43:54 +02001230 else if (! HAS_CHAR(enc, ptr, end)) {
1231 /* This line cannot be executed. The incoming data has already
1232 * been tokenized once, so incomplete characters like this have
1233 * already been eliminated from the input. Retaining the paranoia
1234 * check is still valuable, however.
1235 */
1236 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1237 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001238 start = ptr;
Victor Stinner23ec4b52017-06-15 00:54:36 +02001239 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001240 switch (BYTE_TYPE(enc, ptr)) {
1241#define LEAD_CASE(n) \
1242 case BT_LEAD ## n: ptr += n; break;
1243 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1244#undef LEAD_CASE
1245 case BT_AMP:
1246 if (ptr == start)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001247 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001248 *nextTokPtr = ptr;
1249 return XML_TOK_DATA_CHARS;
1250 case BT_LT:
1251 /* this is for inside entity references */
1252 *nextTokPtr = ptr;
1253 return XML_TOK_INVALID;
1254 case BT_LF:
1255 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001256 *nextTokPtr = ptr + MINBPC(enc);
1257 return XML_TOK_DATA_NEWLINE;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001258 }
1259 *nextTokPtr = ptr;
1260 return XML_TOK_DATA_CHARS;
1261 case BT_CR:
1262 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001263 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001264 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001265 return XML_TOK_TRAILING_CR;
1266 if (BYTE_TYPE(enc, ptr) == BT_LF)
1267 ptr += MINBPC(enc);
1268 *nextTokPtr = ptr;
1269 return XML_TOK_DATA_NEWLINE;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001270 }
1271 *nextTokPtr = ptr;
1272 return XML_TOK_DATA_CHARS;
1273 case BT_S:
1274 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001275 *nextTokPtr = ptr + MINBPC(enc);
1276 return XML_TOK_ATTRIBUTE_VALUE_S;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001277 }
1278 *nextTokPtr = ptr;
1279 return XML_TOK_DATA_CHARS;
1280 default:
1281 ptr += MINBPC(enc);
1282 break;
1283 }
1284 }
1285 *nextTokPtr = ptr;
1286 return XML_TOK_DATA_CHARS;
1287}
1288
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001289static int PTRCALL
1290PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1291 const char *end, const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001292{
1293 const char *start;
Victor Stinner23ec4b52017-06-15 00:54:36 +02001294 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001295 return XML_TOK_NONE;
Victor Stinner93d0cb52017-08-18 23:43:54 +02001296 else if (! HAS_CHAR(enc, ptr, end)) {
1297 /* This line cannot be executed. The incoming data has already
1298 * been tokenized once, so incomplete characters like this have
1299 * already been eliminated from the input. Retaining the paranoia
1300 * check is still valuable, however.
1301 */
1302 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1303 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001304 start = ptr;
Victor Stinner23ec4b52017-06-15 00:54:36 +02001305 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001306 switch (BYTE_TYPE(enc, ptr)) {
1307#define LEAD_CASE(n) \
1308 case BT_LEAD ## n: ptr += n; break;
1309 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1310#undef LEAD_CASE
1311 case BT_AMP:
1312 if (ptr == start)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001313 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001314 *nextTokPtr = ptr;
1315 return XML_TOK_DATA_CHARS;
1316 case BT_PERCNT:
1317 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001318 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1319 end, nextTokPtr);
1320 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001321 }
1322 *nextTokPtr = ptr;
1323 return XML_TOK_DATA_CHARS;
1324 case BT_LF:
1325 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001326 *nextTokPtr = ptr + MINBPC(enc);
1327 return XML_TOK_DATA_NEWLINE;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001328 }
1329 *nextTokPtr = ptr;
1330 return XML_TOK_DATA_CHARS;
1331 case BT_CR:
1332 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001333 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001334 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001335 return XML_TOK_TRAILING_CR;
1336 if (BYTE_TYPE(enc, ptr) == BT_LF)
1337 ptr += MINBPC(enc);
1338 *nextTokPtr = ptr;
1339 return XML_TOK_DATA_NEWLINE;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001340 }
1341 *nextTokPtr = ptr;
1342 return XML_TOK_DATA_CHARS;
1343 default:
1344 ptr += MINBPC(enc);
1345 break;
1346 }
1347 }
1348 *nextTokPtr = ptr;
1349 return XML_TOK_DATA_CHARS;
1350}
1351
1352#ifdef XML_DTD
1353
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001354static int PTRCALL
1355PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1356 const char *end, const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001357{
1358 int level = 0;
1359 if (MINBPC(enc) > 1) {
1360 size_t n = end - ptr;
1361 if (n & (MINBPC(enc) - 1)) {
1362 n &= ~(MINBPC(enc) - 1);
1363 end = ptr + n;
1364 }
1365 }
Victor Stinner23ec4b52017-06-15 00:54:36 +02001366 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001367 switch (BYTE_TYPE(enc, ptr)) {
1368 INVALID_CASES(ptr, nextTokPtr)
1369 case BT_LT:
Victor Stinner23ec4b52017-06-15 00:54:36 +02001370 ptr += MINBPC(enc);
1371 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001372 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
Victor Stinner23ec4b52017-06-15 00:54:36 +02001373 ptr += MINBPC(enc);
1374 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001375 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1376 ++level;
1377 ptr += MINBPC(enc);
1378 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001379 }
1380 break;
1381 case BT_RSQB:
Victor Stinner23ec4b52017-06-15 00:54:36 +02001382 ptr += MINBPC(enc);
1383 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001384 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
Victor Stinner23ec4b52017-06-15 00:54:36 +02001385 ptr += MINBPC(enc);
1386 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001387 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1388 ptr += MINBPC(enc);
1389 if (level == 0) {
1390 *nextTokPtr = ptr;
1391 return XML_TOK_IGNORE_SECT;
1392 }
1393 --level;
1394 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001395 }
1396 break;
1397 default:
1398 ptr += MINBPC(enc);
1399 break;
1400 }
1401 }
1402 return XML_TOK_PARTIAL;
1403}
1404
1405#endif /* XML_DTD */
1406
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001407static int PTRCALL
1408PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1409 const char **badPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001410{
1411 ptr += MINBPC(enc);
1412 end -= MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001413 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001414 switch (BYTE_TYPE(enc, ptr)) {
1415 case BT_DIGIT:
1416 case BT_HEX:
1417 case BT_MINUS:
1418 case BT_APOS:
1419 case BT_LPAR:
1420 case BT_RPAR:
1421 case BT_PLUS:
1422 case BT_COMMA:
1423 case BT_SOL:
1424 case BT_EQUALS:
1425 case BT_QUEST:
1426 case BT_CR:
1427 case BT_LF:
1428 case BT_SEMI:
1429 case BT_EXCL:
1430 case BT_AST:
1431 case BT_PERCNT:
1432 case BT_NUM:
1433#ifdef XML_NS
1434 case BT_COLON:
1435#endif
1436 break;
1437 case BT_S:
1438 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001439 *badPtr = ptr;
1440 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001441 }
1442 break;
1443 case BT_NAME:
1444 case BT_NMSTRT:
1445 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001446 break;
Benjamin Peterson5033aa72018-09-10 21:04:00 -07001447 /* fall through */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001448 default:
1449 switch (BYTE_TO_ASCII(enc, ptr)) {
1450 case 0x24: /* $ */
1451 case 0x40: /* @ */
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001452 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001453 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001454 *badPtr = ptr;
1455 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001456 }
1457 break;
1458 }
1459 }
1460 return 1;
1461}
1462
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001463/* This must only be called for a well-formed start-tag or empty
1464 element tag. Returns the number of attributes. Pointers to the
1465 first attsMax attributes are stored in atts.
1466*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001467
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001468static int PTRCALL
1469PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1470 int attsMax, ATTRIBUTE *atts)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001471{
1472 enum { other, inName, inValue } state = inName;
1473 int nAtts = 0;
1474 int open = 0; /* defined when state == inValue;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001475 initialization just to shut up compilers */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001476
1477 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1478 switch (BYTE_TYPE(enc, ptr)) {
1479#define START_NAME \
1480 if (state == other) { \
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001481 if (nAtts < attsMax) { \
1482 atts[nAtts].name = ptr; \
1483 atts[nAtts].normalized = 1; \
1484 } \
1485 state = inName; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001486 }
1487#define LEAD_CASE(n) \
1488 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1489 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1490#undef LEAD_CASE
1491 case BT_NONASCII:
1492 case BT_NMSTRT:
1493 case BT_HEX:
1494 START_NAME
1495 break;
1496#undef START_NAME
1497 case BT_QUOT:
1498 if (state != inValue) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001499 if (nAtts < attsMax)
1500 atts[nAtts].valuePtr = ptr + MINBPC(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001501 state = inValue;
1502 open = BT_QUOT;
1503 }
1504 else if (open == BT_QUOT) {
1505 state = other;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001506 if (nAtts < attsMax)
1507 atts[nAtts].valueEnd = ptr;
1508 nAtts++;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001509 }
1510 break;
1511 case BT_APOS:
1512 if (state != inValue) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001513 if (nAtts < attsMax)
1514 atts[nAtts].valuePtr = ptr + MINBPC(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001515 state = inValue;
1516 open = BT_APOS;
1517 }
1518 else if (open == BT_APOS) {
1519 state = other;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001520 if (nAtts < attsMax)
1521 atts[nAtts].valueEnd = ptr;
1522 nAtts++;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001523 }
1524 break;
1525 case BT_AMP:
1526 if (nAtts < attsMax)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001527 atts[nAtts].normalized = 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001528 break;
1529 case BT_S:
1530 if (state == inName)
1531 state = other;
1532 else if (state == inValue
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001533 && nAtts < attsMax
1534 && atts[nAtts].normalized
1535 && (ptr == atts[nAtts].valuePtr
1536 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1537 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1538 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1539 atts[nAtts].normalized = 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001540 break;
1541 case BT_CR: case BT_LF:
1542 /* This case ensures that the first attribute name is counted
1543 Apart from that we could just change state on the quote. */
1544 if (state == inName)
1545 state = other;
1546 else if (state == inValue && nAtts < attsMax)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001547 atts[nAtts].normalized = 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001548 break;
1549 case BT_GT:
1550 case BT_SOL:
1551 if (state != inValue)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001552 return nAtts;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001553 break;
1554 default:
1555 break;
1556 }
1557 }
1558 /* not reached */
1559}
1560
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001561static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +02001562PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001563{
1564 int result = 0;
1565 /* skip &# */
1566 ptr += 2*MINBPC(enc);
1567 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001568 for (ptr += MINBPC(enc);
1569 !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1570 ptr += MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001571 int c = BYTE_TO_ASCII(enc, ptr);
1572 switch (c) {
1573 case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1574 case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001575 result <<= 4;
1576 result |= (c - ASCII_0);
1577 break;
1578 case ASCII_A: case ASCII_B: case ASCII_C:
1579 case ASCII_D: case ASCII_E: case ASCII_F:
1580 result <<= 4;
1581 result += 10 + (c - ASCII_A);
1582 break;
1583 case ASCII_a: case ASCII_b: case ASCII_c:
1584 case ASCII_d: case ASCII_e: case ASCII_f:
1585 result <<= 4;
1586 result += 10 + (c - ASCII_a);
1587 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001588 }
1589 if (result >= 0x110000)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001590 return -1;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001591 }
1592 }
1593 else {
1594 for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1595 int c = BYTE_TO_ASCII(enc, ptr);
1596 result *= 10;
1597 result += (c - ASCII_0);
1598 if (result >= 0x110000)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001599 return -1;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001600 }
1601 }
1602 return checkCharRefNumber(result);
1603}
1604
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001605static int PTRCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +02001606PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001607 const char *end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001608{
1609 switch ((end - ptr)/MINBPC(enc)) {
1610 case 2:
1611 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1612 switch (BYTE_TO_ASCII(enc, ptr)) {
1613 case ASCII_l:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001614 return ASCII_LT;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001615 case ASCII_g:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001616 return ASCII_GT;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001617 }
1618 }
1619 break;
1620 case 3:
1621 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1622 ptr += MINBPC(enc);
1623 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001624 ptr += MINBPC(enc);
1625 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1626 return ASCII_AMP;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001627 }
1628 }
1629 break;
1630 case 4:
1631 switch (BYTE_TO_ASCII(enc, ptr)) {
1632 case ASCII_q:
1633 ptr += MINBPC(enc);
1634 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001635 ptr += MINBPC(enc);
1636 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1637 ptr += MINBPC(enc);
1638 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1639 return ASCII_QUOT;
1640 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001641 }
1642 break;
1643 case ASCII_a:
1644 ptr += MINBPC(enc);
1645 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001646 ptr += MINBPC(enc);
1647 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1648 ptr += MINBPC(enc);
1649 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1650 return ASCII_APOS;
1651 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001652 }
1653 break;
1654 }
1655 }
1656 return 0;
1657}
1658
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001659static int PTRCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +02001660PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001661 const char *end1, const char *ptr2)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001662{
1663 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
Victor Stinner93d0cb52017-08-18 23:43:54 +02001664 if (end1 - ptr1 < MINBPC(enc)) {
Benjamin Peterson5033aa72018-09-10 21:04:00 -07001665 /* This line cannot be executed. The incoming data has already
1666 * been tokenized once, so incomplete characters like this have
Victor Stinner93d0cb52017-08-18 23:43:54 +02001667 * already been eliminated from the input. Retaining the
1668 * paranoia check is still valuable, however.
1669 */
1670 return 0; /* LCOV_EXCL_LINE */
1671 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001672 if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1673 return 0;
1674 }
1675 return ptr1 == end1;
1676}
1677
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001678static int PTRFASTCALL
1679PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001680{
1681 const char *start = ptr;
1682 for (;;) {
1683 switch (BYTE_TYPE(enc, ptr)) {
1684#define LEAD_CASE(n) \
1685 case BT_LEAD ## n: ptr += n; break;
1686 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1687#undef LEAD_CASE
1688 case BT_NONASCII:
1689 case BT_NMSTRT:
1690#ifdef XML_NS
1691 case BT_COLON:
1692#endif
1693 case BT_HEX:
1694 case BT_DIGIT:
1695 case BT_NAME:
1696 case BT_MINUS:
1697 ptr += MINBPC(enc);
1698 break;
1699 default:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001700 return (int)(ptr - start);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001701 }
1702 }
1703}
1704
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001705static const char * PTRFASTCALL
1706PREFIX(skipS)(const ENCODING *enc, const char *ptr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001707{
1708 for (;;) {
1709 switch (BYTE_TYPE(enc, ptr)) {
1710 case BT_LF:
1711 case BT_CR:
1712 case BT_S:
1713 ptr += MINBPC(enc);
1714 break;
1715 default:
1716 return ptr;
1717 }
1718 }
1719}
1720
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001721static void PTRCALL
1722PREFIX(updatePosition)(const ENCODING *enc,
1723 const char *ptr,
1724 const char *end,
1725 POSITION *pos)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001726{
Victor Stinner23ec4b52017-06-15 00:54:36 +02001727 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001728 switch (BYTE_TYPE(enc, ptr)) {
1729#define LEAD_CASE(n) \
1730 case BT_LEAD ## n: \
1731 ptr += n; \
1732 break;
1733 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1734#undef LEAD_CASE
1735 case BT_LF:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001736 pos->columnNumber = (XML_Size)-1;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001737 pos->lineNumber++;
1738 ptr += MINBPC(enc);
1739 break;
1740 case BT_CR:
1741 pos->lineNumber++;
1742 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001743 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001744 ptr += MINBPC(enc);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001745 pos->columnNumber = (XML_Size)-1;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001746 break;
1747 default:
1748 ptr += MINBPC(enc);
1749 break;
1750 }
1751 pos->columnNumber++;
1752 }
1753}
1754
1755#undef DO_LEAD_CASE
1756#undef MULTIBYTE_CASES
1757#undef INVALID_CASES
1758#undef CHECK_NAME_CASE
1759#undef CHECK_NAME_CASES
1760#undef CHECK_NMSTRT_CASE
1761#undef CHECK_NMSTRT_CASES
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001762
Gregory P. Smith7c6309c2012-07-14 14:12:35 -07001763#endif /* XML_TOK_IMPL_C */