blob: c209221cd79d135bfc034f0f67277261f6057fca [file] [log] [blame]
Victor Stinner759e30e2017-09-05 01:58:08 +02001/* This file is included!
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000-2017 Expat development team
11 Licensed under the MIT license:
12
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to permit
18 persons to whom the Software is furnished to do so, subject to the
19 following conditions:
20
21 The above copyright notice and this permission notice shall be included
22 in all copies or substantial portions of the Software.
23
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30 USE OR OTHER DEALINGS IN THE SOFTWARE.
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000031*/
32
Gregory P. Smith7c6309c2012-07-14 14:12:35 -070033#ifdef XML_TOK_IMPL_C
34
Benjamin Peterson52b94082019-09-25 21:33:58 -070035# ifndef IS_INVALID_CHAR
36# define IS_INVALID_CHAR(enc, ptr, n) (0)
37# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000038
Benjamin Peterson52b94082019-09-25 21:33:58 -070039# define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
40 case BT_LEAD##n: \
41 if (end - ptr < n) \
42 return XML_TOK_PARTIAL_CHAR; \
43 if (IS_INVALID_CHAR(enc, ptr, n)) { \
44 *(nextTokPtr) = (ptr); \
45 return XML_TOK_INVALID; \
46 } \
47 ptr += n; \
48 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000049
Benjamin Peterson52b94082019-09-25 21:33:58 -070050# define INVALID_CASES(ptr, nextTokPtr) \
51 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
52 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
53 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
54 case BT_NONXML: \
55 case BT_MALFORM: \
56 case BT_TRAIL: \
57 *(nextTokPtr) = (ptr); \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000058 return XML_TOK_INVALID;
59
Benjamin Peterson52b94082019-09-25 21:33:58 -070060# define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
61 case BT_LEAD##n: \
62 if (end - ptr < n) \
63 return XML_TOK_PARTIAL_CHAR; \
64 if (! IS_NAME_CHAR(enc, ptr, n)) { \
65 *nextTokPtr = ptr; \
66 return XML_TOK_INVALID; \
67 } \
68 ptr += n; \
69 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000070
Benjamin Peterson52b94082019-09-25 21:33:58 -070071# define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
72 case BT_NONASCII: \
73 if (! IS_NAME_CHAR_MINBPC(enc, ptr)) { \
74 *nextTokPtr = ptr; \
75 return XML_TOK_INVALID; \
76 } \
77 /* fall through */ \
78 case BT_NMSTRT: \
79 case BT_HEX: \
80 case BT_DIGIT: \
81 case BT_NAME: \
82 case BT_MINUS: \
83 ptr += MINBPC(enc); \
84 break; \
85 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
86 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
87 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000088
Benjamin Peterson52b94082019-09-25 21:33:58 -070089# define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
90 case BT_LEAD##n: \
91 if (end - ptr < n) \
92 return XML_TOK_PARTIAL_CHAR; \
93 if (! IS_NMSTRT_CHAR(enc, ptr, n)) { \
94 *nextTokPtr = ptr; \
95 return XML_TOK_INVALID; \
96 } \
97 ptr += n; \
98 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000099
Benjamin Peterson52b94082019-09-25 21:33:58 -0700100# define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
101 case BT_NONASCII: \
102 if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
103 *nextTokPtr = ptr; \
104 return XML_TOK_INVALID; \
105 } \
106 /* fall through */ \
107 case BT_NMSTRT: \
108 case BT_HEX: \
109 ptr += MINBPC(enc); \
110 break; \
111 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
112 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
113 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000114
Benjamin Peterson52b94082019-09-25 21:33:58 -0700115# ifndef PREFIX
116# define PREFIX(ident) ident
117# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000118
Benjamin Peterson52b94082019-09-25 21:33:58 -0700119# define HAS_CHARS(enc, ptr, end, count) (end - ptr >= count * MINBPC(enc))
Victor Stinner23ec4b52017-06-15 00:54:36 +0200120
Benjamin Peterson52b94082019-09-25 21:33:58 -0700121# define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200122
Benjamin Peterson52b94082019-09-25 21:33:58 -0700123# define REQUIRE_CHARS(enc, ptr, end, count) \
124 { \
125 if (! HAS_CHARS(enc, ptr, end, count)) { \
126 return XML_TOK_PARTIAL; \
127 } \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200128 }
129
Benjamin Peterson52b94082019-09-25 21:33:58 -0700130# define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200131
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000132/* ptr points to character following "<!-" */
133
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000134static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700135PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
136 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200137 if (HAS_CHAR(enc, ptr, end)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700138 if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000139 *nextTokPtr = ptr;
140 return XML_TOK_INVALID;
141 }
142 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200143 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000144 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700145 INVALID_CASES(ptr, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000146 case BT_MINUS:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200147 ptr += MINBPC(enc);
148 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000149 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200150 ptr += MINBPC(enc);
151 REQUIRE_CHAR(enc, ptr, end);
Benjamin Peterson52b94082019-09-25 21:33:58 -0700152 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000153 *nextTokPtr = ptr;
154 return XML_TOK_INVALID;
155 }
156 *nextTokPtr = ptr + MINBPC(enc);
157 return XML_TOK_COMMENT;
158 }
159 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000160 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000161 ptr += MINBPC(enc);
162 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000163 }
164 }
165 }
166 return XML_TOK_PARTIAL;
167}
168
169/* ptr points to character following "<!" */
170
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000171static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700172PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
173 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200174 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000175 switch (BYTE_TYPE(enc, ptr)) {
176 case BT_MINUS:
177 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
178 case BT_LSQB:
179 *nextTokPtr = ptr + MINBPC(enc);
180 return XML_TOK_COND_SECT_OPEN;
181 case BT_NMSTRT:
182 case BT_HEX:
183 ptr += MINBPC(enc);
184 break;
185 default:
186 *nextTokPtr = ptr;
187 return XML_TOK_INVALID;
188 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200189 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000190 switch (BYTE_TYPE(enc, ptr)) {
191 case BT_PERCNT:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200192 REQUIRE_CHARS(enc, ptr, end, 2);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000193 /* don't allow <!ENTITY% foo "whatever"> */
194 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700195 case BT_S:
196 case BT_CR:
197 case BT_LF:
198 case BT_PERCNT:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000199 *nextTokPtr = ptr;
200 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000201 }
202 /* fall through */
Benjamin Peterson52b94082019-09-25 21:33:58 -0700203 case BT_S:
204 case BT_CR:
205 case BT_LF:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000206 *nextTokPtr = ptr;
207 return XML_TOK_DECL_OPEN;
208 case BT_NMSTRT:
209 case BT_HEX:
210 ptr += MINBPC(enc);
211 break;
212 default:
213 *nextTokPtr = ptr;
214 return XML_TOK_INVALID;
215 }
216 }
217 return XML_TOK_PARTIAL;
218}
219
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000220static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700221PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
222 int *tokPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000223 int upper = 0;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700224 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000225 *tokPtr = XML_TOK_PI;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700226 if (end - ptr != MINBPC(enc) * 3)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000227 return 1;
228 switch (BYTE_TO_ASCII(enc, ptr)) {
229 case ASCII_x:
230 break;
231 case ASCII_X:
232 upper = 1;
233 break;
234 default:
235 return 1;
236 }
237 ptr += MINBPC(enc);
238 switch (BYTE_TO_ASCII(enc, ptr)) {
239 case ASCII_m:
240 break;
241 case ASCII_M:
242 upper = 1;
243 break;
244 default:
245 return 1;
246 }
247 ptr += MINBPC(enc);
248 switch (BYTE_TO_ASCII(enc, ptr)) {
249 case ASCII_l:
250 break;
251 case ASCII_L:
252 upper = 1;
253 break;
254 default:
255 return 1;
256 }
257 if (upper)
258 return 0;
259 *tokPtr = XML_TOK_XML_DECL;
260 return 1;
261}
262
263/* ptr points to character following "<?" */
264
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000265static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700266PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
267 const char **nextTokPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000268 int tok;
269 const char *target = ptr;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200270 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000271 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700272 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000273 default:
274 *nextTokPtr = ptr;
275 return XML_TOK_INVALID;
276 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200277 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000278 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700279 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
280 case BT_S:
281 case BT_CR:
282 case BT_LF:
283 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000284 *nextTokPtr = ptr;
285 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000286 }
287 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200288 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000289 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700290 INVALID_CASES(ptr, nextTokPtr)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000291 case BT_QUEST:
292 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200293 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000294 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
295 *nextTokPtr = ptr + MINBPC(enc);
296 return tok;
297 }
298 break;
299 default:
300 ptr += MINBPC(enc);
301 break;
302 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000303 }
304 return XML_TOK_PARTIAL;
305 case BT_QUEST:
Benjamin Peterson52b94082019-09-25 21:33:58 -0700306 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000307 *nextTokPtr = ptr;
308 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000309 }
310 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200311 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000312 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000313 *nextTokPtr = ptr + MINBPC(enc);
314 return tok;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000315 }
316 /* fall through */
317 default:
318 *nextTokPtr = ptr;
319 return XML_TOK_INVALID;
320 }
321 }
322 return XML_TOK_PARTIAL;
323}
324
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000325static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700326PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
327 const char **nextTokPtr) {
328 static const char CDATA_LSQB[]
329 = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000330 int i;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700331 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000332 /* CDATA[ */
Victor Stinner23ec4b52017-06-15 00:54:36 +0200333 REQUIRE_CHARS(enc, ptr, end, 6);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000334 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700335 if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000336 *nextTokPtr = ptr;
337 return XML_TOK_INVALID;
338 }
339 }
340 *nextTokPtr = ptr;
341 return XML_TOK_CDATA_SECT_OPEN;
342}
343
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000344static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700345PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
346 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200347 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000348 return XML_TOK_NONE;
349 if (MINBPC(enc) > 1) {
350 size_t n = end - ptr;
351 if (n & (MINBPC(enc) - 1)) {
352 n &= ~(MINBPC(enc) - 1);
353 if (n == 0)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000354 return XML_TOK_PARTIAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000355 end = ptr + n;
356 }
357 }
358 switch (BYTE_TYPE(enc, ptr)) {
359 case BT_RSQB:
360 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200361 REQUIRE_CHAR(enc, ptr, end);
Benjamin Peterson52b94082019-09-25 21:33:58 -0700362 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000363 break;
364 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200365 REQUIRE_CHAR(enc, ptr, end);
Benjamin Peterson52b94082019-09-25 21:33:58 -0700366 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000367 ptr -= MINBPC(enc);
368 break;
369 }
370 *nextTokPtr = ptr + MINBPC(enc);
371 return XML_TOK_CDATA_SECT_CLOSE;
372 case BT_CR:
373 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200374 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000375 if (BYTE_TYPE(enc, ptr) == BT_LF)
376 ptr += MINBPC(enc);
377 *nextTokPtr = ptr;
378 return XML_TOK_DATA_NEWLINE;
379 case BT_LF:
380 *nextTokPtr = ptr + MINBPC(enc);
381 return XML_TOK_DATA_NEWLINE;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700382 INVALID_CASES(ptr, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000383 default:
384 ptr += MINBPC(enc);
385 break;
386 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200387 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000388 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700389# define LEAD_CASE(n) \
390 case BT_LEAD##n: \
391 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
392 *nextTokPtr = ptr; \
393 return XML_TOK_DATA_CHARS; \
394 } \
395 ptr += n; \
396 break;
397 LEAD_CASE(2)
398 LEAD_CASE(3)
399 LEAD_CASE(4)
400# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000401 case BT_NONXML:
402 case BT_MALFORM:
403 case BT_TRAIL:
404 case BT_CR:
405 case BT_LF:
406 case BT_RSQB:
407 *nextTokPtr = ptr;
408 return XML_TOK_DATA_CHARS;
409 default:
410 ptr += MINBPC(enc);
411 break;
412 }
413 }
414 *nextTokPtr = ptr;
415 return XML_TOK_DATA_CHARS;
416}
417
418/* ptr points to character following "</" */
419
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000420static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700421PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
422 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200423 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000424 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700425 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000426 default:
427 *nextTokPtr = ptr;
428 return XML_TOK_INVALID;
429 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200430 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000431 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700432 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
433 case BT_S:
434 case BT_CR:
435 case BT_LF:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200436 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000437 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700438 case BT_S:
439 case BT_CR:
440 case BT_LF:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000441 break;
442 case BT_GT:
443 *nextTokPtr = ptr + MINBPC(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000444 return XML_TOK_END_TAG;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000445 default:
446 *nextTokPtr = ptr;
447 return XML_TOK_INVALID;
448 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000449 }
450 return XML_TOK_PARTIAL;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700451# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000452 case BT_COLON:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000453 /* no need to check qname syntax here,
454 since end-tag must match exactly */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000455 ptr += MINBPC(enc);
456 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700457# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000458 case BT_GT:
459 *nextTokPtr = ptr + MINBPC(enc);
460 return XML_TOK_END_TAG;
461 default:
462 *nextTokPtr = ptr;
463 return XML_TOK_INVALID;
464 }
465 }
466 return XML_TOK_PARTIAL;
467}
468
469/* ptr points to character following "&#X" */
470
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000471static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700472PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
473 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200474 if (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000475 switch (BYTE_TYPE(enc, ptr)) {
476 case BT_DIGIT:
477 case BT_HEX:
478 break;
479 default:
480 *nextTokPtr = ptr;
481 return XML_TOK_INVALID;
482 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200483 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000484 switch (BYTE_TYPE(enc, ptr)) {
485 case BT_DIGIT:
486 case BT_HEX:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000487 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000488 case BT_SEMI:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000489 *nextTokPtr = ptr + MINBPC(enc);
490 return XML_TOK_CHAR_REF;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000491 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000492 *nextTokPtr = ptr;
493 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000494 }
495 }
496 }
497 return XML_TOK_PARTIAL;
498}
499
500/* ptr points to character following "&#" */
501
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000502static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700503PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
504 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200505 if (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000506 if (CHAR_MATCHES(enc, ptr, ASCII_x))
507 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
508 switch (BYTE_TYPE(enc, ptr)) {
509 case BT_DIGIT:
510 break;
511 default:
512 *nextTokPtr = ptr;
513 return XML_TOK_INVALID;
514 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200515 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000516 switch (BYTE_TYPE(enc, ptr)) {
517 case BT_DIGIT:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000518 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000519 case BT_SEMI:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000520 *nextTokPtr = ptr + MINBPC(enc);
521 return XML_TOK_CHAR_REF;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000522 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000523 *nextTokPtr = ptr;
524 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000525 }
526 }
527 }
528 return XML_TOK_PARTIAL;
529}
530
531/* ptr points to character following "&" */
532
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000533static int PTRCALL
534PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -0700535 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200536 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000537 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700538 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000539 case BT_NUM:
540 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
541 default:
542 *nextTokPtr = ptr;
543 return XML_TOK_INVALID;
544 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200545 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000546 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700547 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000548 case BT_SEMI:
549 *nextTokPtr = ptr + MINBPC(enc);
550 return XML_TOK_ENTITY_REF;
551 default:
552 *nextTokPtr = ptr;
553 return XML_TOK_INVALID;
554 }
555 }
556 return XML_TOK_PARTIAL;
557}
558
559/* ptr points to character following first character of attribute name */
560
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000561static int PTRCALL
562PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -0700563 const char **nextTokPtr) {
564# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000565 int hadColon = 0;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700566# endif
Victor Stinner23ec4b52017-06-15 00:54:36 +0200567 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000568 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700569 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
570# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000571 case BT_COLON:
572 if (hadColon) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000573 *nextTokPtr = ptr;
574 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000575 }
576 hadColon = 1;
577 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200578 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000579 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700580 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000581 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000582 *nextTokPtr = ptr;
583 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000584 }
585 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700586# endif
587 case BT_S:
588 case BT_CR:
589 case BT_LF:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000590 for (;;) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000591 int t;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000592
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000593 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200594 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000595 t = BYTE_TYPE(enc, ptr);
596 if (t == BT_EQUALS)
597 break;
598 switch (t) {
599 case BT_S:
600 case BT_LF:
601 case BT_CR:
602 break;
603 default:
604 *nextTokPtr = ptr;
605 return XML_TOK_INVALID;
606 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000607 }
Benjamin Peterson5033aa72018-09-10 21:04:00 -0700608 /* fall through */
Benjamin Peterson52b94082019-09-25 21:33:58 -0700609 case BT_EQUALS: {
610 int open;
611# ifdef XML_NS
612 hadColon = 0;
613# endif
614 for (;;) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000615 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200616 REQUIRE_CHAR(enc, ptr, end);
Benjamin Peterson52b94082019-09-25 21:33:58 -0700617 open = BYTE_TYPE(enc, ptr);
618 if (open == BT_QUOT || open == BT_APOS)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000619 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700620 switch (open) {
621 case BT_S:
622 case BT_LF:
623 case BT_CR:
624 break;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000625 default:
626 *nextTokPtr = ptr;
627 return XML_TOK_INVALID;
628 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700629 }
630 ptr += MINBPC(enc);
631 /* in attribute value */
632 for (;;) {
633 int t;
634 REQUIRE_CHAR(enc, ptr, end);
635 t = BYTE_TYPE(enc, ptr);
636 if (t == open)
637 break;
638 switch (t) {
639 INVALID_CASES(ptr, nextTokPtr)
640 case BT_AMP: {
641 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
642 if (tok <= 0) {
643 if (tok == XML_TOK_INVALID)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000644 *nextTokPtr = ptr;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700645 return tok;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000646 }
647 break;
648 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700649 case BT_LT:
650 *nextTokPtr = ptr;
651 return XML_TOK_INVALID;
652 default:
653 ptr += MINBPC(enc);
654 break;
655 }
656 }
657 ptr += MINBPC(enc);
658 REQUIRE_CHAR(enc, ptr, end);
659 switch (BYTE_TYPE(enc, ptr)) {
660 case BT_S:
661 case BT_CR:
662 case BT_LF:
663 break;
664 case BT_SOL:
665 goto sol;
666 case BT_GT:
667 goto gt;
668 default:
669 *nextTokPtr = ptr;
670 return XML_TOK_INVALID;
671 }
672 /* ptr points to closing quote */
673 for (;;) {
674 ptr += MINBPC(enc);
675 REQUIRE_CHAR(enc, ptr, end);
676 switch (BYTE_TYPE(enc, ptr)) {
677 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
678 case BT_S:
679 case BT_CR:
680 case BT_LF:
681 continue;
682 case BT_GT:
683 gt:
684 *nextTokPtr = ptr + MINBPC(enc);
685 return XML_TOK_START_TAG_WITH_ATTS;
686 case BT_SOL:
687 sol:
688 ptr += MINBPC(enc);
689 REQUIRE_CHAR(enc, ptr, end);
690 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
691 *nextTokPtr = ptr;
692 return XML_TOK_INVALID;
693 }
694 *nextTokPtr = ptr + MINBPC(enc);
695 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
696 default:
697 *nextTokPtr = ptr;
698 return XML_TOK_INVALID;
699 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000700 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000701 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700702 break;
703 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000704 default:
705 *nextTokPtr = ptr;
706 return XML_TOK_INVALID;
707 }
708 }
709 return XML_TOK_PARTIAL;
710}
711
712/* ptr points to character following "<" */
713
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000714static int PTRCALL
715PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -0700716 const char **nextTokPtr) {
717# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000718 int hadColon;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700719# endif
Victor Stinner23ec4b52017-06-15 00:54:36 +0200720 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000721 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700722 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000723 case BT_EXCL:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200724 ptr += MINBPC(enc);
725 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000726 switch (BYTE_TYPE(enc, ptr)) {
727 case BT_MINUS:
728 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
729 case BT_LSQB:
Benjamin Peterson52b94082019-09-25 21:33:58 -0700730 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000731 }
732 *nextTokPtr = ptr;
733 return XML_TOK_INVALID;
734 case BT_QUEST:
735 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
736 case BT_SOL:
737 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
738 default:
739 *nextTokPtr = ptr;
740 return XML_TOK_INVALID;
741 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700742# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000743 hadColon = 0;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700744# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000745 /* we have a start-tag */
Victor Stinner23ec4b52017-06-15 00:54:36 +0200746 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000747 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700748 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
749# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000750 case BT_COLON:
751 if (hadColon) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000752 *nextTokPtr = ptr;
753 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000754 }
755 hadColon = 1;
756 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200757 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000758 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700759 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000760 default:
761 *nextTokPtr = ptr;
762 return XML_TOK_INVALID;
763 }
764 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700765# endif
766 case BT_S:
767 case BT_CR:
768 case BT_LF: {
769 ptr += MINBPC(enc);
770 while (HAS_CHAR(enc, ptr, end)) {
771 switch (BYTE_TYPE(enc, ptr)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000772 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Benjamin Peterson52b94082019-09-25 21:33:58 -0700773 case BT_GT:
774 goto gt;
775 case BT_SOL:
776 goto sol;
777 case BT_S:
778 case BT_CR:
779 case BT_LF:
780 ptr += MINBPC(enc);
781 continue;
782 default:
783 *nextTokPtr = ptr;
784 return XML_TOK_INVALID;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000785 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700786 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000787 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700788 return XML_TOK_PARTIAL;
789 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000790 case BT_GT:
791 gt:
792 *nextTokPtr = ptr + MINBPC(enc);
793 return XML_TOK_START_TAG_NO_ATTS;
794 case BT_SOL:
795 sol:
796 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200797 REQUIRE_CHAR(enc, ptr, end);
Benjamin Peterson52b94082019-09-25 21:33:58 -0700798 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000799 *nextTokPtr = ptr;
800 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000801 }
802 *nextTokPtr = ptr + MINBPC(enc);
803 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
804 default:
805 *nextTokPtr = ptr;
806 return XML_TOK_INVALID;
807 }
808 }
809 return XML_TOK_PARTIAL;
810}
811
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000812static int PTRCALL
813PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -0700814 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200815 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000816 return XML_TOK_NONE;
817 if (MINBPC(enc) > 1) {
818 size_t n = end - ptr;
819 if (n & (MINBPC(enc) - 1)) {
820 n &= ~(MINBPC(enc) - 1);
821 if (n == 0)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000822 return XML_TOK_PARTIAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000823 end = ptr + n;
824 }
825 }
826 switch (BYTE_TYPE(enc, ptr)) {
827 case BT_LT:
828 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
829 case BT_AMP:
830 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
831 case BT_CR:
832 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200833 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000834 return XML_TOK_TRAILING_CR;
835 if (BYTE_TYPE(enc, ptr) == BT_LF)
836 ptr += MINBPC(enc);
837 *nextTokPtr = ptr;
838 return XML_TOK_DATA_NEWLINE;
839 case BT_LF:
840 *nextTokPtr = ptr + MINBPC(enc);
841 return XML_TOK_DATA_NEWLINE;
842 case BT_RSQB:
843 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200844 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000845 return XML_TOK_TRAILING_RSQB;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700846 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000847 break;
848 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200849 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000850 return XML_TOK_TRAILING_RSQB;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700851 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000852 ptr -= MINBPC(enc);
853 break;
854 }
855 *nextTokPtr = ptr;
856 return XML_TOK_INVALID;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700857 INVALID_CASES(ptr, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000858 default:
859 ptr += MINBPC(enc);
860 break;
861 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200862 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000863 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700864# define LEAD_CASE(n) \
865 case BT_LEAD##n: \
866 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
867 *nextTokPtr = ptr; \
868 return XML_TOK_DATA_CHARS; \
869 } \
870 ptr += n; \
871 break;
872 LEAD_CASE(2)
873 LEAD_CASE(3)
874 LEAD_CASE(4)
875# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000876 case BT_RSQB:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200877 if (HAS_CHARS(enc, ptr, end, 2)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700878 if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
879 ptr += MINBPC(enc);
880 break;
881 }
882 if (HAS_CHARS(enc, ptr, end, 3)) {
883 if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
884 ptr += MINBPC(enc);
885 break;
886 }
887 *nextTokPtr = ptr + 2 * MINBPC(enc);
888 return XML_TOK_INVALID;
889 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000890 }
891 /* fall through */
892 case BT_AMP:
893 case BT_LT:
894 case BT_NONXML:
895 case BT_MALFORM:
896 case BT_TRAIL:
897 case BT_CR:
898 case BT_LF:
899 *nextTokPtr = ptr;
900 return XML_TOK_DATA_CHARS;
901 default:
902 ptr += MINBPC(enc);
903 break;
904 }
905 }
906 *nextTokPtr = ptr;
907 return XML_TOK_DATA_CHARS;
908}
909
910/* ptr points to character following "%" */
911
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000912static int PTRCALL
913PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -0700914 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200915 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000916 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700917 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
918 case BT_S:
919 case BT_LF:
920 case BT_CR:
921 case BT_PERCNT:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000922 *nextTokPtr = ptr;
923 return XML_TOK_PERCENT;
924 default:
925 *nextTokPtr = ptr;
926 return XML_TOK_INVALID;
927 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200928 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000929 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700930 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000931 case BT_SEMI:
932 *nextTokPtr = ptr + MINBPC(enc);
933 return XML_TOK_PARAM_ENTITY_REF;
934 default:
935 *nextTokPtr = ptr;
936 return XML_TOK_INVALID;
937 }
938 }
939 return XML_TOK_PARTIAL;
940}
941
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000942static int PTRCALL
943PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -0700944 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200945 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000946 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700947 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000948 default:
949 *nextTokPtr = ptr;
950 return XML_TOK_INVALID;
951 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200952 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000953 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700954 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
955 case BT_CR:
956 case BT_LF:
957 case BT_S:
958 case BT_RPAR:
959 case BT_GT:
960 case BT_PERCNT:
961 case BT_VERBAR:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000962 *nextTokPtr = ptr;
963 return XML_TOK_POUND_NAME;
964 default:
965 *nextTokPtr = ptr;
966 return XML_TOK_INVALID;
967 }
968 }
969 return -XML_TOK_POUND_NAME;
970}
971
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000972static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700973PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
974 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200975 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000976 int t = BYTE_TYPE(enc, ptr);
977 switch (t) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700978 INVALID_CASES(ptr, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000979 case BT_QUOT:
980 case BT_APOS:
981 ptr += MINBPC(enc);
982 if (t != open)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000983 break;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200984 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000985 return -XML_TOK_LITERAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000986 *nextTokPtr = ptr;
987 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700988 case BT_S:
989 case BT_CR:
990 case BT_LF:
991 case BT_GT:
992 case BT_PERCNT:
993 case BT_LSQB:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000994 return XML_TOK_LITERAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000995 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000996 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000997 }
998 default:
999 ptr += MINBPC(enc);
1000 break;
1001 }
1002 }
1003 return XML_TOK_PARTIAL;
1004}
1005
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001006static int PTRCALL
1007PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -07001008 const char **nextTokPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001009 int tok;
Victor Stinner23ec4b52017-06-15 00:54:36 +02001010 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001011 return XML_TOK_NONE;
1012 if (MINBPC(enc) > 1) {
1013 size_t n = end - ptr;
1014 if (n & (MINBPC(enc) - 1)) {
1015 n &= ~(MINBPC(enc) - 1);
1016 if (n == 0)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001017 return XML_TOK_PARTIAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001018 end = ptr + n;
1019 }
1020 }
1021 switch (BYTE_TYPE(enc, ptr)) {
1022 case BT_QUOT:
1023 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1024 case BT_APOS:
1025 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
Benjamin Peterson52b94082019-09-25 21:33:58 -07001026 case BT_LT: {
1027 ptr += MINBPC(enc);
1028 REQUIRE_CHAR(enc, ptr, end);
1029 switch (BYTE_TYPE(enc, ptr)) {
1030 case BT_EXCL:
1031 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1032 case BT_QUEST:
1033 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1034 case BT_NMSTRT:
1035 case BT_HEX:
1036 case BT_NONASCII:
1037 case BT_LEAD2:
1038 case BT_LEAD3:
1039 case BT_LEAD4:
1040 *nextTokPtr = ptr - MINBPC(enc);
1041 return XML_TOK_INSTANCE_START;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001042 }
Benjamin Peterson52b94082019-09-25 21:33:58 -07001043 *nextTokPtr = ptr;
1044 return XML_TOK_INVALID;
1045 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001046 case BT_CR:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001047 if (ptr + MINBPC(enc) == end) {
1048 *nextTokPtr = end;
1049 /* indicate that this might be part of a CR/LF pair */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001050 return -XML_TOK_PROLOG_S;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001051 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001052 /* fall through */
Benjamin Peterson52b94082019-09-25 21:33:58 -07001053 case BT_S:
1054 case BT_LF:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001055 for (;;) {
1056 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001057 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001058 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001059 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001060 case BT_S:
1061 case BT_LF:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001062 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001063 case BT_CR:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001064 /* don't split CR/LF pair */
1065 if (ptr + MINBPC(enc) != end)
1066 break;
1067 /* fall through */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001068 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001069 *nextTokPtr = ptr;
1070 return XML_TOK_PROLOG_S;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001071 }
1072 }
1073 *nextTokPtr = ptr;
1074 return XML_TOK_PROLOG_S;
1075 case BT_PERCNT:
1076 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1077 case BT_COMMA:
1078 *nextTokPtr = ptr + MINBPC(enc);
1079 return XML_TOK_COMMA;
1080 case BT_LSQB:
1081 *nextTokPtr = ptr + MINBPC(enc);
1082 return XML_TOK_OPEN_BRACKET;
1083 case BT_RSQB:
1084 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001085 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001086 return -XML_TOK_CLOSE_BRACKET;
1087 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
Victor Stinner23ec4b52017-06-15 00:54:36 +02001088 REQUIRE_CHARS(enc, ptr, end, 2);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001089 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001090 *nextTokPtr = ptr + 2 * MINBPC(enc);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001091 return XML_TOK_COND_SECT_CLOSE;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001092 }
1093 }
1094 *nextTokPtr = ptr;
1095 return XML_TOK_CLOSE_BRACKET;
1096 case BT_LPAR:
1097 *nextTokPtr = ptr + MINBPC(enc);
1098 return XML_TOK_OPEN_PAREN;
1099 case BT_RPAR:
1100 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001101 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001102 return -XML_TOK_CLOSE_PAREN;
1103 switch (BYTE_TYPE(enc, ptr)) {
1104 case BT_AST:
1105 *nextTokPtr = ptr + MINBPC(enc);
1106 return XML_TOK_CLOSE_PAREN_ASTERISK;
1107 case BT_QUEST:
1108 *nextTokPtr = ptr + MINBPC(enc);
1109 return XML_TOK_CLOSE_PAREN_QUESTION;
1110 case BT_PLUS:
1111 *nextTokPtr = ptr + MINBPC(enc);
1112 return XML_TOK_CLOSE_PAREN_PLUS;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001113 case BT_CR:
1114 case BT_LF:
1115 case BT_S:
1116 case BT_GT:
1117 case BT_COMMA:
1118 case BT_VERBAR:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001119 case BT_RPAR:
1120 *nextTokPtr = ptr;
1121 return XML_TOK_CLOSE_PAREN;
1122 }
1123 *nextTokPtr = ptr;
1124 return XML_TOK_INVALID;
1125 case BT_VERBAR:
1126 *nextTokPtr = ptr + MINBPC(enc);
1127 return XML_TOK_OR;
1128 case BT_GT:
1129 *nextTokPtr = ptr + MINBPC(enc);
1130 return XML_TOK_DECL_CLOSE;
1131 case BT_NUM:
1132 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
Benjamin Peterson52b94082019-09-25 21:33:58 -07001133# define LEAD_CASE(n) \
1134 case BT_LEAD##n: \
1135 if (end - ptr < n) \
1136 return XML_TOK_PARTIAL_CHAR; \
1137 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1138 ptr += n; \
1139 tok = XML_TOK_NAME; \
1140 break; \
1141 } \
1142 if (IS_NAME_CHAR(enc, ptr, n)) { \
1143 ptr += n; \
1144 tok = XML_TOK_NMTOKEN; \
1145 break; \
1146 } \
1147 *nextTokPtr = ptr; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001148 return XML_TOK_INVALID;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001149 LEAD_CASE(2)
1150 LEAD_CASE(3)
1151 LEAD_CASE(4)
1152# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001153 case BT_NMSTRT:
1154 case BT_HEX:
1155 tok = XML_TOK_NAME;
1156 ptr += MINBPC(enc);
1157 break;
1158 case BT_DIGIT:
1159 case BT_NAME:
1160 case BT_MINUS:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001161# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001162 case BT_COLON:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001163# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001164 tok = XML_TOK_NMTOKEN;
1165 ptr += MINBPC(enc);
1166 break;
1167 case BT_NONASCII:
1168 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1169 ptr += MINBPC(enc);
1170 tok = XML_TOK_NAME;
1171 break;
1172 }
1173 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1174 ptr += MINBPC(enc);
1175 tok = XML_TOK_NMTOKEN;
1176 break;
1177 }
1178 /* fall through */
1179 default:
1180 *nextTokPtr = ptr;
1181 return XML_TOK_INVALID;
1182 }
Victor Stinner23ec4b52017-06-15 00:54:36 +02001183 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001184 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001185 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1186 case BT_GT:
1187 case BT_RPAR:
1188 case BT_COMMA:
1189 case BT_VERBAR:
1190 case BT_LSQB:
1191 case BT_PERCNT:
1192 case BT_S:
1193 case BT_CR:
1194 case BT_LF:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001195 *nextTokPtr = ptr;
1196 return tok;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001197# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001198 case BT_COLON:
1199 ptr += MINBPC(enc);
1200 switch (tok) {
1201 case XML_TOK_NAME:
Victor Stinner23ec4b52017-06-15 00:54:36 +02001202 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001203 tok = XML_TOK_PREFIXED_NAME;
1204 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001205 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001206 default:
1207 tok = XML_TOK_NMTOKEN;
1208 break;
1209 }
1210 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001211 case XML_TOK_PREFIXED_NAME:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001212 tok = XML_TOK_NMTOKEN;
1213 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001214 }
1215 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001216# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001217 case BT_PLUS:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001218 if (tok == XML_TOK_NMTOKEN) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001219 *nextTokPtr = ptr;
1220 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001221 }
1222 *nextTokPtr = ptr + MINBPC(enc);
1223 return XML_TOK_NAME_PLUS;
1224 case BT_AST:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001225 if (tok == XML_TOK_NMTOKEN) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001226 *nextTokPtr = ptr;
1227 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001228 }
1229 *nextTokPtr = ptr + MINBPC(enc);
1230 return XML_TOK_NAME_ASTERISK;
1231 case BT_QUEST:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001232 if (tok == XML_TOK_NMTOKEN) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001233 *nextTokPtr = ptr;
1234 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001235 }
1236 *nextTokPtr = ptr + MINBPC(enc);
1237 return XML_TOK_NAME_QUESTION;
1238 default:
1239 *nextTokPtr = ptr;
1240 return XML_TOK_INVALID;
1241 }
1242 }
1243 return -tok;
1244}
1245
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001246static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001247PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1248 const char **nextTokPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001249 const char *start;
Victor Stinner23ec4b52017-06-15 00:54:36 +02001250 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001251 return XML_TOK_NONE;
Victor Stinner93d0cb52017-08-18 23:43:54 +02001252 else if (! HAS_CHAR(enc, ptr, end)) {
1253 /* This line cannot be executed. The incoming data has already
1254 * been tokenized once, so incomplete characters like this have
1255 * already been eliminated from the input. Retaining the paranoia
1256 * check is still valuable, however.
1257 */
1258 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1259 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001260 start = ptr;
Victor Stinner23ec4b52017-06-15 00:54:36 +02001261 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001262 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001263# define LEAD_CASE(n) \
1264 case BT_LEAD##n: \
1265 ptr += n; \
1266 break;
1267 LEAD_CASE(2)
1268 LEAD_CASE(3)
1269 LEAD_CASE(4)
1270# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001271 case BT_AMP:
1272 if (ptr == start)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001273 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001274 *nextTokPtr = ptr;
1275 return XML_TOK_DATA_CHARS;
1276 case BT_LT:
1277 /* this is for inside entity references */
1278 *nextTokPtr = ptr;
1279 return XML_TOK_INVALID;
1280 case BT_LF:
1281 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001282 *nextTokPtr = ptr + MINBPC(enc);
1283 return XML_TOK_DATA_NEWLINE;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001284 }
1285 *nextTokPtr = ptr;
1286 return XML_TOK_DATA_CHARS;
1287 case BT_CR:
1288 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001289 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001290 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001291 return XML_TOK_TRAILING_CR;
1292 if (BYTE_TYPE(enc, ptr) == BT_LF)
1293 ptr += MINBPC(enc);
1294 *nextTokPtr = ptr;
1295 return XML_TOK_DATA_NEWLINE;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001296 }
1297 *nextTokPtr = ptr;
1298 return XML_TOK_DATA_CHARS;
1299 case BT_S:
1300 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001301 *nextTokPtr = ptr + MINBPC(enc);
1302 return XML_TOK_ATTRIBUTE_VALUE_S;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001303 }
1304 *nextTokPtr = ptr;
1305 return XML_TOK_DATA_CHARS;
1306 default:
1307 ptr += MINBPC(enc);
1308 break;
1309 }
1310 }
1311 *nextTokPtr = ptr;
1312 return XML_TOK_DATA_CHARS;
1313}
1314
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001315static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001316PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1317 const char **nextTokPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001318 const char *start;
Victor Stinner23ec4b52017-06-15 00:54:36 +02001319 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001320 return XML_TOK_NONE;
Victor Stinner93d0cb52017-08-18 23:43:54 +02001321 else if (! HAS_CHAR(enc, ptr, end)) {
1322 /* This line cannot be executed. The incoming data has already
1323 * been tokenized once, so incomplete characters like this have
1324 * already been eliminated from the input. Retaining the paranoia
1325 * check is still valuable, however.
1326 */
1327 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1328 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001329 start = ptr;
Victor Stinner23ec4b52017-06-15 00:54:36 +02001330 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001331 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001332# define LEAD_CASE(n) \
1333 case BT_LEAD##n: \
1334 ptr += n; \
1335 break;
1336 LEAD_CASE(2)
1337 LEAD_CASE(3)
1338 LEAD_CASE(4)
1339# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001340 case BT_AMP:
1341 if (ptr == start)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001342 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001343 *nextTokPtr = ptr;
1344 return XML_TOK_DATA_CHARS;
1345 case BT_PERCNT:
1346 if (ptr == start) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001347 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001348 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001349 }
1350 *nextTokPtr = ptr;
1351 return XML_TOK_DATA_CHARS;
1352 case BT_LF:
1353 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001354 *nextTokPtr = ptr + MINBPC(enc);
1355 return XML_TOK_DATA_NEWLINE;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001356 }
1357 *nextTokPtr = ptr;
1358 return XML_TOK_DATA_CHARS;
1359 case BT_CR:
1360 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001361 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001362 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001363 return XML_TOK_TRAILING_CR;
1364 if (BYTE_TYPE(enc, ptr) == BT_LF)
1365 ptr += MINBPC(enc);
1366 *nextTokPtr = ptr;
1367 return XML_TOK_DATA_NEWLINE;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001368 }
1369 *nextTokPtr = ptr;
1370 return XML_TOK_DATA_CHARS;
1371 default:
1372 ptr += MINBPC(enc);
1373 break;
1374 }
1375 }
1376 *nextTokPtr = ptr;
1377 return XML_TOK_DATA_CHARS;
1378}
1379
Benjamin Peterson52b94082019-09-25 21:33:58 -07001380# ifdef XML_DTD
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001381
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001382static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001383PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1384 const char **nextTokPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001385 int level = 0;
1386 if (MINBPC(enc) > 1) {
1387 size_t n = end - ptr;
1388 if (n & (MINBPC(enc) - 1)) {
1389 n &= ~(MINBPC(enc) - 1);
1390 end = ptr + n;
1391 }
1392 }
Victor Stinner23ec4b52017-06-15 00:54:36 +02001393 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001394 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001395 INVALID_CASES(ptr, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001396 case BT_LT:
Victor Stinner23ec4b52017-06-15 00:54:36 +02001397 ptr += MINBPC(enc);
1398 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001399 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
Victor Stinner23ec4b52017-06-15 00:54:36 +02001400 ptr += MINBPC(enc);
1401 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001402 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1403 ++level;
1404 ptr += MINBPC(enc);
1405 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001406 }
1407 break;
1408 case BT_RSQB:
Victor Stinner23ec4b52017-06-15 00:54:36 +02001409 ptr += MINBPC(enc);
1410 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001411 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
Victor Stinner23ec4b52017-06-15 00:54:36 +02001412 ptr += MINBPC(enc);
1413 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001414 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1415 ptr += MINBPC(enc);
1416 if (level == 0) {
1417 *nextTokPtr = ptr;
1418 return XML_TOK_IGNORE_SECT;
1419 }
1420 --level;
1421 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001422 }
1423 break;
1424 default:
1425 ptr += MINBPC(enc);
1426 break;
1427 }
1428 }
1429 return XML_TOK_PARTIAL;
1430}
1431
Benjamin Peterson52b94082019-09-25 21:33:58 -07001432# endif /* XML_DTD */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001433
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001434static int PTRCALL
1435PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -07001436 const char **badPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001437 ptr += MINBPC(enc);
1438 end -= MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001439 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001440 switch (BYTE_TYPE(enc, ptr)) {
1441 case BT_DIGIT:
1442 case BT_HEX:
1443 case BT_MINUS:
1444 case BT_APOS:
1445 case BT_LPAR:
1446 case BT_RPAR:
1447 case BT_PLUS:
1448 case BT_COMMA:
1449 case BT_SOL:
1450 case BT_EQUALS:
1451 case BT_QUEST:
1452 case BT_CR:
1453 case BT_LF:
1454 case BT_SEMI:
1455 case BT_EXCL:
1456 case BT_AST:
1457 case BT_PERCNT:
1458 case BT_NUM:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001459# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001460 case BT_COLON:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001461# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001462 break;
1463 case BT_S:
1464 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001465 *badPtr = ptr;
1466 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001467 }
1468 break;
1469 case BT_NAME:
1470 case BT_NMSTRT:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001471 if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001472 break;
Benjamin Peterson5033aa72018-09-10 21:04:00 -07001473 /* fall through */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001474 default:
1475 switch (BYTE_TO_ASCII(enc, ptr)) {
1476 case 0x24: /* $ */
1477 case 0x40: /* @ */
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001478 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001479 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001480 *badPtr = ptr;
1481 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001482 }
1483 break;
1484 }
1485 }
1486 return 1;
1487}
1488
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001489/* This must only be called for a well-formed start-tag or empty
1490 element tag. Returns the number of attributes. Pointers to the
1491 first attsMax attributes are stored in atts.
1492*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001493
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001494static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001495PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
1496 ATTRIBUTE *atts) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001497 enum { other, inName, inValue } state = inName;
1498 int nAtts = 0;
1499 int open = 0; /* defined when state == inValue;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001500 initialization just to shut up compilers */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001501
1502 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1503 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001504# define START_NAME \
1505 if (state == other) { \
1506 if (nAtts < attsMax) { \
1507 atts[nAtts].name = ptr; \
1508 atts[nAtts].normalized = 1; \
1509 } \
1510 state = inName; \
1511 }
1512# define LEAD_CASE(n) \
1513 case BT_LEAD##n: \
1514 START_NAME ptr += (n - MINBPC(enc)); \
1515 break;
1516 LEAD_CASE(2)
1517 LEAD_CASE(3)
1518 LEAD_CASE(4)
1519# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001520 case BT_NONASCII:
1521 case BT_NMSTRT:
1522 case BT_HEX:
1523 START_NAME
1524 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001525# undef START_NAME
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001526 case BT_QUOT:
1527 if (state != inValue) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001528 if (nAtts < attsMax)
1529 atts[nAtts].valuePtr = ptr + MINBPC(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001530 state = inValue;
1531 open = BT_QUOT;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001532 } else if (open == BT_QUOT) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001533 state = other;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001534 if (nAtts < attsMax)
1535 atts[nAtts].valueEnd = ptr;
1536 nAtts++;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001537 }
1538 break;
1539 case BT_APOS:
1540 if (state != inValue) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001541 if (nAtts < attsMax)
1542 atts[nAtts].valuePtr = ptr + MINBPC(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001543 state = inValue;
1544 open = BT_APOS;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001545 } else if (open == BT_APOS) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001546 state = other;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001547 if (nAtts < attsMax)
1548 atts[nAtts].valueEnd = ptr;
1549 nAtts++;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001550 }
1551 break;
1552 case BT_AMP:
1553 if (nAtts < attsMax)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001554 atts[nAtts].normalized = 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001555 break;
1556 case BT_S:
1557 if (state == inName)
1558 state = other;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001559 else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001560 && (ptr == atts[nAtts].valuePtr
1561 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1562 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1563 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1564 atts[nAtts].normalized = 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001565 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001566 case BT_CR:
1567 case BT_LF:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001568 /* This case ensures that the first attribute name is counted
1569 Apart from that we could just change state on the quote. */
1570 if (state == inName)
1571 state = other;
1572 else if (state == inValue && nAtts < attsMax)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001573 atts[nAtts].normalized = 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001574 break;
1575 case BT_GT:
1576 case BT_SOL:
1577 if (state != inValue)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001578 return nAtts;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001579 break;
1580 default:
1581 break;
1582 }
1583 }
1584 /* not reached */
1585}
1586
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001587static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001588PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001589 int result = 0;
1590 /* skip &# */
Benjamin Peterson52b94082019-09-25 21:33:58 -07001591 UNUSED_P(enc);
1592 ptr += 2 * MINBPC(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001593 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001594 for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001595 ptr += MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001596 int c = BYTE_TO_ASCII(enc, ptr);
1597 switch (c) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001598 case ASCII_0:
1599 case ASCII_1:
1600 case ASCII_2:
1601 case ASCII_3:
1602 case ASCII_4:
1603 case ASCII_5:
1604 case ASCII_6:
1605 case ASCII_7:
1606 case ASCII_8:
1607 case ASCII_9:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001608 result <<= 4;
1609 result |= (c - ASCII_0);
1610 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001611 case ASCII_A:
1612 case ASCII_B:
1613 case ASCII_C:
1614 case ASCII_D:
1615 case ASCII_E:
1616 case ASCII_F:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001617 result <<= 4;
1618 result += 10 + (c - ASCII_A);
1619 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001620 case ASCII_a:
1621 case ASCII_b:
1622 case ASCII_c:
1623 case ASCII_d:
1624 case ASCII_e:
1625 case ASCII_f:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001626 result <<= 4;
1627 result += 10 + (c - ASCII_a);
1628 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001629 }
1630 if (result >= 0x110000)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001631 return -1;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001632 }
Benjamin Peterson52b94082019-09-25 21:33:58 -07001633 } else {
1634 for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001635 int c = BYTE_TO_ASCII(enc, ptr);
1636 result *= 10;
1637 result += (c - ASCII_0);
1638 if (result >= 0x110000)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001639 return -1;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001640 }
1641 }
1642 return checkCharRefNumber(result);
1643}
1644
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001645static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001646PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1647 const char *end) {
1648 UNUSED_P(enc);
1649 switch ((end - ptr) / MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001650 case 2:
1651 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1652 switch (BYTE_TO_ASCII(enc, ptr)) {
1653 case ASCII_l:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001654 return ASCII_LT;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001655 case ASCII_g:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001656 return ASCII_GT;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001657 }
1658 }
1659 break;
1660 case 3:
1661 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1662 ptr += MINBPC(enc);
1663 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001664 ptr += MINBPC(enc);
1665 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1666 return ASCII_AMP;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001667 }
1668 }
1669 break;
1670 case 4:
1671 switch (BYTE_TO_ASCII(enc, ptr)) {
1672 case ASCII_q:
1673 ptr += MINBPC(enc);
1674 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001675 ptr += MINBPC(enc);
1676 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1677 ptr += MINBPC(enc);
1678 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1679 return ASCII_QUOT;
1680 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001681 }
1682 break;
1683 case ASCII_a:
1684 ptr += MINBPC(enc);
1685 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001686 ptr += MINBPC(enc);
1687 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1688 ptr += MINBPC(enc);
1689 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1690 return ASCII_APOS;
1691 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001692 }
1693 break;
1694 }
1695 }
1696 return 0;
1697}
1698
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001699static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001700PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1701 const char *end1, const char *ptr2) {
1702 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001703 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
Victor Stinner93d0cb52017-08-18 23:43:54 +02001704 if (end1 - ptr1 < MINBPC(enc)) {
Benjamin Peterson5033aa72018-09-10 21:04:00 -07001705 /* This line cannot be executed. The incoming data has already
1706 * been tokenized once, so incomplete characters like this have
Victor Stinner93d0cb52017-08-18 23:43:54 +02001707 * already been eliminated from the input. Retaining the
1708 * paranoia check is still valuable, however.
1709 */
1710 return 0; /* LCOV_EXCL_LINE */
1711 }
Benjamin Peterson52b94082019-09-25 21:33:58 -07001712 if (! CHAR_MATCHES(enc, ptr1, *ptr2))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001713 return 0;
1714 }
1715 return ptr1 == end1;
1716}
1717
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001718static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001719PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001720 const char *start = ptr;
1721 for (;;) {
1722 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001723# define LEAD_CASE(n) \
1724 case BT_LEAD##n: \
1725 ptr += n; \
1726 break;
1727 LEAD_CASE(2)
1728 LEAD_CASE(3)
1729 LEAD_CASE(4)
1730# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001731 case BT_NONASCII:
1732 case BT_NMSTRT:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001733# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001734 case BT_COLON:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001735# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001736 case BT_HEX:
1737 case BT_DIGIT:
1738 case BT_NAME:
1739 case BT_MINUS:
1740 ptr += MINBPC(enc);
1741 break;
1742 default:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001743 return (int)(ptr - start);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001744 }
1745 }
1746}
1747
Benjamin Peterson52b94082019-09-25 21:33:58 -07001748static const char *PTRFASTCALL
1749PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001750 for (;;) {
1751 switch (BYTE_TYPE(enc, ptr)) {
1752 case BT_LF:
1753 case BT_CR:
1754 case BT_S:
1755 ptr += MINBPC(enc);
1756 break;
1757 default:
1758 return ptr;
1759 }
1760 }
1761}
1762
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001763static void PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001764PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
1765 POSITION *pos) {
Victor Stinner23ec4b52017-06-15 00:54:36 +02001766 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001767 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001768# define LEAD_CASE(n) \
1769 case BT_LEAD##n: \
1770 ptr += n; \
1771 break;
1772 LEAD_CASE(2)
1773 LEAD_CASE(3)
1774 LEAD_CASE(4)
1775# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001776 case BT_LF:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001777 pos->columnNumber = (XML_Size)-1;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001778 pos->lineNumber++;
1779 ptr += MINBPC(enc);
1780 break;
1781 case BT_CR:
1782 pos->lineNumber++;
1783 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001784 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001785 ptr += MINBPC(enc);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001786 pos->columnNumber = (XML_Size)-1;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001787 break;
1788 default:
1789 ptr += MINBPC(enc);
1790 break;
1791 }
1792 pos->columnNumber++;
1793 }
1794}
1795
Benjamin Peterson52b94082019-09-25 21:33:58 -07001796# undef DO_LEAD_CASE
1797# undef MULTIBYTE_CASES
1798# undef INVALID_CASES
1799# undef CHECK_NAME_CASE
1800# undef CHECK_NAME_CASES
1801# undef CHECK_NMSTRT_CASE
1802# undef CHECK_NMSTRT_CASES
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001803
Gregory P. Smith7c6309c2012-07-14 14:12:35 -07001804#endif /* XML_TOK_IMPL_C */