blob: 0430591b42636edf9bd430d88857dd2e3aabd06a [file] [log] [blame]
Miss Islington (bot)27067852021-08-29 07:32:50 -07001/* This file is included (from xmltok.c, 1-3 times depending on XML_MIN_SIZE)!
Victor Stinner759e30e2017-09-05 01:58:08 +02002 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
Miss Islington (bot)27067852021-08-29 07:32:50 -070010 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net>
11 Copyright (c) 2002 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12 Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
13 Copyright (c) 2016-2021 Sebastian Pipping <sebastian@pipping.org>
14 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk>
15 Copyright (c) 2018 Benjamin Peterson <benjamin@python.org>
16 Copyright (c) 2018 Anton Maklakov <antmak.pub@gmail.com>
17 Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
18 Copyright (c) 2020 Boris Kolpackov <boris@codesynthesis.com>
Victor Stinner759e30e2017-09-05 01:58:08 +020019 Licensed under the MIT license:
20
21 Permission is hereby granted, free of charge, to any person obtaining
22 a copy of this software and associated documentation files (the
23 "Software"), to deal in the Software without restriction, including
24 without limitation the rights to use, copy, modify, merge, publish,
25 distribute, sublicense, and/or sell copies of the Software, and to permit
26 persons to whom the Software is furnished to do so, subject to the
27 following conditions:
28
29 The above copyright notice and this permission notice shall be included
30 in all copies or substantial portions of the Software.
31
32 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
33 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
34 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
35 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
36 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
37 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
38 USE OR OTHER DEALINGS IN THE SOFTWARE.
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000039*/
40
Gregory P. Smith7c6309c2012-07-14 14:12:35 -070041#ifdef XML_TOK_IMPL_C
42
Miss Islington (bot)27067852021-08-29 07:32:50 -070043# ifndef IS_INVALID_CHAR // i.e. for UTF-16 and XML_MIN_SIZE not defined
Benjamin Peterson52b94082019-09-25 21:33:58 -070044# define IS_INVALID_CHAR(enc, ptr, n) (0)
45# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000046
Benjamin Peterson52b94082019-09-25 21:33:58 -070047# define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
48 case BT_LEAD##n: \
49 if (end - ptr < n) \
50 return XML_TOK_PARTIAL_CHAR; \
51 if (IS_INVALID_CHAR(enc, ptr, n)) { \
52 *(nextTokPtr) = (ptr); \
53 return XML_TOK_INVALID; \
54 } \
55 ptr += n; \
56 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000057
Benjamin Peterson52b94082019-09-25 21:33:58 -070058# define INVALID_CASES(ptr, nextTokPtr) \
59 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
60 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
61 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
62 case BT_NONXML: \
63 case BT_MALFORM: \
64 case BT_TRAIL: \
65 *(nextTokPtr) = (ptr); \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000066 return XML_TOK_INVALID;
67
Benjamin Peterson52b94082019-09-25 21:33:58 -070068# define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
69 case BT_LEAD##n: \
70 if (end - ptr < n) \
71 return XML_TOK_PARTIAL_CHAR; \
72 if (! IS_NAME_CHAR(enc, ptr, n)) { \
73 *nextTokPtr = ptr; \
74 return XML_TOK_INVALID; \
75 } \
76 ptr += n; \
77 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000078
Benjamin Peterson52b94082019-09-25 21:33:58 -070079# define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
80 case BT_NONASCII: \
81 if (! IS_NAME_CHAR_MINBPC(enc, ptr)) { \
82 *nextTokPtr = ptr; \
83 return XML_TOK_INVALID; \
84 } \
85 /* fall through */ \
86 case BT_NMSTRT: \
87 case BT_HEX: \
88 case BT_DIGIT: \
89 case BT_NAME: \
90 case BT_MINUS: \
91 ptr += MINBPC(enc); \
92 break; \
93 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
94 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
95 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000096
Benjamin Peterson52b94082019-09-25 21:33:58 -070097# define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
98 case BT_LEAD##n: \
99 if (end - ptr < n) \
100 return XML_TOK_PARTIAL_CHAR; \
101 if (! IS_NMSTRT_CHAR(enc, ptr, n)) { \
102 *nextTokPtr = ptr; \
103 return XML_TOK_INVALID; \
104 } \
105 ptr += n; \
106 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000107
Benjamin Peterson52b94082019-09-25 21:33:58 -0700108# define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
109 case BT_NONASCII: \
110 if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
111 *nextTokPtr = ptr; \
112 return XML_TOK_INVALID; \
113 } \
114 /* fall through */ \
115 case BT_NMSTRT: \
116 case BT_HEX: \
117 ptr += MINBPC(enc); \
118 break; \
119 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
120 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
121 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000122
Benjamin Peterson52b94082019-09-25 21:33:58 -0700123# ifndef PREFIX
124# define PREFIX(ident) ident
125# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000126
Benjamin Peterson52b94082019-09-25 21:33:58 -0700127# define HAS_CHARS(enc, ptr, end, count) (end - ptr >= count * MINBPC(enc))
Victor Stinner23ec4b52017-06-15 00:54:36 +0200128
Benjamin Peterson52b94082019-09-25 21:33:58 -0700129# define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200130
Benjamin Peterson52b94082019-09-25 21:33:58 -0700131# define REQUIRE_CHARS(enc, ptr, end, count) \
132 { \
133 if (! HAS_CHARS(enc, ptr, end, count)) { \
134 return XML_TOK_PARTIAL; \
135 } \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200136 }
137
Benjamin Peterson52b94082019-09-25 21:33:58 -0700138# define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200139
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000140/* ptr points to character following "<!-" */
141
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000142static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700143PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
144 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200145 if (HAS_CHAR(enc, ptr, end)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700146 if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000147 *nextTokPtr = ptr;
148 return XML_TOK_INVALID;
149 }
150 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200151 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000152 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700153 INVALID_CASES(ptr, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000154 case BT_MINUS:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200155 ptr += MINBPC(enc);
156 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000157 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200158 ptr += MINBPC(enc);
159 REQUIRE_CHAR(enc, ptr, end);
Benjamin Peterson52b94082019-09-25 21:33:58 -0700160 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000161 *nextTokPtr = ptr;
162 return XML_TOK_INVALID;
163 }
164 *nextTokPtr = ptr + MINBPC(enc);
165 return XML_TOK_COMMENT;
166 }
167 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000168 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000169 ptr += MINBPC(enc);
170 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000171 }
172 }
173 }
174 return XML_TOK_PARTIAL;
175}
176
177/* ptr points to character following "<!" */
178
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000179static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700180PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
181 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200182 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000183 switch (BYTE_TYPE(enc, ptr)) {
184 case BT_MINUS:
185 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
186 case BT_LSQB:
187 *nextTokPtr = ptr + MINBPC(enc);
188 return XML_TOK_COND_SECT_OPEN;
189 case BT_NMSTRT:
190 case BT_HEX:
191 ptr += MINBPC(enc);
192 break;
193 default:
194 *nextTokPtr = ptr;
195 return XML_TOK_INVALID;
196 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200197 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000198 switch (BYTE_TYPE(enc, ptr)) {
199 case BT_PERCNT:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200200 REQUIRE_CHARS(enc, ptr, end, 2);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000201 /* don't allow <!ENTITY% foo "whatever"> */
202 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700203 case BT_S:
204 case BT_CR:
205 case BT_LF:
206 case BT_PERCNT:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000207 *nextTokPtr = ptr;
208 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000209 }
210 /* fall through */
Benjamin Peterson52b94082019-09-25 21:33:58 -0700211 case BT_S:
212 case BT_CR:
213 case BT_LF:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000214 *nextTokPtr = ptr;
215 return XML_TOK_DECL_OPEN;
216 case BT_NMSTRT:
217 case BT_HEX:
218 ptr += MINBPC(enc);
219 break;
220 default:
221 *nextTokPtr = ptr;
222 return XML_TOK_INVALID;
223 }
224 }
225 return XML_TOK_PARTIAL;
226}
227
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000228static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700229PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
230 int *tokPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000231 int upper = 0;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700232 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000233 *tokPtr = XML_TOK_PI;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700234 if (end - ptr != MINBPC(enc) * 3)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000235 return 1;
236 switch (BYTE_TO_ASCII(enc, ptr)) {
237 case ASCII_x:
238 break;
239 case ASCII_X:
240 upper = 1;
241 break;
242 default:
243 return 1;
244 }
245 ptr += MINBPC(enc);
246 switch (BYTE_TO_ASCII(enc, ptr)) {
247 case ASCII_m:
248 break;
249 case ASCII_M:
250 upper = 1;
251 break;
252 default:
253 return 1;
254 }
255 ptr += MINBPC(enc);
256 switch (BYTE_TO_ASCII(enc, ptr)) {
257 case ASCII_l:
258 break;
259 case ASCII_L:
260 upper = 1;
261 break;
262 default:
263 return 1;
264 }
265 if (upper)
266 return 0;
267 *tokPtr = XML_TOK_XML_DECL;
268 return 1;
269}
270
271/* ptr points to character following "<?" */
272
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000273static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700274PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
275 const char **nextTokPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000276 int tok;
277 const char *target = ptr;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200278 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000279 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700280 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000281 default:
282 *nextTokPtr = ptr;
283 return XML_TOK_INVALID;
284 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200285 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000286 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700287 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
288 case BT_S:
289 case BT_CR:
290 case BT_LF:
291 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000292 *nextTokPtr = ptr;
293 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000294 }
295 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200296 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000297 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700298 INVALID_CASES(ptr, nextTokPtr)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000299 case BT_QUEST:
300 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200301 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000302 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
303 *nextTokPtr = ptr + MINBPC(enc);
304 return tok;
305 }
306 break;
307 default:
308 ptr += MINBPC(enc);
309 break;
310 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000311 }
312 return XML_TOK_PARTIAL;
313 case BT_QUEST:
Benjamin Peterson52b94082019-09-25 21:33:58 -0700314 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000315 *nextTokPtr = ptr;
316 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000317 }
318 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200319 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000320 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000321 *nextTokPtr = ptr + MINBPC(enc);
322 return tok;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000323 }
324 /* fall through */
325 default:
326 *nextTokPtr = ptr;
327 return XML_TOK_INVALID;
328 }
329 }
330 return XML_TOK_PARTIAL;
331}
332
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000333static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700334PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
335 const char **nextTokPtr) {
336 static const char CDATA_LSQB[]
337 = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000338 int i;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700339 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000340 /* CDATA[ */
Victor Stinner23ec4b52017-06-15 00:54:36 +0200341 REQUIRE_CHARS(enc, ptr, end, 6);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000342 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700343 if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000344 *nextTokPtr = ptr;
345 return XML_TOK_INVALID;
346 }
347 }
348 *nextTokPtr = ptr;
349 return XML_TOK_CDATA_SECT_OPEN;
350}
351
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000352static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700353PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
354 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200355 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000356 return XML_TOK_NONE;
357 if (MINBPC(enc) > 1) {
358 size_t n = end - ptr;
359 if (n & (MINBPC(enc) - 1)) {
360 n &= ~(MINBPC(enc) - 1);
361 if (n == 0)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000362 return XML_TOK_PARTIAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000363 end = ptr + n;
364 }
365 }
366 switch (BYTE_TYPE(enc, ptr)) {
367 case BT_RSQB:
368 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200369 REQUIRE_CHAR(enc, ptr, end);
Benjamin Peterson52b94082019-09-25 21:33:58 -0700370 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000371 break;
372 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200373 REQUIRE_CHAR(enc, ptr, end);
Benjamin Peterson52b94082019-09-25 21:33:58 -0700374 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000375 ptr -= MINBPC(enc);
376 break;
377 }
378 *nextTokPtr = ptr + MINBPC(enc);
379 return XML_TOK_CDATA_SECT_CLOSE;
380 case BT_CR:
381 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200382 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000383 if (BYTE_TYPE(enc, ptr) == BT_LF)
384 ptr += MINBPC(enc);
385 *nextTokPtr = ptr;
386 return XML_TOK_DATA_NEWLINE;
387 case BT_LF:
388 *nextTokPtr = ptr + MINBPC(enc);
389 return XML_TOK_DATA_NEWLINE;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700390 INVALID_CASES(ptr, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000391 default:
392 ptr += MINBPC(enc);
393 break;
394 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200395 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000396 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700397# define LEAD_CASE(n) \
398 case BT_LEAD##n: \
399 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
400 *nextTokPtr = ptr; \
401 return XML_TOK_DATA_CHARS; \
402 } \
403 ptr += n; \
404 break;
405 LEAD_CASE(2)
406 LEAD_CASE(3)
407 LEAD_CASE(4)
408# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000409 case BT_NONXML:
410 case BT_MALFORM:
411 case BT_TRAIL:
412 case BT_CR:
413 case BT_LF:
414 case BT_RSQB:
415 *nextTokPtr = ptr;
416 return XML_TOK_DATA_CHARS;
417 default:
418 ptr += MINBPC(enc);
419 break;
420 }
421 }
422 *nextTokPtr = ptr;
423 return XML_TOK_DATA_CHARS;
424}
425
426/* ptr points to character following "</" */
427
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000428static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700429PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
430 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200431 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000432 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700433 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000434 default:
435 *nextTokPtr = ptr;
436 return XML_TOK_INVALID;
437 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200438 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000439 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700440 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
441 case BT_S:
442 case BT_CR:
443 case BT_LF:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200444 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000445 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700446 case BT_S:
447 case BT_CR:
448 case BT_LF:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000449 break;
450 case BT_GT:
451 *nextTokPtr = ptr + MINBPC(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000452 return XML_TOK_END_TAG;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000453 default:
454 *nextTokPtr = ptr;
455 return XML_TOK_INVALID;
456 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000457 }
458 return XML_TOK_PARTIAL;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700459# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000460 case BT_COLON:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000461 /* no need to check qname syntax here,
462 since end-tag must match exactly */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000463 ptr += MINBPC(enc);
464 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700465# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000466 case BT_GT:
467 *nextTokPtr = ptr + MINBPC(enc);
468 return XML_TOK_END_TAG;
469 default:
470 *nextTokPtr = ptr;
471 return XML_TOK_INVALID;
472 }
473 }
474 return XML_TOK_PARTIAL;
475}
476
477/* ptr points to character following "&#X" */
478
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000479static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700480PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
481 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200482 if (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000483 switch (BYTE_TYPE(enc, ptr)) {
484 case BT_DIGIT:
485 case BT_HEX:
486 break;
487 default:
488 *nextTokPtr = ptr;
489 return XML_TOK_INVALID;
490 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200491 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000492 switch (BYTE_TYPE(enc, ptr)) {
493 case BT_DIGIT:
494 case BT_HEX:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000495 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000496 case BT_SEMI:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000497 *nextTokPtr = ptr + MINBPC(enc);
498 return XML_TOK_CHAR_REF;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000499 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000500 *nextTokPtr = ptr;
501 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000502 }
503 }
504 }
505 return XML_TOK_PARTIAL;
506}
507
508/* ptr points to character following "&#" */
509
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000510static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700511PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
512 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200513 if (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000514 if (CHAR_MATCHES(enc, ptr, ASCII_x))
515 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
516 switch (BYTE_TYPE(enc, ptr)) {
517 case BT_DIGIT:
518 break;
519 default:
520 *nextTokPtr = ptr;
521 return XML_TOK_INVALID;
522 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200523 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000524 switch (BYTE_TYPE(enc, ptr)) {
525 case BT_DIGIT:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000526 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000527 case BT_SEMI:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000528 *nextTokPtr = ptr + MINBPC(enc);
529 return XML_TOK_CHAR_REF;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000530 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000531 *nextTokPtr = ptr;
532 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000533 }
534 }
535 }
536 return XML_TOK_PARTIAL;
537}
538
539/* ptr points to character following "&" */
540
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000541static int PTRCALL
542PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -0700543 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200544 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000545 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700546 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000547 case BT_NUM:
548 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
549 default:
550 *nextTokPtr = ptr;
551 return XML_TOK_INVALID;
552 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200553 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000554 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700555 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000556 case BT_SEMI:
557 *nextTokPtr = ptr + MINBPC(enc);
558 return XML_TOK_ENTITY_REF;
559 default:
560 *nextTokPtr = ptr;
561 return XML_TOK_INVALID;
562 }
563 }
564 return XML_TOK_PARTIAL;
565}
566
567/* ptr points to character following first character of attribute name */
568
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000569static int PTRCALL
570PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -0700571 const char **nextTokPtr) {
572# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000573 int hadColon = 0;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700574# endif
Victor Stinner23ec4b52017-06-15 00:54:36 +0200575 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000576 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700577 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
578# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000579 case BT_COLON:
580 if (hadColon) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000581 *nextTokPtr = ptr;
582 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000583 }
584 hadColon = 1;
585 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200586 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000587 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700588 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000589 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000590 *nextTokPtr = ptr;
591 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000592 }
593 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700594# endif
595 case BT_S:
596 case BT_CR:
597 case BT_LF:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000598 for (;;) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000599 int t;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000600
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000601 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200602 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000603 t = BYTE_TYPE(enc, ptr);
604 if (t == BT_EQUALS)
605 break;
606 switch (t) {
607 case BT_S:
608 case BT_LF:
609 case BT_CR:
610 break;
611 default:
612 *nextTokPtr = ptr;
613 return XML_TOK_INVALID;
614 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000615 }
Benjamin Peterson5033aa72018-09-10 21:04:00 -0700616 /* fall through */
Benjamin Peterson52b94082019-09-25 21:33:58 -0700617 case BT_EQUALS: {
618 int open;
619# ifdef XML_NS
620 hadColon = 0;
621# endif
622 for (;;) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000623 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200624 REQUIRE_CHAR(enc, ptr, end);
Benjamin Peterson52b94082019-09-25 21:33:58 -0700625 open = BYTE_TYPE(enc, ptr);
626 if (open == BT_QUOT || open == BT_APOS)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000627 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700628 switch (open) {
629 case BT_S:
630 case BT_LF:
631 case BT_CR:
632 break;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000633 default:
634 *nextTokPtr = ptr;
635 return XML_TOK_INVALID;
636 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700637 }
638 ptr += MINBPC(enc);
639 /* in attribute value */
640 for (;;) {
641 int t;
642 REQUIRE_CHAR(enc, ptr, end);
643 t = BYTE_TYPE(enc, ptr);
644 if (t == open)
645 break;
646 switch (t) {
647 INVALID_CASES(ptr, nextTokPtr)
648 case BT_AMP: {
649 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
650 if (tok <= 0) {
651 if (tok == XML_TOK_INVALID)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000652 *nextTokPtr = ptr;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700653 return tok;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000654 }
655 break;
656 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700657 case BT_LT:
658 *nextTokPtr = ptr;
659 return XML_TOK_INVALID;
660 default:
661 ptr += MINBPC(enc);
662 break;
663 }
664 }
665 ptr += MINBPC(enc);
666 REQUIRE_CHAR(enc, ptr, end);
667 switch (BYTE_TYPE(enc, ptr)) {
668 case BT_S:
669 case BT_CR:
670 case BT_LF:
671 break;
672 case BT_SOL:
673 goto sol;
674 case BT_GT:
675 goto gt;
676 default:
677 *nextTokPtr = ptr;
678 return XML_TOK_INVALID;
679 }
680 /* ptr points to closing quote */
681 for (;;) {
682 ptr += MINBPC(enc);
683 REQUIRE_CHAR(enc, ptr, end);
684 switch (BYTE_TYPE(enc, ptr)) {
685 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
686 case BT_S:
687 case BT_CR:
688 case BT_LF:
689 continue;
690 case BT_GT:
691 gt:
692 *nextTokPtr = ptr + MINBPC(enc);
693 return XML_TOK_START_TAG_WITH_ATTS;
694 case BT_SOL:
695 sol:
696 ptr += MINBPC(enc);
697 REQUIRE_CHAR(enc, ptr, end);
698 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
699 *nextTokPtr = ptr;
700 return XML_TOK_INVALID;
701 }
702 *nextTokPtr = ptr + MINBPC(enc);
703 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
704 default:
705 *nextTokPtr = ptr;
706 return XML_TOK_INVALID;
707 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000708 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000709 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700710 break;
711 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000712 default:
713 *nextTokPtr = ptr;
714 return XML_TOK_INVALID;
715 }
716 }
717 return XML_TOK_PARTIAL;
718}
719
720/* ptr points to character following "<" */
721
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000722static int PTRCALL
723PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -0700724 const char **nextTokPtr) {
725# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000726 int hadColon;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700727# endif
Victor Stinner23ec4b52017-06-15 00:54:36 +0200728 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000729 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700730 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000731 case BT_EXCL:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200732 ptr += MINBPC(enc);
733 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000734 switch (BYTE_TYPE(enc, ptr)) {
735 case BT_MINUS:
736 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
737 case BT_LSQB:
Benjamin Peterson52b94082019-09-25 21:33:58 -0700738 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000739 }
740 *nextTokPtr = ptr;
741 return XML_TOK_INVALID;
742 case BT_QUEST:
743 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
744 case BT_SOL:
745 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
746 default:
747 *nextTokPtr = ptr;
748 return XML_TOK_INVALID;
749 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700750# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000751 hadColon = 0;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700752# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000753 /* we have a start-tag */
Victor Stinner23ec4b52017-06-15 00:54:36 +0200754 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000755 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700756 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
757# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000758 case BT_COLON:
759 if (hadColon) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000760 *nextTokPtr = ptr;
761 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000762 }
763 hadColon = 1;
764 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200765 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000766 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700767 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000768 default:
769 *nextTokPtr = ptr;
770 return XML_TOK_INVALID;
771 }
772 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700773# endif
774 case BT_S:
775 case BT_CR:
776 case BT_LF: {
777 ptr += MINBPC(enc);
778 while (HAS_CHAR(enc, ptr, end)) {
779 switch (BYTE_TYPE(enc, ptr)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000780 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Benjamin Peterson52b94082019-09-25 21:33:58 -0700781 case BT_GT:
782 goto gt;
783 case BT_SOL:
784 goto sol;
785 case BT_S:
786 case BT_CR:
787 case BT_LF:
788 ptr += MINBPC(enc);
789 continue;
790 default:
791 *nextTokPtr = ptr;
792 return XML_TOK_INVALID;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000793 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700794 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000795 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700796 return XML_TOK_PARTIAL;
797 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000798 case BT_GT:
799 gt:
800 *nextTokPtr = ptr + MINBPC(enc);
801 return XML_TOK_START_TAG_NO_ATTS;
802 case BT_SOL:
803 sol:
804 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200805 REQUIRE_CHAR(enc, ptr, end);
Benjamin Peterson52b94082019-09-25 21:33:58 -0700806 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000807 *nextTokPtr = ptr;
808 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000809 }
810 *nextTokPtr = ptr + MINBPC(enc);
811 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
812 default:
813 *nextTokPtr = ptr;
814 return XML_TOK_INVALID;
815 }
816 }
817 return XML_TOK_PARTIAL;
818}
819
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000820static int PTRCALL
821PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -0700822 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200823 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000824 return XML_TOK_NONE;
825 if (MINBPC(enc) > 1) {
826 size_t n = end - ptr;
827 if (n & (MINBPC(enc) - 1)) {
828 n &= ~(MINBPC(enc) - 1);
829 if (n == 0)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000830 return XML_TOK_PARTIAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000831 end = ptr + n;
832 }
833 }
834 switch (BYTE_TYPE(enc, ptr)) {
835 case BT_LT:
836 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
837 case BT_AMP:
838 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
839 case BT_CR:
840 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200841 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000842 return XML_TOK_TRAILING_CR;
843 if (BYTE_TYPE(enc, ptr) == BT_LF)
844 ptr += MINBPC(enc);
845 *nextTokPtr = ptr;
846 return XML_TOK_DATA_NEWLINE;
847 case BT_LF:
848 *nextTokPtr = ptr + MINBPC(enc);
849 return XML_TOK_DATA_NEWLINE;
850 case BT_RSQB:
851 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200852 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000853 return XML_TOK_TRAILING_RSQB;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700854 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000855 break;
856 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200857 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000858 return XML_TOK_TRAILING_RSQB;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700859 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000860 ptr -= MINBPC(enc);
861 break;
862 }
863 *nextTokPtr = ptr;
864 return XML_TOK_INVALID;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700865 INVALID_CASES(ptr, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000866 default:
867 ptr += MINBPC(enc);
868 break;
869 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200870 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000871 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700872# define LEAD_CASE(n) \
873 case BT_LEAD##n: \
874 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
875 *nextTokPtr = ptr; \
876 return XML_TOK_DATA_CHARS; \
877 } \
878 ptr += n; \
879 break;
880 LEAD_CASE(2)
881 LEAD_CASE(3)
882 LEAD_CASE(4)
883# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000884 case BT_RSQB:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200885 if (HAS_CHARS(enc, ptr, end, 2)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700886 if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
887 ptr += MINBPC(enc);
888 break;
889 }
890 if (HAS_CHARS(enc, ptr, end, 3)) {
891 if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
892 ptr += MINBPC(enc);
893 break;
894 }
895 *nextTokPtr = ptr + 2 * MINBPC(enc);
896 return XML_TOK_INVALID;
897 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000898 }
899 /* fall through */
900 case BT_AMP:
901 case BT_LT:
902 case BT_NONXML:
903 case BT_MALFORM:
904 case BT_TRAIL:
905 case BT_CR:
906 case BT_LF:
907 *nextTokPtr = ptr;
908 return XML_TOK_DATA_CHARS;
909 default:
910 ptr += MINBPC(enc);
911 break;
912 }
913 }
914 *nextTokPtr = ptr;
915 return XML_TOK_DATA_CHARS;
916}
917
918/* ptr points to character following "%" */
919
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000920static int PTRCALL
921PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -0700922 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200923 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000924 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700925 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
926 case BT_S:
927 case BT_LF:
928 case BT_CR:
929 case BT_PERCNT:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000930 *nextTokPtr = ptr;
931 return XML_TOK_PERCENT;
932 default:
933 *nextTokPtr = ptr;
934 return XML_TOK_INVALID;
935 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200936 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000937 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700938 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000939 case BT_SEMI:
940 *nextTokPtr = ptr + MINBPC(enc);
941 return XML_TOK_PARAM_ENTITY_REF;
942 default:
943 *nextTokPtr = ptr;
944 return XML_TOK_INVALID;
945 }
946 }
947 return XML_TOK_PARTIAL;
948}
949
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000950static int PTRCALL
951PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -0700952 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200953 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000954 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700955 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000956 default:
957 *nextTokPtr = ptr;
958 return XML_TOK_INVALID;
959 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200960 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000961 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700962 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
963 case BT_CR:
964 case BT_LF:
965 case BT_S:
966 case BT_RPAR:
967 case BT_GT:
968 case BT_PERCNT:
969 case BT_VERBAR:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000970 *nextTokPtr = ptr;
971 return XML_TOK_POUND_NAME;
972 default:
973 *nextTokPtr = ptr;
974 return XML_TOK_INVALID;
975 }
976 }
977 return -XML_TOK_POUND_NAME;
978}
979
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000980static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700981PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
982 const char **nextTokPtr) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200983 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000984 int t = BYTE_TYPE(enc, ptr);
985 switch (t) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700986 INVALID_CASES(ptr, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000987 case BT_QUOT:
988 case BT_APOS:
989 ptr += MINBPC(enc);
990 if (t != open)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000991 break;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200992 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000993 return -XML_TOK_LITERAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000994 *nextTokPtr = ptr;
995 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700996 case BT_S:
997 case BT_CR:
998 case BT_LF:
999 case BT_GT:
1000 case BT_PERCNT:
1001 case BT_LSQB:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001002 return XML_TOK_LITERAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001003 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001004 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001005 }
1006 default:
1007 ptr += MINBPC(enc);
1008 break;
1009 }
1010 }
1011 return XML_TOK_PARTIAL;
1012}
1013
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001014static int PTRCALL
1015PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -07001016 const char **nextTokPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001017 int tok;
Victor Stinner23ec4b52017-06-15 00:54:36 +02001018 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001019 return XML_TOK_NONE;
1020 if (MINBPC(enc) > 1) {
1021 size_t n = end - ptr;
1022 if (n & (MINBPC(enc) - 1)) {
1023 n &= ~(MINBPC(enc) - 1);
1024 if (n == 0)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001025 return XML_TOK_PARTIAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001026 end = ptr + n;
1027 }
1028 }
1029 switch (BYTE_TYPE(enc, ptr)) {
1030 case BT_QUOT:
1031 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1032 case BT_APOS:
1033 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
Benjamin Peterson52b94082019-09-25 21:33:58 -07001034 case BT_LT: {
1035 ptr += MINBPC(enc);
1036 REQUIRE_CHAR(enc, ptr, end);
1037 switch (BYTE_TYPE(enc, ptr)) {
1038 case BT_EXCL:
1039 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1040 case BT_QUEST:
1041 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1042 case BT_NMSTRT:
1043 case BT_HEX:
1044 case BT_NONASCII:
1045 case BT_LEAD2:
1046 case BT_LEAD3:
1047 case BT_LEAD4:
1048 *nextTokPtr = ptr - MINBPC(enc);
1049 return XML_TOK_INSTANCE_START;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001050 }
Benjamin Peterson52b94082019-09-25 21:33:58 -07001051 *nextTokPtr = ptr;
1052 return XML_TOK_INVALID;
1053 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001054 case BT_CR:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001055 if (ptr + MINBPC(enc) == end) {
1056 *nextTokPtr = end;
1057 /* indicate that this might be part of a CR/LF pair */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001058 return -XML_TOK_PROLOG_S;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001059 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001060 /* fall through */
Benjamin Peterson52b94082019-09-25 21:33:58 -07001061 case BT_S:
1062 case BT_LF:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001063 for (;;) {
1064 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001065 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001066 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001067 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001068 case BT_S:
1069 case BT_LF:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001070 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001071 case BT_CR:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001072 /* don't split CR/LF pair */
1073 if (ptr + MINBPC(enc) != end)
1074 break;
1075 /* fall through */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001076 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001077 *nextTokPtr = ptr;
1078 return XML_TOK_PROLOG_S;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001079 }
1080 }
1081 *nextTokPtr = ptr;
1082 return XML_TOK_PROLOG_S;
1083 case BT_PERCNT:
1084 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1085 case BT_COMMA:
1086 *nextTokPtr = ptr + MINBPC(enc);
1087 return XML_TOK_COMMA;
1088 case BT_LSQB:
1089 *nextTokPtr = ptr + MINBPC(enc);
1090 return XML_TOK_OPEN_BRACKET;
1091 case BT_RSQB:
1092 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001093 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001094 return -XML_TOK_CLOSE_BRACKET;
1095 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
Victor Stinner23ec4b52017-06-15 00:54:36 +02001096 REQUIRE_CHARS(enc, ptr, end, 2);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001097 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001098 *nextTokPtr = ptr + 2 * MINBPC(enc);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001099 return XML_TOK_COND_SECT_CLOSE;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001100 }
1101 }
1102 *nextTokPtr = ptr;
1103 return XML_TOK_CLOSE_BRACKET;
1104 case BT_LPAR:
1105 *nextTokPtr = ptr + MINBPC(enc);
1106 return XML_TOK_OPEN_PAREN;
1107 case BT_RPAR:
1108 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001109 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001110 return -XML_TOK_CLOSE_PAREN;
1111 switch (BYTE_TYPE(enc, ptr)) {
1112 case BT_AST:
1113 *nextTokPtr = ptr + MINBPC(enc);
1114 return XML_TOK_CLOSE_PAREN_ASTERISK;
1115 case BT_QUEST:
1116 *nextTokPtr = ptr + MINBPC(enc);
1117 return XML_TOK_CLOSE_PAREN_QUESTION;
1118 case BT_PLUS:
1119 *nextTokPtr = ptr + MINBPC(enc);
1120 return XML_TOK_CLOSE_PAREN_PLUS;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001121 case BT_CR:
1122 case BT_LF:
1123 case BT_S:
1124 case BT_GT:
1125 case BT_COMMA:
1126 case BT_VERBAR:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001127 case BT_RPAR:
1128 *nextTokPtr = ptr;
1129 return XML_TOK_CLOSE_PAREN;
1130 }
1131 *nextTokPtr = ptr;
1132 return XML_TOK_INVALID;
1133 case BT_VERBAR:
1134 *nextTokPtr = ptr + MINBPC(enc);
1135 return XML_TOK_OR;
1136 case BT_GT:
1137 *nextTokPtr = ptr + MINBPC(enc);
1138 return XML_TOK_DECL_CLOSE;
1139 case BT_NUM:
1140 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
Benjamin Peterson52b94082019-09-25 21:33:58 -07001141# define LEAD_CASE(n) \
1142 case BT_LEAD##n: \
1143 if (end - ptr < n) \
1144 return XML_TOK_PARTIAL_CHAR; \
1145 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1146 ptr += n; \
1147 tok = XML_TOK_NAME; \
1148 break; \
1149 } \
1150 if (IS_NAME_CHAR(enc, ptr, n)) { \
1151 ptr += n; \
1152 tok = XML_TOK_NMTOKEN; \
1153 break; \
1154 } \
1155 *nextTokPtr = ptr; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001156 return XML_TOK_INVALID;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001157 LEAD_CASE(2)
1158 LEAD_CASE(3)
1159 LEAD_CASE(4)
1160# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001161 case BT_NMSTRT:
1162 case BT_HEX:
1163 tok = XML_TOK_NAME;
1164 ptr += MINBPC(enc);
1165 break;
1166 case BT_DIGIT:
1167 case BT_NAME:
1168 case BT_MINUS:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001169# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001170 case BT_COLON:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001171# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001172 tok = XML_TOK_NMTOKEN;
1173 ptr += MINBPC(enc);
1174 break;
1175 case BT_NONASCII:
1176 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1177 ptr += MINBPC(enc);
1178 tok = XML_TOK_NAME;
1179 break;
1180 }
1181 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1182 ptr += MINBPC(enc);
1183 tok = XML_TOK_NMTOKEN;
1184 break;
1185 }
1186 /* fall through */
1187 default:
1188 *nextTokPtr = ptr;
1189 return XML_TOK_INVALID;
1190 }
Victor Stinner23ec4b52017-06-15 00:54:36 +02001191 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001192 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001193 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1194 case BT_GT:
1195 case BT_RPAR:
1196 case BT_COMMA:
1197 case BT_VERBAR:
1198 case BT_LSQB:
1199 case BT_PERCNT:
1200 case BT_S:
1201 case BT_CR:
1202 case BT_LF:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001203 *nextTokPtr = ptr;
1204 return tok;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001205# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001206 case BT_COLON:
1207 ptr += MINBPC(enc);
1208 switch (tok) {
1209 case XML_TOK_NAME:
Victor Stinner23ec4b52017-06-15 00:54:36 +02001210 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001211 tok = XML_TOK_PREFIXED_NAME;
1212 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001213 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001214 default:
1215 tok = XML_TOK_NMTOKEN;
1216 break;
1217 }
1218 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001219 case XML_TOK_PREFIXED_NAME:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001220 tok = XML_TOK_NMTOKEN;
1221 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001222 }
1223 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001224# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001225 case BT_PLUS:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001226 if (tok == XML_TOK_NMTOKEN) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001227 *nextTokPtr = ptr;
1228 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001229 }
1230 *nextTokPtr = ptr + MINBPC(enc);
1231 return XML_TOK_NAME_PLUS;
1232 case BT_AST:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001233 if (tok == XML_TOK_NMTOKEN) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001234 *nextTokPtr = ptr;
1235 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001236 }
1237 *nextTokPtr = ptr + MINBPC(enc);
1238 return XML_TOK_NAME_ASTERISK;
1239 case BT_QUEST:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001240 if (tok == XML_TOK_NMTOKEN) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001241 *nextTokPtr = ptr;
1242 return XML_TOK_INVALID;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001243 }
1244 *nextTokPtr = ptr + MINBPC(enc);
1245 return XML_TOK_NAME_QUESTION;
1246 default:
1247 *nextTokPtr = ptr;
1248 return XML_TOK_INVALID;
1249 }
1250 }
1251 return -tok;
1252}
1253
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001254static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001255PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1256 const char **nextTokPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001257 const char *start;
Victor Stinner23ec4b52017-06-15 00:54:36 +02001258 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001259 return XML_TOK_NONE;
Victor Stinner93d0cb52017-08-18 23:43:54 +02001260 else if (! HAS_CHAR(enc, ptr, end)) {
1261 /* This line cannot be executed. The incoming data has already
1262 * been tokenized once, so incomplete characters like this have
1263 * already been eliminated from the input. Retaining the paranoia
1264 * check is still valuable, however.
1265 */
1266 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1267 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001268 start = ptr;
Victor Stinner23ec4b52017-06-15 00:54:36 +02001269 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001270 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001271# define LEAD_CASE(n) \
1272 case BT_LEAD##n: \
1273 ptr += n; \
1274 break;
1275 LEAD_CASE(2)
1276 LEAD_CASE(3)
1277 LEAD_CASE(4)
1278# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001279 case BT_AMP:
1280 if (ptr == start)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001281 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001282 *nextTokPtr = ptr;
1283 return XML_TOK_DATA_CHARS;
1284 case BT_LT:
1285 /* this is for inside entity references */
1286 *nextTokPtr = ptr;
1287 return XML_TOK_INVALID;
1288 case BT_LF:
1289 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001290 *nextTokPtr = ptr + MINBPC(enc);
1291 return XML_TOK_DATA_NEWLINE;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001292 }
1293 *nextTokPtr = ptr;
1294 return XML_TOK_DATA_CHARS;
1295 case BT_CR:
1296 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001297 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001298 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001299 return XML_TOK_TRAILING_CR;
1300 if (BYTE_TYPE(enc, ptr) == BT_LF)
1301 ptr += MINBPC(enc);
1302 *nextTokPtr = ptr;
1303 return XML_TOK_DATA_NEWLINE;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001304 }
1305 *nextTokPtr = ptr;
1306 return XML_TOK_DATA_CHARS;
1307 case BT_S:
1308 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001309 *nextTokPtr = ptr + MINBPC(enc);
1310 return XML_TOK_ATTRIBUTE_VALUE_S;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001311 }
1312 *nextTokPtr = ptr;
1313 return XML_TOK_DATA_CHARS;
1314 default:
1315 ptr += MINBPC(enc);
1316 break;
1317 }
1318 }
1319 *nextTokPtr = ptr;
1320 return XML_TOK_DATA_CHARS;
1321}
1322
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001323static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001324PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1325 const char **nextTokPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001326 const char *start;
Victor Stinner23ec4b52017-06-15 00:54:36 +02001327 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001328 return XML_TOK_NONE;
Victor Stinner93d0cb52017-08-18 23:43:54 +02001329 else if (! HAS_CHAR(enc, ptr, end)) {
1330 /* This line cannot be executed. The incoming data has already
1331 * been tokenized once, so incomplete characters like this have
1332 * already been eliminated from the input. Retaining the paranoia
1333 * check is still valuable, however.
1334 */
1335 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1336 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001337 start = ptr;
Victor Stinner23ec4b52017-06-15 00:54:36 +02001338 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001339 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001340# define LEAD_CASE(n) \
1341 case BT_LEAD##n: \
1342 ptr += n; \
1343 break;
1344 LEAD_CASE(2)
1345 LEAD_CASE(3)
1346 LEAD_CASE(4)
1347# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001348 case BT_AMP:
1349 if (ptr == start)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001350 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001351 *nextTokPtr = ptr;
1352 return XML_TOK_DATA_CHARS;
1353 case BT_PERCNT:
1354 if (ptr == start) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001355 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001356 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001357 }
1358 *nextTokPtr = ptr;
1359 return XML_TOK_DATA_CHARS;
1360 case BT_LF:
1361 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001362 *nextTokPtr = ptr + MINBPC(enc);
1363 return XML_TOK_DATA_NEWLINE;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001364 }
1365 *nextTokPtr = ptr;
1366 return XML_TOK_DATA_CHARS;
1367 case BT_CR:
1368 if (ptr == start) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001369 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001370 if (! HAS_CHAR(enc, ptr, end))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001371 return XML_TOK_TRAILING_CR;
1372 if (BYTE_TYPE(enc, ptr) == BT_LF)
1373 ptr += MINBPC(enc);
1374 *nextTokPtr = ptr;
1375 return XML_TOK_DATA_NEWLINE;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001376 }
1377 *nextTokPtr = ptr;
1378 return XML_TOK_DATA_CHARS;
1379 default:
1380 ptr += MINBPC(enc);
1381 break;
1382 }
1383 }
1384 *nextTokPtr = ptr;
1385 return XML_TOK_DATA_CHARS;
1386}
1387
Benjamin Peterson52b94082019-09-25 21:33:58 -07001388# ifdef XML_DTD
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001389
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001390static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001391PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1392 const char **nextTokPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001393 int level = 0;
1394 if (MINBPC(enc) > 1) {
1395 size_t n = end - ptr;
1396 if (n & (MINBPC(enc) - 1)) {
1397 n &= ~(MINBPC(enc) - 1);
1398 end = ptr + n;
1399 }
1400 }
Victor Stinner23ec4b52017-06-15 00:54:36 +02001401 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001402 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001403 INVALID_CASES(ptr, nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001404 case BT_LT:
Victor Stinner23ec4b52017-06-15 00:54:36 +02001405 ptr += MINBPC(enc);
1406 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001407 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
Victor Stinner23ec4b52017-06-15 00:54:36 +02001408 ptr += MINBPC(enc);
1409 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001410 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1411 ++level;
1412 ptr += MINBPC(enc);
1413 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001414 }
1415 break;
1416 case BT_RSQB:
Victor Stinner23ec4b52017-06-15 00:54:36 +02001417 ptr += MINBPC(enc);
1418 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001419 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
Victor Stinner23ec4b52017-06-15 00:54:36 +02001420 ptr += MINBPC(enc);
1421 REQUIRE_CHAR(enc, ptr, end);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001422 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1423 ptr += MINBPC(enc);
1424 if (level == 0) {
1425 *nextTokPtr = ptr;
1426 return XML_TOK_IGNORE_SECT;
1427 }
1428 --level;
1429 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001430 }
1431 break;
1432 default:
1433 ptr += MINBPC(enc);
1434 break;
1435 }
1436 }
1437 return XML_TOK_PARTIAL;
1438}
1439
Benjamin Peterson52b94082019-09-25 21:33:58 -07001440# endif /* XML_DTD */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001441
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001442static int PTRCALL
1443PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
Benjamin Peterson52b94082019-09-25 21:33:58 -07001444 const char **badPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001445 ptr += MINBPC(enc);
1446 end -= MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001447 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001448 switch (BYTE_TYPE(enc, ptr)) {
1449 case BT_DIGIT:
1450 case BT_HEX:
1451 case BT_MINUS:
1452 case BT_APOS:
1453 case BT_LPAR:
1454 case BT_RPAR:
1455 case BT_PLUS:
1456 case BT_COMMA:
1457 case BT_SOL:
1458 case BT_EQUALS:
1459 case BT_QUEST:
1460 case BT_CR:
1461 case BT_LF:
1462 case BT_SEMI:
1463 case BT_EXCL:
1464 case BT_AST:
1465 case BT_PERCNT:
1466 case BT_NUM:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001467# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001468 case BT_COLON:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001469# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001470 break;
1471 case BT_S:
1472 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001473 *badPtr = ptr;
1474 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001475 }
1476 break;
1477 case BT_NAME:
1478 case BT_NMSTRT:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001479 if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001480 break;
Benjamin Peterson5033aa72018-09-10 21:04:00 -07001481 /* fall through */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001482 default:
1483 switch (BYTE_TO_ASCII(enc, ptr)) {
1484 case 0x24: /* $ */
1485 case 0x40: /* @ */
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001486 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001487 default:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001488 *badPtr = ptr;
1489 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001490 }
1491 break;
1492 }
1493 }
1494 return 1;
1495}
1496
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001497/* This must only be called for a well-formed start-tag or empty
1498 element tag. Returns the number of attributes. Pointers to the
1499 first attsMax attributes are stored in atts.
1500*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001501
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001502static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001503PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
1504 ATTRIBUTE *atts) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001505 enum { other, inName, inValue } state = inName;
1506 int nAtts = 0;
1507 int open = 0; /* defined when state == inValue;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001508 initialization just to shut up compilers */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001509
1510 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1511 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001512# define START_NAME \
1513 if (state == other) { \
1514 if (nAtts < attsMax) { \
1515 atts[nAtts].name = ptr; \
1516 atts[nAtts].normalized = 1; \
1517 } \
1518 state = inName; \
1519 }
1520# define LEAD_CASE(n) \
1521 case BT_LEAD##n: \
1522 START_NAME ptr += (n - MINBPC(enc)); \
1523 break;
1524 LEAD_CASE(2)
1525 LEAD_CASE(3)
1526 LEAD_CASE(4)
1527# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001528 case BT_NONASCII:
1529 case BT_NMSTRT:
1530 case BT_HEX:
1531 START_NAME
1532 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001533# undef START_NAME
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001534 case BT_QUOT:
1535 if (state != inValue) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001536 if (nAtts < attsMax)
1537 atts[nAtts].valuePtr = ptr + MINBPC(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001538 state = inValue;
1539 open = BT_QUOT;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001540 } else if (open == BT_QUOT) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001541 state = other;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001542 if (nAtts < attsMax)
1543 atts[nAtts].valueEnd = ptr;
1544 nAtts++;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001545 }
1546 break;
1547 case BT_APOS:
1548 if (state != inValue) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001549 if (nAtts < attsMax)
1550 atts[nAtts].valuePtr = ptr + MINBPC(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001551 state = inValue;
1552 open = BT_APOS;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001553 } else if (open == BT_APOS) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001554 state = other;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001555 if (nAtts < attsMax)
1556 atts[nAtts].valueEnd = ptr;
1557 nAtts++;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001558 }
1559 break;
1560 case BT_AMP:
1561 if (nAtts < attsMax)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001562 atts[nAtts].normalized = 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001563 break;
1564 case BT_S:
1565 if (state == inName)
1566 state = other;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001567 else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001568 && (ptr == atts[nAtts].valuePtr
1569 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1570 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1571 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1572 atts[nAtts].normalized = 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001573 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001574 case BT_CR:
1575 case BT_LF:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001576 /* This case ensures that the first attribute name is counted
1577 Apart from that we could just change state on the quote. */
1578 if (state == inName)
1579 state = other;
1580 else if (state == inValue && nAtts < attsMax)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001581 atts[nAtts].normalized = 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001582 break;
1583 case BT_GT:
1584 case BT_SOL:
1585 if (state != inValue)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001586 return nAtts;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001587 break;
1588 default:
1589 break;
1590 }
1591 }
1592 /* not reached */
1593}
1594
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001595static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001596PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001597 int result = 0;
1598 /* skip &# */
Benjamin Peterson52b94082019-09-25 21:33:58 -07001599 UNUSED_P(enc);
1600 ptr += 2 * MINBPC(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001601 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001602 for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001603 ptr += MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001604 int c = BYTE_TO_ASCII(enc, ptr);
1605 switch (c) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001606 case ASCII_0:
1607 case ASCII_1:
1608 case ASCII_2:
1609 case ASCII_3:
1610 case ASCII_4:
1611 case ASCII_5:
1612 case ASCII_6:
1613 case ASCII_7:
1614 case ASCII_8:
1615 case ASCII_9:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001616 result <<= 4;
1617 result |= (c - ASCII_0);
1618 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001619 case ASCII_A:
1620 case ASCII_B:
1621 case ASCII_C:
1622 case ASCII_D:
1623 case ASCII_E:
1624 case ASCII_F:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001625 result <<= 4;
1626 result += 10 + (c - ASCII_A);
1627 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001628 case ASCII_a:
1629 case ASCII_b:
1630 case ASCII_c:
1631 case ASCII_d:
1632 case ASCII_e:
1633 case ASCII_f:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001634 result <<= 4;
1635 result += 10 + (c - ASCII_a);
1636 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001637 }
1638 if (result >= 0x110000)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001639 return -1;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001640 }
Benjamin Peterson52b94082019-09-25 21:33:58 -07001641 } else {
1642 for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001643 int c = BYTE_TO_ASCII(enc, ptr);
1644 result *= 10;
1645 result += (c - ASCII_0);
1646 if (result >= 0x110000)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001647 return -1;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001648 }
1649 }
1650 return checkCharRefNumber(result);
1651}
1652
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001653static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001654PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1655 const char *end) {
1656 UNUSED_P(enc);
1657 switch ((end - ptr) / MINBPC(enc)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001658 case 2:
1659 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1660 switch (BYTE_TO_ASCII(enc, ptr)) {
1661 case ASCII_l:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001662 return ASCII_LT;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001663 case ASCII_g:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001664 return ASCII_GT;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001665 }
1666 }
1667 break;
1668 case 3:
1669 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1670 ptr += MINBPC(enc);
1671 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001672 ptr += MINBPC(enc);
1673 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1674 return ASCII_AMP;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001675 }
1676 }
1677 break;
1678 case 4:
1679 switch (BYTE_TO_ASCII(enc, ptr)) {
1680 case ASCII_q:
1681 ptr += MINBPC(enc);
1682 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001683 ptr += MINBPC(enc);
1684 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1685 ptr += MINBPC(enc);
1686 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1687 return ASCII_QUOT;
1688 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001689 }
1690 break;
1691 case ASCII_a:
1692 ptr += MINBPC(enc);
1693 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001694 ptr += MINBPC(enc);
1695 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1696 ptr += MINBPC(enc);
1697 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1698 return ASCII_APOS;
1699 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001700 }
1701 break;
1702 }
1703 }
1704 return 0;
1705}
1706
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001707static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001708PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1709 const char *end1, const char *ptr2) {
1710 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001711 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
Victor Stinner93d0cb52017-08-18 23:43:54 +02001712 if (end1 - ptr1 < MINBPC(enc)) {
Benjamin Peterson5033aa72018-09-10 21:04:00 -07001713 /* This line cannot be executed. The incoming data has already
1714 * been tokenized once, so incomplete characters like this have
Victor Stinner93d0cb52017-08-18 23:43:54 +02001715 * already been eliminated from the input. Retaining the
1716 * paranoia check is still valuable, however.
1717 */
1718 return 0; /* LCOV_EXCL_LINE */
1719 }
Benjamin Peterson52b94082019-09-25 21:33:58 -07001720 if (! CHAR_MATCHES(enc, ptr1, *ptr2))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001721 return 0;
1722 }
1723 return ptr1 == end1;
1724}
1725
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001726static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001727PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001728 const char *start = ptr;
1729 for (;;) {
1730 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001731# define LEAD_CASE(n) \
1732 case BT_LEAD##n: \
1733 ptr += n; \
1734 break;
1735 LEAD_CASE(2)
1736 LEAD_CASE(3)
1737 LEAD_CASE(4)
1738# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001739 case BT_NONASCII:
1740 case BT_NMSTRT:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001741# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001742 case BT_COLON:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001743# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001744 case BT_HEX:
1745 case BT_DIGIT:
1746 case BT_NAME:
1747 case BT_MINUS:
1748 ptr += MINBPC(enc);
1749 break;
1750 default:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001751 return (int)(ptr - start);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001752 }
1753 }
1754}
1755
Benjamin Peterson52b94082019-09-25 21:33:58 -07001756static const char *PTRFASTCALL
1757PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001758 for (;;) {
1759 switch (BYTE_TYPE(enc, ptr)) {
1760 case BT_LF:
1761 case BT_CR:
1762 case BT_S:
1763 ptr += MINBPC(enc);
1764 break;
1765 default:
1766 return ptr;
1767 }
1768 }
1769}
1770
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001771static void PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001772PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
1773 POSITION *pos) {
Victor Stinner23ec4b52017-06-15 00:54:36 +02001774 while (HAS_CHAR(enc, ptr, end)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001775 switch (BYTE_TYPE(enc, ptr)) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001776# define LEAD_CASE(n) \
1777 case BT_LEAD##n: \
1778 ptr += n; \
Miss Islington (bot)27067852021-08-29 07:32:50 -07001779 pos->columnNumber++; \
Benjamin Peterson52b94082019-09-25 21:33:58 -07001780 break;
1781 LEAD_CASE(2)
1782 LEAD_CASE(3)
1783 LEAD_CASE(4)
1784# undef LEAD_CASE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001785 case BT_LF:
Miss Islington (bot)27067852021-08-29 07:32:50 -07001786 pos->columnNumber = 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001787 pos->lineNumber++;
1788 ptr += MINBPC(enc);
1789 break;
1790 case BT_CR:
1791 pos->lineNumber++;
1792 ptr += MINBPC(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001793 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001794 ptr += MINBPC(enc);
Miss Islington (bot)27067852021-08-29 07:32:50 -07001795 pos->columnNumber = 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001796 break;
1797 default:
1798 ptr += MINBPC(enc);
Miss Islington (bot)27067852021-08-29 07:32:50 -07001799 pos->columnNumber++;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001800 break;
1801 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001802 }
1803}
1804
Benjamin Peterson52b94082019-09-25 21:33:58 -07001805# undef DO_LEAD_CASE
1806# undef MULTIBYTE_CASES
1807# undef INVALID_CASES
1808# undef CHECK_NAME_CASE
1809# undef CHECK_NAME_CASES
1810# undef CHECK_NMSTRT_CASE
1811# undef CHECK_NMSTRT_CASES
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001812
Gregory P. Smith7c6309c2012-07-14 14:12:35 -07001813#endif /* XML_TOK_IMPL_C */