blob: 54cfedb85c28c441d7d6809635faf201d427f413 [file] [log] [blame]
Victor Stinner759e30e2017-09-05 01:58:08 +02001/*
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000-2017 Expat development team
11 Licensed under the MIT license:
12
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to permit
18 persons to whom the Software is furnished to do so, subject to the
19 following conditions:
20
21 The above copyright notice and this permission notice shall be included
22 in all copies or substantial portions of the Software.
23
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30 USE OR OTHER DEALINGS IN THE SOFTWARE.
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000031*/
32
Pablo Galindo81774042019-10-12 20:14:11 +010033#ifdef _WIN32
34# include "winconfig.h"
35#else
36# ifdef HAVE_EXPAT_CONFIG_H
37# include <expat_config.h>
38# endif
39#endif /* ndef _WIN32 */
40
Benjamin Peterson06d49bb2016-06-13 23:41:19 -070041#include <stddef.h>
Benjamin Peterson52b94082019-09-25 21:33:58 -070042#include <string.h> /* memcpy */
Benjamin Peterson4e211002018-06-26 19:25:45 -070043
44#if defined(_MSC_VER) && (_MSC_VER <= 1700)
Benjamin Peterson52b94082019-09-25 21:33:58 -070045/* for vs2012/11.0/1700 and earlier Visual Studio compilers */
46# define bool int
47# define false 0
48# define true 1
Benjamin Peterson4e211002018-06-26 19:25:45 -070049#else
Benjamin Peterson52b94082019-09-25 21:33:58 -070050# include <stdbool.h>
Benjamin Peterson4e211002018-06-26 19:25:45 -070051#endif
52
Fred Drake31d485c2004-08-03 07:06:22 +000053#include "expat_external.h"
Martin v. Löwisfc03a942003-01-25 22:41:29 +000054#include "internal.h"
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000055#include "xmltok.h"
56#include "nametab.h"
57
58#ifdef XML_DTD
Benjamin Peterson52b94082019-09-25 21:33:58 -070059# define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000060#else
Benjamin Peterson52b94082019-09-25 21:33:58 -070061# define IGNORE_SECTION_TOK_VTABLE /* as nothing */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000062#endif
63
Benjamin Peterson52b94082019-09-25 21:33:58 -070064#define VTABLE1 \
65 {PREFIX(prologTok), PREFIX(contentTok), \
66 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \
67 {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \
68 PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \
69 PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \
70 PREFIX(updatePosition), PREFIX(isPublicId)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000071
72#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
73
Benjamin Peterson52b94082019-09-25 21:33:58 -070074#define UCS2_GET_NAMING(pages, hi, lo) \
75 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000076
Martin v. Löwisfc03a942003-01-25 22:41:29 +000077/* A 2 byte UTF-8 representation splits the characters 11 bits between
78 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
79 pages, 3 bits to add to that index and 5 bits to generate the mask.
80*/
Benjamin Peterson52b94082019-09-25 21:33:58 -070081#define UTF8_GET_NAMING2(pages, byte) \
82 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
83 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \
84 & (1u << (((byte)[1]) & 0x1F)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000085
Martin v. Löwisfc03a942003-01-25 22:41:29 +000086/* A 3 byte UTF-8 representation splits the characters 16 bits between
87 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
88 into pages, 3 bits to add to that index and 5 bits to generate the
89 mask.
90*/
Benjamin Peterson52b94082019-09-25 21:33:58 -070091#define UTF8_GET_NAMING3(pages, byte) \
92 (namingBitmap \
93 [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \
94 << 3) \
95 + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \
96 & (1u << (((byte)[2]) & 0x1F)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000097
Benjamin Peterson52b94082019-09-25 21:33:58 -070098#define UTF8_GET_NAMING(pages, p, n) \
99 ((n) == 2 \
100 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
101 : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000102
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000103/* Detection of invalid UTF-8 sequences is based on Table 3.1B
104 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
105 with the additional restriction of not allowing the Unicode
106 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
107 Implementation details:
108 (A & 0x80) == 0 means A < 0x80
109 and
110 (A & 0xC0) == 0xC0 means A > 0xBF
111*/
112
Benjamin Peterson52b94082019-09-25 21:33:58 -0700113#define UTF8_INVALID2(p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000114 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
115
Benjamin Peterson52b94082019-09-25 21:33:58 -0700116#define UTF8_INVALID3(p) \
117 (((p)[2] & 0x80) == 0 \
118 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \
119 : ((p)[2] & 0xC0) == 0xC0) \
120 || ((*p) == 0xE0 \
121 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
122 : ((p)[1] & 0x80) == 0 \
123 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000124
Benjamin Peterson52b94082019-09-25 21:33:58 -0700125#define UTF8_INVALID4(p) \
126 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \
127 || ((p)[2] & 0xC0) == 0xC0 \
128 || ((*p) == 0xF0 \
129 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
130 : ((p)[1] & 0x80) == 0 \
131 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000132
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000133static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700134isNever(const ENCODING *enc, const char *p) {
135 UNUSED_P(enc);
136 UNUSED_P(p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000137 return 0;
138}
139
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000140static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700141utf8_isName2(const ENCODING *enc, const char *p) {
142 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000143 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
144}
145
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000146static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700147utf8_isName3(const ENCODING *enc, const char *p) {
148 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000149 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
150}
151
152#define utf8_isName4 isNever
153
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000154static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700155utf8_isNmstrt2(const ENCODING *enc, const char *p) {
156 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000157 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
158}
159
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000160static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700161utf8_isNmstrt3(const ENCODING *enc, const char *p) {
162 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000163 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
164}
165
166#define utf8_isNmstrt4 isNever
167
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000168static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700169utf8_isInvalid2(const ENCODING *enc, const char *p) {
170 UNUSED_P(enc);
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000171 return UTF8_INVALID2((const unsigned char *)p);
172}
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000173
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000174static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700175utf8_isInvalid3(const ENCODING *enc, const char *p) {
176 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000177 return UTF8_INVALID3((const unsigned char *)p);
178}
179
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000180static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700181utf8_isInvalid4(const ENCODING *enc, const char *p) {
182 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000183 return UTF8_INVALID4((const unsigned char *)p);
184}
185
186struct normal_encoding {
187 ENCODING enc;
188 unsigned char type[256];
189#ifdef XML_MIN_SIZE
Benjamin Peterson52b94082019-09-25 21:33:58 -0700190 int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
191 int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
192 int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
193 int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
194 int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000195#endif /* XML_MIN_SIZE */
Benjamin Peterson52b94082019-09-25 21:33:58 -0700196 int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
197 int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
198 int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
199 int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
200 int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
201 int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
202 int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
203 int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
204 int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000205};
206
Benjamin Peterson52b94082019-09-25 21:33:58 -0700207#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000208
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000209#ifdef XML_MIN_SIZE
210
Benjamin Peterson52b94082019-09-25 21:33:58 -0700211# define STANDARD_VTABLE(E) \
212 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000213
214#else
215
Benjamin Peterson52b94082019-09-25 21:33:58 -0700216# define STANDARD_VTABLE(E) /* as nothing */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000217
218#endif
219
Benjamin Peterson52b94082019-09-25 21:33:58 -0700220#define NORMAL_VTABLE(E) \
221 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \
222 E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000223
Benjamin Peterson52b94082019-09-25 21:33:58 -0700224#define NULL_VTABLE \
225 /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \
226 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \
227 /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200228
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000229static int FASTCALL checkCharRefNumber(int);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000230
231#include "xmltok_impl.h"
232#include "ascii.h"
233
234#ifdef XML_MIN_SIZE
Benjamin Peterson52b94082019-09-25 21:33:58 -0700235# define sb_isNameMin isNever
236# define sb_isNmstrtMin isNever
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000237#endif
238
239#ifdef XML_MIN_SIZE
Benjamin Peterson52b94082019-09-25 21:33:58 -0700240# define MINBPC(enc) ((enc)->minBytesPerChar)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000241#else
242/* minimum bytes per character */
Benjamin Peterson52b94082019-09-25 21:33:58 -0700243# define MINBPC(enc) 1
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000244#endif
245
Benjamin Peterson52b94082019-09-25 21:33:58 -0700246#define SB_BYTE_TYPE(enc, p) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000247 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
248
249#ifdef XML_MIN_SIZE
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000250static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700251sb_byteType(const ENCODING *enc, const char *p) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000252 return SB_BYTE_TYPE(enc, p);
253}
Benjamin Peterson52b94082019-09-25 21:33:58 -0700254# define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000255#else
Benjamin Peterson52b94082019-09-25 21:33:58 -0700256# define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000257#endif
258
259#ifdef XML_MIN_SIZE
Benjamin Peterson52b94082019-09-25 21:33:58 -0700260# define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000261static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700262sb_byteToAscii(const ENCODING *enc, const char *p) {
263 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000264 return *p;
265}
266#else
Benjamin Peterson52b94082019-09-25 21:33:58 -0700267# define BYTE_TO_ASCII(enc, p) (*(p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000268#endif
269
Benjamin Peterson52b94082019-09-25 21:33:58 -0700270#define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
271#define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
272#define IS_INVALID_CHAR(enc, p, n) \
273 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000274
275#ifdef XML_MIN_SIZE
Benjamin Peterson52b94082019-09-25 21:33:58 -0700276# define IS_NAME_CHAR_MINBPC(enc, p) \
277 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
278# define IS_NMSTRT_CHAR_MINBPC(enc, p) \
279 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000280#else
Benjamin Peterson52b94082019-09-25 21:33:58 -0700281# define IS_NAME_CHAR_MINBPC(enc, p) (0)
282# define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000283#endif
284
285#ifdef XML_MIN_SIZE
Benjamin Peterson52b94082019-09-25 21:33:58 -0700286# define CHAR_MATCHES(enc, p, c) \
287 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000288static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700289sb_charMatches(const ENCODING *enc, const char *p, int c) {
290 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000291 return *p == c;
292}
293#else
294/* c is an ASCII character */
Benjamin Peterson52b94082019-09-25 21:33:58 -0700295# define CHAR_MATCHES(enc, p, c) (*(p) == c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000296#endif
297
Benjamin Peterson52b94082019-09-25 21:33:58 -0700298#define PREFIX(ident) normal_##ident
Gregory P. Smith64359d22012-07-14 14:12:35 -0700299#define XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000300#include "xmltok_impl.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -0700301#undef XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000302
303#undef MINBPC
304#undef BYTE_TYPE
305#undef BYTE_TO_ASCII
306#undef CHAR_MATCHES
307#undef IS_NAME_CHAR
308#undef IS_NAME_CHAR_MINBPC
309#undef IS_NMSTRT_CHAR
310#undef IS_NMSTRT_CHAR_MINBPC
311#undef IS_INVALID_CHAR
312
Benjamin Peterson52b94082019-09-25 21:33:58 -0700313enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
314 UTF8_cval1 = 0x00,
315 UTF8_cval2 = 0xc0,
316 UTF8_cval3 = 0xe0,
317 UTF8_cval4 = 0xf0
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000318};
319
Victor Stinner23ec4b52017-06-15 00:54:36 +0200320void
Benjamin Peterson52b94082019-09-25 21:33:58 -0700321_INTERNAL_trim_to_complete_utf8_characters(const char *from,
322 const char **fromLimRef) {
323 const char *fromLim = *fromLimRef;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200324 size_t walked = 0;
325 for (; fromLim > from; fromLim--, walked++) {
326 const unsigned char prev = (unsigned char)fromLim[-1];
Benjamin Peterson52b94082019-09-25 21:33:58 -0700327 if ((prev & 0xf8u)
328 == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
Victor Stinner23ec4b52017-06-15 00:54:36 +0200329 if (walked + 1 >= 4) {
330 fromLim += 4 - 1;
331 break;
332 } else {
333 walked = 0;
334 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700335 } else if ((prev & 0xf0u)
336 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
Victor Stinner23ec4b52017-06-15 00:54:36 +0200337 if (walked + 1 >= 3) {
338 fromLim += 3 - 1;
339 break;
340 } else {
341 walked = 0;
342 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700343 } else if ((prev & 0xe0u)
344 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
Victor Stinner23ec4b52017-06-15 00:54:36 +0200345 if (walked + 1 >= 2) {
346 fromLim += 2 - 1;
347 break;
348 } else {
349 walked = 0;
350 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700351 } else if ((prev & 0x80u)
352 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
Victor Stinner23ec4b52017-06-15 00:54:36 +0200353 break;
354 }
355 }
356 *fromLimRef = fromLim;
357}
358
359static enum XML_Convert_Result PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700360utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
361 char **toP, const char *toLim) {
Victor Stinner759e30e2017-09-05 01:58:08 +0200362 bool input_incomplete = false;
363 bool output_exhausted = false;
Victor Stinner5ff71322017-06-21 14:39:22 +0200364
Victor Stinner759e30e2017-09-05 01:58:08 +0200365 /* Avoid copying partial characters (due to limited space). */
366 const ptrdiff_t bytesAvailable = fromLim - *fromP;
367 const ptrdiff_t bytesStorable = toLim - *toP;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700368 UNUSED_P(enc);
Victor Stinner759e30e2017-09-05 01:58:08 +0200369 if (bytesAvailable > bytesStorable) {
370 fromLim = *fromP + bytesStorable;
371 output_exhausted = true;
372 }
373
374 /* Avoid copying partial characters (from incomplete input). */
Benjamin Peterson4e211002018-06-26 19:25:45 -0700375 {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700376 const char *const fromLimBefore = fromLim;
Benjamin Peterson4e211002018-06-26 19:25:45 -0700377 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
378 if (fromLim < fromLimBefore) {
379 input_incomplete = true;
380 }
Victor Stinner759e30e2017-09-05 01:58:08 +0200381 }
Victor Stinner5ff71322017-06-21 14:39:22 +0200382
Benjamin Peterson4e211002018-06-26 19:25:45 -0700383 {
384 const ptrdiff_t bytesToCopy = fromLim - *fromP;
385 memcpy(*toP, *fromP, bytesToCopy);
386 *fromP += bytesToCopy;
387 *toP += bytesToCopy;
388 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200389
Benjamin Peterson52b94082019-09-25 21:33:58 -0700390 if (output_exhausted) /* needs to go first */
Victor Stinner23ec4b52017-06-15 00:54:36 +0200391 return XML_CONVERT_OUTPUT_EXHAUSTED;
Victor Stinner759e30e2017-09-05 01:58:08 +0200392 else if (input_incomplete)
393 return XML_CONVERT_INPUT_INCOMPLETE;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200394 else
Victor Stinner5ff71322017-06-21 14:39:22 +0200395 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000396}
397
Victor Stinner23ec4b52017-06-15 00:54:36 +0200398static enum XML_Convert_Result PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700399utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
400 unsigned short **toP, const unsigned short *toLim) {
Victor Stinner23ec4b52017-06-15 00:54:36 +0200401 enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000402 unsigned short *to = *toP;
403 const char *from = *fromP;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200404 while (from < fromLim && to < toLim) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000405 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
406 case BT_LEAD2:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200407 if (fromLim - from < 2) {
408 res = XML_CONVERT_INPUT_INCOMPLETE;
Victor Stinner5ff71322017-06-21 14:39:22 +0200409 goto after;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200410 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000411 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000412 from += 2;
413 break;
414 case BT_LEAD3:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200415 if (fromLim - from < 3) {
416 res = XML_CONVERT_INPUT_INCOMPLETE;
Victor Stinner5ff71322017-06-21 14:39:22 +0200417 goto after;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200418 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700419 *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
420 | (from[2] & 0x3f));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000421 from += 3;
422 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700423 case BT_LEAD4: {
424 unsigned long n;
425 if (toLim - to < 2) {
426 res = XML_CONVERT_OUTPUT_EXHAUSTED;
427 goto after;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000428 }
Benjamin Peterson52b94082019-09-25 21:33:58 -0700429 if (fromLim - from < 4) {
430 res = XML_CONVERT_INPUT_INCOMPLETE;
431 goto after;
432 }
433 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
434 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
435 n -= 0x10000;
436 to[0] = (unsigned short)((n >> 10) | 0xD800);
437 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
438 to += 2;
439 from += 4;
440 } break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000441 default:
442 *to++ = *from++;
443 break;
444 }
445 }
Victor Stinner5ff71322017-06-21 14:39:22 +0200446 if (from < fromLim)
447 res = XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000448after:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000449 *fromP = from;
450 *toP = to;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200451 return res;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000452}
453
454#ifdef XML_NS
Benjamin Peterson52b94082019-09-25 21:33:58 -0700455static const struct normal_encoding utf8_encoding_ns
456 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
457 {
458# include "asciitab.h"
459# include "utf8tab.h"
460 },
461 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000462#endif
463
Benjamin Peterson52b94082019-09-25 21:33:58 -0700464static const struct normal_encoding utf8_encoding
465 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
466 {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000467#define BT_COLON BT_NMSTRT
468#include "asciitab.h"
469#undef BT_COLON
470#include "utf8tab.h"
Benjamin Peterson52b94082019-09-25 21:33:58 -0700471 },
472 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000473
474#ifdef XML_NS
475
Benjamin Peterson52b94082019-09-25 21:33:58 -0700476static const struct normal_encoding internal_utf8_encoding_ns
477 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
478 {
479# include "iasciitab.h"
480# include "utf8tab.h"
481 },
482 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000483
484#endif
485
Benjamin Peterson52b94082019-09-25 21:33:58 -0700486static const struct normal_encoding internal_utf8_encoding
487 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
488 {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000489#define BT_COLON BT_NMSTRT
490#include "iasciitab.h"
491#undef BT_COLON
492#include "utf8tab.h"
Benjamin Peterson52b94082019-09-25 21:33:58 -0700493 },
494 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000495
Victor Stinner23ec4b52017-06-15 00:54:36 +0200496static enum XML_Convert_Result PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700497latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
498 char **toP, const char *toLim) {
499 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000500 for (;;) {
501 unsigned char c;
502 if (*fromP == fromLim)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200503 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000504 c = (unsigned char)**fromP;
505 if (c & 0x80) {
506 if (toLim - *toP < 2)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200507 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000508 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
509 *(*toP)++ = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000510 (*fromP)++;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700511 } else {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000512 if (*toP == toLim)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200513 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000514 *(*toP)++ = *(*fromP)++;
515 }
516 }
517}
518
Victor Stinner23ec4b52017-06-15 00:54:36 +0200519static enum XML_Convert_Result PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700520latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
521 unsigned short **toP, const unsigned short *toLim) {
522 UNUSED_P(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200523 while (*fromP < fromLim && *toP < toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000524 *(*toP)++ = (unsigned char)*(*fromP)++;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200525
526 if ((*toP == toLim) && (*fromP < fromLim))
527 return XML_CONVERT_OUTPUT_EXHAUSTED;
528 else
529 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000530}
531
532#ifdef XML_NS
533
Benjamin Peterson52b94082019-09-25 21:33:58 -0700534static const struct normal_encoding latin1_encoding_ns
535 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
536 {
537# include "asciitab.h"
538# include "latin1tab.h"
539 },
540 STANDARD_VTABLE(sb_) NULL_VTABLE};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000541
542#endif
543
Benjamin Peterson52b94082019-09-25 21:33:58 -0700544static const struct normal_encoding latin1_encoding
545 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
546 {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000547#define BT_COLON BT_NMSTRT
548#include "asciitab.h"
549#undef BT_COLON
550#include "latin1tab.h"
Benjamin Peterson52b94082019-09-25 21:33:58 -0700551 },
552 STANDARD_VTABLE(sb_) NULL_VTABLE};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000553
Victor Stinner23ec4b52017-06-15 00:54:36 +0200554static enum XML_Convert_Result PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700555ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
556 char **toP, const char *toLim) {
557 UNUSED_P(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +0200558 while (*fromP < fromLim && *toP < toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000559 *(*toP)++ = *(*fromP)++;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200560
561 if ((*toP == toLim) && (*fromP < fromLim))
562 return XML_CONVERT_OUTPUT_EXHAUSTED;
563 else
564 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000565}
566
567#ifdef XML_NS
568
Benjamin Peterson52b94082019-09-25 21:33:58 -0700569static const struct normal_encoding ascii_encoding_ns
570 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
571 {
572# include "asciitab.h"
573 /* BT_NONXML == 0 */
574 },
575 STANDARD_VTABLE(sb_) NULL_VTABLE};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000576
577#endif
578
Benjamin Peterson52b94082019-09-25 21:33:58 -0700579static const struct normal_encoding ascii_encoding
580 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
581 {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000582#define BT_COLON BT_NMSTRT
583#include "asciitab.h"
584#undef BT_COLON
Benjamin Peterson52b94082019-09-25 21:33:58 -0700585 /* BT_NONXML == 0 */
586 },
587 STANDARD_VTABLE(sb_) NULL_VTABLE};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000588
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000589static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700590unicode_byte_type(char hi, char lo) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000591 switch ((unsigned char)hi) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700592 /* 0xD800–0xDBFF first 16-bit code unit or high surrogate (W1) */
593 case 0xD8:
594 case 0xD9:
595 case 0xDA:
596 case 0xDB:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000597 return BT_LEAD4;
Benjamin Peterson52b94082019-09-25 21:33:58 -0700598 /* 0xDC00–0xDFFF second 16-bit code unit or low surrogate (W2) */
599 case 0xDC:
600 case 0xDD:
601 case 0xDE:
602 case 0xDF:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000603 return BT_TRAIL;
604 case 0xFF:
605 switch ((unsigned char)lo) {
Benjamin Peterson52b94082019-09-25 21:33:58 -0700606 case 0xFF: /* noncharacter-FFFF */
607 case 0xFE: /* noncharacter-FFFE */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000608 return BT_NONXML;
609 }
610 break;
611 }
612 return BT_NONASCII;
613}
614
Benjamin Peterson52b94082019-09-25 21:33:58 -0700615#define DEFINE_UTF16_TO_UTF8(E) \
616 static enum XML_Convert_Result PTRCALL E##toUtf8( \
617 const ENCODING *enc, const char **fromP, const char *fromLim, \
618 char **toP, const char *toLim) { \
619 const char *from = *fromP; \
620 UNUSED_P(enc); \
621 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
622 for (; from < fromLim; from += 2) { \
623 int plane; \
624 unsigned char lo2; \
625 unsigned char lo = GET_LO(from); \
626 unsigned char hi = GET_HI(from); \
627 switch (hi) { \
628 case 0: \
629 if (lo < 0x80) { \
630 if (*toP == toLim) { \
631 *fromP = from; \
632 return XML_CONVERT_OUTPUT_EXHAUSTED; \
633 } \
634 *(*toP)++ = lo; \
635 break; \
636 } \
637 /* fall through */ \
638 case 0x1: \
639 case 0x2: \
640 case 0x3: \
641 case 0x4: \
642 case 0x5: \
643 case 0x6: \
644 case 0x7: \
645 if (toLim - *toP < 2) { \
646 *fromP = from; \
647 return XML_CONVERT_OUTPUT_EXHAUSTED; \
648 } \
649 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
650 *(*toP)++ = ((lo & 0x3f) | 0x80); \
651 break; \
652 default: \
653 if (toLim - *toP < 3) { \
654 *fromP = from; \
655 return XML_CONVERT_OUTPUT_EXHAUSTED; \
656 } \
657 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
658 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
659 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
660 *(*toP)++ = ((lo & 0x3f) | 0x80); \
661 break; \
662 case 0xD8: \
663 case 0xD9: \
664 case 0xDA: \
665 case 0xDB: \
666 if (toLim - *toP < 4) { \
667 *fromP = from; \
668 return XML_CONVERT_OUTPUT_EXHAUSTED; \
669 } \
670 if (fromLim - from < 4) { \
671 *fromP = from; \
672 return XML_CONVERT_INPUT_INCOMPLETE; \
673 } \
674 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
675 *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \
676 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
677 from += 2; \
678 lo2 = GET_LO(from); \
679 *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \
680 | (lo2 >> 6) | 0x80); \
681 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
682 break; \
683 } \
684 } \
685 *fromP = from; \
686 if (from < fromLim) \
687 return XML_CONVERT_INPUT_INCOMPLETE; \
688 else \
689 return XML_CONVERT_COMPLETED; \
690 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000691
Benjamin Peterson52b94082019-09-25 21:33:58 -0700692#define DEFINE_UTF16_TO_UTF16(E) \
693 static enum XML_Convert_Result PTRCALL E##toUtf16( \
694 const ENCODING *enc, const char **fromP, const char *fromLim, \
695 unsigned short **toP, const unsigned short *toLim) { \
696 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
697 UNUSED_P(enc); \
698 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
699 /* Avoid copying first half only of surrogate */ \
700 if (fromLim - *fromP > ((toLim - *toP) << 1) \
701 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
702 fromLim -= 2; \
703 res = XML_CONVERT_INPUT_INCOMPLETE; \
704 } \
705 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
706 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
707 if ((*toP == toLim) && (*fromP < fromLim)) \
708 return XML_CONVERT_OUTPUT_EXHAUSTED; \
709 else \
710 return res; \
711 }
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000712
Benjamin Peterson52b94082019-09-25 21:33:58 -0700713#define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000714#define GET_LO(ptr) ((unsigned char)(ptr)[0])
715#define GET_HI(ptr) ((unsigned char)(ptr)[1])
716
717DEFINE_UTF16_TO_UTF8(little2_)
718DEFINE_UTF16_TO_UTF16(little2_)
719
720#undef SET2
721#undef GET_LO
722#undef GET_HI
723
Benjamin Peterson52b94082019-09-25 21:33:58 -0700724#define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000725#define GET_LO(ptr) ((unsigned char)(ptr)[1])
726#define GET_HI(ptr) ((unsigned char)(ptr)[0])
727
728DEFINE_UTF16_TO_UTF8(big2_)
729DEFINE_UTF16_TO_UTF16(big2_)
730
731#undef SET2
732#undef GET_LO
733#undef GET_HI
734
Benjamin Peterson52b94082019-09-25 21:33:58 -0700735#define LITTLE2_BYTE_TYPE(enc, p) \
736 ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
737 : unicode_byte_type((p)[1], (p)[0]))
738#define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
739#define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c)
740#define LITTLE2_IS_NAME_CHAR_MINBPC(p) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000741 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
Benjamin Peterson52b94082019-09-25 21:33:58 -0700742#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000743 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
744
745#ifdef XML_MIN_SIZE
746
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000747static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700748little2_byteType(const ENCODING *enc, const char *p) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000749 return LITTLE2_BYTE_TYPE(enc, p);
750}
751
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000752static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700753little2_byteToAscii(const ENCODING *enc, const char *p) {
754 UNUSED_P(enc);
755 return LITTLE2_BYTE_TO_ASCII(p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000756}
757
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000758static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700759little2_charMatches(const ENCODING *enc, const char *p, int c) {
760 UNUSED_P(enc);
761 return LITTLE2_CHAR_MATCHES(p, c);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000762}
763
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000764static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700765little2_isNameMin(const ENCODING *enc, const char *p) {
766 UNUSED_P(enc);
767 return LITTLE2_IS_NAME_CHAR_MINBPC(p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000768}
769
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000770static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700771little2_isNmstrtMin(const ENCODING *enc, const char *p) {
772 UNUSED_P(enc);
773 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000774}
775
Benjamin Peterson52b94082019-09-25 21:33:58 -0700776# undef VTABLE
777# define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000778
779#else /* not XML_MIN_SIZE */
780
Benjamin Peterson52b94082019-09-25 21:33:58 -0700781# undef PREFIX
782# define PREFIX(ident) little2_##ident
783# define MINBPC(enc) 2
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000784/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
Benjamin Peterson52b94082019-09-25 21:33:58 -0700785# define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
786# define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
787# define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
788# define IS_NAME_CHAR(enc, p, n) 0
789# define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
790# define IS_NMSTRT_CHAR(enc, p, n) (0)
791# define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000792
Benjamin Peterson52b94082019-09-25 21:33:58 -0700793# define XML_TOK_IMPL_C
794# include "xmltok_impl.c"
795# undef XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000796
Benjamin Peterson52b94082019-09-25 21:33:58 -0700797# undef MINBPC
798# undef BYTE_TYPE
799# undef BYTE_TO_ASCII
800# undef CHAR_MATCHES
801# undef IS_NAME_CHAR
802# undef IS_NAME_CHAR_MINBPC
803# undef IS_NMSTRT_CHAR
804# undef IS_NMSTRT_CHAR_MINBPC
805# undef IS_INVALID_CHAR
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000806
807#endif /* not XML_MIN_SIZE */
808
809#ifdef XML_NS
810
Benjamin Peterson52b94082019-09-25 21:33:58 -0700811static const struct normal_encoding little2_encoding_ns
812 = {{VTABLE, 2, 0,
813# if BYTEORDER == 1234
814 1
815# else
816 0
817# endif
818 },
819 {
820# include "asciitab.h"
821# include "latin1tab.h"
822 },
823 STANDARD_VTABLE(little2_) NULL_VTABLE};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000824
825#endif
826
Benjamin Peterson52b94082019-09-25 21:33:58 -0700827static const struct normal_encoding little2_encoding
828 = {{VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000829#if BYTEORDER == 1234
Benjamin Peterson52b94082019-09-25 21:33:58 -0700830 1
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000831#else
Benjamin Peterson52b94082019-09-25 21:33:58 -0700832 0
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000833#endif
Benjamin Peterson52b94082019-09-25 21:33:58 -0700834 },
835 {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000836#define BT_COLON BT_NMSTRT
837#include "asciitab.h"
838#undef BT_COLON
839#include "latin1tab.h"
Benjamin Peterson52b94082019-09-25 21:33:58 -0700840 },
841 STANDARD_VTABLE(little2_) NULL_VTABLE};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000842
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000843#if BYTEORDER != 4321
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000844
Benjamin Peterson52b94082019-09-25 21:33:58 -0700845# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000846
Benjamin Peterson52b94082019-09-25 21:33:58 -0700847static const struct normal_encoding internal_little2_encoding_ns
848 = {{VTABLE, 2, 0, 1},
849 {
850# include "iasciitab.h"
851# include "latin1tab.h"
852 },
853 STANDARD_VTABLE(little2_) NULL_VTABLE};
854
855# endif
856
857static const struct normal_encoding internal_little2_encoding
858 = {{VTABLE, 2, 0, 1},
859 {
860# define BT_COLON BT_NMSTRT
861# include "iasciitab.h"
862# undef BT_COLON
863# include "latin1tab.h"
864 },
865 STANDARD_VTABLE(little2_) NULL_VTABLE};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000866
867#endif
868
Benjamin Peterson52b94082019-09-25 21:33:58 -0700869#define BIG2_BYTE_TYPE(enc, p) \
870 ((p)[0] == 0 \
871 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
872 : unicode_byte_type((p)[0], (p)[1]))
873#define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
874#define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c)
875#define BIG2_IS_NAME_CHAR_MINBPC(p) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000876 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
Benjamin Peterson52b94082019-09-25 21:33:58 -0700877#define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000878 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
879
880#ifdef XML_MIN_SIZE
881
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000882static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700883big2_byteType(const ENCODING *enc, const char *p) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000884 return BIG2_BYTE_TYPE(enc, p);
885}
886
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000887static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700888big2_byteToAscii(const ENCODING *enc, const char *p) {
889 UNUSED_P(enc);
890 return BIG2_BYTE_TO_ASCII(p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000891}
892
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000893static int PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700894big2_charMatches(const ENCODING *enc, const char *p, int c) {
895 UNUSED_P(enc);
896 return BIG2_CHAR_MATCHES(p, c);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000897}
898
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000899static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700900big2_isNameMin(const ENCODING *enc, const char *p) {
901 UNUSED_P(enc);
902 return BIG2_IS_NAME_CHAR_MINBPC(p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000903}
904
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000905static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -0700906big2_isNmstrtMin(const ENCODING *enc, const char *p) {
907 UNUSED_P(enc);
908 return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000909}
910
Benjamin Peterson52b94082019-09-25 21:33:58 -0700911# undef VTABLE
912# define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000913
914#else /* not XML_MIN_SIZE */
915
Benjamin Peterson52b94082019-09-25 21:33:58 -0700916# undef PREFIX
917# define PREFIX(ident) big2_##ident
918# define MINBPC(enc) 2
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000919/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
Benjamin Peterson52b94082019-09-25 21:33:58 -0700920# define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
921# define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
922# define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
923# define IS_NAME_CHAR(enc, p, n) 0
924# define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
925# define IS_NMSTRT_CHAR(enc, p, n) (0)
926# define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000927
Benjamin Peterson52b94082019-09-25 21:33:58 -0700928# define XML_TOK_IMPL_C
929# include "xmltok_impl.c"
930# undef XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000931
Benjamin Peterson52b94082019-09-25 21:33:58 -0700932# undef MINBPC
933# undef BYTE_TYPE
934# undef BYTE_TO_ASCII
935# undef CHAR_MATCHES
936# undef IS_NAME_CHAR
937# undef IS_NAME_CHAR_MINBPC
938# undef IS_NMSTRT_CHAR
939# undef IS_NMSTRT_CHAR_MINBPC
940# undef IS_INVALID_CHAR
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000941
942#endif /* not XML_MIN_SIZE */
943
944#ifdef XML_NS
945
Benjamin Peterson52b94082019-09-25 21:33:58 -0700946static const struct normal_encoding big2_encoding_ns
947 = {{VTABLE, 2, 0,
948# if BYTEORDER == 4321
949 1
950# else
951 0
952# endif
953 },
954 {
955# include "asciitab.h"
956# include "latin1tab.h"
957 },
958 STANDARD_VTABLE(big2_) NULL_VTABLE};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000959
960#endif
961
Benjamin Peterson52b94082019-09-25 21:33:58 -0700962static const struct normal_encoding big2_encoding
963 = {{VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000964#if BYTEORDER == 4321
Benjamin Peterson52b94082019-09-25 21:33:58 -0700965 1
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000966#else
Benjamin Peterson52b94082019-09-25 21:33:58 -0700967 0
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000968#endif
Benjamin Peterson52b94082019-09-25 21:33:58 -0700969 },
970 {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000971#define BT_COLON BT_NMSTRT
972#include "asciitab.h"
973#undef BT_COLON
974#include "latin1tab.h"
Benjamin Peterson52b94082019-09-25 21:33:58 -0700975 },
976 STANDARD_VTABLE(big2_) NULL_VTABLE};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000977
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000978#if BYTEORDER != 1234
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000979
Benjamin Peterson52b94082019-09-25 21:33:58 -0700980# ifdef XML_NS
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000981
Benjamin Peterson52b94082019-09-25 21:33:58 -0700982static const struct normal_encoding internal_big2_encoding_ns
983 = {{VTABLE, 2, 0, 1},
984 {
985# include "iasciitab.h"
986# include "latin1tab.h"
987 },
988 STANDARD_VTABLE(big2_) NULL_VTABLE};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000989
Benjamin Peterson52b94082019-09-25 21:33:58 -0700990# endif
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000991
Benjamin Peterson52b94082019-09-25 21:33:58 -0700992static const struct normal_encoding internal_big2_encoding
993 = {{VTABLE, 2, 0, 1},
994 {
995# define BT_COLON BT_NMSTRT
996# include "iasciitab.h"
997# undef BT_COLON
998# include "latin1tab.h"
999 },
1000 STANDARD_VTABLE(big2_) NULL_VTABLE};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001001
1002#endif
1003
1004#undef PREFIX
1005
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001006static int FASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001007streqci(const char *s1, const char *s2) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001008 for (;;) {
1009 char c1 = *s1++;
1010 char c2 = *s2++;
1011 if (ASCII_a <= c1 && c1 <= ASCII_z)
1012 c1 += ASCII_A - ASCII_a;
1013 if (ASCII_a <= c2 && c2 <= ASCII_z)
Victor Stinner93d0cb52017-08-18 23:43:54 +02001014 /* The following line will never get executed. streqci() is
1015 * only called from two places, both of which guarantee to put
1016 * upper-case strings into s2.
1017 */
1018 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001019 if (c1 != c2)
1020 return 0;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001021 if (! c1)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001022 break;
1023 }
1024 return 1;
1025}
1026
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001027static void PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001028initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1029 POSITION *pos) {
1030 UNUSED_P(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001031 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1032}
1033
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001034static int
Benjamin Peterson52b94082019-09-25 21:33:58 -07001035toAscii(const ENCODING *enc, const char *ptr, const char *end) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001036 char buf[1];
1037 char *p = buf;
1038 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1039 if (p == buf)
1040 return -1;
1041 else
1042 return buf[0];
1043}
1044
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001045static int FASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001046isSpace(int c) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001047 switch (c) {
1048 case 0x20:
1049 case 0xD:
1050 case 0xA:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001051 case 0x9:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001052 return 1;
1053 }
1054 return 0;
1055}
1056
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001057/* Return 1 if there's just optional white space or there's an S
1058 followed by name=val.
1059*/
1060static int
Benjamin Peterson52b94082019-09-25 21:33:58 -07001061parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1062 const char **namePtr, const char **nameEndPtr,
1063 const char **valPtr, const char **nextTokPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001064 int c;
1065 char open;
1066 if (ptr == end) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001067 *namePtr = NULL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001068 return 1;
1069 }
Benjamin Peterson52b94082019-09-25 21:33:58 -07001070 if (! isSpace(toAscii(enc, ptr, end))) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001071 *nextTokPtr = ptr;
1072 return 0;
1073 }
1074 do {
1075 ptr += enc->minBytesPerChar;
1076 } while (isSpace(toAscii(enc, ptr, end)));
1077 if (ptr == end) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001078 *namePtr = NULL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001079 return 1;
1080 }
1081 *namePtr = ptr;
1082 for (;;) {
1083 c = toAscii(enc, ptr, end);
1084 if (c == -1) {
1085 *nextTokPtr = ptr;
1086 return 0;
1087 }
1088 if (c == ASCII_EQUALS) {
1089 *nameEndPtr = ptr;
1090 break;
1091 }
1092 if (isSpace(c)) {
1093 *nameEndPtr = ptr;
1094 do {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001095 ptr += enc->minBytesPerChar;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001096 } while (isSpace(c = toAscii(enc, ptr, end)));
1097 if (c != ASCII_EQUALS) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001098 *nextTokPtr = ptr;
1099 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001100 }
1101 break;
1102 }
1103 ptr += enc->minBytesPerChar;
1104 }
1105 if (ptr == *namePtr) {
1106 *nextTokPtr = ptr;
1107 return 0;
1108 }
1109 ptr += enc->minBytesPerChar;
1110 c = toAscii(enc, ptr, end);
1111 while (isSpace(c)) {
1112 ptr += enc->minBytesPerChar;
1113 c = toAscii(enc, ptr, end);
1114 }
1115 if (c != ASCII_QUOT && c != ASCII_APOS) {
1116 *nextTokPtr = ptr;
1117 return 0;
1118 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001119 open = (char)c;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001120 ptr += enc->minBytesPerChar;
1121 *valPtr = ptr;
1122 for (;; ptr += enc->minBytesPerChar) {
1123 c = toAscii(enc, ptr, end);
1124 if (c == open)
1125 break;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001126 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1127 && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1128 && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001129 *nextTokPtr = ptr;
1130 return 0;
1131 }
1132 }
1133 *nextTokPtr = ptr + enc->minBytesPerChar;
1134 return 1;
1135}
1136
Benjamin Peterson52b94082019-09-25 21:33:58 -07001137static const char KW_version[]
1138 = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001139
Benjamin Peterson52b94082019-09-25 21:33:58 -07001140static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1141 ASCII_i, ASCII_n, ASCII_g, '\0'};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001142
Benjamin Peterson52b94082019-09-25 21:33:58 -07001143static const char KW_standalone[]
1144 = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1145 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001146
Benjamin Peterson52b94082019-09-25 21:33:58 -07001147static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001148
Benjamin Peterson52b94082019-09-25 21:33:58 -07001149static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001150
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001151static int
Benjamin Peterson52b94082019-09-25 21:33:58 -07001152doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001153 const char *),
Benjamin Peterson52b94082019-09-25 21:33:58 -07001154 int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1155 const char *end, const char **badPtr, const char **versionPtr,
1156 const char **versionEndPtr, const char **encodingName,
1157 const ENCODING **encoding, int *standalone) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001158 const char *val = NULL;
1159 const char *name = NULL;
1160 const char *nameEnd = NULL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001161 ptr += 5 * enc->minBytesPerChar;
1162 end -= 2 * enc->minBytesPerChar;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001163 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1164 || ! name) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001165 *badPtr = ptr;
1166 return 0;
1167 }
Benjamin Peterson52b94082019-09-25 21:33:58 -07001168 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1169 if (! isGeneralTextEntity) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001170 *badPtr = name;
1171 return 0;
1172 }
Benjamin Peterson52b94082019-09-25 21:33:58 -07001173 } else {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001174 if (versionPtr)
1175 *versionPtr = val;
1176 if (versionEndPtr)
1177 *versionEndPtr = ptr;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001178 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001179 *badPtr = ptr;
1180 return 0;
1181 }
Benjamin Peterson52b94082019-09-25 21:33:58 -07001182 if (! name) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001183 if (isGeneralTextEntity) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001184 /* a TextDecl must have an EncodingDecl */
1185 *badPtr = ptr;
1186 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001187 }
1188 return 1;
1189 }
1190 }
1191 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1192 int c = toAscii(enc, val, end);
Benjamin Peterson52b94082019-09-25 21:33:58 -07001193 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001194 *badPtr = val;
1195 return 0;
1196 }
1197 if (encodingName)
1198 *encodingName = val;
1199 if (encoding)
1200 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
Benjamin Peterson52b94082019-09-25 21:33:58 -07001201 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001202 *badPtr = ptr;
1203 return 0;
1204 }
Benjamin Peterson52b94082019-09-25 21:33:58 -07001205 if (! name)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001206 return 1;
1207 }
Benjamin Peterson52b94082019-09-25 21:33:58 -07001208 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001209 || isGeneralTextEntity) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001210 *badPtr = name;
1211 return 0;
1212 }
1213 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1214 if (standalone)
1215 *standalone = 1;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001216 } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001217 if (standalone)
1218 *standalone = 0;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001219 } else {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001220 *badPtr = val;
1221 return 0;
1222 }
1223 while (isSpace(toAscii(enc, ptr, end)))
1224 ptr += enc->minBytesPerChar;
1225 if (ptr != end) {
1226 *badPtr = ptr;
1227 return 0;
1228 }
1229 return 1;
1230}
1231
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001232static int FASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001233checkCharRefNumber(int result) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001234 switch (result >> 8) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001235 case 0xD8:
1236 case 0xD9:
1237 case 0xDA:
1238 case 0xDB:
1239 case 0xDC:
1240 case 0xDD:
1241 case 0xDE:
1242 case 0xDF:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001243 return -1;
1244 case 0:
1245 if (latin1_encoding.type[result] == BT_NONXML)
1246 return -1;
1247 break;
1248 case 0xFF:
1249 if (result == 0xFFFE || result == 0xFFFF)
1250 return -1;
1251 break;
1252 }
1253 return result;
1254}
1255
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001256int FASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001257XmlUtf8Encode(int c, char *buf) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001258 enum {
1259 /* minN is minimum legal resulting value for N byte sequence */
1260 min2 = 0x80,
1261 min3 = 0x800,
1262 min4 = 0x10000
1263 };
1264
1265 if (c < 0)
Victor Stinner93d0cb52017-08-18 23:43:54 +02001266 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001267 if (c < min2) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001268 buf[0] = (char)(c | UTF8_cval1);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001269 return 1;
1270 }
1271 if (c < min3) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001272 buf[0] = (char)((c >> 6) | UTF8_cval2);
1273 buf[1] = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001274 return 2;
1275 }
1276 if (c < min4) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001277 buf[0] = (char)((c >> 12) | UTF8_cval3);
1278 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1279 buf[2] = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001280 return 3;
1281 }
1282 if (c < 0x110000) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001283 buf[0] = (char)((c >> 18) | UTF8_cval4);
1284 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1285 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1286 buf[3] = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001287 return 4;
1288 }
Victor Stinner93d0cb52017-08-18 23:43:54 +02001289 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001290}
1291
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001292int FASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001293XmlUtf16Encode(int charNum, unsigned short *buf) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001294 if (charNum < 0)
1295 return 0;
1296 if (charNum < 0x10000) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001297 buf[0] = (unsigned short)charNum;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001298 return 1;
1299 }
1300 if (charNum < 0x110000) {
1301 charNum -= 0x10000;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001302 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1303 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001304 return 2;
1305 }
1306 return 0;
1307}
1308
1309struct unknown_encoding {
1310 struct normal_encoding normal;
Fred Drake31d485c2004-08-03 07:06:22 +00001311 CONVERTER convert;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001312 void *userData;
1313 unsigned short utf16[256];
1314 char utf8[256][4];
1315};
1316
Benjamin Peterson52b94082019-09-25 21:33:58 -07001317#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001318
1319int
Benjamin Peterson52b94082019-09-25 21:33:58 -07001320XmlSizeOfUnknownEncoding(void) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001321 return sizeof(struct unknown_encoding);
1322}
1323
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001324static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001325unknown_isName(const ENCODING *enc, const char *p) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001326 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1327 int c = uenc->convert(uenc->userData, p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001328 if (c & ~0xFFFF)
1329 return 0;
1330 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1331}
1332
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001333static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001334unknown_isNmstrt(const ENCODING *enc, const char *p) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001335 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1336 int c = uenc->convert(uenc->userData, p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001337 if (c & ~0xFFFF)
1338 return 0;
1339 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1340}
1341
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001342static int PTRFASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001343unknown_isInvalid(const ENCODING *enc, const char *p) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001344 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1345 int c = uenc->convert(uenc->userData, p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001346 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1347}
1348
Victor Stinner23ec4b52017-06-15 00:54:36 +02001349static enum XML_Convert_Result PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001350unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1351 char **toP, const char *toLim) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001352 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001353 char buf[XML_UTF8_ENCODE_MAX];
1354 for (;;) {
1355 const char *utf8;
1356 int n;
1357 if (*fromP == fromLim)
Victor Stinner23ec4b52017-06-15 00:54:36 +02001358 return XML_CONVERT_COMPLETED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001359 utf8 = uenc->utf8[(unsigned char)**fromP];
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001360 n = *utf8++;
1361 if (n == 0) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001362 int c = uenc->convert(uenc->userData, *fromP);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001363 n = XmlUtf8Encode(c, buf);
1364 if (n > toLim - *toP)
Victor Stinner23ec4b52017-06-15 00:54:36 +02001365 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001366 utf8 = buf;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001367 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1368 - (BT_LEAD2 - 2));
Benjamin Peterson52b94082019-09-25 21:33:58 -07001369 } else {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001370 if (n > toLim - *toP)
Victor Stinner23ec4b52017-06-15 00:54:36 +02001371 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001372 (*fromP)++;
1373 }
Benjamin Peterson4e211002018-06-26 19:25:45 -07001374 memcpy(*toP, utf8, n);
1375 *toP += n;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001376 }
1377}
1378
Victor Stinner23ec4b52017-06-15 00:54:36 +02001379static enum XML_Convert_Result PTRCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001380unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1381 unsigned short **toP, const unsigned short *toLim) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001382 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001383 while (*fromP < fromLim && *toP < toLim) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001384 unsigned short c = uenc->utf16[(unsigned char)**fromP];
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001385 if (c == 0) {
Benjamin Peterson52b94082019-09-25 21:33:58 -07001386 c = (unsigned short)uenc->convert(uenc->userData, *fromP);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001387 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1388 - (BT_LEAD2 - 2));
Benjamin Peterson52b94082019-09-25 21:33:58 -07001389 } else
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001390 (*fromP)++;
1391 *(*toP)++ = c;
1392 }
Victor Stinner23ec4b52017-06-15 00:54:36 +02001393
1394 if ((*toP == toLim) && (*fromP < fromLim))
1395 return XML_CONVERT_OUTPUT_EXHAUSTED;
1396 else
1397 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001398}
1399
1400ENCODING *
Benjamin Peterson52b94082019-09-25 21:33:58 -07001401XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1402 void *userData) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001403 int i;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001404 struct unknown_encoding *e = (struct unknown_encoding *)mem;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001405 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001406 for (i = 0; i < 128; i++)
1407 if (latin1_encoding.type[i] != BT_OTHER
Benjamin Peterson52b94082019-09-25 21:33:58 -07001408 && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001409 return 0;
1410 for (i = 0; i < 256; i++) {
1411 int c = table[i];
1412 if (c == -1) {
1413 e->normal.type[i] = BT_MALFORM;
1414 /* This shouldn't really get used. */
1415 e->utf16[i] = 0xFFFF;
1416 e->utf8[i][0] = 1;
1417 e->utf8[i][1] = 0;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001418 } else if (c < 0) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001419 if (c < -4)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001420 return 0;
Victor Stinner93d0cb52017-08-18 23:43:54 +02001421 /* Multi-byte sequences need a converter function */
Benjamin Peterson52b94082019-09-25 21:33:58 -07001422 if (! convert)
Victor Stinner93d0cb52017-08-18 23:43:54 +02001423 return 0;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001424 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001425 e->utf8[i][0] = 0;
1426 e->utf16[i] = 0;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001427 } else if (c < 0x80) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001428 if (latin1_encoding.type[c] != BT_OTHER
Benjamin Peterson52b94082019-09-25 21:33:58 -07001429 && latin1_encoding.type[c] != BT_NONXML && c != i)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001430 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001431 e->normal.type[i] = latin1_encoding.type[c];
1432 e->utf8[i][0] = 1;
1433 e->utf8[i][1] = (char)c;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001434 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
Benjamin Peterson52b94082019-09-25 21:33:58 -07001435 } else if (checkCharRefNumber(c) < 0) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001436 e->normal.type[i] = BT_NONXML;
1437 /* This shouldn't really get used. */
1438 e->utf16[i] = 0xFFFF;
1439 e->utf8[i][0] = 1;
1440 e->utf8[i][1] = 0;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001441 } else {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001442 if (c > 0xFFFF)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001443 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001444 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001445 e->normal.type[i] = BT_NMSTRT;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001446 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001447 e->normal.type[i] = BT_NAME;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001448 else
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001449 e->normal.type[i] = BT_OTHER;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001450 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001451 e->utf16[i] = (unsigned short)c;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001452 }
1453 }
1454 e->userData = userData;
1455 e->convert = convert;
1456 if (convert) {
1457 e->normal.isName2 = unknown_isName;
1458 e->normal.isName3 = unknown_isName;
1459 e->normal.isName4 = unknown_isName;
1460 e->normal.isNmstrt2 = unknown_isNmstrt;
1461 e->normal.isNmstrt3 = unknown_isNmstrt;
1462 e->normal.isNmstrt4 = unknown_isNmstrt;
1463 e->normal.isInvalid2 = unknown_isInvalid;
1464 e->normal.isInvalid3 = unknown_isInvalid;
1465 e->normal.isInvalid4 = unknown_isInvalid;
1466 }
1467 e->normal.enc.utf8Convert = unknown_toUtf8;
1468 e->normal.enc.utf16Convert = unknown_toUtf16;
1469 return &(e->normal.enc);
1470}
1471
1472/* If this enumeration is changed, getEncodingIndex and encodings
1473must also be changed. */
1474enum {
1475 UNKNOWN_ENC = -1,
1476 ISO_8859_1_ENC = 0,
1477 US_ASCII_ENC,
1478 UTF_8_ENC,
1479 UTF_16_ENC,
1480 UTF_16BE_ENC,
1481 UTF_16LE_ENC,
1482 /* must match encodingNames up to here */
1483 NO_ENC
1484};
1485
Benjamin Peterson52b94082019-09-25 21:33:58 -07001486static const char KW_ISO_8859_1[]
1487 = {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8,
1488 ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'};
1489static const char KW_US_ASCII[]
1490 = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1491 ASCII_C, ASCII_I, ASCII_I, '\0'};
1492static const char KW_UTF_8[]
1493 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1494static const char KW_UTF_16[]
1495 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1496static const char KW_UTF_16BE[]
1497 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1498 ASCII_6, ASCII_B, ASCII_E, '\0'};
1499static const char KW_UTF_16LE[]
1500 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1501 ASCII_6, ASCII_L, ASCII_E, '\0'};
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001502
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001503static int FASTCALL
Benjamin Peterson52b94082019-09-25 21:33:58 -07001504getEncodingIndex(const char *name) {
1505 static const char *const encodingNames[] = {
1506 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001507 };
1508 int i;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001509 if (name == NULL)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001510 return NO_ENC;
Benjamin Peterson52b94082019-09-25 21:33:58 -07001511 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001512 if (streqci(name, encodingNames[i]))
1513 return i;
1514 return UNKNOWN_ENC;
1515}
1516
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001517/* For binary compatibility, we store the index of the encoding
1518 specified at initialization in the isUtf16 member.
1519*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001520
1521#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1522#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1523
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001524/* This is what detects the encoding. encodingTable maps from
1525 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1526 the external (protocol) specified encoding; state is
1527 XML_CONTENT_STATE if we're parsing an external text entity, and
1528 XML_PROLOG_STATE otherwise.
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001529*/
1530
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001531static int
Benjamin Peterson52b94082019-09-25 21:33:58 -07001532initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1533 int state, const char *ptr, const char *end, const char **nextTokPtr) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001534 const ENCODING **encPtr;
1535
Victor Stinner23ec4b52017-06-15 00:54:36 +02001536 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001537 return XML_TOK_NONE;
1538 encPtr = enc->encPtr;
1539 if (ptr + 1 == end) {
1540 /* only a single byte available for auto-detection */
1541#ifndef XML_DTD /* FIXME */
1542 /* a well-formed document entity must have more than one byte */
1543 if (state != XML_CONTENT_STATE)
1544 return XML_TOK_PARTIAL;
1545#endif
1546 /* so we're parsing an external text entity... */
1547 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1548 switch (INIT_ENC_INDEX(enc)) {
1549 case UTF_16_ENC:
1550 case UTF_16LE_ENC:
1551 case UTF_16BE_ENC:
1552 return XML_TOK_PARTIAL;
1553 }
1554 switch ((unsigned char)*ptr) {
1555 case 0xFE:
1556 case 0xFF:
1557 case 0xEF: /* possibly first byte of UTF-8 BOM */
Benjamin Peterson52b94082019-09-25 21:33:58 -07001558 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001559 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001560 /* fall through */
1561 case 0x00:
1562 case 0x3C:
1563 return XML_TOK_PARTIAL;
1564 }
Benjamin Peterson52b94082019-09-25 21:33:58 -07001565 } else {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001566 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1567 case 0xFEFF:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001568 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001569 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001570 *nextTokPtr = ptr + 2;
1571 *encPtr = encodingTable[UTF_16BE_ENC];
1572 return XML_TOK_BOM;
1573 /* 00 3C is handled in the default case */
1574 case 0x3C00:
1575 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001576 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1577 && state == XML_CONTENT_STATE)
1578 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001579 *encPtr = encodingTable[UTF_16LE_ENC];
1580 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1581 case 0xFFFE:
Benjamin Peterson52b94082019-09-25 21:33:58 -07001582 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001583 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001584 *nextTokPtr = ptr + 2;
1585 *encPtr = encodingTable[UTF_16LE_ENC];
1586 return XML_TOK_BOM;
1587 case 0xEFBB:
1588 /* Maybe a UTF-8 BOM (EF BB BF) */
1589 /* If there's an explicitly specified (external) encoding
1590 of ISO-8859-1 or some flavour of UTF-16
1591 and this is an external text entity,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001592 don't look for the BOM,
1593 because it might be a legal data.
1594 */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001595 if (state == XML_CONTENT_STATE) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001596 int e = INIT_ENC_INDEX(enc);
Benjamin Peterson52b94082019-09-25 21:33:58 -07001597 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1598 || e == UTF_16_ENC)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001599 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001600 }
1601 if (ptr + 2 == end)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001602 return XML_TOK_PARTIAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001603 if ((unsigned char)ptr[2] == 0xBF) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001604 *nextTokPtr = ptr + 3;
1605 *encPtr = encodingTable[UTF_8_ENC];
1606 return XML_TOK_BOM;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001607 }
1608 break;
1609 default:
1610 if (ptr[0] == '\0') {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001611 /* 0 isn't a legal data character. Furthermore a document
1612 entity can only start with ASCII characters. So the only
Benjamin Peterson196d7db2016-06-11 13:28:56 -07001613 way this can fail to be big-endian UTF-16 if it it's an
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001614 external parsed general entity that's labelled as
1615 UTF-16LE.
1616 */
1617 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1618 break;
1619 *encPtr = encodingTable[UTF_16BE_ENC];
1620 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
Benjamin Peterson52b94082019-09-25 21:33:58 -07001621 } else if (ptr[1] == '\0') {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001622 /* We could recover here in the case:
1623 - parsing an external entity
1624 - second byte is 0
1625 - no externally specified encoding
1626 - no encoding declaration
1627 by assuming UTF-16LE. But we don't, because this would mean when
1628 presented just with a single byte, we couldn't reliably determine
1629 whether we needed further bytes.
1630 */
1631 if (state == XML_CONTENT_STATE)
1632 break;
1633 *encPtr = encodingTable[UTF_16LE_ENC];
1634 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001635 }
1636 break;
1637 }
1638 }
1639 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1640 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1641}
1642
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001643#define NS(x) x
1644#define ns(x) x
Gregory P. Smith64359d22012-07-14 14:12:35 -07001645#define XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001646#include "xmltok_ns.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -07001647#undef XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001648#undef NS
1649#undef ns
1650
1651#ifdef XML_NS
1652
Benjamin Peterson52b94082019-09-25 21:33:58 -07001653# define NS(x) x##NS
1654# define ns(x) x##_ns
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001655
Benjamin Peterson52b94082019-09-25 21:33:58 -07001656# define XML_TOK_NS_C
1657# include "xmltok_ns.c"
1658# undef XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001659
Benjamin Peterson52b94082019-09-25 21:33:58 -07001660# undef NS
1661# undef ns
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001662
1663ENCODING *
Benjamin Peterson52b94082019-09-25 21:33:58 -07001664XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1665 void *userData) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001666 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1667 if (enc)
1668 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1669 return enc;
1670}
1671
1672#endif /* XML_NS */