blob: 6371a350dadf9694d1ba14f70ee14398b3f166f0 [file] [log] [blame]
Victor Stinner759e30e2017-09-05 01:58:08 +02001/*
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000-2017 Expat development team
11 Licensed under the MIT license:
12
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to permit
18 persons to whom the Software is furnished to do so, subject to the
19 following conditions:
20
21 The above copyright notice and this permission notice shall be included
22 in all copies or substantial portions of the Software.
23
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30 USE OR OTHER DEALINGS IN THE SOFTWARE.
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000031*/
32
Miss Islington (bot)ef1fc0d2018-12-10 07:25:31 -080033#if !defined(_WIN32) && defined(HAVE_EXPAT_CONFIG_H)
34# include <pyconfig.h>
35#endif
Benjamin Peterson06d49bb2016-06-13 23:41:19 -070036#include <stddef.h>
Miss Islington (bot)fb17b812018-06-26 19:44:32 -070037#include <string.h> /* memcpy */
38
39#if defined(_MSC_VER) && (_MSC_VER <= 1700)
40 /* for vs2012/11.0/1700 and earlier Visual Studio compilers */
41# define bool int
42# define false 0
43# define true 1
44#else
45# include <stdbool.h>
46#endif
47
Benjamin Peterson06d49bb2016-06-13 23:41:19 -070048
Victor Stinner5ff71322017-06-21 14:39:22 +020049#ifdef _WIN32
Martin v. Löwisfc03a942003-01-25 22:41:29 +000050#include "winconfig.h"
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000051#else
Fred Drake08317ae2003-10-21 15:38:55 +000052#ifdef HAVE_EXPAT_CONFIG_H
Martin v. Löwisfc03a942003-01-25 22:41:29 +000053#include <expat_config.h>
Fred Drake08317ae2003-10-21 15:38:55 +000054#endif
Victor Stinner5ff71322017-06-21 14:39:22 +020055#endif /* ndef _WIN32 */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000056
Fred Drake31d485c2004-08-03 07:06:22 +000057#include "expat_external.h"
Martin v. Löwisfc03a942003-01-25 22:41:29 +000058#include "internal.h"
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000059#include "xmltok.h"
60#include "nametab.h"
61
62#ifdef XML_DTD
63#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
64#else
65#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
66#endif
67
68#define VTABLE1 \
69 { PREFIX(prologTok), PREFIX(contentTok), \
70 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
71 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000072 PREFIX(nameMatchesAscii), \
73 PREFIX(nameLength), \
74 PREFIX(skipS), \
75 PREFIX(getAtts), \
76 PREFIX(charRefNumber), \
77 PREFIX(predefinedEntityName), \
78 PREFIX(updatePosition), \
79 PREFIX(isPublicId)
80
81#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
82
83#define UCS2_GET_NAMING(pages, hi, lo) \
Victor Stinner23ec4b52017-06-15 00:54:36 +020084 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000085
Martin v. Löwisfc03a942003-01-25 22:41:29 +000086/* A 2 byte UTF-8 representation splits the characters 11 bits between
87 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
88 pages, 3 bits to add to that index and 5 bits to generate the mask.
89*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000090#define UTF8_GET_NAMING2(pages, byte) \
91 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
92 + ((((byte)[0]) & 3) << 1) \
93 + ((((byte)[1]) >> 5) & 1)] \
Victor Stinner23ec4b52017-06-15 00:54:36 +020094 & (1u << (((byte)[1]) & 0x1F)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000095
Martin v. Löwisfc03a942003-01-25 22:41:29 +000096/* A 3 byte UTF-8 representation splits the characters 16 bits between
97 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
98 into pages, 3 bits to add to that index and 5 bits to generate the
99 mask.
100*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000101#define UTF8_GET_NAMING3(pages, byte) \
102 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
103 + ((((byte)[1]) >> 2) & 0xF)] \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000104 << 3) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000105 + ((((byte)[1]) & 3) << 1) \
106 + ((((byte)[2]) >> 5) & 1)] \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200107 & (1u << (((byte)[2]) & 0x1F)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000108
109#define UTF8_GET_NAMING(pages, p, n) \
110 ((n) == 2 \
111 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
112 : ((n) == 3 \
113 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
114 : 0))
115
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000116/* Detection of invalid UTF-8 sequences is based on Table 3.1B
117 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
118 with the additional restriction of not allowing the Unicode
119 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
120 Implementation details:
121 (A & 0x80) == 0 means A < 0x80
122 and
123 (A & 0xC0) == 0xC0 means A > 0xBF
124*/
125
126#define UTF8_INVALID2(p) \
127 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
128
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000129#define UTF8_INVALID3(p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000130 (((p)[2] & 0x80) == 0 \
131 || \
132 ((*p) == 0xEF && (p)[1] == 0xBF \
133 ? \
134 (p)[2] > 0xBD \
135 : \
136 ((p)[2] & 0xC0) == 0xC0) \
137 || \
138 ((*p) == 0xE0 \
139 ? \
140 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
141 : \
142 ((p)[1] & 0x80) == 0 \
143 || \
144 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000145
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000146#define UTF8_INVALID4(p) \
147 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
148 || \
149 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
150 || \
151 ((*p) == 0xF0 \
152 ? \
153 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
154 : \
155 ((p)[1] & 0x80) == 0 \
156 || \
157 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000158
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000159static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200160isNever(const ENCODING *UNUSED_P(enc), const char *UNUSED_P(p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000161{
162 return 0;
163}
164
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000165static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200166utf8_isName2(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000167{
168 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
169}
170
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000171static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200172utf8_isName3(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000173{
174 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
175}
176
177#define utf8_isName4 isNever
178
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000179static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200180utf8_isNmstrt2(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000181{
182 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
183}
184
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000185static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200186utf8_isNmstrt3(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000187{
188 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
189}
190
191#define utf8_isNmstrt4 isNever
192
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000193static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200194utf8_isInvalid2(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000195{
196 return UTF8_INVALID2((const unsigned char *)p);
197}
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000198
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000199static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200200utf8_isInvalid3(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000201{
202 return UTF8_INVALID3((const unsigned char *)p);
203}
204
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000205static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200206utf8_isInvalid4(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000207{
208 return UTF8_INVALID4((const unsigned char *)p);
209}
210
211struct normal_encoding {
212 ENCODING enc;
213 unsigned char type[256];
214#ifdef XML_MIN_SIZE
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000215 int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
216 int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
217 int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
218 int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
219 int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000220#endif /* XML_MIN_SIZE */
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000221 int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
222 int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
223 int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
224 int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
225 int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
226 int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
227 int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
228 int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
229 int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000230};
231
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000232#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
233
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000234#ifdef XML_MIN_SIZE
235
236#define STANDARD_VTABLE(E) \
237 E ## byteType, \
238 E ## isNameMin, \
239 E ## isNmstrtMin, \
240 E ## byteToAscii, \
241 E ## charMatches,
242
243#else
244
245#define STANDARD_VTABLE(E) /* as nothing */
246
247#endif
248
249#define NORMAL_VTABLE(E) \
250 E ## isName2, \
251 E ## isName3, \
252 E ## isName4, \
253 E ## isNmstrt2, \
254 E ## isNmstrt3, \
255 E ## isNmstrt4, \
256 E ## isInvalid2, \
257 E ## isInvalid3, \
258 E ## isInvalid4
259
Victor Stinner23ec4b52017-06-15 00:54:36 +0200260#define NULL_VTABLE \
261 /* isName2 */ NULL, \
262 /* isName3 */ NULL, \
263 /* isName4 */ NULL, \
264 /* isNmstrt2 */ NULL, \
265 /* isNmstrt3 */ NULL, \
266 /* isNmstrt4 */ NULL, \
267 /* isInvalid2 */ NULL, \
268 /* isInvalid3 */ NULL, \
269 /* isInvalid4 */ NULL
270
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000271static int FASTCALL checkCharRefNumber(int);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000272
273#include "xmltok_impl.h"
274#include "ascii.h"
275
276#ifdef XML_MIN_SIZE
277#define sb_isNameMin isNever
278#define sb_isNmstrtMin isNever
279#endif
280
281#ifdef XML_MIN_SIZE
282#define MINBPC(enc) ((enc)->minBytesPerChar)
283#else
284/* minimum bytes per character */
285#define MINBPC(enc) 1
286#endif
287
288#define SB_BYTE_TYPE(enc, p) \
289 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
290
291#ifdef XML_MIN_SIZE
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000292static int PTRFASTCALL
293sb_byteType(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000294{
295 return SB_BYTE_TYPE(enc, p);
296}
297#define BYTE_TYPE(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000298 (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000299#else
300#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
301#endif
302
303#ifdef XML_MIN_SIZE
304#define BYTE_TO_ASCII(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000305 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
306static int PTRFASTCALL
307sb_byteToAscii(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000308{
309 return *p;
310}
311#else
312#define BYTE_TO_ASCII(enc, p) (*(p))
313#endif
314
315#define IS_NAME_CHAR(enc, p, n) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000316 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000317#define IS_NMSTRT_CHAR(enc, p, n) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000318 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000319#define IS_INVALID_CHAR(enc, p, n) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000320 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000321
322#ifdef XML_MIN_SIZE
323#define IS_NAME_CHAR_MINBPC(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000324 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000325#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000326 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000327#else
328#define IS_NAME_CHAR_MINBPC(enc, p) (0)
329#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
330#endif
331
332#ifdef XML_MIN_SIZE
333#define CHAR_MATCHES(enc, p, c) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000334 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
335static int PTRCALL
336sb_charMatches(const ENCODING *enc, const char *p, int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000337{
338 return *p == c;
339}
340#else
341/* c is an ASCII character */
342#define CHAR_MATCHES(enc, p, c) (*(p) == c)
343#endif
344
345#define PREFIX(ident) normal_ ## ident
Gregory P. Smith64359d22012-07-14 14:12:35 -0700346#define XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000347#include "xmltok_impl.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -0700348#undef XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000349
350#undef MINBPC
351#undef BYTE_TYPE
352#undef BYTE_TO_ASCII
353#undef CHAR_MATCHES
354#undef IS_NAME_CHAR
355#undef IS_NAME_CHAR_MINBPC
356#undef IS_NMSTRT_CHAR
357#undef IS_NMSTRT_CHAR_MINBPC
358#undef IS_INVALID_CHAR
359
360enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
361 UTF8_cval1 = 0x00,
362 UTF8_cval2 = 0xc0,
363 UTF8_cval3 = 0xe0,
364 UTF8_cval4 = 0xf0
365};
366
Victor Stinner23ec4b52017-06-15 00:54:36 +0200367void
Miss Islington (bot)fb17b812018-06-26 19:44:32 -0700368_INTERNAL_trim_to_complete_utf8_characters(const char * from, const char ** fromLimRef)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200369{
370 const char * fromLim = *fromLimRef;
371 size_t walked = 0;
372 for (; fromLim > from; fromLim--, walked++) {
373 const unsigned char prev = (unsigned char)fromLim[-1];
374 if ((prev & 0xf8u) == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
375 if (walked + 1 >= 4) {
376 fromLim += 4 - 1;
377 break;
378 } else {
379 walked = 0;
380 }
381 } else if ((prev & 0xf0u) == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
382 if (walked + 1 >= 3) {
383 fromLim += 3 - 1;
384 break;
385 } else {
386 walked = 0;
387 }
388 } else if ((prev & 0xe0u) == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
389 if (walked + 1 >= 2) {
390 fromLim += 2 - 1;
391 break;
392 } else {
393 walked = 0;
394 }
395 } else if ((prev & 0x80u) == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
396 break;
397 }
398 }
399 *fromLimRef = fromLim;
400}
401
402static enum XML_Convert_Result PTRCALL
403utf8_toUtf8(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000404 const char **fromP, const char *fromLim,
405 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000406{
Victor Stinner759e30e2017-09-05 01:58:08 +0200407 bool input_incomplete = false;
408 bool output_exhausted = false;
Victor Stinner5ff71322017-06-21 14:39:22 +0200409
Victor Stinner759e30e2017-09-05 01:58:08 +0200410 /* Avoid copying partial characters (due to limited space). */
411 const ptrdiff_t bytesAvailable = fromLim - *fromP;
412 const ptrdiff_t bytesStorable = toLim - *toP;
413 if (bytesAvailable > bytesStorable) {
414 fromLim = *fromP + bytesStorable;
415 output_exhausted = true;
416 }
417
418 /* Avoid copying partial characters (from incomplete input). */
Miss Islington (bot)fb17b812018-06-26 19:44:32 -0700419 {
420 const char * const fromLimBefore = fromLim;
421 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
422 if (fromLim < fromLimBefore) {
423 input_incomplete = true;
424 }
Victor Stinner759e30e2017-09-05 01:58:08 +0200425 }
Victor Stinner5ff71322017-06-21 14:39:22 +0200426
Miss Islington (bot)fb17b812018-06-26 19:44:32 -0700427 {
428 const ptrdiff_t bytesToCopy = fromLim - *fromP;
429 memcpy(*toP, *fromP, bytesToCopy);
430 *fromP += bytesToCopy;
431 *toP += bytesToCopy;
432 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200433
Miss Islington (bot)fb17b812018-06-26 19:44:32 -0700434 if (output_exhausted) /* needs to go first */
Victor Stinner23ec4b52017-06-15 00:54:36 +0200435 return XML_CONVERT_OUTPUT_EXHAUSTED;
Victor Stinner759e30e2017-09-05 01:58:08 +0200436 else if (input_incomplete)
437 return XML_CONVERT_INPUT_INCOMPLETE;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200438 else
Victor Stinner5ff71322017-06-21 14:39:22 +0200439 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000440}
441
Victor Stinner23ec4b52017-06-15 00:54:36 +0200442static enum XML_Convert_Result PTRCALL
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000443utf8_toUtf16(const ENCODING *enc,
444 const char **fromP, const char *fromLim,
445 unsigned short **toP, const unsigned short *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000446{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200447 enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000448 unsigned short *to = *toP;
449 const char *from = *fromP;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200450 while (from < fromLim && to < toLim) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000451 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
452 case BT_LEAD2:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200453 if (fromLim - from < 2) {
454 res = XML_CONVERT_INPUT_INCOMPLETE;
Victor Stinner5ff71322017-06-21 14:39:22 +0200455 goto after;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200456 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000457 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000458 from += 2;
459 break;
460 case BT_LEAD3:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200461 if (fromLim - from < 3) {
462 res = XML_CONVERT_INPUT_INCOMPLETE;
Victor Stinner5ff71322017-06-21 14:39:22 +0200463 goto after;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200464 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000465 *to++ = (unsigned short)(((from[0] & 0xf) << 12)
466 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000467 from += 3;
468 break;
469 case BT_LEAD4:
470 {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000471 unsigned long n;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200472 if (toLim - to < 2) {
473 res = XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000474 goto after;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200475 }
476 if (fromLim - from < 4) {
477 res = XML_CONVERT_INPUT_INCOMPLETE;
478 goto after;
479 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000480 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
481 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
482 n -= 0x10000;
483 to[0] = (unsigned short)((n >> 10) | 0xD800);
484 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
485 to += 2;
486 from += 4;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000487 }
488 break;
489 default:
490 *to++ = *from++;
491 break;
492 }
493 }
Victor Stinner5ff71322017-06-21 14:39:22 +0200494 if (from < fromLim)
495 res = XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000496after:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000497 *fromP = from;
498 *toP = to;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200499 return res;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000500}
501
502#ifdef XML_NS
503static const struct normal_encoding utf8_encoding_ns = {
504 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
505 {
506#include "asciitab.h"
507#include "utf8tab.h"
508 },
509 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
510};
511#endif
512
513static const struct normal_encoding utf8_encoding = {
514 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
515 {
516#define BT_COLON BT_NMSTRT
517#include "asciitab.h"
518#undef BT_COLON
519#include "utf8tab.h"
520 },
521 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
522};
523
524#ifdef XML_NS
525
526static const struct normal_encoding internal_utf8_encoding_ns = {
527 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
528 {
529#include "iasciitab.h"
530#include "utf8tab.h"
531 },
532 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
533};
534
535#endif
536
537static const struct normal_encoding internal_utf8_encoding = {
538 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
539 {
540#define BT_COLON BT_NMSTRT
541#include "iasciitab.h"
542#undef BT_COLON
543#include "utf8tab.h"
544 },
545 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
546};
547
Victor Stinner23ec4b52017-06-15 00:54:36 +0200548static enum XML_Convert_Result PTRCALL
549latin1_toUtf8(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000550 const char **fromP, const char *fromLim,
551 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000552{
553 for (;;) {
554 unsigned char c;
555 if (*fromP == fromLim)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200556 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000557 c = (unsigned char)**fromP;
558 if (c & 0x80) {
559 if (toLim - *toP < 2)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200560 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000561 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
562 *(*toP)++ = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000563 (*fromP)++;
564 }
565 else {
566 if (*toP == toLim)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200567 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000568 *(*toP)++ = *(*fromP)++;
569 }
570 }
571}
572
Victor Stinner23ec4b52017-06-15 00:54:36 +0200573static enum XML_Convert_Result PTRCALL
574latin1_toUtf16(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000575 const char **fromP, const char *fromLim,
576 unsigned short **toP, const unsigned short *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000577{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200578 while (*fromP < fromLim && *toP < toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000579 *(*toP)++ = (unsigned char)*(*fromP)++;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200580
581 if ((*toP == toLim) && (*fromP < fromLim))
582 return XML_CONVERT_OUTPUT_EXHAUSTED;
583 else
584 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000585}
586
587#ifdef XML_NS
588
589static const struct normal_encoding latin1_encoding_ns = {
590 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
591 {
592#include "asciitab.h"
593#include "latin1tab.h"
594 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200595 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000596};
597
598#endif
599
600static const struct normal_encoding latin1_encoding = {
601 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
602 {
603#define BT_COLON BT_NMSTRT
604#include "asciitab.h"
605#undef BT_COLON
606#include "latin1tab.h"
607 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200608 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000609};
610
Victor Stinner23ec4b52017-06-15 00:54:36 +0200611static enum XML_Convert_Result PTRCALL
612ascii_toUtf8(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000613 const char **fromP, const char *fromLim,
614 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000615{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200616 while (*fromP < fromLim && *toP < toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000617 *(*toP)++ = *(*fromP)++;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200618
619 if ((*toP == toLim) && (*fromP < fromLim))
620 return XML_CONVERT_OUTPUT_EXHAUSTED;
621 else
622 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000623}
624
625#ifdef XML_NS
626
627static const struct normal_encoding ascii_encoding_ns = {
628 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
629 {
630#include "asciitab.h"
631/* BT_NONXML == 0 */
632 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200633 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000634};
635
636#endif
637
638static const struct normal_encoding ascii_encoding = {
639 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
640 {
641#define BT_COLON BT_NMSTRT
642#include "asciitab.h"
643#undef BT_COLON
644/* BT_NONXML == 0 */
645 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200646 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000647};
648
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000649static int PTRFASTCALL
650unicode_byte_type(char hi, char lo)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000651{
652 switch ((unsigned char)hi) {
653 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
654 return BT_LEAD4;
655 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
656 return BT_TRAIL;
657 case 0xFF:
658 switch ((unsigned char)lo) {
659 case 0xFF:
660 case 0xFE:
661 return BT_NONXML;
662 }
663 break;
664 }
665 return BT_NONASCII;
666}
667
668#define DEFINE_UTF16_TO_UTF8(E) \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200669static enum XML_Convert_Result PTRCALL \
670E ## toUtf8(const ENCODING *UNUSED_P(enc), \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000671 const char **fromP, const char *fromLim, \
672 char **toP, const char *toLim) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000673{ \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200674 const char *from = *fromP; \
675 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
676 for (; from < fromLim; from += 2) { \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000677 int plane; \
678 unsigned char lo2; \
679 unsigned char lo = GET_LO(from); \
680 unsigned char hi = GET_HI(from); \
681 switch (hi) { \
682 case 0: \
683 if (lo < 0x80) { \
684 if (*toP == toLim) { \
685 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200686 return XML_CONVERT_OUTPUT_EXHAUSTED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000687 } \
688 *(*toP)++ = lo; \
689 break; \
690 } \
691 /* fall through */ \
692 case 0x1: case 0x2: case 0x3: \
693 case 0x4: case 0x5: case 0x6: case 0x7: \
694 if (toLim - *toP < 2) { \
695 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200696 return XML_CONVERT_OUTPUT_EXHAUSTED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000697 } \
698 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
699 *(*toP)++ = ((lo & 0x3f) | 0x80); \
700 break; \
701 default: \
702 if (toLim - *toP < 3) { \
703 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200704 return XML_CONVERT_OUTPUT_EXHAUSTED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000705 } \
706 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
707 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
708 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
709 *(*toP)++ = ((lo & 0x3f) | 0x80); \
710 break; \
711 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
712 if (toLim - *toP < 4) { \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000713 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200714 return XML_CONVERT_OUTPUT_EXHAUSTED; \
715 } \
716 if (fromLim - from < 4) { \
717 *fromP = from; \
718 return XML_CONVERT_INPUT_INCOMPLETE; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000719 } \
720 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
721 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
722 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
723 from += 2; \
724 lo2 = GET_LO(from); \
725 *(*toP)++ = (((lo & 0x3) << 4) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000726 | ((GET_HI(from) & 0x3) << 2) \
727 | (lo2 >> 6) \
728 | 0x80); \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000729 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
730 break; \
731 } \
732 } \
733 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200734 if (from < fromLim) \
735 return XML_CONVERT_INPUT_INCOMPLETE; \
736 else \
737 return XML_CONVERT_COMPLETED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000738}
739
740#define DEFINE_UTF16_TO_UTF16(E) \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200741static enum XML_Convert_Result PTRCALL \
742E ## toUtf16(const ENCODING *UNUSED_P(enc), \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000743 const char **fromP, const char *fromLim, \
744 unsigned short **toP, const unsigned short *toLim) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000745{ \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200746 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
747 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000748 /* Avoid copying first half only of surrogate */ \
749 if (fromLim - *fromP > ((toLim - *toP) << 1) \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200750 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000751 fromLim -= 2; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200752 res = XML_CONVERT_INPUT_INCOMPLETE; \
753 } \
754 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000755 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200756 if ((*toP == toLim) && (*fromP < fromLim)) \
757 return XML_CONVERT_OUTPUT_EXHAUSTED; \
758 else \
759 return res; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000760}
761
762#define SET2(ptr, ch) \
763 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
764#define GET_LO(ptr) ((unsigned char)(ptr)[0])
765#define GET_HI(ptr) ((unsigned char)(ptr)[1])
766
767DEFINE_UTF16_TO_UTF8(little2_)
768DEFINE_UTF16_TO_UTF16(little2_)
769
770#undef SET2
771#undef GET_LO
772#undef GET_HI
773
774#define SET2(ptr, ch) \
775 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
776#define GET_LO(ptr) ((unsigned char)(ptr)[1])
777#define GET_HI(ptr) ((unsigned char)(ptr)[0])
778
779DEFINE_UTF16_TO_UTF8(big2_)
780DEFINE_UTF16_TO_UTF16(big2_)
781
782#undef SET2
783#undef GET_LO
784#undef GET_HI
785
786#define LITTLE2_BYTE_TYPE(enc, p) \
787 ((p)[1] == 0 \
788 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
789 : unicode_byte_type((p)[1], (p)[0]))
790#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
791#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
792#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
793 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
794#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
795 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
796
797#ifdef XML_MIN_SIZE
798
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000799static int PTRFASTCALL
800little2_byteType(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000801{
802 return LITTLE2_BYTE_TYPE(enc, p);
803}
804
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000805static int PTRFASTCALL
806little2_byteToAscii(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000807{
808 return LITTLE2_BYTE_TO_ASCII(enc, p);
809}
810
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000811static int PTRCALL
812little2_charMatches(const ENCODING *enc, const char *p, int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000813{
814 return LITTLE2_CHAR_MATCHES(enc, p, c);
815}
816
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000817static int PTRFASTCALL
818little2_isNameMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000819{
820 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
821}
822
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000823static int PTRFASTCALL
824little2_isNmstrtMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000825{
826 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
827}
828
829#undef VTABLE
830#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
831
832#else /* not XML_MIN_SIZE */
833
834#undef PREFIX
835#define PREFIX(ident) little2_ ## ident
836#define MINBPC(enc) 2
837/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
838#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000839#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000840#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
841#define IS_NAME_CHAR(enc, p, n) 0
842#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
843#define IS_NMSTRT_CHAR(enc, p, n) (0)
844#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
845
Gregory P. Smith64359d22012-07-14 14:12:35 -0700846#define XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000847#include "xmltok_impl.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -0700848#undef XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000849
850#undef MINBPC
851#undef BYTE_TYPE
852#undef BYTE_TO_ASCII
853#undef CHAR_MATCHES
854#undef IS_NAME_CHAR
855#undef IS_NAME_CHAR_MINBPC
856#undef IS_NMSTRT_CHAR
857#undef IS_NMSTRT_CHAR_MINBPC
858#undef IS_INVALID_CHAR
859
860#endif /* not XML_MIN_SIZE */
861
862#ifdef XML_NS
863
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000864static const struct normal_encoding little2_encoding_ns = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000865 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000866#if BYTEORDER == 1234
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000867 1
868#else
869 0
870#endif
871 },
872 {
873#include "asciitab.h"
874#include "latin1tab.h"
875 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200876 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000877};
878
879#endif
880
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000881static const struct normal_encoding little2_encoding = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000882 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000883#if BYTEORDER == 1234
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000884 1
885#else
886 0
887#endif
888 },
889 {
890#define BT_COLON BT_NMSTRT
891#include "asciitab.h"
892#undef BT_COLON
893#include "latin1tab.h"
894 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200895 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000896};
897
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000898#if BYTEORDER != 4321
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000899
900#ifdef XML_NS
901
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000902static const struct normal_encoding internal_little2_encoding_ns = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000903 { VTABLE, 2, 0, 1 },
904 {
905#include "iasciitab.h"
906#include "latin1tab.h"
907 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200908 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000909};
910
911#endif
912
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000913static const struct normal_encoding internal_little2_encoding = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000914 { VTABLE, 2, 0, 1 },
915 {
916#define BT_COLON BT_NMSTRT
917#include "iasciitab.h"
918#undef BT_COLON
919#include "latin1tab.h"
920 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200921 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000922};
923
924#endif
925
926
927#define BIG2_BYTE_TYPE(enc, p) \
928 ((p)[0] == 0 \
929 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
930 : unicode_byte_type((p)[0], (p)[1]))
931#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
932#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
933#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
934 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
935#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
936 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
937
938#ifdef XML_MIN_SIZE
939
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000940static int PTRFASTCALL
941big2_byteType(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000942{
943 return BIG2_BYTE_TYPE(enc, p);
944}
945
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000946static int PTRFASTCALL
947big2_byteToAscii(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000948{
949 return BIG2_BYTE_TO_ASCII(enc, p);
950}
951
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000952static int PTRCALL
953big2_charMatches(const ENCODING *enc, const char *p, int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000954{
955 return BIG2_CHAR_MATCHES(enc, p, c);
956}
957
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000958static int PTRFASTCALL
959big2_isNameMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000960{
961 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
962}
963
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000964static int PTRFASTCALL
965big2_isNmstrtMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000966{
967 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
968}
969
970#undef VTABLE
971#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
972
973#else /* not XML_MIN_SIZE */
974
975#undef PREFIX
976#define PREFIX(ident) big2_ ## ident
977#define MINBPC(enc) 2
978/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
979#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000980#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000981#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
982#define IS_NAME_CHAR(enc, p, n) 0
983#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
984#define IS_NMSTRT_CHAR(enc, p, n) (0)
985#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
986
Gregory P. Smith64359d22012-07-14 14:12:35 -0700987#define XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000988#include "xmltok_impl.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -0700989#undef XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000990
991#undef MINBPC
992#undef BYTE_TYPE
993#undef BYTE_TO_ASCII
994#undef CHAR_MATCHES
995#undef IS_NAME_CHAR
996#undef IS_NAME_CHAR_MINBPC
997#undef IS_NMSTRT_CHAR
998#undef IS_NMSTRT_CHAR_MINBPC
999#undef IS_INVALID_CHAR
1000
1001#endif /* not XML_MIN_SIZE */
1002
1003#ifdef XML_NS
1004
1005static const struct normal_encoding big2_encoding_ns = {
1006 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001007#if BYTEORDER == 4321
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001008 1
1009#else
1010 0
1011#endif
1012 },
1013 {
1014#include "asciitab.h"
1015#include "latin1tab.h"
1016 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001017 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001018};
1019
1020#endif
1021
1022static const struct normal_encoding big2_encoding = {
1023 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001024#if BYTEORDER == 4321
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001025 1
1026#else
1027 0
1028#endif
1029 },
1030 {
1031#define BT_COLON BT_NMSTRT
1032#include "asciitab.h"
1033#undef BT_COLON
1034#include "latin1tab.h"
1035 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001036 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001037};
1038
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001039#if BYTEORDER != 1234
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001040
1041#ifdef XML_NS
1042
1043static const struct normal_encoding internal_big2_encoding_ns = {
1044 { VTABLE, 2, 0, 1 },
1045 {
1046#include "iasciitab.h"
1047#include "latin1tab.h"
1048 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001049 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001050};
1051
1052#endif
1053
1054static const struct normal_encoding internal_big2_encoding = {
1055 { VTABLE, 2, 0, 1 },
1056 {
1057#define BT_COLON BT_NMSTRT
1058#include "iasciitab.h"
1059#undef BT_COLON
1060#include "latin1tab.h"
1061 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001062 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001063};
1064
1065#endif
1066
1067#undef PREFIX
1068
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001069static int FASTCALL
1070streqci(const char *s1, const char *s2)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001071{
1072 for (;;) {
1073 char c1 = *s1++;
1074 char c2 = *s2++;
1075 if (ASCII_a <= c1 && c1 <= ASCII_z)
1076 c1 += ASCII_A - ASCII_a;
1077 if (ASCII_a <= c2 && c2 <= ASCII_z)
Victor Stinner93d0cb52017-08-18 23:43:54 +02001078 /* The following line will never get executed. streqci() is
1079 * only called from two places, both of which guarantee to put
1080 * upper-case strings into s2.
1081 */
1082 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001083 if (c1 != c2)
1084 return 0;
1085 if (!c1)
1086 break;
1087 }
1088 return 1;
1089}
1090
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001091static void PTRCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +02001092initUpdatePosition(const ENCODING *UNUSED_P(enc), const char *ptr,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001093 const char *end, POSITION *pos)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001094{
1095 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1096}
1097
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001098static int
1099toAscii(const ENCODING *enc, const char *ptr, const char *end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001100{
1101 char buf[1];
1102 char *p = buf;
1103 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1104 if (p == buf)
1105 return -1;
1106 else
1107 return buf[0];
1108}
1109
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001110static int FASTCALL
1111isSpace(int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001112{
1113 switch (c) {
1114 case 0x20:
1115 case 0xD:
1116 case 0xA:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001117 case 0x9:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001118 return 1;
1119 }
1120 return 0;
1121}
1122
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001123/* Return 1 if there's just optional white space or there's an S
1124 followed by name=val.
1125*/
1126static int
1127parsePseudoAttribute(const ENCODING *enc,
1128 const char *ptr,
1129 const char *end,
1130 const char **namePtr,
1131 const char **nameEndPtr,
1132 const char **valPtr,
1133 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001134{
1135 int c;
1136 char open;
1137 if (ptr == end) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001138 *namePtr = NULL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001139 return 1;
1140 }
1141 if (!isSpace(toAscii(enc, ptr, end))) {
1142 *nextTokPtr = ptr;
1143 return 0;
1144 }
1145 do {
1146 ptr += enc->minBytesPerChar;
1147 } while (isSpace(toAscii(enc, ptr, end)));
1148 if (ptr == end) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001149 *namePtr = NULL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001150 return 1;
1151 }
1152 *namePtr = ptr;
1153 for (;;) {
1154 c = toAscii(enc, ptr, end);
1155 if (c == -1) {
1156 *nextTokPtr = ptr;
1157 return 0;
1158 }
1159 if (c == ASCII_EQUALS) {
1160 *nameEndPtr = ptr;
1161 break;
1162 }
1163 if (isSpace(c)) {
1164 *nameEndPtr = ptr;
1165 do {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001166 ptr += enc->minBytesPerChar;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001167 } while (isSpace(c = toAscii(enc, ptr, end)));
1168 if (c != ASCII_EQUALS) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001169 *nextTokPtr = ptr;
1170 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001171 }
1172 break;
1173 }
1174 ptr += enc->minBytesPerChar;
1175 }
1176 if (ptr == *namePtr) {
1177 *nextTokPtr = ptr;
1178 return 0;
1179 }
1180 ptr += enc->minBytesPerChar;
1181 c = toAscii(enc, ptr, end);
1182 while (isSpace(c)) {
1183 ptr += enc->minBytesPerChar;
1184 c = toAscii(enc, ptr, end);
1185 }
1186 if (c != ASCII_QUOT && c != ASCII_APOS) {
1187 *nextTokPtr = ptr;
1188 return 0;
1189 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001190 open = (char)c;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001191 ptr += enc->minBytesPerChar;
1192 *valPtr = ptr;
1193 for (;; ptr += enc->minBytesPerChar) {
1194 c = toAscii(enc, ptr, end);
1195 if (c == open)
1196 break;
1197 if (!(ASCII_a <= c && c <= ASCII_z)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001198 && !(ASCII_A <= c && c <= ASCII_Z)
1199 && !(ASCII_0 <= c && c <= ASCII_9)
1200 && c != ASCII_PERIOD
1201 && c != ASCII_MINUS
1202 && c != ASCII_UNDERSCORE) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001203 *nextTokPtr = ptr;
1204 return 0;
1205 }
1206 }
1207 *nextTokPtr = ptr + enc->minBytesPerChar;
1208 return 1;
1209}
1210
1211static const char KW_version[] = {
1212 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1213};
1214
1215static const char KW_encoding[] = {
1216 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1217};
1218
1219static const char KW_standalone[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001220 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1221 ASCII_n, ASCII_e, '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001222};
1223
1224static const char KW_yes[] = {
1225 ASCII_y, ASCII_e, ASCII_s, '\0'
1226};
1227
1228static const char KW_no[] = {
1229 ASCII_n, ASCII_o, '\0'
1230};
1231
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001232static int
1233doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1234 const char *,
1235 const char *),
1236 int isGeneralTextEntity,
1237 const ENCODING *enc,
1238 const char *ptr,
1239 const char *end,
1240 const char **badPtr,
1241 const char **versionPtr,
1242 const char **versionEndPtr,
1243 const char **encodingName,
1244 const ENCODING **encoding,
1245 int *standalone)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001246{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001247 const char *val = NULL;
1248 const char *name = NULL;
1249 const char *nameEnd = NULL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001250 ptr += 5 * enc->minBytesPerChar;
1251 end -= 2 * enc->minBytesPerChar;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001252 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1253 || !name) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001254 *badPtr = ptr;
1255 return 0;
1256 }
1257 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1258 if (!isGeneralTextEntity) {
1259 *badPtr = name;
1260 return 0;
1261 }
1262 }
1263 else {
1264 if (versionPtr)
1265 *versionPtr = val;
1266 if (versionEndPtr)
1267 *versionEndPtr = ptr;
1268 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1269 *badPtr = ptr;
1270 return 0;
1271 }
1272 if (!name) {
1273 if (isGeneralTextEntity) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001274 /* a TextDecl must have an EncodingDecl */
1275 *badPtr = ptr;
1276 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001277 }
1278 return 1;
1279 }
1280 }
1281 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1282 int c = toAscii(enc, val, end);
1283 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1284 *badPtr = val;
1285 return 0;
1286 }
1287 if (encodingName)
1288 *encodingName = val;
1289 if (encoding)
1290 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1291 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1292 *badPtr = ptr;
1293 return 0;
1294 }
1295 if (!name)
1296 return 1;
1297 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001298 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1299 || isGeneralTextEntity) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001300 *badPtr = name;
1301 return 0;
1302 }
1303 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1304 if (standalone)
1305 *standalone = 1;
1306 }
1307 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1308 if (standalone)
1309 *standalone = 0;
1310 }
1311 else {
1312 *badPtr = val;
1313 return 0;
1314 }
1315 while (isSpace(toAscii(enc, ptr, end)))
1316 ptr += enc->minBytesPerChar;
1317 if (ptr != end) {
1318 *badPtr = ptr;
1319 return 0;
1320 }
1321 return 1;
1322}
1323
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001324static int FASTCALL
1325checkCharRefNumber(int result)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001326{
1327 switch (result >> 8) {
1328 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1329 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1330 return -1;
1331 case 0:
1332 if (latin1_encoding.type[result] == BT_NONXML)
1333 return -1;
1334 break;
1335 case 0xFF:
1336 if (result == 0xFFFE || result == 0xFFFF)
1337 return -1;
1338 break;
1339 }
1340 return result;
1341}
1342
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001343int FASTCALL
1344XmlUtf8Encode(int c, char *buf)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001345{
1346 enum {
1347 /* minN is minimum legal resulting value for N byte sequence */
1348 min2 = 0x80,
1349 min3 = 0x800,
1350 min4 = 0x10000
1351 };
1352
1353 if (c < 0)
Victor Stinner93d0cb52017-08-18 23:43:54 +02001354 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001355 if (c < min2) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001356 buf[0] = (char)(c | UTF8_cval1);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001357 return 1;
1358 }
1359 if (c < min3) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001360 buf[0] = (char)((c >> 6) | UTF8_cval2);
1361 buf[1] = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001362 return 2;
1363 }
1364 if (c < min4) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001365 buf[0] = (char)((c >> 12) | UTF8_cval3);
1366 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1367 buf[2] = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001368 return 3;
1369 }
1370 if (c < 0x110000) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001371 buf[0] = (char)((c >> 18) | UTF8_cval4);
1372 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1373 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1374 buf[3] = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001375 return 4;
1376 }
Victor Stinner93d0cb52017-08-18 23:43:54 +02001377 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001378}
1379
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001380int FASTCALL
1381XmlUtf16Encode(int charNum, unsigned short *buf)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001382{
1383 if (charNum < 0)
1384 return 0;
1385 if (charNum < 0x10000) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001386 buf[0] = (unsigned short)charNum;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001387 return 1;
1388 }
1389 if (charNum < 0x110000) {
1390 charNum -= 0x10000;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001391 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1392 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001393 return 2;
1394 }
1395 return 0;
1396}
1397
1398struct unknown_encoding {
1399 struct normal_encoding normal;
Fred Drake31d485c2004-08-03 07:06:22 +00001400 CONVERTER convert;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001401 void *userData;
1402 unsigned short utf16[256];
1403 char utf8[256][4];
1404};
1405
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001406#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
1407
1408int
1409XmlSizeOfUnknownEncoding(void)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001410{
1411 return sizeof(struct unknown_encoding);
1412}
1413
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001414static int PTRFASTCALL
1415unknown_isName(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001416{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001417 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1418 int c = uenc->convert(uenc->userData, p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001419 if (c & ~0xFFFF)
1420 return 0;
1421 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1422}
1423
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001424static int PTRFASTCALL
1425unknown_isNmstrt(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001426{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001427 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1428 int c = uenc->convert(uenc->userData, p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001429 if (c & ~0xFFFF)
1430 return 0;
1431 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1432}
1433
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001434static int PTRFASTCALL
1435unknown_isInvalid(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001436{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001437 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1438 int c = uenc->convert(uenc->userData, p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001439 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1440}
1441
Victor Stinner23ec4b52017-06-15 00:54:36 +02001442static enum XML_Convert_Result PTRCALL
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001443unknown_toUtf8(const ENCODING *enc,
1444 const char **fromP, const char *fromLim,
1445 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001446{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001447 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001448 char buf[XML_UTF8_ENCODE_MAX];
1449 for (;;) {
1450 const char *utf8;
1451 int n;
1452 if (*fromP == fromLim)
Victor Stinner23ec4b52017-06-15 00:54:36 +02001453 return XML_CONVERT_COMPLETED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001454 utf8 = uenc->utf8[(unsigned char)**fromP];
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001455 n = *utf8++;
1456 if (n == 0) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001457 int c = uenc->convert(uenc->userData, *fromP);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001458 n = XmlUtf8Encode(c, buf);
1459 if (n > toLim - *toP)
Victor Stinner23ec4b52017-06-15 00:54:36 +02001460 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001461 utf8 = buf;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001462 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1463 - (BT_LEAD2 - 2));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001464 }
1465 else {
1466 if (n > toLim - *toP)
Victor Stinner23ec4b52017-06-15 00:54:36 +02001467 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001468 (*fromP)++;
1469 }
Miss Islington (bot)fb17b812018-06-26 19:44:32 -07001470 memcpy(*toP, utf8, n);
1471 *toP += n;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001472 }
1473}
1474
Victor Stinner23ec4b52017-06-15 00:54:36 +02001475static enum XML_Convert_Result PTRCALL
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001476unknown_toUtf16(const ENCODING *enc,
1477 const char **fromP, const char *fromLim,
1478 unsigned short **toP, const unsigned short *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001479{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001480 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001481 while (*fromP < fromLim && *toP < toLim) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001482 unsigned short c = uenc->utf16[(unsigned char)**fromP];
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001483 if (c == 0) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001484 c = (unsigned short)
1485 uenc->convert(uenc->userData, *fromP);
1486 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1487 - (BT_LEAD2 - 2));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001488 }
1489 else
1490 (*fromP)++;
1491 *(*toP)++ = c;
1492 }
Victor Stinner23ec4b52017-06-15 00:54:36 +02001493
1494 if ((*toP == toLim) && (*fromP < fromLim))
1495 return XML_CONVERT_OUTPUT_EXHAUSTED;
1496 else
1497 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001498}
1499
1500ENCODING *
1501XmlInitUnknownEncoding(void *mem,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001502 int *table,
Gregory P. Smith64359d22012-07-14 14:12:35 -07001503 CONVERTER convert,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001504 void *userData)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001505{
1506 int i;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001507 struct unknown_encoding *e = (struct unknown_encoding *)mem;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001508 for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1509 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1510 for (i = 0; i < 128; i++)
1511 if (latin1_encoding.type[i] != BT_OTHER
1512 && latin1_encoding.type[i] != BT_NONXML
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001513 && table[i] != i)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001514 return 0;
1515 for (i = 0; i < 256; i++) {
1516 int c = table[i];
1517 if (c == -1) {
1518 e->normal.type[i] = BT_MALFORM;
1519 /* This shouldn't really get used. */
1520 e->utf16[i] = 0xFFFF;
1521 e->utf8[i][0] = 1;
1522 e->utf8[i][1] = 0;
1523 }
1524 else if (c < 0) {
1525 if (c < -4)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001526 return 0;
Victor Stinner93d0cb52017-08-18 23:43:54 +02001527 /* Multi-byte sequences need a converter function */
1528 if (!convert)
1529 return 0;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001530 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001531 e->utf8[i][0] = 0;
1532 e->utf16[i] = 0;
1533 }
1534 else if (c < 0x80) {
1535 if (latin1_encoding.type[c] != BT_OTHER
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001536 && latin1_encoding.type[c] != BT_NONXML
1537 && c != i)
1538 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001539 e->normal.type[i] = latin1_encoding.type[c];
1540 e->utf8[i][0] = 1;
1541 e->utf8[i][1] = (char)c;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001542 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001543 }
1544 else if (checkCharRefNumber(c) < 0) {
1545 e->normal.type[i] = BT_NONXML;
1546 /* This shouldn't really get used. */
1547 e->utf16[i] = 0xFFFF;
1548 e->utf8[i][0] = 1;
1549 e->utf8[i][1] = 0;
1550 }
1551 else {
1552 if (c > 0xFFFF)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001553 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001554 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001555 e->normal.type[i] = BT_NMSTRT;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001556 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001557 e->normal.type[i] = BT_NAME;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001558 else
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001559 e->normal.type[i] = BT_OTHER;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001560 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001561 e->utf16[i] = (unsigned short)c;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001562 }
1563 }
1564 e->userData = userData;
1565 e->convert = convert;
1566 if (convert) {
1567 e->normal.isName2 = unknown_isName;
1568 e->normal.isName3 = unknown_isName;
1569 e->normal.isName4 = unknown_isName;
1570 e->normal.isNmstrt2 = unknown_isNmstrt;
1571 e->normal.isNmstrt3 = unknown_isNmstrt;
1572 e->normal.isNmstrt4 = unknown_isNmstrt;
1573 e->normal.isInvalid2 = unknown_isInvalid;
1574 e->normal.isInvalid3 = unknown_isInvalid;
1575 e->normal.isInvalid4 = unknown_isInvalid;
1576 }
1577 e->normal.enc.utf8Convert = unknown_toUtf8;
1578 e->normal.enc.utf16Convert = unknown_toUtf16;
1579 return &(e->normal.enc);
1580}
1581
1582/* If this enumeration is changed, getEncodingIndex and encodings
1583must also be changed. */
1584enum {
1585 UNKNOWN_ENC = -1,
1586 ISO_8859_1_ENC = 0,
1587 US_ASCII_ENC,
1588 UTF_8_ENC,
1589 UTF_16_ENC,
1590 UTF_16BE_ENC,
1591 UTF_16LE_ENC,
1592 /* must match encodingNames up to here */
1593 NO_ENC
1594};
1595
1596static const char KW_ISO_8859_1[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001597 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1598 ASCII_MINUS, ASCII_1, '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001599};
1600static const char KW_US_ASCII[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001601 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1602 '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001603};
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001604static const char KW_UTF_8[] = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001605 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1606};
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001607static const char KW_UTF_16[] = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001608 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1609};
1610static const char KW_UTF_16BE[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001611 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1612 '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001613};
1614static const char KW_UTF_16LE[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001615 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1616 '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001617};
1618
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001619static int FASTCALL
1620getEncodingIndex(const char *name)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001621{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001622 static const char * const encodingNames[] = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001623 KW_ISO_8859_1,
1624 KW_US_ASCII,
1625 KW_UTF_8,
1626 KW_UTF_16,
1627 KW_UTF_16BE,
1628 KW_UTF_16LE,
1629 };
1630 int i;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001631 if (name == NULL)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001632 return NO_ENC;
1633 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1634 if (streqci(name, encodingNames[i]))
1635 return i;
1636 return UNKNOWN_ENC;
1637}
1638
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001639/* For binary compatibility, we store the index of the encoding
1640 specified at initialization in the isUtf16 member.
1641*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001642
1643#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1644#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1645
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001646/* This is what detects the encoding. encodingTable maps from
1647 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1648 the external (protocol) specified encoding; state is
1649 XML_CONTENT_STATE if we're parsing an external text entity, and
1650 XML_PROLOG_STATE otherwise.
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001651*/
1652
1653
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001654static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001655initScan(const ENCODING * const *encodingTable,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001656 const INIT_ENCODING *enc,
1657 int state,
1658 const char *ptr,
1659 const char *end,
1660 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001661{
1662 const ENCODING **encPtr;
1663
Victor Stinner23ec4b52017-06-15 00:54:36 +02001664 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001665 return XML_TOK_NONE;
1666 encPtr = enc->encPtr;
1667 if (ptr + 1 == end) {
1668 /* only a single byte available for auto-detection */
1669#ifndef XML_DTD /* FIXME */
1670 /* a well-formed document entity must have more than one byte */
1671 if (state != XML_CONTENT_STATE)
1672 return XML_TOK_PARTIAL;
1673#endif
1674 /* so we're parsing an external text entity... */
1675 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1676 switch (INIT_ENC_INDEX(enc)) {
1677 case UTF_16_ENC:
1678 case UTF_16LE_ENC:
1679 case UTF_16BE_ENC:
1680 return XML_TOK_PARTIAL;
1681 }
1682 switch ((unsigned char)*ptr) {
1683 case 0xFE:
1684 case 0xFF:
1685 case 0xEF: /* possibly first byte of UTF-8 BOM */
1686 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001687 && state == XML_CONTENT_STATE)
1688 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001689 /* fall through */
1690 case 0x00:
1691 case 0x3C:
1692 return XML_TOK_PARTIAL;
1693 }
1694 }
1695 else {
1696 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1697 case 0xFEFF:
1698 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001699 && state == XML_CONTENT_STATE)
1700 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001701 *nextTokPtr = ptr + 2;
1702 *encPtr = encodingTable[UTF_16BE_ENC];
1703 return XML_TOK_BOM;
1704 /* 00 3C is handled in the default case */
1705 case 0x3C00:
1706 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001707 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1708 && state == XML_CONTENT_STATE)
1709 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001710 *encPtr = encodingTable[UTF_16LE_ENC];
1711 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1712 case 0xFFFE:
1713 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001714 && state == XML_CONTENT_STATE)
1715 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001716 *nextTokPtr = ptr + 2;
1717 *encPtr = encodingTable[UTF_16LE_ENC];
1718 return XML_TOK_BOM;
1719 case 0xEFBB:
1720 /* Maybe a UTF-8 BOM (EF BB BF) */
1721 /* If there's an explicitly specified (external) encoding
1722 of ISO-8859-1 or some flavour of UTF-16
1723 and this is an external text entity,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001724 don't look for the BOM,
1725 because it might be a legal data.
1726 */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001727 if (state == XML_CONTENT_STATE) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001728 int e = INIT_ENC_INDEX(enc);
1729 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1730 || e == UTF_16LE_ENC || e == UTF_16_ENC)
1731 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001732 }
1733 if (ptr + 2 == end)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001734 return XML_TOK_PARTIAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001735 if ((unsigned char)ptr[2] == 0xBF) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001736 *nextTokPtr = ptr + 3;
1737 *encPtr = encodingTable[UTF_8_ENC];
1738 return XML_TOK_BOM;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001739 }
1740 break;
1741 default:
1742 if (ptr[0] == '\0') {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001743 /* 0 isn't a legal data character. Furthermore a document
1744 entity can only start with ASCII characters. So the only
Benjamin Peterson196d7db2016-06-11 13:28:56 -07001745 way this can fail to be big-endian UTF-16 if it it's an
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001746 external parsed general entity that's labelled as
1747 UTF-16LE.
1748 */
1749 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1750 break;
1751 *encPtr = encodingTable[UTF_16BE_ENC];
1752 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001753 }
1754 else if (ptr[1] == '\0') {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001755 /* We could recover here in the case:
1756 - parsing an external entity
1757 - second byte is 0
1758 - no externally specified encoding
1759 - no encoding declaration
1760 by assuming UTF-16LE. But we don't, because this would mean when
1761 presented just with a single byte, we couldn't reliably determine
1762 whether we needed further bytes.
1763 */
1764 if (state == XML_CONTENT_STATE)
1765 break;
1766 *encPtr = encodingTable[UTF_16LE_ENC];
1767 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001768 }
1769 break;
1770 }
1771 }
1772 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1773 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1774}
1775
1776
1777#define NS(x) x
1778#define ns(x) x
Gregory P. Smith64359d22012-07-14 14:12:35 -07001779#define XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001780#include "xmltok_ns.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -07001781#undef XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001782#undef NS
1783#undef ns
1784
1785#ifdef XML_NS
1786
1787#define NS(x) x ## NS
1788#define ns(x) x ## _ns
1789
Gregory P. Smith64359d22012-07-14 14:12:35 -07001790#define XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001791#include "xmltok_ns.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -07001792#undef XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001793
1794#undef NS
1795#undef ns
1796
1797ENCODING *
1798XmlInitUnknownEncodingNS(void *mem,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001799 int *table,
Gregory P. Smith64359d22012-07-14 14:12:35 -07001800 CONVERTER convert,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001801 void *userData)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001802{
1803 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1804 if (enc)
1805 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1806 return enc;
1807}
1808
1809#endif /* XML_NS */