blob: fa35de71324cebea13f1cbf309bb8e2931bf3956 [file] [log] [blame]
Victor Stinner759e30e2017-09-05 01:58:08 +02001/*
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000-2017 Expat development team
11 Licensed under the MIT license:
12
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to permit
18 persons to whom the Software is furnished to do so, subject to the
19 following conditions:
20
21 The above copyright notice and this permission notice shall be included
22 in all copies or substantial portions of the Software.
23
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30 USE OR OTHER DEALINGS IN THE SOFTWARE.
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000031*/
32
Miss Islington (bot)1467a3a2018-12-10 03:28:13 -080033#include <pyconfig.h>
Benjamin Peterson06d49bb2016-06-13 23:41:19 -070034#include <stddef.h>
Miss Islington (bot)fb17b812018-06-26 19:44:32 -070035#include <string.h> /* memcpy */
36
37#if defined(_MSC_VER) && (_MSC_VER <= 1700)
38 /* for vs2012/11.0/1700 and earlier Visual Studio compilers */
39# define bool int
40# define false 0
41# define true 1
42#else
43# include <stdbool.h>
44#endif
45
Benjamin Peterson06d49bb2016-06-13 23:41:19 -070046
Victor Stinner5ff71322017-06-21 14:39:22 +020047#ifdef _WIN32
Martin v. Löwisfc03a942003-01-25 22:41:29 +000048#include "winconfig.h"
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000049#else
Fred Drake08317ae2003-10-21 15:38:55 +000050#ifdef HAVE_EXPAT_CONFIG_H
Martin v. Löwisfc03a942003-01-25 22:41:29 +000051#include <expat_config.h>
Fred Drake08317ae2003-10-21 15:38:55 +000052#endif
Victor Stinner5ff71322017-06-21 14:39:22 +020053#endif /* ndef _WIN32 */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000054
Fred Drake31d485c2004-08-03 07:06:22 +000055#include "expat_external.h"
Martin v. Löwisfc03a942003-01-25 22:41:29 +000056#include "internal.h"
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000057#include "xmltok.h"
58#include "nametab.h"
59
60#ifdef XML_DTD
61#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
62#else
63#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
64#endif
65
66#define VTABLE1 \
67 { PREFIX(prologTok), PREFIX(contentTok), \
68 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
69 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000070 PREFIX(nameMatchesAscii), \
71 PREFIX(nameLength), \
72 PREFIX(skipS), \
73 PREFIX(getAtts), \
74 PREFIX(charRefNumber), \
75 PREFIX(predefinedEntityName), \
76 PREFIX(updatePosition), \
77 PREFIX(isPublicId)
78
79#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
80
81#define UCS2_GET_NAMING(pages, hi, lo) \
Victor Stinner23ec4b52017-06-15 00:54:36 +020082 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000083
Martin v. Löwisfc03a942003-01-25 22:41:29 +000084/* A 2 byte UTF-8 representation splits the characters 11 bits between
85 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
86 pages, 3 bits to add to that index and 5 bits to generate the mask.
87*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000088#define UTF8_GET_NAMING2(pages, byte) \
89 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
90 + ((((byte)[0]) & 3) << 1) \
91 + ((((byte)[1]) >> 5) & 1)] \
Victor Stinner23ec4b52017-06-15 00:54:36 +020092 & (1u << (((byte)[1]) & 0x1F)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000093
Martin v. Löwisfc03a942003-01-25 22:41:29 +000094/* A 3 byte UTF-8 representation splits the characters 16 bits between
95 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
96 into pages, 3 bits to add to that index and 5 bits to generate the
97 mask.
98*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000099#define UTF8_GET_NAMING3(pages, byte) \
100 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
101 + ((((byte)[1]) >> 2) & 0xF)] \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000102 << 3) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000103 + ((((byte)[1]) & 3) << 1) \
104 + ((((byte)[2]) >> 5) & 1)] \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200105 & (1u << (((byte)[2]) & 0x1F)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000106
107#define UTF8_GET_NAMING(pages, p, n) \
108 ((n) == 2 \
109 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
110 : ((n) == 3 \
111 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
112 : 0))
113
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000114/* Detection of invalid UTF-8 sequences is based on Table 3.1B
115 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
116 with the additional restriction of not allowing the Unicode
117 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
118 Implementation details:
119 (A & 0x80) == 0 means A < 0x80
120 and
121 (A & 0xC0) == 0xC0 means A > 0xBF
122*/
123
124#define UTF8_INVALID2(p) \
125 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
126
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000127#define UTF8_INVALID3(p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000128 (((p)[2] & 0x80) == 0 \
129 || \
130 ((*p) == 0xEF && (p)[1] == 0xBF \
131 ? \
132 (p)[2] > 0xBD \
133 : \
134 ((p)[2] & 0xC0) == 0xC0) \
135 || \
136 ((*p) == 0xE0 \
137 ? \
138 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
139 : \
140 ((p)[1] & 0x80) == 0 \
141 || \
142 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000143
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000144#define UTF8_INVALID4(p) \
145 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
146 || \
147 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
148 || \
149 ((*p) == 0xF0 \
150 ? \
151 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
152 : \
153 ((p)[1] & 0x80) == 0 \
154 || \
155 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000156
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000157static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200158isNever(const ENCODING *UNUSED_P(enc), const char *UNUSED_P(p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000159{
160 return 0;
161}
162
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000163static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200164utf8_isName2(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000165{
166 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
167}
168
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000169static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200170utf8_isName3(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000171{
172 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
173}
174
175#define utf8_isName4 isNever
176
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000177static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200178utf8_isNmstrt2(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000179{
180 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
181}
182
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000183static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200184utf8_isNmstrt3(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000185{
186 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
187}
188
189#define utf8_isNmstrt4 isNever
190
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000191static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200192utf8_isInvalid2(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000193{
194 return UTF8_INVALID2((const unsigned char *)p);
195}
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000196
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000197static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200198utf8_isInvalid3(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000199{
200 return UTF8_INVALID3((const unsigned char *)p);
201}
202
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000203static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200204utf8_isInvalid4(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000205{
206 return UTF8_INVALID4((const unsigned char *)p);
207}
208
209struct normal_encoding {
210 ENCODING enc;
211 unsigned char type[256];
212#ifdef XML_MIN_SIZE
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000213 int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
214 int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
215 int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
216 int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
217 int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000218#endif /* XML_MIN_SIZE */
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000219 int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
220 int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
221 int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
222 int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
223 int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
224 int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
225 int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
226 int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
227 int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000228};
229
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000230#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
231
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000232#ifdef XML_MIN_SIZE
233
234#define STANDARD_VTABLE(E) \
235 E ## byteType, \
236 E ## isNameMin, \
237 E ## isNmstrtMin, \
238 E ## byteToAscii, \
239 E ## charMatches,
240
241#else
242
243#define STANDARD_VTABLE(E) /* as nothing */
244
245#endif
246
247#define NORMAL_VTABLE(E) \
248 E ## isName2, \
249 E ## isName3, \
250 E ## isName4, \
251 E ## isNmstrt2, \
252 E ## isNmstrt3, \
253 E ## isNmstrt4, \
254 E ## isInvalid2, \
255 E ## isInvalid3, \
256 E ## isInvalid4
257
Victor Stinner23ec4b52017-06-15 00:54:36 +0200258#define NULL_VTABLE \
259 /* isName2 */ NULL, \
260 /* isName3 */ NULL, \
261 /* isName4 */ NULL, \
262 /* isNmstrt2 */ NULL, \
263 /* isNmstrt3 */ NULL, \
264 /* isNmstrt4 */ NULL, \
265 /* isInvalid2 */ NULL, \
266 /* isInvalid3 */ NULL, \
267 /* isInvalid4 */ NULL
268
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000269static int FASTCALL checkCharRefNumber(int);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000270
271#include "xmltok_impl.h"
272#include "ascii.h"
273
274#ifdef XML_MIN_SIZE
275#define sb_isNameMin isNever
276#define sb_isNmstrtMin isNever
277#endif
278
279#ifdef XML_MIN_SIZE
280#define MINBPC(enc) ((enc)->minBytesPerChar)
281#else
282/* minimum bytes per character */
283#define MINBPC(enc) 1
284#endif
285
286#define SB_BYTE_TYPE(enc, p) \
287 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
288
289#ifdef XML_MIN_SIZE
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000290static int PTRFASTCALL
291sb_byteType(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000292{
293 return SB_BYTE_TYPE(enc, p);
294}
295#define BYTE_TYPE(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000296 (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000297#else
298#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
299#endif
300
301#ifdef XML_MIN_SIZE
302#define BYTE_TO_ASCII(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000303 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
304static int PTRFASTCALL
305sb_byteToAscii(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000306{
307 return *p;
308}
309#else
310#define BYTE_TO_ASCII(enc, p) (*(p))
311#endif
312
313#define IS_NAME_CHAR(enc, p, n) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000314 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000315#define IS_NMSTRT_CHAR(enc, p, n) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000316 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000317#define IS_INVALID_CHAR(enc, p, n) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000318 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000319
320#ifdef XML_MIN_SIZE
321#define IS_NAME_CHAR_MINBPC(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000322 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000323#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000324 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000325#else
326#define IS_NAME_CHAR_MINBPC(enc, p) (0)
327#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
328#endif
329
330#ifdef XML_MIN_SIZE
331#define CHAR_MATCHES(enc, p, c) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000332 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
333static int PTRCALL
334sb_charMatches(const ENCODING *enc, const char *p, int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000335{
336 return *p == c;
337}
338#else
339/* c is an ASCII character */
340#define CHAR_MATCHES(enc, p, c) (*(p) == c)
341#endif
342
343#define PREFIX(ident) normal_ ## ident
Gregory P. Smith64359d22012-07-14 14:12:35 -0700344#define XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000345#include "xmltok_impl.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -0700346#undef XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000347
348#undef MINBPC
349#undef BYTE_TYPE
350#undef BYTE_TO_ASCII
351#undef CHAR_MATCHES
352#undef IS_NAME_CHAR
353#undef IS_NAME_CHAR_MINBPC
354#undef IS_NMSTRT_CHAR
355#undef IS_NMSTRT_CHAR_MINBPC
356#undef IS_INVALID_CHAR
357
358enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
359 UTF8_cval1 = 0x00,
360 UTF8_cval2 = 0xc0,
361 UTF8_cval3 = 0xe0,
362 UTF8_cval4 = 0xf0
363};
364
Victor Stinner23ec4b52017-06-15 00:54:36 +0200365void
Miss Islington (bot)fb17b812018-06-26 19:44:32 -0700366_INTERNAL_trim_to_complete_utf8_characters(const char * from, const char ** fromLimRef)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200367{
368 const char * fromLim = *fromLimRef;
369 size_t walked = 0;
370 for (; fromLim > from; fromLim--, walked++) {
371 const unsigned char prev = (unsigned char)fromLim[-1];
372 if ((prev & 0xf8u) == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
373 if (walked + 1 >= 4) {
374 fromLim += 4 - 1;
375 break;
376 } else {
377 walked = 0;
378 }
379 } else if ((prev & 0xf0u) == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
380 if (walked + 1 >= 3) {
381 fromLim += 3 - 1;
382 break;
383 } else {
384 walked = 0;
385 }
386 } else if ((prev & 0xe0u) == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
387 if (walked + 1 >= 2) {
388 fromLim += 2 - 1;
389 break;
390 } else {
391 walked = 0;
392 }
393 } else if ((prev & 0x80u) == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
394 break;
395 }
396 }
397 *fromLimRef = fromLim;
398}
399
400static enum XML_Convert_Result PTRCALL
401utf8_toUtf8(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000402 const char **fromP, const char *fromLim,
403 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000404{
Victor Stinner759e30e2017-09-05 01:58:08 +0200405 bool input_incomplete = false;
406 bool output_exhausted = false;
Victor Stinner5ff71322017-06-21 14:39:22 +0200407
Victor Stinner759e30e2017-09-05 01:58:08 +0200408 /* Avoid copying partial characters (due to limited space). */
409 const ptrdiff_t bytesAvailable = fromLim - *fromP;
410 const ptrdiff_t bytesStorable = toLim - *toP;
411 if (bytesAvailable > bytesStorable) {
412 fromLim = *fromP + bytesStorable;
413 output_exhausted = true;
414 }
415
416 /* Avoid copying partial characters (from incomplete input). */
Miss Islington (bot)fb17b812018-06-26 19:44:32 -0700417 {
418 const char * const fromLimBefore = fromLim;
419 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
420 if (fromLim < fromLimBefore) {
421 input_incomplete = true;
422 }
Victor Stinner759e30e2017-09-05 01:58:08 +0200423 }
Victor Stinner5ff71322017-06-21 14:39:22 +0200424
Miss Islington (bot)fb17b812018-06-26 19:44:32 -0700425 {
426 const ptrdiff_t bytesToCopy = fromLim - *fromP;
427 memcpy(*toP, *fromP, bytesToCopy);
428 *fromP += bytesToCopy;
429 *toP += bytesToCopy;
430 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200431
Miss Islington (bot)fb17b812018-06-26 19:44:32 -0700432 if (output_exhausted) /* needs to go first */
Victor Stinner23ec4b52017-06-15 00:54:36 +0200433 return XML_CONVERT_OUTPUT_EXHAUSTED;
Victor Stinner759e30e2017-09-05 01:58:08 +0200434 else if (input_incomplete)
435 return XML_CONVERT_INPUT_INCOMPLETE;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200436 else
Victor Stinner5ff71322017-06-21 14:39:22 +0200437 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000438}
439
Victor Stinner23ec4b52017-06-15 00:54:36 +0200440static enum XML_Convert_Result PTRCALL
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000441utf8_toUtf16(const ENCODING *enc,
442 const char **fromP, const char *fromLim,
443 unsigned short **toP, const unsigned short *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000444{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200445 enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000446 unsigned short *to = *toP;
447 const char *from = *fromP;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200448 while (from < fromLim && to < toLim) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000449 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
450 case BT_LEAD2:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200451 if (fromLim - from < 2) {
452 res = XML_CONVERT_INPUT_INCOMPLETE;
Victor Stinner5ff71322017-06-21 14:39:22 +0200453 goto after;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200454 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000455 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000456 from += 2;
457 break;
458 case BT_LEAD3:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200459 if (fromLim - from < 3) {
460 res = XML_CONVERT_INPUT_INCOMPLETE;
Victor Stinner5ff71322017-06-21 14:39:22 +0200461 goto after;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200462 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000463 *to++ = (unsigned short)(((from[0] & 0xf) << 12)
464 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000465 from += 3;
466 break;
467 case BT_LEAD4:
468 {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000469 unsigned long n;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200470 if (toLim - to < 2) {
471 res = XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000472 goto after;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200473 }
474 if (fromLim - from < 4) {
475 res = XML_CONVERT_INPUT_INCOMPLETE;
476 goto after;
477 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000478 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
479 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
480 n -= 0x10000;
481 to[0] = (unsigned short)((n >> 10) | 0xD800);
482 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
483 to += 2;
484 from += 4;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000485 }
486 break;
487 default:
488 *to++ = *from++;
489 break;
490 }
491 }
Victor Stinner5ff71322017-06-21 14:39:22 +0200492 if (from < fromLim)
493 res = XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000494after:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000495 *fromP = from;
496 *toP = to;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200497 return res;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000498}
499
500#ifdef XML_NS
501static const struct normal_encoding utf8_encoding_ns = {
502 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
503 {
504#include "asciitab.h"
505#include "utf8tab.h"
506 },
507 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
508};
509#endif
510
511static const struct normal_encoding utf8_encoding = {
512 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
513 {
514#define BT_COLON BT_NMSTRT
515#include "asciitab.h"
516#undef BT_COLON
517#include "utf8tab.h"
518 },
519 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
520};
521
522#ifdef XML_NS
523
524static const struct normal_encoding internal_utf8_encoding_ns = {
525 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
526 {
527#include "iasciitab.h"
528#include "utf8tab.h"
529 },
530 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
531};
532
533#endif
534
535static const struct normal_encoding internal_utf8_encoding = {
536 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
537 {
538#define BT_COLON BT_NMSTRT
539#include "iasciitab.h"
540#undef BT_COLON
541#include "utf8tab.h"
542 },
543 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
544};
545
Victor Stinner23ec4b52017-06-15 00:54:36 +0200546static enum XML_Convert_Result PTRCALL
547latin1_toUtf8(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000548 const char **fromP, const char *fromLim,
549 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000550{
551 for (;;) {
552 unsigned char c;
553 if (*fromP == fromLim)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200554 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000555 c = (unsigned char)**fromP;
556 if (c & 0x80) {
557 if (toLim - *toP < 2)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200558 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000559 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
560 *(*toP)++ = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000561 (*fromP)++;
562 }
563 else {
564 if (*toP == toLim)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200565 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000566 *(*toP)++ = *(*fromP)++;
567 }
568 }
569}
570
Victor Stinner23ec4b52017-06-15 00:54:36 +0200571static enum XML_Convert_Result PTRCALL
572latin1_toUtf16(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000573 const char **fromP, const char *fromLim,
574 unsigned short **toP, const unsigned short *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000575{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200576 while (*fromP < fromLim && *toP < toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000577 *(*toP)++ = (unsigned char)*(*fromP)++;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200578
579 if ((*toP == toLim) && (*fromP < fromLim))
580 return XML_CONVERT_OUTPUT_EXHAUSTED;
581 else
582 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000583}
584
585#ifdef XML_NS
586
587static const struct normal_encoding latin1_encoding_ns = {
588 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
589 {
590#include "asciitab.h"
591#include "latin1tab.h"
592 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200593 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000594};
595
596#endif
597
598static const struct normal_encoding latin1_encoding = {
599 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
600 {
601#define BT_COLON BT_NMSTRT
602#include "asciitab.h"
603#undef BT_COLON
604#include "latin1tab.h"
605 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200606 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000607};
608
Victor Stinner23ec4b52017-06-15 00:54:36 +0200609static enum XML_Convert_Result PTRCALL
610ascii_toUtf8(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000611 const char **fromP, const char *fromLim,
612 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000613{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200614 while (*fromP < fromLim && *toP < toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000615 *(*toP)++ = *(*fromP)++;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200616
617 if ((*toP == toLim) && (*fromP < fromLim))
618 return XML_CONVERT_OUTPUT_EXHAUSTED;
619 else
620 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000621}
622
623#ifdef XML_NS
624
625static const struct normal_encoding ascii_encoding_ns = {
626 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
627 {
628#include "asciitab.h"
629/* BT_NONXML == 0 */
630 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200631 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000632};
633
634#endif
635
636static const struct normal_encoding ascii_encoding = {
637 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
638 {
639#define BT_COLON BT_NMSTRT
640#include "asciitab.h"
641#undef BT_COLON
642/* BT_NONXML == 0 */
643 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200644 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000645};
646
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000647static int PTRFASTCALL
648unicode_byte_type(char hi, char lo)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000649{
650 switch ((unsigned char)hi) {
651 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
652 return BT_LEAD4;
653 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
654 return BT_TRAIL;
655 case 0xFF:
656 switch ((unsigned char)lo) {
657 case 0xFF:
658 case 0xFE:
659 return BT_NONXML;
660 }
661 break;
662 }
663 return BT_NONASCII;
664}
665
666#define DEFINE_UTF16_TO_UTF8(E) \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200667static enum XML_Convert_Result PTRCALL \
668E ## toUtf8(const ENCODING *UNUSED_P(enc), \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000669 const char **fromP, const char *fromLim, \
670 char **toP, const char *toLim) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000671{ \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200672 const char *from = *fromP; \
673 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
674 for (; from < fromLim; from += 2) { \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000675 int plane; \
676 unsigned char lo2; \
677 unsigned char lo = GET_LO(from); \
678 unsigned char hi = GET_HI(from); \
679 switch (hi) { \
680 case 0: \
681 if (lo < 0x80) { \
682 if (*toP == toLim) { \
683 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200684 return XML_CONVERT_OUTPUT_EXHAUSTED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000685 } \
686 *(*toP)++ = lo; \
687 break; \
688 } \
689 /* fall through */ \
690 case 0x1: case 0x2: case 0x3: \
691 case 0x4: case 0x5: case 0x6: case 0x7: \
692 if (toLim - *toP < 2) { \
693 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200694 return XML_CONVERT_OUTPUT_EXHAUSTED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000695 } \
696 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
697 *(*toP)++ = ((lo & 0x3f) | 0x80); \
698 break; \
699 default: \
700 if (toLim - *toP < 3) { \
701 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200702 return XML_CONVERT_OUTPUT_EXHAUSTED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000703 } \
704 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
705 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
706 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
707 *(*toP)++ = ((lo & 0x3f) | 0x80); \
708 break; \
709 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
710 if (toLim - *toP < 4) { \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000711 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200712 return XML_CONVERT_OUTPUT_EXHAUSTED; \
713 } \
714 if (fromLim - from < 4) { \
715 *fromP = from; \
716 return XML_CONVERT_INPUT_INCOMPLETE; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000717 } \
718 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
719 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
720 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
721 from += 2; \
722 lo2 = GET_LO(from); \
723 *(*toP)++ = (((lo & 0x3) << 4) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000724 | ((GET_HI(from) & 0x3) << 2) \
725 | (lo2 >> 6) \
726 | 0x80); \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000727 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
728 break; \
729 } \
730 } \
731 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200732 if (from < fromLim) \
733 return XML_CONVERT_INPUT_INCOMPLETE; \
734 else \
735 return XML_CONVERT_COMPLETED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000736}
737
738#define DEFINE_UTF16_TO_UTF16(E) \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200739static enum XML_Convert_Result PTRCALL \
740E ## toUtf16(const ENCODING *UNUSED_P(enc), \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000741 const char **fromP, const char *fromLim, \
742 unsigned short **toP, const unsigned short *toLim) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000743{ \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200744 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
745 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000746 /* Avoid copying first half only of surrogate */ \
747 if (fromLim - *fromP > ((toLim - *toP) << 1) \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200748 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000749 fromLim -= 2; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200750 res = XML_CONVERT_INPUT_INCOMPLETE; \
751 } \
752 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000753 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200754 if ((*toP == toLim) && (*fromP < fromLim)) \
755 return XML_CONVERT_OUTPUT_EXHAUSTED; \
756 else \
757 return res; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000758}
759
760#define SET2(ptr, ch) \
761 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
762#define GET_LO(ptr) ((unsigned char)(ptr)[0])
763#define GET_HI(ptr) ((unsigned char)(ptr)[1])
764
765DEFINE_UTF16_TO_UTF8(little2_)
766DEFINE_UTF16_TO_UTF16(little2_)
767
768#undef SET2
769#undef GET_LO
770#undef GET_HI
771
772#define SET2(ptr, ch) \
773 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
774#define GET_LO(ptr) ((unsigned char)(ptr)[1])
775#define GET_HI(ptr) ((unsigned char)(ptr)[0])
776
777DEFINE_UTF16_TO_UTF8(big2_)
778DEFINE_UTF16_TO_UTF16(big2_)
779
780#undef SET2
781#undef GET_LO
782#undef GET_HI
783
784#define LITTLE2_BYTE_TYPE(enc, p) \
785 ((p)[1] == 0 \
786 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
787 : unicode_byte_type((p)[1], (p)[0]))
788#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
789#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
790#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
791 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
792#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
793 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
794
795#ifdef XML_MIN_SIZE
796
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000797static int PTRFASTCALL
798little2_byteType(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000799{
800 return LITTLE2_BYTE_TYPE(enc, p);
801}
802
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000803static int PTRFASTCALL
804little2_byteToAscii(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000805{
806 return LITTLE2_BYTE_TO_ASCII(enc, p);
807}
808
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000809static int PTRCALL
810little2_charMatches(const ENCODING *enc, const char *p, int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000811{
812 return LITTLE2_CHAR_MATCHES(enc, p, c);
813}
814
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000815static int PTRFASTCALL
816little2_isNameMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000817{
818 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
819}
820
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000821static int PTRFASTCALL
822little2_isNmstrtMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000823{
824 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
825}
826
827#undef VTABLE
828#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
829
830#else /* not XML_MIN_SIZE */
831
832#undef PREFIX
833#define PREFIX(ident) little2_ ## ident
834#define MINBPC(enc) 2
835/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
836#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000837#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000838#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
839#define IS_NAME_CHAR(enc, p, n) 0
840#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
841#define IS_NMSTRT_CHAR(enc, p, n) (0)
842#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
843
Gregory P. Smith64359d22012-07-14 14:12:35 -0700844#define XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000845#include "xmltok_impl.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -0700846#undef XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000847
848#undef MINBPC
849#undef BYTE_TYPE
850#undef BYTE_TO_ASCII
851#undef CHAR_MATCHES
852#undef IS_NAME_CHAR
853#undef IS_NAME_CHAR_MINBPC
854#undef IS_NMSTRT_CHAR
855#undef IS_NMSTRT_CHAR_MINBPC
856#undef IS_INVALID_CHAR
857
858#endif /* not XML_MIN_SIZE */
859
860#ifdef XML_NS
861
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000862static const struct normal_encoding little2_encoding_ns = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000863 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000864#if BYTEORDER == 1234
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000865 1
866#else
867 0
868#endif
869 },
870 {
871#include "asciitab.h"
872#include "latin1tab.h"
873 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200874 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000875};
876
877#endif
878
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000879static const struct normal_encoding little2_encoding = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000880 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000881#if BYTEORDER == 1234
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000882 1
883#else
884 0
885#endif
886 },
887 {
888#define BT_COLON BT_NMSTRT
889#include "asciitab.h"
890#undef BT_COLON
891#include "latin1tab.h"
892 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200893 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000894};
895
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000896#if BYTEORDER != 4321
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000897
898#ifdef XML_NS
899
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000900static const struct normal_encoding internal_little2_encoding_ns = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000901 { VTABLE, 2, 0, 1 },
902 {
903#include "iasciitab.h"
904#include "latin1tab.h"
905 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200906 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000907};
908
909#endif
910
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000911static const struct normal_encoding internal_little2_encoding = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000912 { VTABLE, 2, 0, 1 },
913 {
914#define BT_COLON BT_NMSTRT
915#include "iasciitab.h"
916#undef BT_COLON
917#include "latin1tab.h"
918 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200919 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000920};
921
922#endif
923
924
925#define BIG2_BYTE_TYPE(enc, p) \
926 ((p)[0] == 0 \
927 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
928 : unicode_byte_type((p)[0], (p)[1]))
929#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
930#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
931#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
932 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
933#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
934 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
935
936#ifdef XML_MIN_SIZE
937
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000938static int PTRFASTCALL
939big2_byteType(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000940{
941 return BIG2_BYTE_TYPE(enc, p);
942}
943
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000944static int PTRFASTCALL
945big2_byteToAscii(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000946{
947 return BIG2_BYTE_TO_ASCII(enc, p);
948}
949
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000950static int PTRCALL
951big2_charMatches(const ENCODING *enc, const char *p, int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000952{
953 return BIG2_CHAR_MATCHES(enc, p, c);
954}
955
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000956static int PTRFASTCALL
957big2_isNameMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000958{
959 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
960}
961
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000962static int PTRFASTCALL
963big2_isNmstrtMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000964{
965 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
966}
967
968#undef VTABLE
969#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
970
971#else /* not XML_MIN_SIZE */
972
973#undef PREFIX
974#define PREFIX(ident) big2_ ## ident
975#define MINBPC(enc) 2
976/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
977#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000978#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000979#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
980#define IS_NAME_CHAR(enc, p, n) 0
981#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
982#define IS_NMSTRT_CHAR(enc, p, n) (0)
983#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
984
Gregory P. Smith64359d22012-07-14 14:12:35 -0700985#define XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000986#include "xmltok_impl.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -0700987#undef XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000988
989#undef MINBPC
990#undef BYTE_TYPE
991#undef BYTE_TO_ASCII
992#undef CHAR_MATCHES
993#undef IS_NAME_CHAR
994#undef IS_NAME_CHAR_MINBPC
995#undef IS_NMSTRT_CHAR
996#undef IS_NMSTRT_CHAR_MINBPC
997#undef IS_INVALID_CHAR
998
999#endif /* not XML_MIN_SIZE */
1000
1001#ifdef XML_NS
1002
1003static const struct normal_encoding big2_encoding_ns = {
1004 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001005#if BYTEORDER == 4321
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001006 1
1007#else
1008 0
1009#endif
1010 },
1011 {
1012#include "asciitab.h"
1013#include "latin1tab.h"
1014 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001015 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001016};
1017
1018#endif
1019
1020static const struct normal_encoding big2_encoding = {
1021 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001022#if BYTEORDER == 4321
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001023 1
1024#else
1025 0
1026#endif
1027 },
1028 {
1029#define BT_COLON BT_NMSTRT
1030#include "asciitab.h"
1031#undef BT_COLON
1032#include "latin1tab.h"
1033 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001034 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001035};
1036
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001037#if BYTEORDER != 1234
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001038
1039#ifdef XML_NS
1040
1041static const struct normal_encoding internal_big2_encoding_ns = {
1042 { VTABLE, 2, 0, 1 },
1043 {
1044#include "iasciitab.h"
1045#include "latin1tab.h"
1046 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001047 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001048};
1049
1050#endif
1051
1052static const struct normal_encoding internal_big2_encoding = {
1053 { VTABLE, 2, 0, 1 },
1054 {
1055#define BT_COLON BT_NMSTRT
1056#include "iasciitab.h"
1057#undef BT_COLON
1058#include "latin1tab.h"
1059 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001060 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001061};
1062
1063#endif
1064
1065#undef PREFIX
1066
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001067static int FASTCALL
1068streqci(const char *s1, const char *s2)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001069{
1070 for (;;) {
1071 char c1 = *s1++;
1072 char c2 = *s2++;
1073 if (ASCII_a <= c1 && c1 <= ASCII_z)
1074 c1 += ASCII_A - ASCII_a;
1075 if (ASCII_a <= c2 && c2 <= ASCII_z)
Victor Stinner93d0cb52017-08-18 23:43:54 +02001076 /* The following line will never get executed. streqci() is
1077 * only called from two places, both of which guarantee to put
1078 * upper-case strings into s2.
1079 */
1080 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001081 if (c1 != c2)
1082 return 0;
1083 if (!c1)
1084 break;
1085 }
1086 return 1;
1087}
1088
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001089static void PTRCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +02001090initUpdatePosition(const ENCODING *UNUSED_P(enc), const char *ptr,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001091 const char *end, POSITION *pos)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001092{
1093 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1094}
1095
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001096static int
1097toAscii(const ENCODING *enc, const char *ptr, const char *end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001098{
1099 char buf[1];
1100 char *p = buf;
1101 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1102 if (p == buf)
1103 return -1;
1104 else
1105 return buf[0];
1106}
1107
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001108static int FASTCALL
1109isSpace(int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001110{
1111 switch (c) {
1112 case 0x20:
1113 case 0xD:
1114 case 0xA:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001115 case 0x9:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001116 return 1;
1117 }
1118 return 0;
1119}
1120
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001121/* Return 1 if there's just optional white space or there's an S
1122 followed by name=val.
1123*/
1124static int
1125parsePseudoAttribute(const ENCODING *enc,
1126 const char *ptr,
1127 const char *end,
1128 const char **namePtr,
1129 const char **nameEndPtr,
1130 const char **valPtr,
1131 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001132{
1133 int c;
1134 char open;
1135 if (ptr == end) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001136 *namePtr = NULL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001137 return 1;
1138 }
1139 if (!isSpace(toAscii(enc, ptr, end))) {
1140 *nextTokPtr = ptr;
1141 return 0;
1142 }
1143 do {
1144 ptr += enc->minBytesPerChar;
1145 } while (isSpace(toAscii(enc, ptr, end)));
1146 if (ptr == end) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001147 *namePtr = NULL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001148 return 1;
1149 }
1150 *namePtr = ptr;
1151 for (;;) {
1152 c = toAscii(enc, ptr, end);
1153 if (c == -1) {
1154 *nextTokPtr = ptr;
1155 return 0;
1156 }
1157 if (c == ASCII_EQUALS) {
1158 *nameEndPtr = ptr;
1159 break;
1160 }
1161 if (isSpace(c)) {
1162 *nameEndPtr = ptr;
1163 do {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001164 ptr += enc->minBytesPerChar;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001165 } while (isSpace(c = toAscii(enc, ptr, end)));
1166 if (c != ASCII_EQUALS) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001167 *nextTokPtr = ptr;
1168 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001169 }
1170 break;
1171 }
1172 ptr += enc->minBytesPerChar;
1173 }
1174 if (ptr == *namePtr) {
1175 *nextTokPtr = ptr;
1176 return 0;
1177 }
1178 ptr += enc->minBytesPerChar;
1179 c = toAscii(enc, ptr, end);
1180 while (isSpace(c)) {
1181 ptr += enc->minBytesPerChar;
1182 c = toAscii(enc, ptr, end);
1183 }
1184 if (c != ASCII_QUOT && c != ASCII_APOS) {
1185 *nextTokPtr = ptr;
1186 return 0;
1187 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001188 open = (char)c;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001189 ptr += enc->minBytesPerChar;
1190 *valPtr = ptr;
1191 for (;; ptr += enc->minBytesPerChar) {
1192 c = toAscii(enc, ptr, end);
1193 if (c == open)
1194 break;
1195 if (!(ASCII_a <= c && c <= ASCII_z)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001196 && !(ASCII_A <= c && c <= ASCII_Z)
1197 && !(ASCII_0 <= c && c <= ASCII_9)
1198 && c != ASCII_PERIOD
1199 && c != ASCII_MINUS
1200 && c != ASCII_UNDERSCORE) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001201 *nextTokPtr = ptr;
1202 return 0;
1203 }
1204 }
1205 *nextTokPtr = ptr + enc->minBytesPerChar;
1206 return 1;
1207}
1208
1209static const char KW_version[] = {
1210 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1211};
1212
1213static const char KW_encoding[] = {
1214 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1215};
1216
1217static const char KW_standalone[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001218 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1219 ASCII_n, ASCII_e, '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001220};
1221
1222static const char KW_yes[] = {
1223 ASCII_y, ASCII_e, ASCII_s, '\0'
1224};
1225
1226static const char KW_no[] = {
1227 ASCII_n, ASCII_o, '\0'
1228};
1229
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001230static int
1231doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1232 const char *,
1233 const char *),
1234 int isGeneralTextEntity,
1235 const ENCODING *enc,
1236 const char *ptr,
1237 const char *end,
1238 const char **badPtr,
1239 const char **versionPtr,
1240 const char **versionEndPtr,
1241 const char **encodingName,
1242 const ENCODING **encoding,
1243 int *standalone)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001244{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001245 const char *val = NULL;
1246 const char *name = NULL;
1247 const char *nameEnd = NULL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001248 ptr += 5 * enc->minBytesPerChar;
1249 end -= 2 * enc->minBytesPerChar;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001250 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1251 || !name) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001252 *badPtr = ptr;
1253 return 0;
1254 }
1255 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1256 if (!isGeneralTextEntity) {
1257 *badPtr = name;
1258 return 0;
1259 }
1260 }
1261 else {
1262 if (versionPtr)
1263 *versionPtr = val;
1264 if (versionEndPtr)
1265 *versionEndPtr = ptr;
1266 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1267 *badPtr = ptr;
1268 return 0;
1269 }
1270 if (!name) {
1271 if (isGeneralTextEntity) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001272 /* a TextDecl must have an EncodingDecl */
1273 *badPtr = ptr;
1274 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001275 }
1276 return 1;
1277 }
1278 }
1279 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1280 int c = toAscii(enc, val, end);
1281 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1282 *badPtr = val;
1283 return 0;
1284 }
1285 if (encodingName)
1286 *encodingName = val;
1287 if (encoding)
1288 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1289 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1290 *badPtr = ptr;
1291 return 0;
1292 }
1293 if (!name)
1294 return 1;
1295 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001296 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1297 || isGeneralTextEntity) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001298 *badPtr = name;
1299 return 0;
1300 }
1301 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1302 if (standalone)
1303 *standalone = 1;
1304 }
1305 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1306 if (standalone)
1307 *standalone = 0;
1308 }
1309 else {
1310 *badPtr = val;
1311 return 0;
1312 }
1313 while (isSpace(toAscii(enc, ptr, end)))
1314 ptr += enc->minBytesPerChar;
1315 if (ptr != end) {
1316 *badPtr = ptr;
1317 return 0;
1318 }
1319 return 1;
1320}
1321
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001322static int FASTCALL
1323checkCharRefNumber(int result)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001324{
1325 switch (result >> 8) {
1326 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1327 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1328 return -1;
1329 case 0:
1330 if (latin1_encoding.type[result] == BT_NONXML)
1331 return -1;
1332 break;
1333 case 0xFF:
1334 if (result == 0xFFFE || result == 0xFFFF)
1335 return -1;
1336 break;
1337 }
1338 return result;
1339}
1340
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001341int FASTCALL
1342XmlUtf8Encode(int c, char *buf)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001343{
1344 enum {
1345 /* minN is minimum legal resulting value for N byte sequence */
1346 min2 = 0x80,
1347 min3 = 0x800,
1348 min4 = 0x10000
1349 };
1350
1351 if (c < 0)
Victor Stinner93d0cb52017-08-18 23:43:54 +02001352 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001353 if (c < min2) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001354 buf[0] = (char)(c | UTF8_cval1);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001355 return 1;
1356 }
1357 if (c < min3) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001358 buf[0] = (char)((c >> 6) | UTF8_cval2);
1359 buf[1] = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001360 return 2;
1361 }
1362 if (c < min4) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001363 buf[0] = (char)((c >> 12) | UTF8_cval3);
1364 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1365 buf[2] = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001366 return 3;
1367 }
1368 if (c < 0x110000) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001369 buf[0] = (char)((c >> 18) | UTF8_cval4);
1370 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1371 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1372 buf[3] = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001373 return 4;
1374 }
Victor Stinner93d0cb52017-08-18 23:43:54 +02001375 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001376}
1377
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001378int FASTCALL
1379XmlUtf16Encode(int charNum, unsigned short *buf)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001380{
1381 if (charNum < 0)
1382 return 0;
1383 if (charNum < 0x10000) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001384 buf[0] = (unsigned short)charNum;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001385 return 1;
1386 }
1387 if (charNum < 0x110000) {
1388 charNum -= 0x10000;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001389 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1390 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001391 return 2;
1392 }
1393 return 0;
1394}
1395
1396struct unknown_encoding {
1397 struct normal_encoding normal;
Fred Drake31d485c2004-08-03 07:06:22 +00001398 CONVERTER convert;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001399 void *userData;
1400 unsigned short utf16[256];
1401 char utf8[256][4];
1402};
1403
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001404#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
1405
1406int
1407XmlSizeOfUnknownEncoding(void)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001408{
1409 return sizeof(struct unknown_encoding);
1410}
1411
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001412static int PTRFASTCALL
1413unknown_isName(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001414{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001415 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1416 int c = uenc->convert(uenc->userData, p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001417 if (c & ~0xFFFF)
1418 return 0;
1419 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1420}
1421
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001422static int PTRFASTCALL
1423unknown_isNmstrt(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001424{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001425 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1426 int c = uenc->convert(uenc->userData, p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001427 if (c & ~0xFFFF)
1428 return 0;
1429 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1430}
1431
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001432static int PTRFASTCALL
1433unknown_isInvalid(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001434{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001435 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1436 int c = uenc->convert(uenc->userData, p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001437 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1438}
1439
Victor Stinner23ec4b52017-06-15 00:54:36 +02001440static enum XML_Convert_Result PTRCALL
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001441unknown_toUtf8(const ENCODING *enc,
1442 const char **fromP, const char *fromLim,
1443 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001444{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001445 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001446 char buf[XML_UTF8_ENCODE_MAX];
1447 for (;;) {
1448 const char *utf8;
1449 int n;
1450 if (*fromP == fromLim)
Victor Stinner23ec4b52017-06-15 00:54:36 +02001451 return XML_CONVERT_COMPLETED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001452 utf8 = uenc->utf8[(unsigned char)**fromP];
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001453 n = *utf8++;
1454 if (n == 0) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001455 int c = uenc->convert(uenc->userData, *fromP);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001456 n = XmlUtf8Encode(c, buf);
1457 if (n > toLim - *toP)
Victor Stinner23ec4b52017-06-15 00:54:36 +02001458 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001459 utf8 = buf;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001460 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1461 - (BT_LEAD2 - 2));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001462 }
1463 else {
1464 if (n > toLim - *toP)
Victor Stinner23ec4b52017-06-15 00:54:36 +02001465 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001466 (*fromP)++;
1467 }
Miss Islington (bot)fb17b812018-06-26 19:44:32 -07001468 memcpy(*toP, utf8, n);
1469 *toP += n;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001470 }
1471}
1472
Victor Stinner23ec4b52017-06-15 00:54:36 +02001473static enum XML_Convert_Result PTRCALL
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001474unknown_toUtf16(const ENCODING *enc,
1475 const char **fromP, const char *fromLim,
1476 unsigned short **toP, const unsigned short *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001477{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001478 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001479 while (*fromP < fromLim && *toP < toLim) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001480 unsigned short c = uenc->utf16[(unsigned char)**fromP];
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001481 if (c == 0) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001482 c = (unsigned short)
1483 uenc->convert(uenc->userData, *fromP);
1484 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1485 - (BT_LEAD2 - 2));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001486 }
1487 else
1488 (*fromP)++;
1489 *(*toP)++ = c;
1490 }
Victor Stinner23ec4b52017-06-15 00:54:36 +02001491
1492 if ((*toP == toLim) && (*fromP < fromLim))
1493 return XML_CONVERT_OUTPUT_EXHAUSTED;
1494 else
1495 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001496}
1497
1498ENCODING *
1499XmlInitUnknownEncoding(void *mem,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001500 int *table,
Gregory P. Smith64359d22012-07-14 14:12:35 -07001501 CONVERTER convert,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001502 void *userData)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001503{
1504 int i;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001505 struct unknown_encoding *e = (struct unknown_encoding *)mem;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001506 for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1507 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1508 for (i = 0; i < 128; i++)
1509 if (latin1_encoding.type[i] != BT_OTHER
1510 && latin1_encoding.type[i] != BT_NONXML
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001511 && table[i] != i)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001512 return 0;
1513 for (i = 0; i < 256; i++) {
1514 int c = table[i];
1515 if (c == -1) {
1516 e->normal.type[i] = BT_MALFORM;
1517 /* This shouldn't really get used. */
1518 e->utf16[i] = 0xFFFF;
1519 e->utf8[i][0] = 1;
1520 e->utf8[i][1] = 0;
1521 }
1522 else if (c < 0) {
1523 if (c < -4)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001524 return 0;
Victor Stinner93d0cb52017-08-18 23:43:54 +02001525 /* Multi-byte sequences need a converter function */
1526 if (!convert)
1527 return 0;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001528 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001529 e->utf8[i][0] = 0;
1530 e->utf16[i] = 0;
1531 }
1532 else if (c < 0x80) {
1533 if (latin1_encoding.type[c] != BT_OTHER
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001534 && latin1_encoding.type[c] != BT_NONXML
1535 && c != i)
1536 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001537 e->normal.type[i] = latin1_encoding.type[c];
1538 e->utf8[i][0] = 1;
1539 e->utf8[i][1] = (char)c;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001540 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001541 }
1542 else if (checkCharRefNumber(c) < 0) {
1543 e->normal.type[i] = BT_NONXML;
1544 /* This shouldn't really get used. */
1545 e->utf16[i] = 0xFFFF;
1546 e->utf8[i][0] = 1;
1547 e->utf8[i][1] = 0;
1548 }
1549 else {
1550 if (c > 0xFFFF)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001551 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001552 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001553 e->normal.type[i] = BT_NMSTRT;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001554 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001555 e->normal.type[i] = BT_NAME;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001556 else
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001557 e->normal.type[i] = BT_OTHER;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001558 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001559 e->utf16[i] = (unsigned short)c;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001560 }
1561 }
1562 e->userData = userData;
1563 e->convert = convert;
1564 if (convert) {
1565 e->normal.isName2 = unknown_isName;
1566 e->normal.isName3 = unknown_isName;
1567 e->normal.isName4 = unknown_isName;
1568 e->normal.isNmstrt2 = unknown_isNmstrt;
1569 e->normal.isNmstrt3 = unknown_isNmstrt;
1570 e->normal.isNmstrt4 = unknown_isNmstrt;
1571 e->normal.isInvalid2 = unknown_isInvalid;
1572 e->normal.isInvalid3 = unknown_isInvalid;
1573 e->normal.isInvalid4 = unknown_isInvalid;
1574 }
1575 e->normal.enc.utf8Convert = unknown_toUtf8;
1576 e->normal.enc.utf16Convert = unknown_toUtf16;
1577 return &(e->normal.enc);
1578}
1579
1580/* If this enumeration is changed, getEncodingIndex and encodings
1581must also be changed. */
1582enum {
1583 UNKNOWN_ENC = -1,
1584 ISO_8859_1_ENC = 0,
1585 US_ASCII_ENC,
1586 UTF_8_ENC,
1587 UTF_16_ENC,
1588 UTF_16BE_ENC,
1589 UTF_16LE_ENC,
1590 /* must match encodingNames up to here */
1591 NO_ENC
1592};
1593
1594static const char KW_ISO_8859_1[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001595 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1596 ASCII_MINUS, ASCII_1, '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001597};
1598static const char KW_US_ASCII[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001599 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1600 '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001601};
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001602static const char KW_UTF_8[] = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001603 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1604};
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001605static const char KW_UTF_16[] = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001606 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1607};
1608static const char KW_UTF_16BE[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001609 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1610 '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001611};
1612static const char KW_UTF_16LE[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001613 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1614 '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001615};
1616
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001617static int FASTCALL
1618getEncodingIndex(const char *name)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001619{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001620 static const char * const encodingNames[] = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001621 KW_ISO_8859_1,
1622 KW_US_ASCII,
1623 KW_UTF_8,
1624 KW_UTF_16,
1625 KW_UTF_16BE,
1626 KW_UTF_16LE,
1627 };
1628 int i;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001629 if (name == NULL)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001630 return NO_ENC;
1631 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1632 if (streqci(name, encodingNames[i]))
1633 return i;
1634 return UNKNOWN_ENC;
1635}
1636
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001637/* For binary compatibility, we store the index of the encoding
1638 specified at initialization in the isUtf16 member.
1639*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001640
1641#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1642#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1643
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001644/* This is what detects the encoding. encodingTable maps from
1645 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1646 the external (protocol) specified encoding; state is
1647 XML_CONTENT_STATE if we're parsing an external text entity, and
1648 XML_PROLOG_STATE otherwise.
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001649*/
1650
1651
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001652static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001653initScan(const ENCODING * const *encodingTable,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001654 const INIT_ENCODING *enc,
1655 int state,
1656 const char *ptr,
1657 const char *end,
1658 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001659{
1660 const ENCODING **encPtr;
1661
Victor Stinner23ec4b52017-06-15 00:54:36 +02001662 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001663 return XML_TOK_NONE;
1664 encPtr = enc->encPtr;
1665 if (ptr + 1 == end) {
1666 /* only a single byte available for auto-detection */
1667#ifndef XML_DTD /* FIXME */
1668 /* a well-formed document entity must have more than one byte */
1669 if (state != XML_CONTENT_STATE)
1670 return XML_TOK_PARTIAL;
1671#endif
1672 /* so we're parsing an external text entity... */
1673 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1674 switch (INIT_ENC_INDEX(enc)) {
1675 case UTF_16_ENC:
1676 case UTF_16LE_ENC:
1677 case UTF_16BE_ENC:
1678 return XML_TOK_PARTIAL;
1679 }
1680 switch ((unsigned char)*ptr) {
1681 case 0xFE:
1682 case 0xFF:
1683 case 0xEF: /* possibly first byte of UTF-8 BOM */
1684 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001685 && state == XML_CONTENT_STATE)
1686 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001687 /* fall through */
1688 case 0x00:
1689 case 0x3C:
1690 return XML_TOK_PARTIAL;
1691 }
1692 }
1693 else {
1694 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1695 case 0xFEFF:
1696 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001697 && state == XML_CONTENT_STATE)
1698 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001699 *nextTokPtr = ptr + 2;
1700 *encPtr = encodingTable[UTF_16BE_ENC];
1701 return XML_TOK_BOM;
1702 /* 00 3C is handled in the default case */
1703 case 0x3C00:
1704 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001705 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1706 && state == XML_CONTENT_STATE)
1707 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001708 *encPtr = encodingTable[UTF_16LE_ENC];
1709 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1710 case 0xFFFE:
1711 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001712 && state == XML_CONTENT_STATE)
1713 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001714 *nextTokPtr = ptr + 2;
1715 *encPtr = encodingTable[UTF_16LE_ENC];
1716 return XML_TOK_BOM;
1717 case 0xEFBB:
1718 /* Maybe a UTF-8 BOM (EF BB BF) */
1719 /* If there's an explicitly specified (external) encoding
1720 of ISO-8859-1 or some flavour of UTF-16
1721 and this is an external text entity,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001722 don't look for the BOM,
1723 because it might be a legal data.
1724 */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001725 if (state == XML_CONTENT_STATE) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001726 int e = INIT_ENC_INDEX(enc);
1727 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1728 || e == UTF_16LE_ENC || e == UTF_16_ENC)
1729 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001730 }
1731 if (ptr + 2 == end)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001732 return XML_TOK_PARTIAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001733 if ((unsigned char)ptr[2] == 0xBF) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001734 *nextTokPtr = ptr + 3;
1735 *encPtr = encodingTable[UTF_8_ENC];
1736 return XML_TOK_BOM;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001737 }
1738 break;
1739 default:
1740 if (ptr[0] == '\0') {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001741 /* 0 isn't a legal data character. Furthermore a document
1742 entity can only start with ASCII characters. So the only
Benjamin Peterson196d7db2016-06-11 13:28:56 -07001743 way this can fail to be big-endian UTF-16 if it it's an
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001744 external parsed general entity that's labelled as
1745 UTF-16LE.
1746 */
1747 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1748 break;
1749 *encPtr = encodingTable[UTF_16BE_ENC];
1750 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001751 }
1752 else if (ptr[1] == '\0') {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001753 /* We could recover here in the case:
1754 - parsing an external entity
1755 - second byte is 0
1756 - no externally specified encoding
1757 - no encoding declaration
1758 by assuming UTF-16LE. But we don't, because this would mean when
1759 presented just with a single byte, we couldn't reliably determine
1760 whether we needed further bytes.
1761 */
1762 if (state == XML_CONTENT_STATE)
1763 break;
1764 *encPtr = encodingTable[UTF_16LE_ENC];
1765 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001766 }
1767 break;
1768 }
1769 }
1770 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1771 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1772}
1773
1774
1775#define NS(x) x
1776#define ns(x) x
Gregory P. Smith64359d22012-07-14 14:12:35 -07001777#define XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001778#include "xmltok_ns.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -07001779#undef XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001780#undef NS
1781#undef ns
1782
1783#ifdef XML_NS
1784
1785#define NS(x) x ## NS
1786#define ns(x) x ## _ns
1787
Gregory P. Smith64359d22012-07-14 14:12:35 -07001788#define XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001789#include "xmltok_ns.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -07001790#undef XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001791
1792#undef NS
1793#undef ns
1794
1795ENCODING *
1796XmlInitUnknownEncodingNS(void *mem,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001797 int *table,
Gregory P. Smith64359d22012-07-14 14:12:35 -07001798 CONVERTER convert,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001799 void *userData)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001800{
1801 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1802 if (enc)
1803 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1804 return enc;
1805}
1806
1807#endif /* XML_NS */