blob: 6b415d83972ca605a97c27eb36f75cdab37165c1 [file] [log] [blame]
Victor Stinner759e30e2017-09-05 01:58:08 +02001/*
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000-2017 Expat development team
11 Licensed under the MIT license:
12
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to permit
18 persons to whom the Software is furnished to do so, subject to the
19 following conditions:
20
21 The above copyright notice and this permission notice shall be included
22 in all copies or substantial portions of the Software.
23
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30 USE OR OTHER DEALINGS IN THE SOFTWARE.
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000031*/
32
Benjamin Peterson06d49bb2016-06-13 23:41:19 -070033#include <stddef.h>
Benjamin Peterson4e211002018-06-26 19:25:45 -070034#include <string.h> /* memcpy */
35
36#if defined(_MSC_VER) && (_MSC_VER <= 1700)
37 /* for vs2012/11.0/1700 and earlier Visual Studio compilers */
38# define bool int
39# define false 0
40# define true 1
41#else
42# include <stdbool.h>
43#endif
44
Benjamin Peterson06d49bb2016-06-13 23:41:19 -070045
Victor Stinner5ff71322017-06-21 14:39:22 +020046#ifdef _WIN32
Martin v. Löwisfc03a942003-01-25 22:41:29 +000047#include "winconfig.h"
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000048#else
Fred Drake08317ae2003-10-21 15:38:55 +000049#ifdef HAVE_EXPAT_CONFIG_H
Martin v. Löwisfc03a942003-01-25 22:41:29 +000050#include <expat_config.h>
Fred Drake08317ae2003-10-21 15:38:55 +000051#endif
Victor Stinner5ff71322017-06-21 14:39:22 +020052#endif /* ndef _WIN32 */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000053
Fred Drake31d485c2004-08-03 07:06:22 +000054#include "expat_external.h"
Martin v. Löwisfc03a942003-01-25 22:41:29 +000055#include "internal.h"
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000056#include "xmltok.h"
57#include "nametab.h"
58
59#ifdef XML_DTD
60#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
61#else
62#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
63#endif
64
65#define VTABLE1 \
66 { PREFIX(prologTok), PREFIX(contentTok), \
67 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
68 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000069 PREFIX(nameMatchesAscii), \
70 PREFIX(nameLength), \
71 PREFIX(skipS), \
72 PREFIX(getAtts), \
73 PREFIX(charRefNumber), \
74 PREFIX(predefinedEntityName), \
75 PREFIX(updatePosition), \
76 PREFIX(isPublicId)
77
78#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
79
80#define UCS2_GET_NAMING(pages, hi, lo) \
Victor Stinner23ec4b52017-06-15 00:54:36 +020081 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000082
Martin v. Löwisfc03a942003-01-25 22:41:29 +000083/* A 2 byte UTF-8 representation splits the characters 11 bits between
84 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
85 pages, 3 bits to add to that index and 5 bits to generate the mask.
86*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000087#define UTF8_GET_NAMING2(pages, byte) \
88 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
89 + ((((byte)[0]) & 3) << 1) \
90 + ((((byte)[1]) >> 5) & 1)] \
Victor Stinner23ec4b52017-06-15 00:54:36 +020091 & (1u << (((byte)[1]) & 0x1F)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000092
Martin v. Löwisfc03a942003-01-25 22:41:29 +000093/* A 3 byte UTF-8 representation splits the characters 16 bits between
94 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
95 into pages, 3 bits to add to that index and 5 bits to generate the
96 mask.
97*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000098#define UTF8_GET_NAMING3(pages, byte) \
99 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
100 + ((((byte)[1]) >> 2) & 0xF)] \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000101 << 3) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000102 + ((((byte)[1]) & 3) << 1) \
103 + ((((byte)[2]) >> 5) & 1)] \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200104 & (1u << (((byte)[2]) & 0x1F)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000105
106#define UTF8_GET_NAMING(pages, p, n) \
107 ((n) == 2 \
108 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
109 : ((n) == 3 \
110 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
111 : 0))
112
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000113/* Detection of invalid UTF-8 sequences is based on Table 3.1B
114 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
115 with the additional restriction of not allowing the Unicode
116 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
117 Implementation details:
118 (A & 0x80) == 0 means A < 0x80
119 and
120 (A & 0xC0) == 0xC0 means A > 0xBF
121*/
122
123#define UTF8_INVALID2(p) \
124 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
125
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000126#define UTF8_INVALID3(p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000127 (((p)[2] & 0x80) == 0 \
128 || \
129 ((*p) == 0xEF && (p)[1] == 0xBF \
130 ? \
131 (p)[2] > 0xBD \
132 : \
133 ((p)[2] & 0xC0) == 0xC0) \
134 || \
135 ((*p) == 0xE0 \
136 ? \
137 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
138 : \
139 ((p)[1] & 0x80) == 0 \
140 || \
141 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000142
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000143#define UTF8_INVALID4(p) \
144 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
145 || \
146 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
147 || \
148 ((*p) == 0xF0 \
149 ? \
150 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
151 : \
152 ((p)[1] & 0x80) == 0 \
153 || \
154 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000155
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000156static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200157isNever(const ENCODING *UNUSED_P(enc), const char *UNUSED_P(p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000158{
159 return 0;
160}
161
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000162static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200163utf8_isName2(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000164{
165 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
166}
167
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000168static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200169utf8_isName3(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000170{
171 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
172}
173
174#define utf8_isName4 isNever
175
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000176static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200177utf8_isNmstrt2(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000178{
179 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
180}
181
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000182static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200183utf8_isNmstrt3(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000184{
185 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
186}
187
188#define utf8_isNmstrt4 isNever
189
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000190static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200191utf8_isInvalid2(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000192{
193 return UTF8_INVALID2((const unsigned char *)p);
194}
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000195
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000196static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200197utf8_isInvalid3(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000198{
199 return UTF8_INVALID3((const unsigned char *)p);
200}
201
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000202static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200203utf8_isInvalid4(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000204{
205 return UTF8_INVALID4((const unsigned char *)p);
206}
207
208struct normal_encoding {
209 ENCODING enc;
210 unsigned char type[256];
211#ifdef XML_MIN_SIZE
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000212 int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
213 int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
214 int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
215 int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
216 int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000217#endif /* XML_MIN_SIZE */
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000218 int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
219 int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
220 int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
221 int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
222 int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
223 int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
224 int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
225 int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
226 int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000227};
228
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000229#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
230
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000231#ifdef XML_MIN_SIZE
232
233#define STANDARD_VTABLE(E) \
234 E ## byteType, \
235 E ## isNameMin, \
236 E ## isNmstrtMin, \
237 E ## byteToAscii, \
238 E ## charMatches,
239
240#else
241
242#define STANDARD_VTABLE(E) /* as nothing */
243
244#endif
245
246#define NORMAL_VTABLE(E) \
247 E ## isName2, \
248 E ## isName3, \
249 E ## isName4, \
250 E ## isNmstrt2, \
251 E ## isNmstrt3, \
252 E ## isNmstrt4, \
253 E ## isInvalid2, \
254 E ## isInvalid3, \
255 E ## isInvalid4
256
Victor Stinner23ec4b52017-06-15 00:54:36 +0200257#define NULL_VTABLE \
258 /* isName2 */ NULL, \
259 /* isName3 */ NULL, \
260 /* isName4 */ NULL, \
261 /* isNmstrt2 */ NULL, \
262 /* isNmstrt3 */ NULL, \
263 /* isNmstrt4 */ NULL, \
264 /* isInvalid2 */ NULL, \
265 /* isInvalid3 */ NULL, \
266 /* isInvalid4 */ NULL
267
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000268static int FASTCALL checkCharRefNumber(int);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000269
270#include "xmltok_impl.h"
271#include "ascii.h"
272
273#ifdef XML_MIN_SIZE
274#define sb_isNameMin isNever
275#define sb_isNmstrtMin isNever
276#endif
277
278#ifdef XML_MIN_SIZE
279#define MINBPC(enc) ((enc)->minBytesPerChar)
280#else
281/* minimum bytes per character */
282#define MINBPC(enc) 1
283#endif
284
285#define SB_BYTE_TYPE(enc, p) \
286 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
287
288#ifdef XML_MIN_SIZE
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000289static int PTRFASTCALL
290sb_byteType(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000291{
292 return SB_BYTE_TYPE(enc, p);
293}
294#define BYTE_TYPE(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000295 (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000296#else
297#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
298#endif
299
300#ifdef XML_MIN_SIZE
301#define BYTE_TO_ASCII(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000302 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
303static int PTRFASTCALL
304sb_byteToAscii(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000305{
306 return *p;
307}
308#else
309#define BYTE_TO_ASCII(enc, p) (*(p))
310#endif
311
312#define IS_NAME_CHAR(enc, p, n) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000313 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000314#define IS_NMSTRT_CHAR(enc, p, n) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000315 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000316#define IS_INVALID_CHAR(enc, p, n) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000317 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000318
319#ifdef XML_MIN_SIZE
320#define IS_NAME_CHAR_MINBPC(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000321 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000322#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000323 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000324#else
325#define IS_NAME_CHAR_MINBPC(enc, p) (0)
326#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
327#endif
328
329#ifdef XML_MIN_SIZE
330#define CHAR_MATCHES(enc, p, c) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000331 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
332static int PTRCALL
333sb_charMatches(const ENCODING *enc, const char *p, int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000334{
335 return *p == c;
336}
337#else
338/* c is an ASCII character */
339#define CHAR_MATCHES(enc, p, c) (*(p) == c)
340#endif
341
342#define PREFIX(ident) normal_ ## ident
Gregory P. Smith64359d22012-07-14 14:12:35 -0700343#define XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000344#include "xmltok_impl.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -0700345#undef XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000346
347#undef MINBPC
348#undef BYTE_TYPE
349#undef BYTE_TO_ASCII
350#undef CHAR_MATCHES
351#undef IS_NAME_CHAR
352#undef IS_NAME_CHAR_MINBPC
353#undef IS_NMSTRT_CHAR
354#undef IS_NMSTRT_CHAR_MINBPC
355#undef IS_INVALID_CHAR
356
357enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
358 UTF8_cval1 = 0x00,
359 UTF8_cval2 = 0xc0,
360 UTF8_cval3 = 0xe0,
361 UTF8_cval4 = 0xf0
362};
363
Victor Stinner23ec4b52017-06-15 00:54:36 +0200364void
Benjamin Peterson4e211002018-06-26 19:25:45 -0700365_INTERNAL_trim_to_complete_utf8_characters(const char * from, const char ** fromLimRef)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200366{
367 const char * fromLim = *fromLimRef;
368 size_t walked = 0;
369 for (; fromLim > from; fromLim--, walked++) {
370 const unsigned char prev = (unsigned char)fromLim[-1];
371 if ((prev & 0xf8u) == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
372 if (walked + 1 >= 4) {
373 fromLim += 4 - 1;
374 break;
375 } else {
376 walked = 0;
377 }
378 } else if ((prev & 0xf0u) == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
379 if (walked + 1 >= 3) {
380 fromLim += 3 - 1;
381 break;
382 } else {
383 walked = 0;
384 }
385 } else if ((prev & 0xe0u) == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
386 if (walked + 1 >= 2) {
387 fromLim += 2 - 1;
388 break;
389 } else {
390 walked = 0;
391 }
392 } else if ((prev & 0x80u) == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
393 break;
394 }
395 }
396 *fromLimRef = fromLim;
397}
398
399static enum XML_Convert_Result PTRCALL
400utf8_toUtf8(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000401 const char **fromP, const char *fromLim,
402 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000403{
Victor Stinner759e30e2017-09-05 01:58:08 +0200404 bool input_incomplete = false;
405 bool output_exhausted = false;
Victor Stinner5ff71322017-06-21 14:39:22 +0200406
Victor Stinner759e30e2017-09-05 01:58:08 +0200407 /* Avoid copying partial characters (due to limited space). */
408 const ptrdiff_t bytesAvailable = fromLim - *fromP;
409 const ptrdiff_t bytesStorable = toLim - *toP;
410 if (bytesAvailable > bytesStorable) {
411 fromLim = *fromP + bytesStorable;
412 output_exhausted = true;
413 }
414
415 /* Avoid copying partial characters (from incomplete input). */
Benjamin Peterson4e211002018-06-26 19:25:45 -0700416 {
417 const char * const fromLimBefore = fromLim;
418 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
419 if (fromLim < fromLimBefore) {
420 input_incomplete = true;
421 }
Victor Stinner759e30e2017-09-05 01:58:08 +0200422 }
Victor Stinner5ff71322017-06-21 14:39:22 +0200423
Benjamin Peterson4e211002018-06-26 19:25:45 -0700424 {
425 const ptrdiff_t bytesToCopy = fromLim - *fromP;
426 memcpy(*toP, *fromP, bytesToCopy);
427 *fromP += bytesToCopy;
428 *toP += bytesToCopy;
429 }
Victor Stinner23ec4b52017-06-15 00:54:36 +0200430
Benjamin Peterson4e211002018-06-26 19:25:45 -0700431 if (output_exhausted) /* needs to go first */
Victor Stinner23ec4b52017-06-15 00:54:36 +0200432 return XML_CONVERT_OUTPUT_EXHAUSTED;
Victor Stinner759e30e2017-09-05 01:58:08 +0200433 else if (input_incomplete)
434 return XML_CONVERT_INPUT_INCOMPLETE;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200435 else
Victor Stinner5ff71322017-06-21 14:39:22 +0200436 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000437}
438
Victor Stinner23ec4b52017-06-15 00:54:36 +0200439static enum XML_Convert_Result PTRCALL
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000440utf8_toUtf16(const ENCODING *enc,
441 const char **fromP, const char *fromLim,
442 unsigned short **toP, const unsigned short *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000443{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200444 enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000445 unsigned short *to = *toP;
446 const char *from = *fromP;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200447 while (from < fromLim && to < toLim) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000448 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
449 case BT_LEAD2:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200450 if (fromLim - from < 2) {
451 res = XML_CONVERT_INPUT_INCOMPLETE;
Victor Stinner5ff71322017-06-21 14:39:22 +0200452 goto after;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200453 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000454 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000455 from += 2;
456 break;
457 case BT_LEAD3:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200458 if (fromLim - from < 3) {
459 res = XML_CONVERT_INPUT_INCOMPLETE;
Victor Stinner5ff71322017-06-21 14:39:22 +0200460 goto after;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200461 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000462 *to++ = (unsigned short)(((from[0] & 0xf) << 12)
463 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000464 from += 3;
465 break;
466 case BT_LEAD4:
467 {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000468 unsigned long n;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200469 if (toLim - to < 2) {
470 res = XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000471 goto after;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200472 }
473 if (fromLim - from < 4) {
474 res = XML_CONVERT_INPUT_INCOMPLETE;
475 goto after;
476 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000477 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
478 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
479 n -= 0x10000;
480 to[0] = (unsigned short)((n >> 10) | 0xD800);
481 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
482 to += 2;
483 from += 4;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000484 }
485 break;
486 default:
487 *to++ = *from++;
488 break;
489 }
490 }
Victor Stinner5ff71322017-06-21 14:39:22 +0200491 if (from < fromLim)
492 res = XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000493after:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000494 *fromP = from;
495 *toP = to;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200496 return res;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000497}
498
499#ifdef XML_NS
500static const struct normal_encoding utf8_encoding_ns = {
501 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
502 {
503#include "asciitab.h"
504#include "utf8tab.h"
505 },
506 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
507};
508#endif
509
510static const struct normal_encoding utf8_encoding = {
511 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
512 {
513#define BT_COLON BT_NMSTRT
514#include "asciitab.h"
515#undef BT_COLON
516#include "utf8tab.h"
517 },
518 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
519};
520
521#ifdef XML_NS
522
523static const struct normal_encoding internal_utf8_encoding_ns = {
524 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
525 {
526#include "iasciitab.h"
527#include "utf8tab.h"
528 },
529 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
530};
531
532#endif
533
534static const struct normal_encoding internal_utf8_encoding = {
535 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
536 {
537#define BT_COLON BT_NMSTRT
538#include "iasciitab.h"
539#undef BT_COLON
540#include "utf8tab.h"
541 },
542 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
543};
544
Victor Stinner23ec4b52017-06-15 00:54:36 +0200545static enum XML_Convert_Result PTRCALL
546latin1_toUtf8(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000547 const char **fromP, const char *fromLim,
548 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000549{
550 for (;;) {
551 unsigned char c;
552 if (*fromP == fromLim)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200553 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000554 c = (unsigned char)**fromP;
555 if (c & 0x80) {
556 if (toLim - *toP < 2)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200557 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000558 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
559 *(*toP)++ = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000560 (*fromP)++;
561 }
562 else {
563 if (*toP == toLim)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200564 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000565 *(*toP)++ = *(*fromP)++;
566 }
567 }
568}
569
Victor Stinner23ec4b52017-06-15 00:54:36 +0200570static enum XML_Convert_Result PTRCALL
571latin1_toUtf16(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000572 const char **fromP, const char *fromLim,
573 unsigned short **toP, const unsigned short *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000574{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200575 while (*fromP < fromLim && *toP < toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000576 *(*toP)++ = (unsigned char)*(*fromP)++;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200577
578 if ((*toP == toLim) && (*fromP < fromLim))
579 return XML_CONVERT_OUTPUT_EXHAUSTED;
580 else
581 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000582}
583
584#ifdef XML_NS
585
586static const struct normal_encoding latin1_encoding_ns = {
587 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
588 {
589#include "asciitab.h"
590#include "latin1tab.h"
591 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200592 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000593};
594
595#endif
596
597static const struct normal_encoding latin1_encoding = {
598 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
599 {
600#define BT_COLON BT_NMSTRT
601#include "asciitab.h"
602#undef BT_COLON
603#include "latin1tab.h"
604 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200605 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000606};
607
Victor Stinner23ec4b52017-06-15 00:54:36 +0200608static enum XML_Convert_Result PTRCALL
609ascii_toUtf8(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000610 const char **fromP, const char *fromLim,
611 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000612{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200613 while (*fromP < fromLim && *toP < toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000614 *(*toP)++ = *(*fromP)++;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200615
616 if ((*toP == toLim) && (*fromP < fromLim))
617 return XML_CONVERT_OUTPUT_EXHAUSTED;
618 else
619 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000620}
621
622#ifdef XML_NS
623
624static const struct normal_encoding ascii_encoding_ns = {
625 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
626 {
627#include "asciitab.h"
628/* BT_NONXML == 0 */
629 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200630 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000631};
632
633#endif
634
635static const struct normal_encoding ascii_encoding = {
636 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
637 {
638#define BT_COLON BT_NMSTRT
639#include "asciitab.h"
640#undef BT_COLON
641/* BT_NONXML == 0 */
642 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200643 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000644};
645
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000646static int PTRFASTCALL
647unicode_byte_type(char hi, char lo)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000648{
649 switch ((unsigned char)hi) {
650 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
651 return BT_LEAD4;
652 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
653 return BT_TRAIL;
654 case 0xFF:
655 switch ((unsigned char)lo) {
656 case 0xFF:
657 case 0xFE:
658 return BT_NONXML;
659 }
660 break;
661 }
662 return BT_NONASCII;
663}
664
665#define DEFINE_UTF16_TO_UTF8(E) \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200666static enum XML_Convert_Result PTRCALL \
667E ## toUtf8(const ENCODING *UNUSED_P(enc), \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000668 const char **fromP, const char *fromLim, \
669 char **toP, const char *toLim) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000670{ \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200671 const char *from = *fromP; \
672 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
673 for (; from < fromLim; from += 2) { \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000674 int plane; \
675 unsigned char lo2; \
676 unsigned char lo = GET_LO(from); \
677 unsigned char hi = GET_HI(from); \
678 switch (hi) { \
679 case 0: \
680 if (lo < 0x80) { \
681 if (*toP == toLim) { \
682 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200683 return XML_CONVERT_OUTPUT_EXHAUSTED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000684 } \
685 *(*toP)++ = lo; \
686 break; \
687 } \
688 /* fall through */ \
689 case 0x1: case 0x2: case 0x3: \
690 case 0x4: case 0x5: case 0x6: case 0x7: \
691 if (toLim - *toP < 2) { \
692 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200693 return XML_CONVERT_OUTPUT_EXHAUSTED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000694 } \
695 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
696 *(*toP)++ = ((lo & 0x3f) | 0x80); \
697 break; \
698 default: \
699 if (toLim - *toP < 3) { \
700 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200701 return XML_CONVERT_OUTPUT_EXHAUSTED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000702 } \
703 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
704 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
705 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
706 *(*toP)++ = ((lo & 0x3f) | 0x80); \
707 break; \
708 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
709 if (toLim - *toP < 4) { \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000710 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200711 return XML_CONVERT_OUTPUT_EXHAUSTED; \
712 } \
713 if (fromLim - from < 4) { \
714 *fromP = from; \
715 return XML_CONVERT_INPUT_INCOMPLETE; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000716 } \
717 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
718 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
719 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
720 from += 2; \
721 lo2 = GET_LO(from); \
722 *(*toP)++ = (((lo & 0x3) << 4) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000723 | ((GET_HI(from) & 0x3) << 2) \
724 | (lo2 >> 6) \
725 | 0x80); \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000726 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
727 break; \
728 } \
729 } \
730 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200731 if (from < fromLim) \
732 return XML_CONVERT_INPUT_INCOMPLETE; \
733 else \
734 return XML_CONVERT_COMPLETED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000735}
736
737#define DEFINE_UTF16_TO_UTF16(E) \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200738static enum XML_Convert_Result PTRCALL \
739E ## toUtf16(const ENCODING *UNUSED_P(enc), \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000740 const char **fromP, const char *fromLim, \
741 unsigned short **toP, const unsigned short *toLim) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000742{ \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200743 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
744 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000745 /* Avoid copying first half only of surrogate */ \
746 if (fromLim - *fromP > ((toLim - *toP) << 1) \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200747 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000748 fromLim -= 2; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200749 res = XML_CONVERT_INPUT_INCOMPLETE; \
750 } \
751 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000752 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200753 if ((*toP == toLim) && (*fromP < fromLim)) \
754 return XML_CONVERT_OUTPUT_EXHAUSTED; \
755 else \
756 return res; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000757}
758
759#define SET2(ptr, ch) \
760 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
761#define GET_LO(ptr) ((unsigned char)(ptr)[0])
762#define GET_HI(ptr) ((unsigned char)(ptr)[1])
763
764DEFINE_UTF16_TO_UTF8(little2_)
765DEFINE_UTF16_TO_UTF16(little2_)
766
767#undef SET2
768#undef GET_LO
769#undef GET_HI
770
771#define SET2(ptr, ch) \
772 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
773#define GET_LO(ptr) ((unsigned char)(ptr)[1])
774#define GET_HI(ptr) ((unsigned char)(ptr)[0])
775
776DEFINE_UTF16_TO_UTF8(big2_)
777DEFINE_UTF16_TO_UTF16(big2_)
778
779#undef SET2
780#undef GET_LO
781#undef GET_HI
782
783#define LITTLE2_BYTE_TYPE(enc, p) \
784 ((p)[1] == 0 \
785 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
786 : unicode_byte_type((p)[1], (p)[0]))
787#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
788#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
789#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
790 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
791#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
792 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
793
794#ifdef XML_MIN_SIZE
795
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000796static int PTRFASTCALL
797little2_byteType(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000798{
799 return LITTLE2_BYTE_TYPE(enc, p);
800}
801
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000802static int PTRFASTCALL
803little2_byteToAscii(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000804{
805 return LITTLE2_BYTE_TO_ASCII(enc, p);
806}
807
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000808static int PTRCALL
809little2_charMatches(const ENCODING *enc, const char *p, int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000810{
811 return LITTLE2_CHAR_MATCHES(enc, p, c);
812}
813
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000814static int PTRFASTCALL
815little2_isNameMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000816{
817 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
818}
819
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000820static int PTRFASTCALL
821little2_isNmstrtMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000822{
823 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
824}
825
826#undef VTABLE
827#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
828
829#else /* not XML_MIN_SIZE */
830
831#undef PREFIX
832#define PREFIX(ident) little2_ ## ident
833#define MINBPC(enc) 2
834/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
835#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000836#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000837#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
838#define IS_NAME_CHAR(enc, p, n) 0
839#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
840#define IS_NMSTRT_CHAR(enc, p, n) (0)
841#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
842
Gregory P. Smith64359d22012-07-14 14:12:35 -0700843#define XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000844#include "xmltok_impl.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -0700845#undef XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000846
847#undef MINBPC
848#undef BYTE_TYPE
849#undef BYTE_TO_ASCII
850#undef CHAR_MATCHES
851#undef IS_NAME_CHAR
852#undef IS_NAME_CHAR_MINBPC
853#undef IS_NMSTRT_CHAR
854#undef IS_NMSTRT_CHAR_MINBPC
855#undef IS_INVALID_CHAR
856
857#endif /* not XML_MIN_SIZE */
858
859#ifdef XML_NS
860
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000861static const struct normal_encoding little2_encoding_ns = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000862 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000863#if BYTEORDER == 1234
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000864 1
865#else
866 0
867#endif
868 },
869 {
870#include "asciitab.h"
871#include "latin1tab.h"
872 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200873 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000874};
875
876#endif
877
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000878static const struct normal_encoding little2_encoding = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000879 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000880#if BYTEORDER == 1234
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000881 1
882#else
883 0
884#endif
885 },
886 {
887#define BT_COLON BT_NMSTRT
888#include "asciitab.h"
889#undef BT_COLON
890#include "latin1tab.h"
891 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200892 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000893};
894
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000895#if BYTEORDER != 4321
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000896
897#ifdef XML_NS
898
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000899static const struct normal_encoding internal_little2_encoding_ns = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000900 { VTABLE, 2, 0, 1 },
901 {
902#include "iasciitab.h"
903#include "latin1tab.h"
904 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200905 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000906};
907
908#endif
909
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000910static const struct normal_encoding internal_little2_encoding = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000911 { VTABLE, 2, 0, 1 },
912 {
913#define BT_COLON BT_NMSTRT
914#include "iasciitab.h"
915#undef BT_COLON
916#include "latin1tab.h"
917 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200918 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000919};
920
921#endif
922
923
924#define BIG2_BYTE_TYPE(enc, p) \
925 ((p)[0] == 0 \
926 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
927 : unicode_byte_type((p)[0], (p)[1]))
928#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
929#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
930#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
931 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
932#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
933 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
934
935#ifdef XML_MIN_SIZE
936
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000937static int PTRFASTCALL
938big2_byteType(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000939{
940 return BIG2_BYTE_TYPE(enc, p);
941}
942
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000943static int PTRFASTCALL
944big2_byteToAscii(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000945{
946 return BIG2_BYTE_TO_ASCII(enc, p);
947}
948
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000949static int PTRCALL
950big2_charMatches(const ENCODING *enc, const char *p, int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000951{
952 return BIG2_CHAR_MATCHES(enc, p, c);
953}
954
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000955static int PTRFASTCALL
956big2_isNameMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000957{
958 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
959}
960
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000961static int PTRFASTCALL
962big2_isNmstrtMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000963{
964 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
965}
966
967#undef VTABLE
968#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
969
970#else /* not XML_MIN_SIZE */
971
972#undef PREFIX
973#define PREFIX(ident) big2_ ## ident
974#define MINBPC(enc) 2
975/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
976#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000977#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000978#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
979#define IS_NAME_CHAR(enc, p, n) 0
980#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
981#define IS_NMSTRT_CHAR(enc, p, n) (0)
982#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
983
Gregory P. Smith64359d22012-07-14 14:12:35 -0700984#define XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000985#include "xmltok_impl.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -0700986#undef XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000987
988#undef MINBPC
989#undef BYTE_TYPE
990#undef BYTE_TO_ASCII
991#undef CHAR_MATCHES
992#undef IS_NAME_CHAR
993#undef IS_NAME_CHAR_MINBPC
994#undef IS_NMSTRT_CHAR
995#undef IS_NMSTRT_CHAR_MINBPC
996#undef IS_INVALID_CHAR
997
998#endif /* not XML_MIN_SIZE */
999
1000#ifdef XML_NS
1001
1002static const struct normal_encoding big2_encoding_ns = {
1003 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001004#if BYTEORDER == 4321
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001005 1
1006#else
1007 0
1008#endif
1009 },
1010 {
1011#include "asciitab.h"
1012#include "latin1tab.h"
1013 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001014 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001015};
1016
1017#endif
1018
1019static const struct normal_encoding big2_encoding = {
1020 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001021#if BYTEORDER == 4321
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001022 1
1023#else
1024 0
1025#endif
1026 },
1027 {
1028#define BT_COLON BT_NMSTRT
1029#include "asciitab.h"
1030#undef BT_COLON
1031#include "latin1tab.h"
1032 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001033 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001034};
1035
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001036#if BYTEORDER != 1234
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001037
1038#ifdef XML_NS
1039
1040static const struct normal_encoding internal_big2_encoding_ns = {
1041 { VTABLE, 2, 0, 1 },
1042 {
1043#include "iasciitab.h"
1044#include "latin1tab.h"
1045 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001046 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001047};
1048
1049#endif
1050
1051static const struct normal_encoding internal_big2_encoding = {
1052 { VTABLE, 2, 0, 1 },
1053 {
1054#define BT_COLON BT_NMSTRT
1055#include "iasciitab.h"
1056#undef BT_COLON
1057#include "latin1tab.h"
1058 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001059 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001060};
1061
1062#endif
1063
1064#undef PREFIX
1065
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001066static int FASTCALL
1067streqci(const char *s1, const char *s2)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001068{
1069 for (;;) {
1070 char c1 = *s1++;
1071 char c2 = *s2++;
1072 if (ASCII_a <= c1 && c1 <= ASCII_z)
1073 c1 += ASCII_A - ASCII_a;
1074 if (ASCII_a <= c2 && c2 <= ASCII_z)
Victor Stinner93d0cb52017-08-18 23:43:54 +02001075 /* The following line will never get executed. streqci() is
1076 * only called from two places, both of which guarantee to put
1077 * upper-case strings into s2.
1078 */
1079 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001080 if (c1 != c2)
1081 return 0;
1082 if (!c1)
1083 break;
1084 }
1085 return 1;
1086}
1087
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001088static void PTRCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +02001089initUpdatePosition(const ENCODING *UNUSED_P(enc), const char *ptr,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001090 const char *end, POSITION *pos)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001091{
1092 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1093}
1094
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001095static int
1096toAscii(const ENCODING *enc, const char *ptr, const char *end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001097{
1098 char buf[1];
1099 char *p = buf;
1100 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1101 if (p == buf)
1102 return -1;
1103 else
1104 return buf[0];
1105}
1106
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001107static int FASTCALL
1108isSpace(int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001109{
1110 switch (c) {
1111 case 0x20:
1112 case 0xD:
1113 case 0xA:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001114 case 0x9:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001115 return 1;
1116 }
1117 return 0;
1118}
1119
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001120/* Return 1 if there's just optional white space or there's an S
1121 followed by name=val.
1122*/
1123static int
1124parsePseudoAttribute(const ENCODING *enc,
1125 const char *ptr,
1126 const char *end,
1127 const char **namePtr,
1128 const char **nameEndPtr,
1129 const char **valPtr,
1130 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001131{
1132 int c;
1133 char open;
1134 if (ptr == end) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001135 *namePtr = NULL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001136 return 1;
1137 }
1138 if (!isSpace(toAscii(enc, ptr, end))) {
1139 *nextTokPtr = ptr;
1140 return 0;
1141 }
1142 do {
1143 ptr += enc->minBytesPerChar;
1144 } while (isSpace(toAscii(enc, ptr, end)));
1145 if (ptr == end) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001146 *namePtr = NULL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001147 return 1;
1148 }
1149 *namePtr = ptr;
1150 for (;;) {
1151 c = toAscii(enc, ptr, end);
1152 if (c == -1) {
1153 *nextTokPtr = ptr;
1154 return 0;
1155 }
1156 if (c == ASCII_EQUALS) {
1157 *nameEndPtr = ptr;
1158 break;
1159 }
1160 if (isSpace(c)) {
1161 *nameEndPtr = ptr;
1162 do {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001163 ptr += enc->minBytesPerChar;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001164 } while (isSpace(c = toAscii(enc, ptr, end)));
1165 if (c != ASCII_EQUALS) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001166 *nextTokPtr = ptr;
1167 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001168 }
1169 break;
1170 }
1171 ptr += enc->minBytesPerChar;
1172 }
1173 if (ptr == *namePtr) {
1174 *nextTokPtr = ptr;
1175 return 0;
1176 }
1177 ptr += enc->minBytesPerChar;
1178 c = toAscii(enc, ptr, end);
1179 while (isSpace(c)) {
1180 ptr += enc->minBytesPerChar;
1181 c = toAscii(enc, ptr, end);
1182 }
1183 if (c != ASCII_QUOT && c != ASCII_APOS) {
1184 *nextTokPtr = ptr;
1185 return 0;
1186 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001187 open = (char)c;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001188 ptr += enc->minBytesPerChar;
1189 *valPtr = ptr;
1190 for (;; ptr += enc->minBytesPerChar) {
1191 c = toAscii(enc, ptr, end);
1192 if (c == open)
1193 break;
1194 if (!(ASCII_a <= c && c <= ASCII_z)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001195 && !(ASCII_A <= c && c <= ASCII_Z)
1196 && !(ASCII_0 <= c && c <= ASCII_9)
1197 && c != ASCII_PERIOD
1198 && c != ASCII_MINUS
1199 && c != ASCII_UNDERSCORE) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001200 *nextTokPtr = ptr;
1201 return 0;
1202 }
1203 }
1204 *nextTokPtr = ptr + enc->minBytesPerChar;
1205 return 1;
1206}
1207
1208static const char KW_version[] = {
1209 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1210};
1211
1212static const char KW_encoding[] = {
1213 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1214};
1215
1216static const char KW_standalone[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001217 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1218 ASCII_n, ASCII_e, '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001219};
1220
1221static const char KW_yes[] = {
1222 ASCII_y, ASCII_e, ASCII_s, '\0'
1223};
1224
1225static const char KW_no[] = {
1226 ASCII_n, ASCII_o, '\0'
1227};
1228
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001229static int
1230doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1231 const char *,
1232 const char *),
1233 int isGeneralTextEntity,
1234 const ENCODING *enc,
1235 const char *ptr,
1236 const char *end,
1237 const char **badPtr,
1238 const char **versionPtr,
1239 const char **versionEndPtr,
1240 const char **encodingName,
1241 const ENCODING **encoding,
1242 int *standalone)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001243{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001244 const char *val = NULL;
1245 const char *name = NULL;
1246 const char *nameEnd = NULL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001247 ptr += 5 * enc->minBytesPerChar;
1248 end -= 2 * enc->minBytesPerChar;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001249 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1250 || !name) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001251 *badPtr = ptr;
1252 return 0;
1253 }
1254 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1255 if (!isGeneralTextEntity) {
1256 *badPtr = name;
1257 return 0;
1258 }
1259 }
1260 else {
1261 if (versionPtr)
1262 *versionPtr = val;
1263 if (versionEndPtr)
1264 *versionEndPtr = ptr;
1265 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1266 *badPtr = ptr;
1267 return 0;
1268 }
1269 if (!name) {
1270 if (isGeneralTextEntity) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001271 /* a TextDecl must have an EncodingDecl */
1272 *badPtr = ptr;
1273 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001274 }
1275 return 1;
1276 }
1277 }
1278 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1279 int c = toAscii(enc, val, end);
1280 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1281 *badPtr = val;
1282 return 0;
1283 }
1284 if (encodingName)
1285 *encodingName = val;
1286 if (encoding)
1287 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1288 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1289 *badPtr = ptr;
1290 return 0;
1291 }
1292 if (!name)
1293 return 1;
1294 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001295 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1296 || isGeneralTextEntity) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001297 *badPtr = name;
1298 return 0;
1299 }
1300 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1301 if (standalone)
1302 *standalone = 1;
1303 }
1304 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1305 if (standalone)
1306 *standalone = 0;
1307 }
1308 else {
1309 *badPtr = val;
1310 return 0;
1311 }
1312 while (isSpace(toAscii(enc, ptr, end)))
1313 ptr += enc->minBytesPerChar;
1314 if (ptr != end) {
1315 *badPtr = ptr;
1316 return 0;
1317 }
1318 return 1;
1319}
1320
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001321static int FASTCALL
1322checkCharRefNumber(int result)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001323{
1324 switch (result >> 8) {
1325 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1326 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1327 return -1;
1328 case 0:
1329 if (latin1_encoding.type[result] == BT_NONXML)
1330 return -1;
1331 break;
1332 case 0xFF:
1333 if (result == 0xFFFE || result == 0xFFFF)
1334 return -1;
1335 break;
1336 }
1337 return result;
1338}
1339
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001340int FASTCALL
1341XmlUtf8Encode(int c, char *buf)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001342{
1343 enum {
1344 /* minN is minimum legal resulting value for N byte sequence */
1345 min2 = 0x80,
1346 min3 = 0x800,
1347 min4 = 0x10000
1348 };
1349
1350 if (c < 0)
Victor Stinner93d0cb52017-08-18 23:43:54 +02001351 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001352 if (c < min2) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001353 buf[0] = (char)(c | UTF8_cval1);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001354 return 1;
1355 }
1356 if (c < min3) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001357 buf[0] = (char)((c >> 6) | UTF8_cval2);
1358 buf[1] = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001359 return 2;
1360 }
1361 if (c < min4) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001362 buf[0] = (char)((c >> 12) | UTF8_cval3);
1363 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1364 buf[2] = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001365 return 3;
1366 }
1367 if (c < 0x110000) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001368 buf[0] = (char)((c >> 18) | UTF8_cval4);
1369 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1370 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1371 buf[3] = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001372 return 4;
1373 }
Victor Stinner93d0cb52017-08-18 23:43:54 +02001374 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001375}
1376
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001377int FASTCALL
1378XmlUtf16Encode(int charNum, unsigned short *buf)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001379{
1380 if (charNum < 0)
1381 return 0;
1382 if (charNum < 0x10000) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001383 buf[0] = (unsigned short)charNum;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001384 return 1;
1385 }
1386 if (charNum < 0x110000) {
1387 charNum -= 0x10000;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001388 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1389 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001390 return 2;
1391 }
1392 return 0;
1393}
1394
1395struct unknown_encoding {
1396 struct normal_encoding normal;
Fred Drake31d485c2004-08-03 07:06:22 +00001397 CONVERTER convert;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001398 void *userData;
1399 unsigned short utf16[256];
1400 char utf8[256][4];
1401};
1402
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001403#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
1404
1405int
1406XmlSizeOfUnknownEncoding(void)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001407{
1408 return sizeof(struct unknown_encoding);
1409}
1410
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001411static int PTRFASTCALL
1412unknown_isName(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001413{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001414 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1415 int c = uenc->convert(uenc->userData, p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001416 if (c & ~0xFFFF)
1417 return 0;
1418 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1419}
1420
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001421static int PTRFASTCALL
1422unknown_isNmstrt(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001423{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001424 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1425 int c = uenc->convert(uenc->userData, p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001426 if (c & ~0xFFFF)
1427 return 0;
1428 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1429}
1430
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001431static int PTRFASTCALL
1432unknown_isInvalid(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001433{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001434 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1435 int c = uenc->convert(uenc->userData, p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001436 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1437}
1438
Victor Stinner23ec4b52017-06-15 00:54:36 +02001439static enum XML_Convert_Result PTRCALL
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001440unknown_toUtf8(const ENCODING *enc,
1441 const char **fromP, const char *fromLim,
1442 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001443{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001444 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001445 char buf[XML_UTF8_ENCODE_MAX];
1446 for (;;) {
1447 const char *utf8;
1448 int n;
1449 if (*fromP == fromLim)
Victor Stinner23ec4b52017-06-15 00:54:36 +02001450 return XML_CONVERT_COMPLETED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001451 utf8 = uenc->utf8[(unsigned char)**fromP];
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001452 n = *utf8++;
1453 if (n == 0) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001454 int c = uenc->convert(uenc->userData, *fromP);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001455 n = XmlUtf8Encode(c, buf);
1456 if (n > toLim - *toP)
Victor Stinner23ec4b52017-06-15 00:54:36 +02001457 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001458 utf8 = buf;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001459 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1460 - (BT_LEAD2 - 2));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001461 }
1462 else {
1463 if (n > toLim - *toP)
Victor Stinner23ec4b52017-06-15 00:54:36 +02001464 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001465 (*fromP)++;
1466 }
Benjamin Peterson4e211002018-06-26 19:25:45 -07001467 memcpy(*toP, utf8, n);
1468 *toP += n;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001469 }
1470}
1471
Victor Stinner23ec4b52017-06-15 00:54:36 +02001472static enum XML_Convert_Result PTRCALL
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001473unknown_toUtf16(const ENCODING *enc,
1474 const char **fromP, const char *fromLim,
1475 unsigned short **toP, const unsigned short *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001476{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001477 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001478 while (*fromP < fromLim && *toP < toLim) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001479 unsigned short c = uenc->utf16[(unsigned char)**fromP];
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001480 if (c == 0) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001481 c = (unsigned short)
1482 uenc->convert(uenc->userData, *fromP);
1483 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1484 - (BT_LEAD2 - 2));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001485 }
1486 else
1487 (*fromP)++;
1488 *(*toP)++ = c;
1489 }
Victor Stinner23ec4b52017-06-15 00:54:36 +02001490
1491 if ((*toP == toLim) && (*fromP < fromLim))
1492 return XML_CONVERT_OUTPUT_EXHAUSTED;
1493 else
1494 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001495}
1496
1497ENCODING *
1498XmlInitUnknownEncoding(void *mem,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001499 int *table,
Gregory P. Smith64359d22012-07-14 14:12:35 -07001500 CONVERTER convert,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001501 void *userData)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001502{
1503 int i;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001504 struct unknown_encoding *e = (struct unknown_encoding *)mem;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001505 for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1506 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1507 for (i = 0; i < 128; i++)
1508 if (latin1_encoding.type[i] != BT_OTHER
1509 && latin1_encoding.type[i] != BT_NONXML
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001510 && table[i] != i)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001511 return 0;
1512 for (i = 0; i < 256; i++) {
1513 int c = table[i];
1514 if (c == -1) {
1515 e->normal.type[i] = BT_MALFORM;
1516 /* This shouldn't really get used. */
1517 e->utf16[i] = 0xFFFF;
1518 e->utf8[i][0] = 1;
1519 e->utf8[i][1] = 0;
1520 }
1521 else if (c < 0) {
1522 if (c < -4)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001523 return 0;
Victor Stinner93d0cb52017-08-18 23:43:54 +02001524 /* Multi-byte sequences need a converter function */
1525 if (!convert)
1526 return 0;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001527 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001528 e->utf8[i][0] = 0;
1529 e->utf16[i] = 0;
1530 }
1531 else if (c < 0x80) {
1532 if (latin1_encoding.type[c] != BT_OTHER
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001533 && latin1_encoding.type[c] != BT_NONXML
1534 && c != i)
1535 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001536 e->normal.type[i] = latin1_encoding.type[c];
1537 e->utf8[i][0] = 1;
1538 e->utf8[i][1] = (char)c;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001539 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001540 }
1541 else if (checkCharRefNumber(c) < 0) {
1542 e->normal.type[i] = BT_NONXML;
1543 /* This shouldn't really get used. */
1544 e->utf16[i] = 0xFFFF;
1545 e->utf8[i][0] = 1;
1546 e->utf8[i][1] = 0;
1547 }
1548 else {
1549 if (c > 0xFFFF)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001550 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001551 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001552 e->normal.type[i] = BT_NMSTRT;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001553 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001554 e->normal.type[i] = BT_NAME;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001555 else
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001556 e->normal.type[i] = BT_OTHER;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001557 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001558 e->utf16[i] = (unsigned short)c;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001559 }
1560 }
1561 e->userData = userData;
1562 e->convert = convert;
1563 if (convert) {
1564 e->normal.isName2 = unknown_isName;
1565 e->normal.isName3 = unknown_isName;
1566 e->normal.isName4 = unknown_isName;
1567 e->normal.isNmstrt2 = unknown_isNmstrt;
1568 e->normal.isNmstrt3 = unknown_isNmstrt;
1569 e->normal.isNmstrt4 = unknown_isNmstrt;
1570 e->normal.isInvalid2 = unknown_isInvalid;
1571 e->normal.isInvalid3 = unknown_isInvalid;
1572 e->normal.isInvalid4 = unknown_isInvalid;
1573 }
1574 e->normal.enc.utf8Convert = unknown_toUtf8;
1575 e->normal.enc.utf16Convert = unknown_toUtf16;
1576 return &(e->normal.enc);
1577}
1578
1579/* If this enumeration is changed, getEncodingIndex and encodings
1580must also be changed. */
1581enum {
1582 UNKNOWN_ENC = -1,
1583 ISO_8859_1_ENC = 0,
1584 US_ASCII_ENC,
1585 UTF_8_ENC,
1586 UTF_16_ENC,
1587 UTF_16BE_ENC,
1588 UTF_16LE_ENC,
1589 /* must match encodingNames up to here */
1590 NO_ENC
1591};
1592
1593static const char KW_ISO_8859_1[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001594 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1595 ASCII_MINUS, ASCII_1, '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001596};
1597static const char KW_US_ASCII[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001598 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1599 '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001600};
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001601static const char KW_UTF_8[] = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001602 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1603};
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001604static const char KW_UTF_16[] = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001605 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1606};
1607static const char KW_UTF_16BE[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001608 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1609 '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001610};
1611static const char KW_UTF_16LE[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001612 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1613 '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001614};
1615
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001616static int FASTCALL
1617getEncodingIndex(const char *name)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001618{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001619 static const char * const encodingNames[] = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001620 KW_ISO_8859_1,
1621 KW_US_ASCII,
1622 KW_UTF_8,
1623 KW_UTF_16,
1624 KW_UTF_16BE,
1625 KW_UTF_16LE,
1626 };
1627 int i;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001628 if (name == NULL)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001629 return NO_ENC;
1630 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1631 if (streqci(name, encodingNames[i]))
1632 return i;
1633 return UNKNOWN_ENC;
1634}
1635
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001636/* For binary compatibility, we store the index of the encoding
1637 specified at initialization in the isUtf16 member.
1638*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001639
1640#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1641#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1642
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001643/* This is what detects the encoding. encodingTable maps from
1644 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1645 the external (protocol) specified encoding; state is
1646 XML_CONTENT_STATE if we're parsing an external text entity, and
1647 XML_PROLOG_STATE otherwise.
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001648*/
1649
1650
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001651static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001652initScan(const ENCODING * const *encodingTable,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001653 const INIT_ENCODING *enc,
1654 int state,
1655 const char *ptr,
1656 const char *end,
1657 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001658{
1659 const ENCODING **encPtr;
1660
Victor Stinner23ec4b52017-06-15 00:54:36 +02001661 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001662 return XML_TOK_NONE;
1663 encPtr = enc->encPtr;
1664 if (ptr + 1 == end) {
1665 /* only a single byte available for auto-detection */
1666#ifndef XML_DTD /* FIXME */
1667 /* a well-formed document entity must have more than one byte */
1668 if (state != XML_CONTENT_STATE)
1669 return XML_TOK_PARTIAL;
1670#endif
1671 /* so we're parsing an external text entity... */
1672 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1673 switch (INIT_ENC_INDEX(enc)) {
1674 case UTF_16_ENC:
1675 case UTF_16LE_ENC:
1676 case UTF_16BE_ENC:
1677 return XML_TOK_PARTIAL;
1678 }
1679 switch ((unsigned char)*ptr) {
1680 case 0xFE:
1681 case 0xFF:
1682 case 0xEF: /* possibly first byte of UTF-8 BOM */
1683 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001684 && state == XML_CONTENT_STATE)
1685 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001686 /* fall through */
1687 case 0x00:
1688 case 0x3C:
1689 return XML_TOK_PARTIAL;
1690 }
1691 }
1692 else {
1693 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1694 case 0xFEFF:
1695 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001696 && state == XML_CONTENT_STATE)
1697 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001698 *nextTokPtr = ptr + 2;
1699 *encPtr = encodingTable[UTF_16BE_ENC];
1700 return XML_TOK_BOM;
1701 /* 00 3C is handled in the default case */
1702 case 0x3C00:
1703 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001704 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1705 && state == XML_CONTENT_STATE)
1706 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001707 *encPtr = encodingTable[UTF_16LE_ENC];
1708 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1709 case 0xFFFE:
1710 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001711 && state == XML_CONTENT_STATE)
1712 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001713 *nextTokPtr = ptr + 2;
1714 *encPtr = encodingTable[UTF_16LE_ENC];
1715 return XML_TOK_BOM;
1716 case 0xEFBB:
1717 /* Maybe a UTF-8 BOM (EF BB BF) */
1718 /* If there's an explicitly specified (external) encoding
1719 of ISO-8859-1 or some flavour of UTF-16
1720 and this is an external text entity,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001721 don't look for the BOM,
1722 because it might be a legal data.
1723 */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001724 if (state == XML_CONTENT_STATE) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001725 int e = INIT_ENC_INDEX(enc);
1726 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1727 || e == UTF_16LE_ENC || e == UTF_16_ENC)
1728 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001729 }
1730 if (ptr + 2 == end)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001731 return XML_TOK_PARTIAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001732 if ((unsigned char)ptr[2] == 0xBF) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001733 *nextTokPtr = ptr + 3;
1734 *encPtr = encodingTable[UTF_8_ENC];
1735 return XML_TOK_BOM;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001736 }
1737 break;
1738 default:
1739 if (ptr[0] == '\0') {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001740 /* 0 isn't a legal data character. Furthermore a document
1741 entity can only start with ASCII characters. So the only
Benjamin Peterson196d7db2016-06-11 13:28:56 -07001742 way this can fail to be big-endian UTF-16 if it it's an
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001743 external parsed general entity that's labelled as
1744 UTF-16LE.
1745 */
1746 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1747 break;
1748 *encPtr = encodingTable[UTF_16BE_ENC];
1749 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001750 }
1751 else if (ptr[1] == '\0') {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001752 /* We could recover here in the case:
1753 - parsing an external entity
1754 - second byte is 0
1755 - no externally specified encoding
1756 - no encoding declaration
1757 by assuming UTF-16LE. But we don't, because this would mean when
1758 presented just with a single byte, we couldn't reliably determine
1759 whether we needed further bytes.
1760 */
1761 if (state == XML_CONTENT_STATE)
1762 break;
1763 *encPtr = encodingTable[UTF_16LE_ENC];
1764 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001765 }
1766 break;
1767 }
1768 }
1769 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1770 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1771}
1772
1773
1774#define NS(x) x
1775#define ns(x) x
Gregory P. Smith64359d22012-07-14 14:12:35 -07001776#define XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001777#include "xmltok_ns.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -07001778#undef XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001779#undef NS
1780#undef ns
1781
1782#ifdef XML_NS
1783
1784#define NS(x) x ## NS
1785#define ns(x) x ## _ns
1786
Gregory P. Smith64359d22012-07-14 14:12:35 -07001787#define XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001788#include "xmltok_ns.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -07001789#undef XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001790
1791#undef NS
1792#undef ns
1793
1794ENCODING *
1795XmlInitUnknownEncodingNS(void *mem,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001796 int *table,
Gregory P. Smith64359d22012-07-14 14:12:35 -07001797 CONVERTER convert,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001798 void *userData)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001799{
1800 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1801 if (enc)
1802 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1803 return enc;
1804}
1805
1806#endif /* XML_NS */