blob: 007aed0640a2ef3dd8096804cb9f434c6fd2c991 [file] [log] [blame]
Victor Stinner759e30e2017-09-05 01:58:08 +02001/*
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000-2017 Expat development team
11 Licensed under the MIT license:
12
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to permit
18 persons to whom the Software is furnished to do so, subject to the
19 following conditions:
20
21 The above copyright notice and this permission notice shall be included
22 in all copies or substantial portions of the Software.
23
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30 USE OR OTHER DEALINGS IN THE SOFTWARE.
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000031*/
32
Benjamin Peterson06d49bb2016-06-13 23:41:19 -070033#include <stddef.h>
Victor Stinner759e30e2017-09-05 01:58:08 +020034#include <stdbool.h>
35#include <string.h> // memcpy
Benjamin Peterson06d49bb2016-06-13 23:41:19 -070036
Victor Stinner5ff71322017-06-21 14:39:22 +020037#ifdef _WIN32
Martin v. Löwisfc03a942003-01-25 22:41:29 +000038#include "winconfig.h"
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000039#else
Fred Drake08317ae2003-10-21 15:38:55 +000040#ifdef HAVE_EXPAT_CONFIG_H
Martin v. Löwisfc03a942003-01-25 22:41:29 +000041#include <expat_config.h>
Fred Drake08317ae2003-10-21 15:38:55 +000042#endif
Victor Stinner5ff71322017-06-21 14:39:22 +020043#endif /* ndef _WIN32 */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000044
Fred Drake31d485c2004-08-03 07:06:22 +000045#include "expat_external.h"
Martin v. Löwisfc03a942003-01-25 22:41:29 +000046#include "internal.h"
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000047#include "xmltok.h"
48#include "nametab.h"
49
50#ifdef XML_DTD
51#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
52#else
53#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
54#endif
55
56#define VTABLE1 \
57 { PREFIX(prologTok), PREFIX(contentTok), \
58 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
59 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
60 PREFIX(sameName), \
61 PREFIX(nameMatchesAscii), \
62 PREFIX(nameLength), \
63 PREFIX(skipS), \
64 PREFIX(getAtts), \
65 PREFIX(charRefNumber), \
66 PREFIX(predefinedEntityName), \
67 PREFIX(updatePosition), \
68 PREFIX(isPublicId)
69
70#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
71
72#define UCS2_GET_NAMING(pages, hi, lo) \
Victor Stinner23ec4b52017-06-15 00:54:36 +020073 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000074
Martin v. Löwisfc03a942003-01-25 22:41:29 +000075/* A 2 byte UTF-8 representation splits the characters 11 bits between
76 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
77 pages, 3 bits to add to that index and 5 bits to generate the mask.
78*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000079#define UTF8_GET_NAMING2(pages, byte) \
80 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
81 + ((((byte)[0]) & 3) << 1) \
82 + ((((byte)[1]) >> 5) & 1)] \
Victor Stinner23ec4b52017-06-15 00:54:36 +020083 & (1u << (((byte)[1]) & 0x1F)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000084
Martin v. Löwisfc03a942003-01-25 22:41:29 +000085/* A 3 byte UTF-8 representation splits the characters 16 bits between
86 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
87 into pages, 3 bits to add to that index and 5 bits to generate the
88 mask.
89*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000090#define UTF8_GET_NAMING3(pages, byte) \
91 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
92 + ((((byte)[1]) >> 2) & 0xF)] \
Martin v. Löwisfc03a942003-01-25 22:41:29 +000093 << 3) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000094 + ((((byte)[1]) & 3) << 1) \
95 + ((((byte)[2]) >> 5) & 1)] \
Victor Stinner23ec4b52017-06-15 00:54:36 +020096 & (1u << (((byte)[2]) & 0x1F)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +000097
98#define UTF8_GET_NAMING(pages, p, n) \
99 ((n) == 2 \
100 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
101 : ((n) == 3 \
102 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
103 : 0))
104
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000105/* Detection of invalid UTF-8 sequences is based on Table 3.1B
106 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
107 with the additional restriction of not allowing the Unicode
108 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
109 Implementation details:
110 (A & 0x80) == 0 means A < 0x80
111 and
112 (A & 0xC0) == 0xC0 means A > 0xBF
113*/
114
115#define UTF8_INVALID2(p) \
116 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
117
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000118#define UTF8_INVALID3(p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000119 (((p)[2] & 0x80) == 0 \
120 || \
121 ((*p) == 0xEF && (p)[1] == 0xBF \
122 ? \
123 (p)[2] > 0xBD \
124 : \
125 ((p)[2] & 0xC0) == 0xC0) \
126 || \
127 ((*p) == 0xE0 \
128 ? \
129 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
130 : \
131 ((p)[1] & 0x80) == 0 \
132 || \
133 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000134
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000135#define UTF8_INVALID4(p) \
136 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
137 || \
138 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
139 || \
140 ((*p) == 0xF0 \
141 ? \
142 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
143 : \
144 ((p)[1] & 0x80) == 0 \
145 || \
146 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000147
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000148static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200149isNever(const ENCODING *UNUSED_P(enc), const char *UNUSED_P(p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000150{
151 return 0;
152}
153
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000154static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200155utf8_isName2(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000156{
157 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
158}
159
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000160static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200161utf8_isName3(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000162{
163 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
164}
165
166#define utf8_isName4 isNever
167
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000168static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200169utf8_isNmstrt2(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000170{
171 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
172}
173
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000174static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200175utf8_isNmstrt3(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000176{
177 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
178}
179
180#define utf8_isNmstrt4 isNever
181
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000182static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200183utf8_isInvalid2(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000184{
185 return UTF8_INVALID2((const unsigned char *)p);
186}
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000187
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000188static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200189utf8_isInvalid3(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000190{
191 return UTF8_INVALID3((const unsigned char *)p);
192}
193
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000194static int PTRFASTCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +0200195utf8_isInvalid4(const ENCODING *UNUSED_P(enc), const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000196{
197 return UTF8_INVALID4((const unsigned char *)p);
198}
199
200struct normal_encoding {
201 ENCODING enc;
202 unsigned char type[256];
203#ifdef XML_MIN_SIZE
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000204 int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
205 int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
206 int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
207 int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
208 int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000209#endif /* XML_MIN_SIZE */
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000210 int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
211 int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
212 int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
213 int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
214 int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
215 int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
216 int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
217 int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
218 int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000219};
220
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000221#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
222
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000223#ifdef XML_MIN_SIZE
224
225#define STANDARD_VTABLE(E) \
226 E ## byteType, \
227 E ## isNameMin, \
228 E ## isNmstrtMin, \
229 E ## byteToAscii, \
230 E ## charMatches,
231
232#else
233
234#define STANDARD_VTABLE(E) /* as nothing */
235
236#endif
237
238#define NORMAL_VTABLE(E) \
239 E ## isName2, \
240 E ## isName3, \
241 E ## isName4, \
242 E ## isNmstrt2, \
243 E ## isNmstrt3, \
244 E ## isNmstrt4, \
245 E ## isInvalid2, \
246 E ## isInvalid3, \
247 E ## isInvalid4
248
Victor Stinner23ec4b52017-06-15 00:54:36 +0200249#define NULL_VTABLE \
250 /* isName2 */ NULL, \
251 /* isName3 */ NULL, \
252 /* isName4 */ NULL, \
253 /* isNmstrt2 */ NULL, \
254 /* isNmstrt3 */ NULL, \
255 /* isNmstrt4 */ NULL, \
256 /* isInvalid2 */ NULL, \
257 /* isInvalid3 */ NULL, \
258 /* isInvalid4 */ NULL
259
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000260static int FASTCALL checkCharRefNumber(int);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000261
262#include "xmltok_impl.h"
263#include "ascii.h"
264
265#ifdef XML_MIN_SIZE
266#define sb_isNameMin isNever
267#define sb_isNmstrtMin isNever
268#endif
269
270#ifdef XML_MIN_SIZE
271#define MINBPC(enc) ((enc)->minBytesPerChar)
272#else
273/* minimum bytes per character */
274#define MINBPC(enc) 1
275#endif
276
277#define SB_BYTE_TYPE(enc, p) \
278 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
279
280#ifdef XML_MIN_SIZE
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000281static int PTRFASTCALL
282sb_byteType(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000283{
284 return SB_BYTE_TYPE(enc, p);
285}
286#define BYTE_TYPE(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000287 (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000288#else
289#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
290#endif
291
292#ifdef XML_MIN_SIZE
293#define BYTE_TO_ASCII(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000294 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
295static int PTRFASTCALL
296sb_byteToAscii(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000297{
298 return *p;
299}
300#else
301#define BYTE_TO_ASCII(enc, p) (*(p))
302#endif
303
304#define IS_NAME_CHAR(enc, p, n) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000305 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000306#define IS_NMSTRT_CHAR(enc, p, n) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000307 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000308#define IS_INVALID_CHAR(enc, p, n) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000309 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000310
311#ifdef XML_MIN_SIZE
312#define IS_NAME_CHAR_MINBPC(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000313 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000314#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000315 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000316#else
317#define IS_NAME_CHAR_MINBPC(enc, p) (0)
318#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
319#endif
320
321#ifdef XML_MIN_SIZE
322#define CHAR_MATCHES(enc, p, c) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000323 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
324static int PTRCALL
325sb_charMatches(const ENCODING *enc, const char *p, int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000326{
327 return *p == c;
328}
329#else
330/* c is an ASCII character */
331#define CHAR_MATCHES(enc, p, c) (*(p) == c)
332#endif
333
334#define PREFIX(ident) normal_ ## ident
Gregory P. Smith64359d22012-07-14 14:12:35 -0700335#define XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000336#include "xmltok_impl.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -0700337#undef XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000338
339#undef MINBPC
340#undef BYTE_TYPE
341#undef BYTE_TO_ASCII
342#undef CHAR_MATCHES
343#undef IS_NAME_CHAR
344#undef IS_NAME_CHAR_MINBPC
345#undef IS_NMSTRT_CHAR
346#undef IS_NMSTRT_CHAR_MINBPC
347#undef IS_INVALID_CHAR
348
349enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
350 UTF8_cval1 = 0x00,
351 UTF8_cval2 = 0xc0,
352 UTF8_cval3 = 0xe0,
353 UTF8_cval4 = 0xf0
354};
355
Victor Stinner23ec4b52017-06-15 00:54:36 +0200356void
357align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef)
358{
359 const char * fromLim = *fromLimRef;
360 size_t walked = 0;
361 for (; fromLim > from; fromLim--, walked++) {
362 const unsigned char prev = (unsigned char)fromLim[-1];
363 if ((prev & 0xf8u) == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
364 if (walked + 1 >= 4) {
365 fromLim += 4 - 1;
366 break;
367 } else {
368 walked = 0;
369 }
370 } else if ((prev & 0xf0u) == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
371 if (walked + 1 >= 3) {
372 fromLim += 3 - 1;
373 break;
374 } else {
375 walked = 0;
376 }
377 } else if ((prev & 0xe0u) == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
378 if (walked + 1 >= 2) {
379 fromLim += 2 - 1;
380 break;
381 } else {
382 walked = 0;
383 }
384 } else if ((prev & 0x80u) == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
385 break;
386 }
387 }
388 *fromLimRef = fromLim;
389}
390
391static enum XML_Convert_Result PTRCALL
392utf8_toUtf8(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000393 const char **fromP, const char *fromLim,
394 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000395{
Victor Stinner759e30e2017-09-05 01:58:08 +0200396 bool input_incomplete = false;
397 bool output_exhausted = false;
Victor Stinner5ff71322017-06-21 14:39:22 +0200398
Victor Stinner759e30e2017-09-05 01:58:08 +0200399 /* Avoid copying partial characters (due to limited space). */
400 const ptrdiff_t bytesAvailable = fromLim - *fromP;
401 const ptrdiff_t bytesStorable = toLim - *toP;
402 if (bytesAvailable > bytesStorable) {
403 fromLim = *fromP + bytesStorable;
404 output_exhausted = true;
405 }
406
407 /* Avoid copying partial characters (from incomplete input). */
408 const char * const fromLimBefore = fromLim;
Victor Stinner5ff71322017-06-21 14:39:22 +0200409 align_limit_to_full_utf8_characters(*fromP, &fromLim);
Victor Stinner759e30e2017-09-05 01:58:08 +0200410 if (fromLim < fromLimBefore) {
411 input_incomplete = true;
412 }
Victor Stinner5ff71322017-06-21 14:39:22 +0200413
Victor Stinner759e30e2017-09-05 01:58:08 +0200414 const ptrdiff_t bytesToCopy = fromLim - *fromP;
415 memcpy((void *)*toP, (const void *)*fromP, (size_t)bytesToCopy);
416 *fromP += bytesToCopy;
417 *toP += bytesToCopy;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200418
Victor Stinner759e30e2017-09-05 01:58:08 +0200419 if (output_exhausted) // needs to go first
Victor Stinner23ec4b52017-06-15 00:54:36 +0200420 return XML_CONVERT_OUTPUT_EXHAUSTED;
Victor Stinner759e30e2017-09-05 01:58:08 +0200421 else if (input_incomplete)
422 return XML_CONVERT_INPUT_INCOMPLETE;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200423 else
Victor Stinner5ff71322017-06-21 14:39:22 +0200424 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000425}
426
Victor Stinner23ec4b52017-06-15 00:54:36 +0200427static enum XML_Convert_Result PTRCALL
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000428utf8_toUtf16(const ENCODING *enc,
429 const char **fromP, const char *fromLim,
430 unsigned short **toP, const unsigned short *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000431{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200432 enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000433 unsigned short *to = *toP;
434 const char *from = *fromP;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200435 while (from < fromLim && to < toLim) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000436 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
437 case BT_LEAD2:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200438 if (fromLim - from < 2) {
439 res = XML_CONVERT_INPUT_INCOMPLETE;
Victor Stinner5ff71322017-06-21 14:39:22 +0200440 goto after;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200441 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000442 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000443 from += 2;
444 break;
445 case BT_LEAD3:
Victor Stinner23ec4b52017-06-15 00:54:36 +0200446 if (fromLim - from < 3) {
447 res = XML_CONVERT_INPUT_INCOMPLETE;
Victor Stinner5ff71322017-06-21 14:39:22 +0200448 goto after;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200449 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000450 *to++ = (unsigned short)(((from[0] & 0xf) << 12)
451 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000452 from += 3;
453 break;
454 case BT_LEAD4:
455 {
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000456 unsigned long n;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200457 if (toLim - to < 2) {
458 res = XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000459 goto after;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200460 }
461 if (fromLim - from < 4) {
462 res = XML_CONVERT_INPUT_INCOMPLETE;
463 goto after;
464 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000465 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
466 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
467 n -= 0x10000;
468 to[0] = (unsigned short)((n >> 10) | 0xD800);
469 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
470 to += 2;
471 from += 4;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000472 }
473 break;
474 default:
475 *to++ = *from++;
476 break;
477 }
478 }
Victor Stinner5ff71322017-06-21 14:39:22 +0200479 if (from < fromLim)
480 res = XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000481after:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000482 *fromP = from;
483 *toP = to;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200484 return res;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000485}
486
487#ifdef XML_NS
488static const struct normal_encoding utf8_encoding_ns = {
489 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
490 {
491#include "asciitab.h"
492#include "utf8tab.h"
493 },
494 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
495};
496#endif
497
498static const struct normal_encoding utf8_encoding = {
499 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
500 {
501#define BT_COLON BT_NMSTRT
502#include "asciitab.h"
503#undef BT_COLON
504#include "utf8tab.h"
505 },
506 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
507};
508
509#ifdef XML_NS
510
511static const struct normal_encoding internal_utf8_encoding_ns = {
512 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
513 {
514#include "iasciitab.h"
515#include "utf8tab.h"
516 },
517 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
518};
519
520#endif
521
522static const struct normal_encoding internal_utf8_encoding = {
523 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
524 {
525#define BT_COLON BT_NMSTRT
526#include "iasciitab.h"
527#undef BT_COLON
528#include "utf8tab.h"
529 },
530 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
531};
532
Victor Stinner23ec4b52017-06-15 00:54:36 +0200533static enum XML_Convert_Result PTRCALL
534latin1_toUtf8(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000535 const char **fromP, const char *fromLim,
536 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000537{
538 for (;;) {
539 unsigned char c;
540 if (*fromP == fromLim)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200541 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000542 c = (unsigned char)**fromP;
543 if (c & 0x80) {
544 if (toLim - *toP < 2)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200545 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000546 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
547 *(*toP)++ = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000548 (*fromP)++;
549 }
550 else {
551 if (*toP == toLim)
Victor Stinner23ec4b52017-06-15 00:54:36 +0200552 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000553 *(*toP)++ = *(*fromP)++;
554 }
555 }
556}
557
Victor Stinner23ec4b52017-06-15 00:54:36 +0200558static enum XML_Convert_Result PTRCALL
559latin1_toUtf16(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000560 const char **fromP, const char *fromLim,
561 unsigned short **toP, const unsigned short *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000562{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200563 while (*fromP < fromLim && *toP < toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000564 *(*toP)++ = (unsigned char)*(*fromP)++;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200565
566 if ((*toP == toLim) && (*fromP < fromLim))
567 return XML_CONVERT_OUTPUT_EXHAUSTED;
568 else
569 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000570}
571
572#ifdef XML_NS
573
574static const struct normal_encoding latin1_encoding_ns = {
575 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
576 {
577#include "asciitab.h"
578#include "latin1tab.h"
579 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200580 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000581};
582
583#endif
584
585static const struct normal_encoding latin1_encoding = {
586 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
587 {
588#define BT_COLON BT_NMSTRT
589#include "asciitab.h"
590#undef BT_COLON
591#include "latin1tab.h"
592 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200593 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000594};
595
Victor Stinner23ec4b52017-06-15 00:54:36 +0200596static enum XML_Convert_Result PTRCALL
597ascii_toUtf8(const ENCODING *UNUSED_P(enc),
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000598 const char **fromP, const char *fromLim,
599 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000600{
Victor Stinner23ec4b52017-06-15 00:54:36 +0200601 while (*fromP < fromLim && *toP < toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000602 *(*toP)++ = *(*fromP)++;
Victor Stinner23ec4b52017-06-15 00:54:36 +0200603
604 if ((*toP == toLim) && (*fromP < fromLim))
605 return XML_CONVERT_OUTPUT_EXHAUSTED;
606 else
607 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000608}
609
610#ifdef XML_NS
611
612static const struct normal_encoding ascii_encoding_ns = {
613 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
614 {
615#include "asciitab.h"
616/* BT_NONXML == 0 */
617 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200618 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000619};
620
621#endif
622
623static const struct normal_encoding ascii_encoding = {
624 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
625 {
626#define BT_COLON BT_NMSTRT
627#include "asciitab.h"
628#undef BT_COLON
629/* BT_NONXML == 0 */
630 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200631 STANDARD_VTABLE(sb_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000632};
633
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000634static int PTRFASTCALL
635unicode_byte_type(char hi, char lo)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000636{
637 switch ((unsigned char)hi) {
638 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
639 return BT_LEAD4;
640 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
641 return BT_TRAIL;
642 case 0xFF:
643 switch ((unsigned char)lo) {
644 case 0xFF:
645 case 0xFE:
646 return BT_NONXML;
647 }
648 break;
649 }
650 return BT_NONASCII;
651}
652
653#define DEFINE_UTF16_TO_UTF8(E) \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200654static enum XML_Convert_Result PTRCALL \
655E ## toUtf8(const ENCODING *UNUSED_P(enc), \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000656 const char **fromP, const char *fromLim, \
657 char **toP, const char *toLim) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000658{ \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200659 const char *from = *fromP; \
660 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
661 for (; from < fromLim; from += 2) { \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000662 int plane; \
663 unsigned char lo2; \
664 unsigned char lo = GET_LO(from); \
665 unsigned char hi = GET_HI(from); \
666 switch (hi) { \
667 case 0: \
668 if (lo < 0x80) { \
669 if (*toP == toLim) { \
670 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200671 return XML_CONVERT_OUTPUT_EXHAUSTED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000672 } \
673 *(*toP)++ = lo; \
674 break; \
675 } \
676 /* fall through */ \
677 case 0x1: case 0x2: case 0x3: \
678 case 0x4: case 0x5: case 0x6: case 0x7: \
679 if (toLim - *toP < 2) { \
680 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200681 return XML_CONVERT_OUTPUT_EXHAUSTED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000682 } \
683 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
684 *(*toP)++ = ((lo & 0x3f) | 0x80); \
685 break; \
686 default: \
687 if (toLim - *toP < 3) { \
688 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200689 return XML_CONVERT_OUTPUT_EXHAUSTED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000690 } \
691 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
692 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
693 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
694 *(*toP)++ = ((lo & 0x3f) | 0x80); \
695 break; \
696 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
697 if (toLim - *toP < 4) { \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000698 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200699 return XML_CONVERT_OUTPUT_EXHAUSTED; \
700 } \
701 if (fromLim - from < 4) { \
702 *fromP = from; \
703 return XML_CONVERT_INPUT_INCOMPLETE; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000704 } \
705 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
706 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
707 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
708 from += 2; \
709 lo2 = GET_LO(from); \
710 *(*toP)++ = (((lo & 0x3) << 4) \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000711 | ((GET_HI(from) & 0x3) << 2) \
712 | (lo2 >> 6) \
713 | 0x80); \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000714 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
715 break; \
716 } \
717 } \
718 *fromP = from; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200719 if (from < fromLim) \
720 return XML_CONVERT_INPUT_INCOMPLETE; \
721 else \
722 return XML_CONVERT_COMPLETED; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000723}
724
725#define DEFINE_UTF16_TO_UTF16(E) \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200726static enum XML_Convert_Result PTRCALL \
727E ## toUtf16(const ENCODING *UNUSED_P(enc), \
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000728 const char **fromP, const char *fromLim, \
729 unsigned short **toP, const unsigned short *toLim) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000730{ \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200731 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
732 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000733 /* Avoid copying first half only of surrogate */ \
734 if (fromLim - *fromP > ((toLim - *toP) << 1) \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200735 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000736 fromLim -= 2; \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200737 res = XML_CONVERT_INPUT_INCOMPLETE; \
738 } \
739 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000740 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
Victor Stinner23ec4b52017-06-15 00:54:36 +0200741 if ((*toP == toLim) && (*fromP < fromLim)) \
742 return XML_CONVERT_OUTPUT_EXHAUSTED; \
743 else \
744 return res; \
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000745}
746
747#define SET2(ptr, ch) \
748 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
749#define GET_LO(ptr) ((unsigned char)(ptr)[0])
750#define GET_HI(ptr) ((unsigned char)(ptr)[1])
751
752DEFINE_UTF16_TO_UTF8(little2_)
753DEFINE_UTF16_TO_UTF16(little2_)
754
755#undef SET2
756#undef GET_LO
757#undef GET_HI
758
759#define SET2(ptr, ch) \
760 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
761#define GET_LO(ptr) ((unsigned char)(ptr)[1])
762#define GET_HI(ptr) ((unsigned char)(ptr)[0])
763
764DEFINE_UTF16_TO_UTF8(big2_)
765DEFINE_UTF16_TO_UTF16(big2_)
766
767#undef SET2
768#undef GET_LO
769#undef GET_HI
770
771#define LITTLE2_BYTE_TYPE(enc, p) \
772 ((p)[1] == 0 \
773 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
774 : unicode_byte_type((p)[1], (p)[0]))
775#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
776#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
777#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
778 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
779#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
780 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
781
782#ifdef XML_MIN_SIZE
783
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000784static int PTRFASTCALL
785little2_byteType(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000786{
787 return LITTLE2_BYTE_TYPE(enc, p);
788}
789
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000790static int PTRFASTCALL
791little2_byteToAscii(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000792{
793 return LITTLE2_BYTE_TO_ASCII(enc, p);
794}
795
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000796static int PTRCALL
797little2_charMatches(const ENCODING *enc, const char *p, int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000798{
799 return LITTLE2_CHAR_MATCHES(enc, p, c);
800}
801
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000802static int PTRFASTCALL
803little2_isNameMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000804{
805 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
806}
807
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000808static int PTRFASTCALL
809little2_isNmstrtMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000810{
811 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
812}
813
814#undef VTABLE
815#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
816
817#else /* not XML_MIN_SIZE */
818
819#undef PREFIX
820#define PREFIX(ident) little2_ ## ident
821#define MINBPC(enc) 2
822/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
823#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000824#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000825#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
826#define IS_NAME_CHAR(enc, p, n) 0
827#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
828#define IS_NMSTRT_CHAR(enc, p, n) (0)
829#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
830
Gregory P. Smith64359d22012-07-14 14:12:35 -0700831#define XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000832#include "xmltok_impl.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -0700833#undef XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000834
835#undef MINBPC
836#undef BYTE_TYPE
837#undef BYTE_TO_ASCII
838#undef CHAR_MATCHES
839#undef IS_NAME_CHAR
840#undef IS_NAME_CHAR_MINBPC
841#undef IS_NMSTRT_CHAR
842#undef IS_NMSTRT_CHAR_MINBPC
843#undef IS_INVALID_CHAR
844
845#endif /* not XML_MIN_SIZE */
846
847#ifdef XML_NS
848
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000849static const struct normal_encoding little2_encoding_ns = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000850 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000851#if BYTEORDER == 1234
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000852 1
853#else
854 0
855#endif
856 },
857 {
858#include "asciitab.h"
859#include "latin1tab.h"
860 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200861 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000862};
863
864#endif
865
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000866static const struct normal_encoding little2_encoding = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000867 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000868#if BYTEORDER == 1234
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000869 1
870#else
871 0
872#endif
873 },
874 {
875#define BT_COLON BT_NMSTRT
876#include "asciitab.h"
877#undef BT_COLON
878#include "latin1tab.h"
879 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200880 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000881};
882
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000883#if BYTEORDER != 4321
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000884
885#ifdef XML_NS
886
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000887static const struct normal_encoding internal_little2_encoding_ns = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000888 { VTABLE, 2, 0, 1 },
889 {
890#include "iasciitab.h"
891#include "latin1tab.h"
892 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200893 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000894};
895
896#endif
897
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000898static const struct normal_encoding internal_little2_encoding = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000899 { VTABLE, 2, 0, 1 },
900 {
901#define BT_COLON BT_NMSTRT
902#include "iasciitab.h"
903#undef BT_COLON
904#include "latin1tab.h"
905 },
Victor Stinner23ec4b52017-06-15 00:54:36 +0200906 STANDARD_VTABLE(little2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000907};
908
909#endif
910
911
912#define BIG2_BYTE_TYPE(enc, p) \
913 ((p)[0] == 0 \
914 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
915 : unicode_byte_type((p)[0], (p)[1]))
916#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
917#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
918#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
919 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
920#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
921 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
922
923#ifdef XML_MIN_SIZE
924
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000925static int PTRFASTCALL
926big2_byteType(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000927{
928 return BIG2_BYTE_TYPE(enc, p);
929}
930
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000931static int PTRFASTCALL
932big2_byteToAscii(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000933{
934 return BIG2_BYTE_TO_ASCII(enc, p);
935}
936
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000937static int PTRCALL
938big2_charMatches(const ENCODING *enc, const char *p, int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000939{
940 return BIG2_CHAR_MATCHES(enc, p, c);
941}
942
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000943static int PTRFASTCALL
944big2_isNameMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000945{
946 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
947}
948
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000949static int PTRFASTCALL
950big2_isNmstrtMin(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000951{
952 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
953}
954
955#undef VTABLE
956#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
957
958#else /* not XML_MIN_SIZE */
959
960#undef PREFIX
961#define PREFIX(ident) big2_ ## ident
962#define MINBPC(enc) 2
963/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
964#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000965#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000966#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
967#define IS_NAME_CHAR(enc, p, n) 0
968#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
969#define IS_NMSTRT_CHAR(enc, p, n) (0)
970#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
971
Gregory P. Smith64359d22012-07-14 14:12:35 -0700972#define XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000973#include "xmltok_impl.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -0700974#undef XML_TOK_IMPL_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000975
976#undef MINBPC
977#undef BYTE_TYPE
978#undef BYTE_TO_ASCII
979#undef CHAR_MATCHES
980#undef IS_NAME_CHAR
981#undef IS_NAME_CHAR_MINBPC
982#undef IS_NMSTRT_CHAR
983#undef IS_NMSTRT_CHAR_MINBPC
984#undef IS_INVALID_CHAR
985
986#endif /* not XML_MIN_SIZE */
987
988#ifdef XML_NS
989
990static const struct normal_encoding big2_encoding_ns = {
991 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +0000992#if BYTEORDER == 4321
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +0000993 1
994#else
995 0
996#endif
997 },
998 {
999#include "asciitab.h"
1000#include "latin1tab.h"
1001 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001002 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001003};
1004
1005#endif
1006
1007static const struct normal_encoding big2_encoding = {
1008 { VTABLE, 2, 0,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001009#if BYTEORDER == 4321
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001010 1
1011#else
1012 0
1013#endif
1014 },
1015 {
1016#define BT_COLON BT_NMSTRT
1017#include "asciitab.h"
1018#undef BT_COLON
1019#include "latin1tab.h"
1020 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001021 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001022};
1023
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001024#if BYTEORDER != 1234
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001025
1026#ifdef XML_NS
1027
1028static const struct normal_encoding internal_big2_encoding_ns = {
1029 { VTABLE, 2, 0, 1 },
1030 {
1031#include "iasciitab.h"
1032#include "latin1tab.h"
1033 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001034 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001035};
1036
1037#endif
1038
1039static const struct normal_encoding internal_big2_encoding = {
1040 { VTABLE, 2, 0, 1 },
1041 {
1042#define BT_COLON BT_NMSTRT
1043#include "iasciitab.h"
1044#undef BT_COLON
1045#include "latin1tab.h"
1046 },
Victor Stinner23ec4b52017-06-15 00:54:36 +02001047 STANDARD_VTABLE(big2_) NULL_VTABLE
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001048};
1049
1050#endif
1051
1052#undef PREFIX
1053
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001054static int FASTCALL
1055streqci(const char *s1, const char *s2)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001056{
1057 for (;;) {
1058 char c1 = *s1++;
1059 char c2 = *s2++;
1060 if (ASCII_a <= c1 && c1 <= ASCII_z)
1061 c1 += ASCII_A - ASCII_a;
1062 if (ASCII_a <= c2 && c2 <= ASCII_z)
Victor Stinner93d0cb52017-08-18 23:43:54 +02001063 /* The following line will never get executed. streqci() is
1064 * only called from two places, both of which guarantee to put
1065 * upper-case strings into s2.
1066 */
1067 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001068 if (c1 != c2)
1069 return 0;
1070 if (!c1)
1071 break;
1072 }
1073 return 1;
1074}
1075
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001076static void PTRCALL
Victor Stinner23ec4b52017-06-15 00:54:36 +02001077initUpdatePosition(const ENCODING *UNUSED_P(enc), const char *ptr,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001078 const char *end, POSITION *pos)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001079{
1080 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1081}
1082
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001083static int
1084toAscii(const ENCODING *enc, const char *ptr, const char *end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001085{
1086 char buf[1];
1087 char *p = buf;
1088 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1089 if (p == buf)
1090 return -1;
1091 else
1092 return buf[0];
1093}
1094
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001095static int FASTCALL
1096isSpace(int c)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001097{
1098 switch (c) {
1099 case 0x20:
1100 case 0xD:
1101 case 0xA:
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001102 case 0x9:
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001103 return 1;
1104 }
1105 return 0;
1106}
1107
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001108/* Return 1 if there's just optional white space or there's an S
1109 followed by name=val.
1110*/
1111static int
1112parsePseudoAttribute(const ENCODING *enc,
1113 const char *ptr,
1114 const char *end,
1115 const char **namePtr,
1116 const char **nameEndPtr,
1117 const char **valPtr,
1118 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001119{
1120 int c;
1121 char open;
1122 if (ptr == end) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001123 *namePtr = NULL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001124 return 1;
1125 }
1126 if (!isSpace(toAscii(enc, ptr, end))) {
1127 *nextTokPtr = ptr;
1128 return 0;
1129 }
1130 do {
1131 ptr += enc->minBytesPerChar;
1132 } while (isSpace(toAscii(enc, ptr, end)));
1133 if (ptr == end) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001134 *namePtr = NULL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001135 return 1;
1136 }
1137 *namePtr = ptr;
1138 for (;;) {
1139 c = toAscii(enc, ptr, end);
1140 if (c == -1) {
1141 *nextTokPtr = ptr;
1142 return 0;
1143 }
1144 if (c == ASCII_EQUALS) {
1145 *nameEndPtr = ptr;
1146 break;
1147 }
1148 if (isSpace(c)) {
1149 *nameEndPtr = ptr;
1150 do {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001151 ptr += enc->minBytesPerChar;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001152 } while (isSpace(c = toAscii(enc, ptr, end)));
1153 if (c != ASCII_EQUALS) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001154 *nextTokPtr = ptr;
1155 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001156 }
1157 break;
1158 }
1159 ptr += enc->minBytesPerChar;
1160 }
1161 if (ptr == *namePtr) {
1162 *nextTokPtr = ptr;
1163 return 0;
1164 }
1165 ptr += enc->minBytesPerChar;
1166 c = toAscii(enc, ptr, end);
1167 while (isSpace(c)) {
1168 ptr += enc->minBytesPerChar;
1169 c = toAscii(enc, ptr, end);
1170 }
1171 if (c != ASCII_QUOT && c != ASCII_APOS) {
1172 *nextTokPtr = ptr;
1173 return 0;
1174 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001175 open = (char)c;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001176 ptr += enc->minBytesPerChar;
1177 *valPtr = ptr;
1178 for (;; ptr += enc->minBytesPerChar) {
1179 c = toAscii(enc, ptr, end);
1180 if (c == open)
1181 break;
1182 if (!(ASCII_a <= c && c <= ASCII_z)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001183 && !(ASCII_A <= c && c <= ASCII_Z)
1184 && !(ASCII_0 <= c && c <= ASCII_9)
1185 && c != ASCII_PERIOD
1186 && c != ASCII_MINUS
1187 && c != ASCII_UNDERSCORE) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001188 *nextTokPtr = ptr;
1189 return 0;
1190 }
1191 }
1192 *nextTokPtr = ptr + enc->minBytesPerChar;
1193 return 1;
1194}
1195
1196static const char KW_version[] = {
1197 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1198};
1199
1200static const char KW_encoding[] = {
1201 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1202};
1203
1204static const char KW_standalone[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001205 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1206 ASCII_n, ASCII_e, '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001207};
1208
1209static const char KW_yes[] = {
1210 ASCII_y, ASCII_e, ASCII_s, '\0'
1211};
1212
1213static const char KW_no[] = {
1214 ASCII_n, ASCII_o, '\0'
1215};
1216
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001217static int
1218doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1219 const char *,
1220 const char *),
1221 int isGeneralTextEntity,
1222 const ENCODING *enc,
1223 const char *ptr,
1224 const char *end,
1225 const char **badPtr,
1226 const char **versionPtr,
1227 const char **versionEndPtr,
1228 const char **encodingName,
1229 const ENCODING **encoding,
1230 int *standalone)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001231{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001232 const char *val = NULL;
1233 const char *name = NULL;
1234 const char *nameEnd = NULL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001235 ptr += 5 * enc->minBytesPerChar;
1236 end -= 2 * enc->minBytesPerChar;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001237 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1238 || !name) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001239 *badPtr = ptr;
1240 return 0;
1241 }
1242 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1243 if (!isGeneralTextEntity) {
1244 *badPtr = name;
1245 return 0;
1246 }
1247 }
1248 else {
1249 if (versionPtr)
1250 *versionPtr = val;
1251 if (versionEndPtr)
1252 *versionEndPtr = ptr;
1253 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1254 *badPtr = ptr;
1255 return 0;
1256 }
1257 if (!name) {
1258 if (isGeneralTextEntity) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001259 /* a TextDecl must have an EncodingDecl */
1260 *badPtr = ptr;
1261 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001262 }
1263 return 1;
1264 }
1265 }
1266 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1267 int c = toAscii(enc, val, end);
1268 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1269 *badPtr = val;
1270 return 0;
1271 }
1272 if (encodingName)
1273 *encodingName = val;
1274 if (encoding)
1275 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1276 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1277 *badPtr = ptr;
1278 return 0;
1279 }
1280 if (!name)
1281 return 1;
1282 }
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001283 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1284 || isGeneralTextEntity) {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001285 *badPtr = name;
1286 return 0;
1287 }
1288 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1289 if (standalone)
1290 *standalone = 1;
1291 }
1292 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1293 if (standalone)
1294 *standalone = 0;
1295 }
1296 else {
1297 *badPtr = val;
1298 return 0;
1299 }
1300 while (isSpace(toAscii(enc, ptr, end)))
1301 ptr += enc->minBytesPerChar;
1302 if (ptr != end) {
1303 *badPtr = ptr;
1304 return 0;
1305 }
1306 return 1;
1307}
1308
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001309static int FASTCALL
1310checkCharRefNumber(int result)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001311{
1312 switch (result >> 8) {
1313 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1314 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1315 return -1;
1316 case 0:
1317 if (latin1_encoding.type[result] == BT_NONXML)
1318 return -1;
1319 break;
1320 case 0xFF:
1321 if (result == 0xFFFE || result == 0xFFFF)
1322 return -1;
1323 break;
1324 }
1325 return result;
1326}
1327
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001328int FASTCALL
1329XmlUtf8Encode(int c, char *buf)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001330{
1331 enum {
1332 /* minN is minimum legal resulting value for N byte sequence */
1333 min2 = 0x80,
1334 min3 = 0x800,
1335 min4 = 0x10000
1336 };
1337
1338 if (c < 0)
Victor Stinner93d0cb52017-08-18 23:43:54 +02001339 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001340 if (c < min2) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001341 buf[0] = (char)(c | UTF8_cval1);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001342 return 1;
1343 }
1344 if (c < min3) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001345 buf[0] = (char)((c >> 6) | UTF8_cval2);
1346 buf[1] = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001347 return 2;
1348 }
1349 if (c < min4) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001350 buf[0] = (char)((c >> 12) | UTF8_cval3);
1351 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1352 buf[2] = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001353 return 3;
1354 }
1355 if (c < 0x110000) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001356 buf[0] = (char)((c >> 18) | UTF8_cval4);
1357 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1358 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1359 buf[3] = (char)((c & 0x3f) | 0x80);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001360 return 4;
1361 }
Victor Stinner93d0cb52017-08-18 23:43:54 +02001362 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001363}
1364
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001365int FASTCALL
1366XmlUtf16Encode(int charNum, unsigned short *buf)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001367{
1368 if (charNum < 0)
1369 return 0;
1370 if (charNum < 0x10000) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001371 buf[0] = (unsigned short)charNum;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001372 return 1;
1373 }
1374 if (charNum < 0x110000) {
1375 charNum -= 0x10000;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001376 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1377 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001378 return 2;
1379 }
1380 return 0;
1381}
1382
1383struct unknown_encoding {
1384 struct normal_encoding normal;
Fred Drake31d485c2004-08-03 07:06:22 +00001385 CONVERTER convert;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001386 void *userData;
1387 unsigned short utf16[256];
1388 char utf8[256][4];
1389};
1390
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001391#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
1392
1393int
1394XmlSizeOfUnknownEncoding(void)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001395{
1396 return sizeof(struct unknown_encoding);
1397}
1398
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001399static int PTRFASTCALL
1400unknown_isName(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001401{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001402 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1403 int c = uenc->convert(uenc->userData, p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001404 if (c & ~0xFFFF)
1405 return 0;
1406 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1407}
1408
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001409static int PTRFASTCALL
1410unknown_isNmstrt(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001411{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001412 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1413 int c = uenc->convert(uenc->userData, p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001414 if (c & ~0xFFFF)
1415 return 0;
1416 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1417}
1418
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001419static int PTRFASTCALL
1420unknown_isInvalid(const ENCODING *enc, const char *p)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001421{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001422 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1423 int c = uenc->convert(uenc->userData, p);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001424 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1425}
1426
Victor Stinner23ec4b52017-06-15 00:54:36 +02001427static enum XML_Convert_Result PTRCALL
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001428unknown_toUtf8(const ENCODING *enc,
1429 const char **fromP, const char *fromLim,
1430 char **toP, const char *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001431{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001432 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001433 char buf[XML_UTF8_ENCODE_MAX];
1434 for (;;) {
1435 const char *utf8;
1436 int n;
1437 if (*fromP == fromLim)
Victor Stinner23ec4b52017-06-15 00:54:36 +02001438 return XML_CONVERT_COMPLETED;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001439 utf8 = uenc->utf8[(unsigned char)**fromP];
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001440 n = *utf8++;
1441 if (n == 0) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001442 int c = uenc->convert(uenc->userData, *fromP);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001443 n = XmlUtf8Encode(c, buf);
1444 if (n > toLim - *toP)
Victor Stinner23ec4b52017-06-15 00:54:36 +02001445 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001446 utf8 = buf;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001447 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1448 - (BT_LEAD2 - 2));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001449 }
1450 else {
1451 if (n > toLim - *toP)
Victor Stinner23ec4b52017-06-15 00:54:36 +02001452 return XML_CONVERT_OUTPUT_EXHAUSTED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001453 (*fromP)++;
1454 }
1455 do {
1456 *(*toP)++ = *utf8++;
1457 } while (--n != 0);
1458 }
1459}
1460
Victor Stinner23ec4b52017-06-15 00:54:36 +02001461static enum XML_Convert_Result PTRCALL
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001462unknown_toUtf16(const ENCODING *enc,
1463 const char **fromP, const char *fromLim,
1464 unsigned short **toP, const unsigned short *toLim)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001465{
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001466 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
Victor Stinner23ec4b52017-06-15 00:54:36 +02001467 while (*fromP < fromLim && *toP < toLim) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001468 unsigned short c = uenc->utf16[(unsigned char)**fromP];
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001469 if (c == 0) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001470 c = (unsigned short)
1471 uenc->convert(uenc->userData, *fromP);
1472 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1473 - (BT_LEAD2 - 2));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001474 }
1475 else
1476 (*fromP)++;
1477 *(*toP)++ = c;
1478 }
Victor Stinner23ec4b52017-06-15 00:54:36 +02001479
1480 if ((*toP == toLim) && (*fromP < fromLim))
1481 return XML_CONVERT_OUTPUT_EXHAUSTED;
1482 else
1483 return XML_CONVERT_COMPLETED;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001484}
1485
1486ENCODING *
1487XmlInitUnknownEncoding(void *mem,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001488 int *table,
Gregory P. Smith64359d22012-07-14 14:12:35 -07001489 CONVERTER convert,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001490 void *userData)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001491{
1492 int i;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001493 struct unknown_encoding *e = (struct unknown_encoding *)mem;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001494 for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1495 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1496 for (i = 0; i < 128; i++)
1497 if (latin1_encoding.type[i] != BT_OTHER
1498 && latin1_encoding.type[i] != BT_NONXML
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001499 && table[i] != i)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001500 return 0;
1501 for (i = 0; i < 256; i++) {
1502 int c = table[i];
1503 if (c == -1) {
1504 e->normal.type[i] = BT_MALFORM;
1505 /* This shouldn't really get used. */
1506 e->utf16[i] = 0xFFFF;
1507 e->utf8[i][0] = 1;
1508 e->utf8[i][1] = 0;
1509 }
1510 else if (c < 0) {
1511 if (c < -4)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001512 return 0;
Victor Stinner93d0cb52017-08-18 23:43:54 +02001513 /* Multi-byte sequences need a converter function */
1514 if (!convert)
1515 return 0;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001516 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001517 e->utf8[i][0] = 0;
1518 e->utf16[i] = 0;
1519 }
1520 else if (c < 0x80) {
1521 if (latin1_encoding.type[c] != BT_OTHER
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001522 && latin1_encoding.type[c] != BT_NONXML
1523 && c != i)
1524 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001525 e->normal.type[i] = latin1_encoding.type[c];
1526 e->utf8[i][0] = 1;
1527 e->utf8[i][1] = (char)c;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001528 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001529 }
1530 else if (checkCharRefNumber(c) < 0) {
1531 e->normal.type[i] = BT_NONXML;
1532 /* This shouldn't really get used. */
1533 e->utf16[i] = 0xFFFF;
1534 e->utf8[i][0] = 1;
1535 e->utf8[i][1] = 0;
1536 }
1537 else {
1538 if (c > 0xFFFF)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001539 return 0;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001540 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001541 e->normal.type[i] = BT_NMSTRT;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001542 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001543 e->normal.type[i] = BT_NAME;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001544 else
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001545 e->normal.type[i] = BT_OTHER;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001546 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001547 e->utf16[i] = (unsigned short)c;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001548 }
1549 }
1550 e->userData = userData;
1551 e->convert = convert;
1552 if (convert) {
1553 e->normal.isName2 = unknown_isName;
1554 e->normal.isName3 = unknown_isName;
1555 e->normal.isName4 = unknown_isName;
1556 e->normal.isNmstrt2 = unknown_isNmstrt;
1557 e->normal.isNmstrt3 = unknown_isNmstrt;
1558 e->normal.isNmstrt4 = unknown_isNmstrt;
1559 e->normal.isInvalid2 = unknown_isInvalid;
1560 e->normal.isInvalid3 = unknown_isInvalid;
1561 e->normal.isInvalid4 = unknown_isInvalid;
1562 }
1563 e->normal.enc.utf8Convert = unknown_toUtf8;
1564 e->normal.enc.utf16Convert = unknown_toUtf16;
1565 return &(e->normal.enc);
1566}
1567
1568/* If this enumeration is changed, getEncodingIndex and encodings
1569must also be changed. */
1570enum {
1571 UNKNOWN_ENC = -1,
1572 ISO_8859_1_ENC = 0,
1573 US_ASCII_ENC,
1574 UTF_8_ENC,
1575 UTF_16_ENC,
1576 UTF_16BE_ENC,
1577 UTF_16LE_ENC,
1578 /* must match encodingNames up to here */
1579 NO_ENC
1580};
1581
1582static const char KW_ISO_8859_1[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001583 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1584 ASCII_MINUS, ASCII_1, '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001585};
1586static const char KW_US_ASCII[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001587 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1588 '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001589};
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001590static const char KW_UTF_8[] = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001591 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1592};
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001593static const char KW_UTF_16[] = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001594 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1595};
1596static const char KW_UTF_16BE[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001597 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1598 '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001599};
1600static const char KW_UTF_16LE[] = {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001601 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1602 '\0'
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001603};
1604
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001605static int FASTCALL
1606getEncodingIndex(const char *name)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001607{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001608 static const char * const encodingNames[] = {
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001609 KW_ISO_8859_1,
1610 KW_US_ASCII,
1611 KW_UTF_8,
1612 KW_UTF_16,
1613 KW_UTF_16BE,
1614 KW_UTF_16LE,
1615 };
1616 int i;
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001617 if (name == NULL)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001618 return NO_ENC;
1619 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1620 if (streqci(name, encodingNames[i]))
1621 return i;
1622 return UNKNOWN_ENC;
1623}
1624
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001625/* For binary compatibility, we store the index of the encoding
1626 specified at initialization in the isUtf16 member.
1627*/
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001628
1629#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1630#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1631
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001632/* This is what detects the encoding. encodingTable maps from
1633 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1634 the external (protocol) specified encoding; state is
1635 XML_CONTENT_STATE if we're parsing an external text entity, and
1636 XML_PROLOG_STATE otherwise.
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001637*/
1638
1639
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001640static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001641initScan(const ENCODING * const *encodingTable,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001642 const INIT_ENCODING *enc,
1643 int state,
1644 const char *ptr,
1645 const char *end,
1646 const char **nextTokPtr)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001647{
1648 const ENCODING **encPtr;
1649
Victor Stinner23ec4b52017-06-15 00:54:36 +02001650 if (ptr >= end)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001651 return XML_TOK_NONE;
1652 encPtr = enc->encPtr;
1653 if (ptr + 1 == end) {
1654 /* only a single byte available for auto-detection */
1655#ifndef XML_DTD /* FIXME */
1656 /* a well-formed document entity must have more than one byte */
1657 if (state != XML_CONTENT_STATE)
1658 return XML_TOK_PARTIAL;
1659#endif
1660 /* so we're parsing an external text entity... */
1661 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1662 switch (INIT_ENC_INDEX(enc)) {
1663 case UTF_16_ENC:
1664 case UTF_16LE_ENC:
1665 case UTF_16BE_ENC:
1666 return XML_TOK_PARTIAL;
1667 }
1668 switch ((unsigned char)*ptr) {
1669 case 0xFE:
1670 case 0xFF:
1671 case 0xEF: /* possibly first byte of UTF-8 BOM */
1672 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001673 && state == XML_CONTENT_STATE)
1674 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001675 /* fall through */
1676 case 0x00:
1677 case 0x3C:
1678 return XML_TOK_PARTIAL;
1679 }
1680 }
1681 else {
1682 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1683 case 0xFEFF:
1684 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001685 && state == XML_CONTENT_STATE)
1686 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001687 *nextTokPtr = ptr + 2;
1688 *encPtr = encodingTable[UTF_16BE_ENC];
1689 return XML_TOK_BOM;
1690 /* 00 3C is handled in the default case */
1691 case 0x3C00:
1692 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001693 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1694 && state == XML_CONTENT_STATE)
1695 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001696 *encPtr = encodingTable[UTF_16LE_ENC];
1697 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1698 case 0xFFFE:
1699 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001700 && state == XML_CONTENT_STATE)
1701 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001702 *nextTokPtr = ptr + 2;
1703 *encPtr = encodingTable[UTF_16LE_ENC];
1704 return XML_TOK_BOM;
1705 case 0xEFBB:
1706 /* Maybe a UTF-8 BOM (EF BB BF) */
1707 /* If there's an explicitly specified (external) encoding
1708 of ISO-8859-1 or some flavour of UTF-16
1709 and this is an external text entity,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001710 don't look for the BOM,
1711 because it might be a legal data.
1712 */
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001713 if (state == XML_CONTENT_STATE) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001714 int e = INIT_ENC_INDEX(enc);
1715 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1716 || e == UTF_16LE_ENC || e == UTF_16_ENC)
1717 break;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001718 }
1719 if (ptr + 2 == end)
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001720 return XML_TOK_PARTIAL;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001721 if ((unsigned char)ptr[2] == 0xBF) {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001722 *nextTokPtr = ptr + 3;
1723 *encPtr = encodingTable[UTF_8_ENC];
1724 return XML_TOK_BOM;
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001725 }
1726 break;
1727 default:
1728 if (ptr[0] == '\0') {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001729 /* 0 isn't a legal data character. Furthermore a document
1730 entity can only start with ASCII characters. So the only
Benjamin Peterson196d7db2016-06-11 13:28:56 -07001731 way this can fail to be big-endian UTF-16 if it it's an
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001732 external parsed general entity that's labelled as
1733 UTF-16LE.
1734 */
1735 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1736 break;
1737 *encPtr = encodingTable[UTF_16BE_ENC];
1738 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001739 }
1740 else if (ptr[1] == '\0') {
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001741 /* We could recover here in the case:
1742 - parsing an external entity
1743 - second byte is 0
1744 - no externally specified encoding
1745 - no encoding declaration
1746 by assuming UTF-16LE. But we don't, because this would mean when
1747 presented just with a single byte, we couldn't reliably determine
1748 whether we needed further bytes.
1749 */
1750 if (state == XML_CONTENT_STATE)
1751 break;
1752 *encPtr = encodingTable[UTF_16LE_ENC];
1753 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001754 }
1755 break;
1756 }
1757 }
1758 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1759 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1760}
1761
1762
1763#define NS(x) x
1764#define ns(x) x
Gregory P. Smith64359d22012-07-14 14:12:35 -07001765#define XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001766#include "xmltok_ns.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -07001767#undef XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001768#undef NS
1769#undef ns
1770
1771#ifdef XML_NS
1772
1773#define NS(x) x ## NS
1774#define ns(x) x ## _ns
1775
Gregory P. Smith64359d22012-07-14 14:12:35 -07001776#define XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001777#include "xmltok_ns.c"
Gregory P. Smith64359d22012-07-14 14:12:35 -07001778#undef XML_TOK_NS_C
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001779
1780#undef NS
1781#undef ns
1782
1783ENCODING *
1784XmlInitUnknownEncodingNS(void *mem,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001785 int *table,
Gregory P. Smith64359d22012-07-14 14:12:35 -07001786 CONVERTER convert,
Martin v. Löwisfc03a942003-01-25 22:41:29 +00001787 void *userData)
Martin v. Löwis1dbb1ca2002-02-11 23:13:04 +00001788{
1789 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1790 if (enc)
1791 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1792 return enc;
1793}
1794
1795#endif /* XML_NS */