blob: 3bfd9c074bd567dafe9865dba037e9ceb9fe9adb [file] [log] [blame]
Daniel Veillardabade012008-07-24 15:05:38 +00001/**
2 * Test the UTF-8 decoding routines
3 *
4 * author: Daniel Veillard
5 * copy: see Copyright for the status of this software.
6 */
7
8#include <stdio.h>
9#include <string.h>
10#include <libxml/parser.h>
11#include <libxml/parserInternals.h>
12
13int lastError;
14
15static void errorHandler(void *unused, xmlErrorPtr err) {
16 if ((unused == NULL) && (err != NULL) && (lastError == 0)) {
17 lastError = err->code;
18 }
19}
20
21char document1[100] = "<doc>XXXX</doc>";
22char document2[100] = "<doc foo='XXXX'/>";
23
24static void testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document,
25 int len, char *data, int forbid1, int forbid2) {
26 int i;
27 xmlDocPtr res;
28
29 for (i = 0;i <= 0xFF;i++) {
30 lastError = 0;
31 xmlCtxtReset(ctxt);
32
33 data[0] = i;
34
35 res = xmlReadMemory(document, len, "test", NULL, 0);
36
37 if ((i == forbid1) || (i == forbid2)) {
38 if ((lastError == 0) || (res != NULL))
39 fprintf(stderr,
40 "Failed to detect invalid char for Byte 0x%02X: %c\n",
41 i, i);
42 }
43
44 else if ((i == '<') || (i == '&')) {
45 if ((lastError == 0) || (res != NULL))
46 fprintf(stderr,
47 "Failed to detect illegal char %c for Byte 0x%02X\n", i, i);
48 }
49 else if (((i < 0x20) || (i >= 0x80)) &&
50 (i != 0x9) && (i != 0xA) && (i != 0xD)) {
51 if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL))
52 fprintf(stderr,
53 "Failed to detect invalid char for Byte 0x%02X\n", i);
54 }
55 else if (res == NULL) {
56 fprintf(stderr,
57 "Failed to parse valid char for Byte 0x%02X : %c\n", i, i);
58 }
59 if (res != NULL)
60 xmlFreeDoc(res);
61 }
62}
63
64static void testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document,
65 int len, char *data) {
66 int i, j;
67 xmlDocPtr res;
68
69 for (i = 0x80;i <= 0xFF;i++) {
70 for (j = 0;j <= 0xFF;j++) {
71 lastError = 0;
72 xmlCtxtReset(ctxt);
73
74 data[0] = i;
75 data[1] = j;
76
77 res = xmlReadMemory(document, len, "test", NULL, 0);
78
79 /* if first bit of first char is set, then second bit must too */
80 if ((i & 0x80) && ((i & 0x40) == 0)) {
81 if ((lastError == 0) || (res != NULL))
82 fprintf(stderr,
83 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
84 i, j);
85 }
86
87 /*
88 * if first bit of first char is set, then second char first
89 * bits must be 10
90 */
91 else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
92 if ((lastError == 0) || (res != NULL))
93 fprintf(stderr,
94 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
95 i, j);
96 }
97
98 /*
99 * if using a 2 byte encoding then the value must be greater
100 * than 0x80, i.e. one of bits 5 to 1 of i must be set
101 */
102 else if ((i & 0x80) && ((i & 0x1E) == 0)) {
103 if ((lastError == 0) || (res != NULL))
104 fprintf(stderr,
105 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
106 i, j);
107 }
108
109 /*
110 * if third bit of first char is set, then the sequence would need
111 * at least 3 bytes, but we give only 2 !
112 */
113 else if ((i & 0xE0) == 0xE0) {
114 if ((lastError == 0) || (res != NULL))
115 fprintf(stderr,
116 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
117 i, j);
118 }
119
120 /*
121 * We should see no error in remaning cases
122 */
123 else if ((lastError != 0) || (res == NULL)) {
124 fprintf(stderr,
125 "Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j);
126 }
127 if (res != NULL)
128 xmlFreeDoc(res);
129 }
130 }
131}
132
133/**
134 * testDocumentRanges:
135 *
136 * Test the correct UTF8 character parsing in context of XML documents
137 * Those are in-context injection tests checking the parser behaviour on
138 * edge case values at different point in content, beginning and end of
139 * CDATA in text or in attribute values.
140 */
141
142static void testDocumentRanges(void) {
143 xmlParserCtxtPtr ctxt;
144 char *data;
145
146 /*
147 * Set up a parsing context using the first document as
148 * the current input source.
149 */
150 ctxt = xmlNewParserCtxt();
151 if (ctxt == NULL) {
152 fprintf(stderr, "Failed to allocate parser context\n");
153 return;
154 }
155
156 printf("testing 1 byte char in document: 1");
157 fflush(stdout);
158 data = &document1[5];
159 data[0] = ' ';
160 data[1] = ' ';
161 data[2] = ' ';
162 data[3] = ' ';
163 /* test 1 byte injection at beginning of area */
164 testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
165 data, -1, -1);
166 printf(" 2");
167 fflush(stdout);
168 data[0] = ' ';
169 data[1] = ' ';
170 data[2] = ' ';
171 data[3] = ' ';
172 /* test 1 byte injection at end of area */
173 testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
174 data + 3, -1, -1);
175
176 printf(" 3");
177 fflush(stdout);
178 data = &document2[10];
179 data[0] = ' ';
180 data[1] = ' ';
181 data[2] = ' ';
182 data[3] = ' ';
183 /* test 1 byte injection at beginning of area */
184 testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
185 data, '\'', -1);
186 printf(" 4");
187 fflush(stdout);
188 data[0] = ' ';
189 data[1] = ' ';
190 data[2] = ' ';
191 data[3] = ' ';
192 /* test 1 byte injection at end of area */
193 testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
194 data + 3, '\'', -1);
195 printf(" done\n");
196
197 printf("testing 2 byte char in document: 1");
198 fflush(stdout);
199 data = &document1[5];
200 data[0] = ' ';
201 data[1] = ' ';
202 data[2] = ' ';
203 data[3] = ' ';
204 /* test 2 byte injection at beginning of area */
205 testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
206 data);
207 printf(" 2");
208 fflush(stdout);
209 data[0] = ' ';
210 data[1] = ' ';
211 data[2] = ' ';
212 data[3] = ' ';
213 /* test 2 byte injection at end of area */
214 testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
215 data + 2);
216
217 printf(" 3");
218 fflush(stdout);
219 data = &document2[10];
220 data[0] = ' ';
221 data[1] = ' ';
222 data[2] = ' ';
223 data[3] = ' ';
224 /* test 2 byte injection at beginning of area */
225 testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
226 data);
227 printf(" 4");
228 fflush(stdout);
229 data[0] = ' ';
230 data[1] = ' ';
231 data[2] = ' ';
232 data[3] = ' ';
233 /* test 2 byte injection at end of area */
234 testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
235 data + 2);
236 printf(" done\n");
237
238 xmlFreeParserCtxt(ctxt);
239}
240
241static void testCharRangeByte1(xmlParserCtxtPtr ctxt, char *data) {
242 int i = 0;
243 int len, c;
244
245 data[1] = 0;
246 data[2] = 0;
247 data[3] = 0;
248 for (i = 0;i <= 0xFF;i++) {
249 data[0] = i;
250 ctxt->charset = XML_CHAR_ENCODING_UTF8;
251
252 lastError = 0;
253 c = xmlCurrentChar(ctxt, &len);
254 if ((i == 0) || (i >= 0x80)) {
255 /* we must see an error there */
256 if (lastError != XML_ERR_INVALID_CHAR)
257 fprintf(stderr,
258 "Failed to detect invalid char for Byte 0x%02X\n", i);
259 } else if (i == 0xD) {
260 if ((c != 0xA) || (len != 1))
261 fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i);
262 } else if ((c != i) || (len != 1)) {
263 fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i);
264 }
265 }
266}
267
268static void testCharRangeByte2(xmlParserCtxtPtr ctxt, char *data) {
269 int i, j;
270 int len, c;
271
272 data[2] = 0;
273 data[3] = 0;
274 for (i = 0x80;i <= 0xFF;i++) {
275 for (j = 0;j <= 0xFF;j++) {
276 data[0] = i;
277 data[1] = j;
278 ctxt->charset = XML_CHAR_ENCODING_UTF8;
279
280 lastError = 0;
281 c = xmlCurrentChar(ctxt, &len);
282
283 /* if first bit of first char is set, then second bit must too */
284 if ((i & 0x80) && ((i & 0x40) == 0)) {
285 if (lastError != XML_ERR_INVALID_CHAR)
286 fprintf(stderr,
287 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
288 i, j);
289 }
290
291 /*
292 * if first bit of first char is set, then second char first
293 * bits must be 10
294 */
295 else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
296 if (lastError != XML_ERR_INVALID_CHAR)
297 fprintf(stderr,
298 "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
299 i, j, c);
300 }
301
302 /*
303 * if using a 2 byte encoding then the value must be greater
304 * than 0x80, i.e. one of bits 5 to 1 of i must be set
305 */
306 else if ((i & 0x80) && ((i & 0x1E) == 0)) {
307 if (lastError != XML_ERR_INVALID_CHAR)
308 fprintf(stderr,
309 "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
310 i, j, c);
311 }
312
313 /*
314 * if third bit of first char is set, then the sequence would need
315 * at least 3 bytes, but we give only 2 !
316 */
317 else if ((i & 0xE0) == 0xE0) {
318 if (lastError != XML_ERR_INVALID_CHAR)
319 fprintf(stderr,
320 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
321 i, j);
322 }
323
324 /*
325 * We should see no error in remaning cases
326 */
327 else if ((lastError != 0) || (len != 2)) {
328 fprintf(stderr,
329 "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j);
330 }
331
332 /*
333 * Finally check the value is right
334 */
335 else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) {
336 fprintf(stderr,
337 "Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n",
338 i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c);
339 }
340 }
341 }
342}
343
344static void testCharRangeByte3(xmlParserCtxtPtr ctxt, char *data) {
345 int i, j, k, K;
346 int len, c;
347 unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
348 int value;
349
350 data[3] = 0;
351 for (i = 0xE0;i <= 0xFF;i++) {
352 for (j = 0;j <= 0xFF;j++) {
353 for (k = 0;k < 6;k++) {
354 data[0] = i;
355 data[1] = j;
356 K = lows[k];
357 data[2] = (char) K;
358 value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
359 ctxt->charset = XML_CHAR_ENCODING_UTF8;
360
361 lastError = 0;
362 c = xmlCurrentChar(ctxt, &len);
363
364 /*
365 * if fourth bit of first char is set, then the sequence would need
366 * at least 4 bytes, but we give only 3 !
367 */
368 if ((i & 0xF0) == 0xF0) {
369 if (lastError != XML_ERR_INVALID_CHAR)
370 fprintf(stderr,
371 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
372 i, j, K, data[3]);
373 }
374
375 /*
376 * The second and the third bytes must start with 10
377 */
378 else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) {
379 if (lastError != XML_ERR_INVALID_CHAR)
380 fprintf(stderr,
381 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
382 i, j, K);
383 }
384
385 /*
386 * if using a 3 byte encoding then the value must be greater
387 * than 0x800, i.e. one of bits 4 to 0 of i must be set or
388 * the 6th byte of data[1] must be set
389 */
390 else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) {
391 if (lastError != XML_ERR_INVALID_CHAR)
392 fprintf(stderr,
393 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
394 i, j, K);
395 }
396
397 /*
398 * There are values in that range that are not allowed in XML-1.0
399 */
400 else if (((value > 0xD7FF) && (value <0xE000)) ||
401 ((value > 0xFFFD) && (value <0x10000))) {
402 if (lastError != XML_ERR_INVALID_CHAR)
403 fprintf(stderr,
404 "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n",
405 value, i, j, K);
406 }
407
408 /*
409 * We should see no error in remaining cases
410 */
411 else if ((lastError != 0) || (len != 3)) {
412 fprintf(stderr,
413 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
414 i, j, K);
415 }
416
417 /*
418 * Finally check the value is right
419 */
420 else if (c != value) {
421 fprintf(stderr,
422 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
423 i, j, data[2], value, c);
424 }
425 }
426 }
427 }
428}
429
430static void testCharRangeByte4(xmlParserCtxtPtr ctxt, char *data) {
431 int i, j, k, K, l, L;
432 int len, c;
433 unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
434 int value;
435
436 data[4] = 0;
437 for (i = 0xF0;i <= 0xFF;i++) {
438 for (j = 0;j <= 0xFF;j++) {
439 for (k = 0;k < 6;k++) {
440 for (l = 0;l < 6;l++) {
441 data[0] = i;
442 data[1] = j;
443 K = lows[k];
444 data[2] = (char) K;
445 L = lows[l];
446 data[3] = (char) L;
447 value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
448 ((i & 0x7) << 18);
449 ctxt->charset = XML_CHAR_ENCODING_UTF8;
450
451 lastError = 0;
452 c = xmlCurrentChar(ctxt, &len);
453
454 /*
455 * if fifth bit of first char is set, then the sequence would need
456 * at least 5 bytes, but we give only 4 !
457 */
458 if ((i & 0xF8) == 0xF8) {
459 if (lastError != XML_ERR_INVALID_CHAR)
460 fprintf(stderr,
461 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
462 i, j, K, data[3]);
463 }
464
465 /*
466 * The second, third and fourth bytes must start with 10
467 */
468 else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) ||
469 ((L & 0xC0) != 0x80)) {
470 if (lastError != XML_ERR_INVALID_CHAR)
471 fprintf(stderr,
472 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
473 i, j, K, L);
474 }
475
476 /*
477 * if using a 3 byte encoding then the value must be greater
478 * than 0x10000, i.e. one of bits 3 to 0 of i must be set or
479 * the 6 or 5th byte of j must be set
480 */
481 else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) {
482 if (lastError != XML_ERR_INVALID_CHAR)
483 fprintf(stderr,
484 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
485 i, j, K, L);
486 }
487
488 /*
489 * There are values in that range that are not allowed in XML-1.0
490 */
491 else if (((value > 0xD7FF) && (value <0xE000)) ||
492 ((value > 0xFFFD) && (value <0x10000)) ||
493 (value > 0x10FFFF)) {
494 if (lastError != XML_ERR_INVALID_CHAR)
495 fprintf(stderr,
496"Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
497 value, i, j, K, L);
498 }
499
500 /*
501 * We should see no error in remaining cases
502 */
503 else if ((lastError != 0) || (len != 4)) {
504 fprintf(stderr,
505 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
506 i, j, K);
507 }
508
509 /*
510 * Finally check the value is right
511 */
512 else if (c != value) {
513 fprintf(stderr,
514 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
515 i, j, data[2], value, c);
516 }
517 }
518 }
519 }
520 }
521}
522
523/**
524 * testCharRanges:
525 *
526 * Test the correct UTF8 character parsing in isolation i.e.
527 * not when parsing a full document, this is less expensive and we can
528 * cover the full range of UTF-8 chars accepted by XML-1.0
529 */
530
531static void testCharRanges(void) {
532 char data[5];
533 xmlParserCtxtPtr ctxt;
534 xmlParserInputBufferPtr buf;
535 xmlParserInputPtr input;
536
537 memset(data, 0, 5);
538
539 /*
540 * Set up a parsing context using the above data buffer as
541 * the current input source.
542 */
543 ctxt = xmlNewParserCtxt();
544 if (ctxt == NULL) {
545 fprintf(stderr, "Failed to allocate parser context\n");
546 return;
547 }
548 buf = xmlParserInputBufferCreateStatic(data, sizeof(data),
549 XML_CHAR_ENCODING_NONE);
550 if (buf == NULL) {
551 fprintf(stderr, "Failed to allocate input buffer\n");
552 goto error;
553 }
554 input = xmlNewInputStream(ctxt);
555 if (input == NULL) {
556 xmlFreeParserInputBuffer(buf);
557 goto error;
558 }
559 input->filename = NULL;
560 input->buf = buf;
561 input->base = input->buf->buffer->content;
562 input->cur = input->buf->buffer->content;
563 input->end = &input->buf->buffer->content[4];
564 inputPush(ctxt, input);
565
566 printf("testing char range: 1");
567 fflush(stdout);
568 testCharRangeByte1(ctxt, data);
569 printf(" 2");
570 fflush(stdout);
571 testCharRangeByte2(ctxt, data);
572 printf(" 3");
573 fflush(stdout);
574 testCharRangeByte3(ctxt, data);
575 printf(" 4");
576 fflush(stdout);
577 testCharRangeByte4(ctxt, data);
578 printf(" done\n");
579 fflush(stdout);
580
581error:
582 xmlFreeParserCtxt(ctxt);
583}
584
585int main(void) {
586
587 /*
588 * this initialize the library and check potential ABI mismatches
589 * between the version it was compiled for and the actual shared
590 * library used.
591 */
592 LIBXML_TEST_VERSION
593
594 /*
595 * Catch errors separately
596 */
597
598 xmlSetStructuredErrorFunc(NULL, errorHandler);
599
600 /*
601 * Run the tests
602 */
603 testCharRanges();
604 testDocumentRanges();
605
606 /*
607 * Cleanup function for the XML library.
608 */
609 xmlCleanupParser();
610 /*
611 * this is to debug memory for regression tests
612 */
613 xmlMemoryDump();
614 return(0);
615}