blob: 4f3b373918f4327aa992cd9df8a5e8a1e1199f28 [file] [log] [blame]
William M. Bracka2e844a2004-01-06 11:52:13 +00001/*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
6 * from the parser.c file (their original home).
7 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16#define IN_LIBXML
17#include "libxml.h"
18
19#include <stdlib.h>
20#include <string.h>
21#include <libxml/xmlmemory.h>
22#include <libxml/parserInternals.h>
23#include <libxml/xmlstring.h>
24
25/************************************************************************
26 * *
27 * Commodity functions to handle xmlChars *
28 * *
29 ************************************************************************/
30
31/**
32 * xmlStrndup:
33 * @cur: the input xmlChar *
34 * @len: the len of @cur
35 *
36 * a strndup for array of xmlChar's
37 *
38 * Returns a new xmlChar * or NULL
39 */
40xmlChar *
41xmlStrndup(const xmlChar *cur, int len) {
42 xmlChar *ret;
43
44 if ((cur == NULL) || (len < 0)) return(NULL);
45 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46 if (ret == NULL) {
47 xmlErrMemory(NULL, NULL);
48 return(NULL);
49 }
50 memcpy(ret, cur, len * sizeof(xmlChar));
51 ret[len] = 0;
52 return(ret);
53}
54
55/**
56 * xmlStrdup:
57 * @cur: the input xmlChar *
58 *
59 * a strdup for array of xmlChar's. Since they are supposed to be
60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61 * a termination mark of '0'.
62 *
63 * Returns a new xmlChar * or NULL
64 */
65xmlChar *
66xmlStrdup(const xmlChar *cur) {
67 const xmlChar *p = cur;
68
69 if (cur == NULL) return(NULL);
70 while (*p != 0) p++; /* non input consuming */
71 return(xmlStrndup(cur, p - cur));
72}
73
74/**
75 * xmlCharStrndup:
76 * @cur: the input char *
77 * @len: the len of @cur
78 *
79 * a strndup for char's to xmlChar's
80 *
81 * Returns a new xmlChar * or NULL
82 */
83
84xmlChar *
85xmlCharStrndup(const char *cur, int len) {
86 int i;
87 xmlChar *ret;
88
89 if ((cur == NULL) || (len < 0)) return(NULL);
90 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91 if (ret == NULL) {
92 xmlErrMemory(NULL, NULL);
93 return(NULL);
94 }
Daniel Veillard5ea30d72004-11-08 11:54:28 +000095 for (i = 0;i < len;i++) {
William M. Bracka2e844a2004-01-06 11:52:13 +000096 ret[i] = (xmlChar) cur[i];
Daniel Veillard5ea30d72004-11-08 11:54:28 +000097 if (ret[i] == 0) return(ret);
98 }
William M. Bracka2e844a2004-01-06 11:52:13 +000099 ret[len] = 0;
100 return(ret);
101}
102
103/**
104 * xmlCharStrdup:
105 * @cur: the input char *
106 *
107 * a strdup for char's to xmlChar's
108 *
109 * Returns a new xmlChar * or NULL
110 */
111
112xmlChar *
113xmlCharStrdup(const char *cur) {
114 const char *p = cur;
115
116 if (cur == NULL) return(NULL);
117 while (*p != '\0') p++; /* non input consuming */
118 return(xmlCharStrndup(cur, p - cur));
119}
120
121/**
122 * xmlStrcmp:
123 * @str1: the first xmlChar *
124 * @str2: the second xmlChar *
125 *
126 * a strcmp for xmlChar's
127 *
128 * Returns the integer result of the comparison
129 */
130
131int
132xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
133 register int tmp;
134
135 if (str1 == str2) return(0);
136 if (str1 == NULL) return(-1);
137 if (str2 == NULL) return(1);
138 do {
139 tmp = *str1++ - *str2;
140 if (tmp != 0) return(tmp);
141 } while (*str2++ != 0);
142 return 0;
143}
144
145/**
146 * xmlStrEqual:
147 * @str1: the first xmlChar *
148 * @str2: the second xmlChar *
149 *
Daniel Veillardd95ecf02005-12-22 14:58:32 +0000150 * Check if both strings are equal of have same content.
Daniel Veillard6a0baa02005-12-10 11:11:12 +0000151 * Should be a bit more readable and faster than xmlStrcmp()
William M. Bracka2e844a2004-01-06 11:52:13 +0000152 *
153 * Returns 1 if they are equal, 0 if they are different
154 */
155
156int
157xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
158 if (str1 == str2) return(1);
159 if (str1 == NULL) return(0);
160 if (str2 == NULL) return(0);
161 do {
162 if (*str1++ != *str2) return(0);
163 } while (*str2++);
164 return(1);
165}
166
167/**
168 * xmlStrQEqual:
169 * @pref: the prefix of the QName
170 * @name: the localname of the QName
171 * @str: the second xmlChar *
172 *
173 * Check if a QName is Equal to a given string
174 *
175 * Returns 1 if they are equal, 0 if they are different
176 */
177
178int
179xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
180 if (pref == NULL) return(xmlStrEqual(name, str));
181 if (name == NULL) return(0);
182 if (str == NULL) return(0);
183
184 do {
185 if (*pref++ != *str) return(0);
186 } while ((*str++) && (*pref));
187 if (*str++ != ':') return(0);
188 do {
189 if (*name++ != *str) return(0);
190 } while (*str++);
191 return(1);
192}
193
194/**
195 * xmlStrncmp:
196 * @str1: the first xmlChar *
197 * @str2: the second xmlChar *
198 * @len: the max comparison length
199 *
200 * a strncmp for xmlChar's
201 *
202 * Returns the integer result of the comparison
203 */
204
205int
206xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
207 register int tmp;
208
209 if (len <= 0) return(0);
210 if (str1 == str2) return(0);
211 if (str1 == NULL) return(-1);
212 if (str2 == NULL) return(1);
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000213#ifdef __GNUC__
William M. Brackb7b54de2004-10-06 16:38:01 +0000214 tmp = strncmp((const char *)str1, (const char *)str2, len);
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000215 return tmp;
216#else
William M. Bracka2e844a2004-01-06 11:52:13 +0000217 do {
218 tmp = *str1++ - *str2;
219 if (tmp != 0 || --len == 0) return(tmp);
220 } while (*str2++ != 0);
221 return 0;
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000222#endif
William M. Bracka2e844a2004-01-06 11:52:13 +0000223}
224
225static const xmlChar casemap[256] = {
226 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
227 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
228 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
229 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
230 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
231 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
232 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
233 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
234 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
235 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
236 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
237 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
238 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
239 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
240 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
241 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
242 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
243 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
244 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
245 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
246 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
247 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
248 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
249 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
250 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
251 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
252 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
253 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
254 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
255 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
256 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
257 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
258};
259
260/**
261 * xmlStrcasecmp:
262 * @str1: the first xmlChar *
263 * @str2: the second xmlChar *
264 *
265 * a strcasecmp for xmlChar's
266 *
267 * Returns the integer result of the comparison
268 */
269
270int
271xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
272 register int tmp;
273
274 if (str1 == str2) return(0);
275 if (str1 == NULL) return(-1);
276 if (str2 == NULL) return(1);
277 do {
278 tmp = casemap[*str1++] - casemap[*str2];
279 if (tmp != 0) return(tmp);
280 } while (*str2++ != 0);
281 return 0;
282}
283
284/**
285 * xmlStrncasecmp:
286 * @str1: the first xmlChar *
287 * @str2: the second xmlChar *
288 * @len: the max comparison length
289 *
290 * a strncasecmp for xmlChar's
291 *
292 * Returns the integer result of the comparison
293 */
294
295int
296xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
297 register int tmp;
298
299 if (len <= 0) return(0);
300 if (str1 == str2) return(0);
301 if (str1 == NULL) return(-1);
302 if (str2 == NULL) return(1);
303 do {
304 tmp = casemap[*str1++] - casemap[*str2];
305 if (tmp != 0 || --len == 0) return(tmp);
306 } while (*str2++ != 0);
307 return 0;
308}
309
310/**
311 * xmlStrchr:
312 * @str: the xmlChar * array
313 * @val: the xmlChar to search
314 *
315 * a strchr for xmlChar's
316 *
317 * Returns the xmlChar * for the first occurrence or NULL.
318 */
319
320const xmlChar *
321xmlStrchr(const xmlChar *str, xmlChar val) {
322 if (str == NULL) return(NULL);
323 while (*str != 0) { /* non input consuming */
324 if (*str == val) return((xmlChar *) str);
325 str++;
326 }
327 return(NULL);
328}
329
330/**
331 * xmlStrstr:
332 * @str: the xmlChar * array (haystack)
333 * @val: the xmlChar to search (needle)
334 *
335 * a strstr for xmlChar's
336 *
337 * Returns the xmlChar * for the first occurrence or NULL.
338 */
339
340const xmlChar *
341xmlStrstr(const xmlChar *str, const xmlChar *val) {
342 int n;
343
344 if (str == NULL) return(NULL);
345 if (val == NULL) return(NULL);
346 n = xmlStrlen(val);
347
348 if (n == 0) return(str);
349 while (*str != 0) { /* non input consuming */
350 if (*str == *val) {
351 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
352 }
353 str++;
354 }
355 return(NULL);
356}
357
358/**
359 * xmlStrcasestr:
360 * @str: the xmlChar * array (haystack)
361 * @val: the xmlChar to search (needle)
362 *
363 * a case-ignoring strstr for xmlChar's
364 *
365 * Returns the xmlChar * for the first occurrence or NULL.
366 */
367
368const xmlChar *
369xmlStrcasestr(const xmlChar *str, xmlChar *val) {
370 int n;
371
372 if (str == NULL) return(NULL);
373 if (val == NULL) return(NULL);
374 n = xmlStrlen(val);
375
376 if (n == 0) return(str);
377 while (*str != 0) { /* non input consuming */
378 if (casemap[*str] == casemap[*val])
379 if (!xmlStrncasecmp(str, val, n)) return(str);
380 str++;
381 }
382 return(NULL);
383}
384
385/**
386 * xmlStrsub:
387 * @str: the xmlChar * array (haystack)
388 * @start: the index of the first char (zero based)
389 * @len: the length of the substring
390 *
391 * Extract a substring of a given string
392 *
393 * Returns the xmlChar * for the first occurrence or NULL.
394 */
395
396xmlChar *
397xmlStrsub(const xmlChar *str, int start, int len) {
398 int i;
399
400 if (str == NULL) return(NULL);
401 if (start < 0) return(NULL);
402 if (len < 0) return(NULL);
403
404 for (i = 0;i < start;i++) {
405 if (*str == 0) return(NULL);
406 str++;
407 }
408 if (*str == 0) return(NULL);
409 return(xmlStrndup(str, len));
410}
411
412/**
413 * xmlStrlen:
414 * @str: the xmlChar * array
415 *
416 * length of a xmlChar's string
417 *
418 * Returns the number of xmlChar contained in the ARRAY.
419 */
420
421int
422xmlStrlen(const xmlChar *str) {
423 int len = 0;
424
425 if (str == NULL) return(0);
426 while (*str != 0) { /* non input consuming */
427 str++;
428 len++;
429 }
430 return(len);
431}
432
433/**
434 * xmlStrncat:
435 * @cur: the original xmlChar * array
436 * @add: the xmlChar * array added
437 * @len: the length of @add
438 *
439 * a strncat for array of xmlChar's, it will extend @cur with the len
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000440 * first bytes of @add. Note that if @len < 0 then this is an API error
441 * and NULL will be returned.
William M. Bracka2e844a2004-01-06 11:52:13 +0000442 *
443 * Returns a new xmlChar *, the original @cur is reallocated if needed
444 * and should not be freed
445 */
446
447xmlChar *
448xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
449 int size;
450 xmlChar *ret;
451
452 if ((add == NULL) || (len == 0))
453 return(cur);
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000454 if (len < 0)
455 return(NULL);
William M. Bracka2e844a2004-01-06 11:52:13 +0000456 if (cur == NULL)
457 return(xmlStrndup(add, len));
458
459 size = xmlStrlen(cur);
460 ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
461 if (ret == NULL) {
462 xmlErrMemory(NULL, NULL);
463 return(cur);
464 }
465 memcpy(&ret[size], add, len * sizeof(xmlChar));
466 ret[size + len] = 0;
467 return(ret);
468}
469
470/**
471 * xmlStrncatNew:
472 * @str1: first xmlChar string
473 * @str2: second xmlChar string
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000474 * @len: the len of @str2 or < 0
William M. Bracka2e844a2004-01-06 11:52:13 +0000475 *
476 * same as xmlStrncat, but creates a new string. The original
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000477 * two strings are not freed. If @len is < 0 then the length
478 * will be calculated automatically.
William M. Bracka2e844a2004-01-06 11:52:13 +0000479 *
480 * Returns a new xmlChar * or NULL
481 */
482xmlChar *
483xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
484 int size;
485 xmlChar *ret;
486
Daniel Veillard8a32fe42004-11-02 22:10:16 +0000487 if (len < 0)
488 len = xmlStrlen(str2);
William M. Bracka2e844a2004-01-06 11:52:13 +0000489 if ((str2 == NULL) || (len == 0))
490 return(xmlStrdup(str1));
491 if (str1 == NULL)
492 return(xmlStrndup(str2, len));
493
494 size = xmlStrlen(str1);
495 ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
496 if (ret == NULL) {
497 xmlErrMemory(NULL, NULL);
498 return(xmlStrndup(str1, size));
499 }
500 memcpy(ret, str1, size * sizeof(xmlChar));
501 memcpy(&ret[size], str2, len * sizeof(xmlChar));
502 ret[size + len] = 0;
503 return(ret);
504}
505
506/**
507 * xmlStrcat:
508 * @cur: the original xmlChar * array
509 * @add: the xmlChar * array added
510 *
511 * a strcat for array of xmlChar's. Since they are supposed to be
512 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
513 * a termination mark of '0'.
514 *
515 * Returns a new xmlChar * containing the concatenated string.
516 */
517xmlChar *
518xmlStrcat(xmlChar *cur, const xmlChar *add) {
519 const xmlChar *p = add;
520
521 if (add == NULL) return(cur);
522 if (cur == NULL)
523 return(xmlStrdup(add));
524
525 while (*p != 0) p++; /* non input consuming */
526 return(xmlStrncat(cur, add, p - add));
527}
528
529/**
530 * xmlStrPrintf:
531 * @buf: the result buffer.
532 * @len: the result buffer length.
533 * @msg: the message with printf formatting.
534 * @...: extra parameters for the message.
535 *
536 * Formats @msg and places result into @buf.
537 *
538 * Returns the number of characters written to @buf or -1 if an error occurs.
539 */
Daniel Veillardffa3c742005-07-21 13:24:09 +0000540int XMLCDECL
William M. Bracka2e844a2004-01-06 11:52:13 +0000541xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
542 va_list args;
543 int ret;
544
545 if((buf == NULL) || (msg == NULL)) {
546 return(-1);
547 }
548
549 va_start(args, msg);
550 ret = vsnprintf((char *) buf, len, (const char *) msg, args);
551 va_end(args);
552 buf[len - 1] = 0; /* be safe ! */
553
554 return(ret);
555}
556
557/**
558 * xmlStrVPrintf:
559 * @buf: the result buffer.
560 * @len: the result buffer length.
561 * @msg: the message with printf formatting.
562 * @ap: extra parameters for the message.
563 *
564 * Formats @msg and places result into @buf.
565 *
566 * Returns the number of characters written to @buf or -1 if an error occurs.
567 */
568int
569xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
570 int ret;
571
572 if((buf == NULL) || (msg == NULL)) {
573 return(-1);
574 }
575
576 ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
577 buf[len - 1] = 0; /* be safe ! */
578
579 return(ret);
580}
581
582/************************************************************************
583 * *
584 * Generic UTF8 handling routines *
585 * *
586 * From rfc2044: encoding of the Unicode values on UTF-8: *
587 * *
588 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
589 * 0000 0000-0000 007F 0xxxxxxx *
590 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
591 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
592 * *
593 * I hope we won't use values > 0xFFFF anytime soon ! *
594 * *
595 ************************************************************************/
596
597
598/**
599 * xmlUTF8Size:
600 * @utf: pointer to the UTF8 character
601 *
602 * calculates the internal size of a UTF8 character
603 *
604 * returns the numbers of bytes in the character, -1 on format error
605 */
606int
607xmlUTF8Size(const xmlChar *utf) {
608 xmlChar mask;
609 int len;
610
611 if (utf == NULL)
612 return -1;
613 if (*utf < 0x80)
614 return 1;
615 /* check valid UTF8 character */
616 if (!(*utf & 0x40))
617 return -1;
618 /* determine number of bytes in char */
619 len = 2;
620 for (mask=0x20; mask != 0; mask>>=1) {
621 if (!(*utf & mask))
622 return len;
623 len++;
624 }
625 return -1;
626}
627
628/**
629 * xmlUTF8Charcmp:
630 * @utf1: pointer to first UTF8 char
631 * @utf2: pointer to second UTF8 char
632 *
633 * compares the two UCS4 values
634 *
635 * returns result of the compare as with xmlStrncmp
636 */
637int
638xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
639
640 if (utf1 == NULL ) {
641 if (utf2 == NULL)
642 return 0;
643 return -1;
644 }
645 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
646}
647
648/**
649 * xmlUTF8Strlen:
650 * @utf: a sequence of UTF-8 encoded bytes
651 *
652 * compute the length of an UTF8 string, it doesn't do a full UTF8
653 * checking of the content of the string.
654 *
655 * Returns the number of characters in the string or -1 in case of error
656 */
657int
658xmlUTF8Strlen(const xmlChar *utf) {
659 int ret = 0;
660
661 if (utf == NULL)
662 return(-1);
663
664 while (*utf != 0) {
665 if (utf[0] & 0x80) {
666 if ((utf[1] & 0xc0) != 0x80)
667 return(-1);
668 if ((utf[0] & 0xe0) == 0xe0) {
669 if ((utf[2] & 0xc0) != 0x80)
670 return(-1);
671 if ((utf[0] & 0xf0) == 0xf0) {
672 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
673 return(-1);
674 utf += 4;
675 } else {
676 utf += 3;
677 }
678 } else {
679 utf += 2;
680 }
681 } else {
682 utf++;
683 }
684 ret++;
685 }
686 return(ret);
687}
688
689/**
690 * xmlGetUTF8Char:
691 * @utf: a sequence of UTF-8 encoded bytes
William M. Brack3e530162004-09-03 17:10:08 +0000692 * @len: a pointer to the minimum number of bytes present in
693 * the sequence. This is used to assure the next character
694 * is completely contained within the sequence.
William M. Bracka2e844a2004-01-06 11:52:13 +0000695 *
William M. Brack3e530162004-09-03 17:10:08 +0000696 * Read the first UTF8 character from @utf
William M. Bracka2e844a2004-01-06 11:52:13 +0000697 *
William M. Brack3e530162004-09-03 17:10:08 +0000698 * Returns the char value or -1 in case of error, and sets *len to
699 * the actual number of bytes consumed (0 in case of error)
William M. Bracka2e844a2004-01-06 11:52:13 +0000700 */
701int
702xmlGetUTF8Char(const unsigned char *utf, int *len) {
703 unsigned int c;
704
705 if (utf == NULL)
706 goto error;
707 if (len == NULL)
708 goto error;
709 if (*len < 1)
710 goto error;
711
712 c = utf[0];
713 if (c & 0x80) {
714 if (*len < 2)
715 goto error;
716 if ((utf[1] & 0xc0) != 0x80)
717 goto error;
718 if ((c & 0xe0) == 0xe0) {
719 if (*len < 3)
720 goto error;
721 if ((utf[2] & 0xc0) != 0x80)
722 goto error;
723 if ((c & 0xf0) == 0xf0) {
724 if (*len < 4)
725 goto error;
726 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
727 goto error;
728 *len = 4;
729 /* 4-byte code */
730 c = (utf[0] & 0x7) << 18;
731 c |= (utf[1] & 0x3f) << 12;
732 c |= (utf[2] & 0x3f) << 6;
733 c |= utf[3] & 0x3f;
734 } else {
735 /* 3-byte code */
736 *len = 3;
737 c = (utf[0] & 0xf) << 12;
738 c |= (utf[1] & 0x3f) << 6;
739 c |= utf[2] & 0x3f;
740 }
741 } else {
742 /* 2-byte code */
743 *len = 2;
744 c = (utf[0] & 0x1f) << 6;
745 c |= utf[1] & 0x3f;
746 }
747 } else {
748 /* 1-byte code */
749 *len = 1;
750 }
751 return(c);
752
753error:
Daniel Veillardce682bc2004-11-05 17:22:25 +0000754 if (len != NULL)
755 *len = 0;
William M. Bracka2e844a2004-01-06 11:52:13 +0000756 return(-1);
757}
758
759/**
760 * xmlCheckUTF8:
761 * @utf: Pointer to putative UTF-8 encoded string.
762 *
763 * Checks @utf for being valid UTF-8. @utf is assumed to be
764 * null-terminated. This function is not super-strict, as it will
765 * allow longer UTF-8 sequences than necessary. Note that Java is
766 * capable of producing these sequences if provoked. Also note, this
767 * routine checks for the 4-byte maximum size, but does not check for
768 * 0x10ffff maximum value.
769 *
770 * Return value: true if @utf is valid.
771 **/
772int
773xmlCheckUTF8(const unsigned char *utf)
774{
775 int ix;
776 unsigned char c;
777
Daniel Veillardce682bc2004-11-05 17:22:25 +0000778 if (utf == NULL)
779 return(0);
William M. Brack3ffe90e2004-08-28 01:33:30 +0000780 /*
781 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
782 * are as follows (in "bit format"):
783 * 0xxxxxxx valid 1-byte
784 * 110xxxxx 10xxxxxx valid 2-byte
785 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
786 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
787 */
788 for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
William M. Brackf4095152004-08-31 16:49:26 +0000789 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
William M. Bracka2e844a2004-01-06 11:52:13 +0000790 ix++;
William M. Brackbf5cf212004-08-31 06:47:17 +0000791 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
792 if ((utf[ix+1] & 0xc0 ) != 0x80)
793 return 0;
794 ix += 2;
795 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
796 if (((utf[ix+1] & 0xc0) != 0x80) ||
797 ((utf[ix+2] & 0xc0) != 0x80))
798 return 0;
799 ix += 3;
800 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
801 if (((utf[ix+1] & 0xc0) != 0x80) ||
802 ((utf[ix+2] & 0xc0) != 0x80) ||
803 ((utf[ix+3] & 0xc0) != 0x80))
804 return 0;
805 ix += 4;
806 } else /* unknown encoding */
807 return 0;
William M. Bracka2e844a2004-01-06 11:52:13 +0000808 }
809 return(1);
810}
811
812/**
813 * xmlUTF8Strsize:
814 * @utf: a sequence of UTF-8 encoded bytes
815 * @len: the number of characters in the array
816 *
817 * storage size of an UTF8 string
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000818 * the behaviour is not garanteed if the input string is not UTF-8
William M. Bracka2e844a2004-01-06 11:52:13 +0000819 *
820 * Returns the storage size of
821 * the first 'len' characters of ARRAY
William M. Bracka2e844a2004-01-06 11:52:13 +0000822 */
823
824int
825xmlUTF8Strsize(const xmlChar *utf, int len) {
826 const xmlChar *ptr=utf;
827 xmlChar ch;
828
Daniel Veillard36e5cd52004-11-02 14:52:23 +0000829 if (utf == NULL)
830 return(0);
831
William M. Bracka2e844a2004-01-06 11:52:13 +0000832 if (len <= 0)
833 return(0);
834
835 while ( len-- > 0) {
836 if ( !*ptr )
837 break;
838 if ( (ch = *ptr++) & 0x80)
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000839 while ((ch<<=1) & 0x80 ) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000840 ptr++;
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000841 if (*ptr == 0) break;
842 }
William M. Bracka2e844a2004-01-06 11:52:13 +0000843 }
844 return (ptr - utf);
845}
846
847
848/**
849 * xmlUTF8Strndup:
850 * @utf: the input UTF8 *
851 * @len: the len of @utf (in chars)
852 *
853 * a strndup for array of UTF8's
854 *
855 * Returns a new UTF8 * or NULL
856 */
857xmlChar *
858xmlUTF8Strndup(const xmlChar *utf, int len) {
859 xmlChar *ret;
860 int i;
861
862 if ((utf == NULL) || (len < 0)) return(NULL);
863 i = xmlUTF8Strsize(utf, len);
864 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
865 if (ret == NULL) {
866 xmlGenericError(xmlGenericErrorContext,
867 "malloc of %ld byte failed\n",
868 (len + 1) * (long)sizeof(xmlChar));
869 return(NULL);
870 }
871 memcpy(ret, utf, i * sizeof(xmlChar));
872 ret[i] = 0;
873 return(ret);
874}
875
876/**
877 * xmlUTF8Strpos:
878 * @utf: the input UTF8 *
879 * @pos: the position of the desired UTF8 char (in chars)
880 *
881 * a function to provide the equivalent of fetching a
882 * character from a string array
883 *
884 * Returns a pointer to the UTF8 character or NULL
885 */
Daniel Veillard8a32fe42004-11-02 22:10:16 +0000886const xmlChar *
William M. Bracka2e844a2004-01-06 11:52:13 +0000887xmlUTF8Strpos(const xmlChar *utf, int pos) {
888 xmlChar ch;
889
890 if (utf == NULL) return(NULL);
William M. Brack230c5502004-12-20 16:18:49 +0000891 if (pos < 0)
William M. Bracka2e844a2004-01-06 11:52:13 +0000892 return(NULL);
893 while (pos--) {
894 if ((ch=*utf++) == 0) return(NULL);
895 if ( ch & 0x80 ) {
896 /* if not simple ascii, verify proper format */
897 if ( (ch & 0xc0) != 0xc0 )
898 return(NULL);
899 /* then skip over remaining bytes for this char */
900 while ( (ch <<= 1) & 0x80 )
901 if ( (*utf++ & 0xc0) != 0x80 )
902 return(NULL);
903 }
904 }
905 return((xmlChar *)utf);
906}
907
908/**
909 * xmlUTF8Strloc:
910 * @utf: the input UTF8 *
911 * @utfchar: the UTF8 character to be found
912 *
913 * a function to provide the relative location of a UTF8 char
914 *
915 * Returns the relative character position of the desired char
916 * or -1 if not found
917 */
918int
919xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
920 int i, size;
921 xmlChar ch;
922
923 if (utf==NULL || utfchar==NULL) return -1;
924 size = xmlUTF8Strsize(utfchar, 1);
925 for(i=0; (ch=*utf) != 0; i++) {
926 if (xmlStrncmp(utf, utfchar, size)==0)
927 return(i);
928 utf++;
929 if ( ch & 0x80 ) {
930 /* if not simple ascii, verify proper format */
931 if ( (ch & 0xc0) != 0xc0 )
932 return(-1);
933 /* then skip over remaining bytes for this char */
934 while ( (ch <<= 1) & 0x80 )
935 if ( (*utf++ & 0xc0) != 0x80 )
936 return(-1);
937 }
938 }
939
940 return(-1);
941}
942/**
943 * xmlUTF8Strsub:
944 * @utf: a sequence of UTF-8 encoded bytes
945 * @start: relative pos of first char
946 * @len: total number to copy
947 *
948 * Create a substring from a given UTF-8 string
949 * Note: positions are given in units of UTF-8 chars
950 *
951 * Returns a pointer to a newly created string
952 * or NULL if any problem
953 */
954
955xmlChar *
956xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
957 int i;
958 xmlChar ch;
959
960 if (utf == NULL) return(NULL);
961 if (start < 0) return(NULL);
962 if (len < 0) return(NULL);
963
964 /*
965 * Skip over any leading chars
966 */
967 for (i = 0;i < start;i++) {
968 if ((ch=*utf++) == 0) return(NULL);
969 if ( ch & 0x80 ) {
970 /* if not simple ascii, verify proper format */
971 if ( (ch & 0xc0) != 0xc0 )
972 return(NULL);
973 /* then skip over remaining bytes for this char */
974 while ( (ch <<= 1) & 0x80 )
975 if ( (*utf++ & 0xc0) != 0x80 )
976 return(NULL);
977 }
978 }
979
980 return(xmlUTF8Strndup(utf, len));
981}
Daniel Veillard5d4644e2005-04-01 13:11:58 +0000982
983#define bottom_xmlstring
984#include "elfgcchack.h"