blob: 3b1b2dfc19413f0be8d3db44f9366f2a0b612fa6 [file] [log] [blame]
William M. Bracka2e844a2004-01-06 11:52:13 +00001/*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
6 * from the parser.c file (their original home).
7 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16#define IN_LIBXML
17#include "libxml.h"
18
19#include <stdlib.h>
20#include <string.h>
21#include <libxml/xmlmemory.h>
22#include <libxml/parserInternals.h>
23#include <libxml/xmlstring.h>
24
25/************************************************************************
26 * *
27 * Commodity functions to handle xmlChars *
28 * *
29 ************************************************************************/
30
31/**
32 * xmlStrndup:
33 * @cur: the input xmlChar *
34 * @len: the len of @cur
35 *
36 * a strndup for array of xmlChar's
37 *
38 * Returns a new xmlChar * or NULL
39 */
40xmlChar *
41xmlStrndup(const xmlChar *cur, int len) {
42 xmlChar *ret;
43
44 if ((cur == NULL) || (len < 0)) return(NULL);
45 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46 if (ret == NULL) {
47 xmlErrMemory(NULL, NULL);
48 return(NULL);
49 }
50 memcpy(ret, cur, len * sizeof(xmlChar));
51 ret[len] = 0;
52 return(ret);
53}
54
55/**
56 * xmlStrdup:
57 * @cur: the input xmlChar *
58 *
59 * a strdup for array of xmlChar's. Since they are supposed to be
60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61 * a termination mark of '0'.
62 *
63 * Returns a new xmlChar * or NULL
64 */
65xmlChar *
66xmlStrdup(const xmlChar *cur) {
67 const xmlChar *p = cur;
68
69 if (cur == NULL) return(NULL);
70 while (*p != 0) p++; /* non input consuming */
71 return(xmlStrndup(cur, p - cur));
72}
73
74/**
75 * xmlCharStrndup:
76 * @cur: the input char *
77 * @len: the len of @cur
78 *
79 * a strndup for char's to xmlChar's
80 *
81 * Returns a new xmlChar * or NULL
82 */
83
84xmlChar *
85xmlCharStrndup(const char *cur, int len) {
86 int i;
87 xmlChar *ret;
88
89 if ((cur == NULL) || (len < 0)) return(NULL);
90 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91 if (ret == NULL) {
92 xmlErrMemory(NULL, NULL);
93 return(NULL);
94 }
95 for (i = 0;i < len;i++)
96 ret[i] = (xmlChar) cur[i];
97 ret[len] = 0;
98 return(ret);
99}
100
101/**
102 * xmlCharStrdup:
103 * @cur: the input char *
104 *
105 * a strdup for char's to xmlChar's
106 *
107 * Returns a new xmlChar * or NULL
108 */
109
110xmlChar *
111xmlCharStrdup(const char *cur) {
112 const char *p = cur;
113
114 if (cur == NULL) return(NULL);
115 while (*p != '\0') p++; /* non input consuming */
116 return(xmlCharStrndup(cur, p - cur));
117}
118
119/**
120 * xmlStrcmp:
121 * @str1: the first xmlChar *
122 * @str2: the second xmlChar *
123 *
124 * a strcmp for xmlChar's
125 *
126 * Returns the integer result of the comparison
127 */
128
129int
130xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
131 register int tmp;
132
133 if (str1 == str2) return(0);
134 if (str1 == NULL) return(-1);
135 if (str2 == NULL) return(1);
136 do {
137 tmp = *str1++ - *str2;
138 if (tmp != 0) return(tmp);
139 } while (*str2++ != 0);
140 return 0;
141}
142
143/**
144 * xmlStrEqual:
145 * @str1: the first xmlChar *
146 * @str2: the second xmlChar *
147 *
148 * Check if both string are equal of have same content
149 * Should be a bit more readable and faster than xmlStrEqual()
150 *
151 * Returns 1 if they are equal, 0 if they are different
152 */
153
154int
155xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
156 if (str1 == str2) return(1);
157 if (str1 == NULL) return(0);
158 if (str2 == NULL) return(0);
159 do {
160 if (*str1++ != *str2) return(0);
161 } while (*str2++);
162 return(1);
163}
164
165/**
166 * xmlStrQEqual:
167 * @pref: the prefix of the QName
168 * @name: the localname of the QName
169 * @str: the second xmlChar *
170 *
171 * Check if a QName is Equal to a given string
172 *
173 * Returns 1 if they are equal, 0 if they are different
174 */
175
176int
177xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
178 if (pref == NULL) return(xmlStrEqual(name, str));
179 if (name == NULL) return(0);
180 if (str == NULL) return(0);
181
182 do {
183 if (*pref++ != *str) return(0);
184 } while ((*str++) && (*pref));
185 if (*str++ != ':') return(0);
186 do {
187 if (*name++ != *str) return(0);
188 } while (*str++);
189 return(1);
190}
191
192/**
193 * xmlStrncmp:
194 * @str1: the first xmlChar *
195 * @str2: the second xmlChar *
196 * @len: the max comparison length
197 *
198 * a strncmp for xmlChar's
199 *
200 * Returns the integer result of the comparison
201 */
202
203int
204xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
205 register int tmp;
206
207 if (len <= 0) return(0);
208 if (str1 == str2) return(0);
209 if (str1 == NULL) return(-1);
210 if (str2 == NULL) return(1);
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000211#ifdef __GNUC__
212 tmp = strncmp(str1, str2, len);
213 return tmp;
214#else
William M. Bracka2e844a2004-01-06 11:52:13 +0000215 do {
216 tmp = *str1++ - *str2;
217 if (tmp != 0 || --len == 0) return(tmp);
218 } while (*str2++ != 0);
219 return 0;
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000220#endif
William M. Bracka2e844a2004-01-06 11:52:13 +0000221}
222
223static const xmlChar casemap[256] = {
224 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
225 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
226 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
227 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
228 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
229 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
230 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
231 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
232 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
233 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
234 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
235 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
236 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
237 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
238 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
239 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
240 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
241 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
242 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
243 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
244 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
245 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
246 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
247 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
248 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
249 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
250 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
251 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
252 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
253 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
254 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
255 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
256};
257
258/**
259 * xmlStrcasecmp:
260 * @str1: the first xmlChar *
261 * @str2: the second xmlChar *
262 *
263 * a strcasecmp for xmlChar's
264 *
265 * Returns the integer result of the comparison
266 */
267
268int
269xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
270 register int tmp;
271
272 if (str1 == str2) return(0);
273 if (str1 == NULL) return(-1);
274 if (str2 == NULL) return(1);
275 do {
276 tmp = casemap[*str1++] - casemap[*str2];
277 if (tmp != 0) return(tmp);
278 } while (*str2++ != 0);
279 return 0;
280}
281
282/**
283 * xmlStrncasecmp:
284 * @str1: the first xmlChar *
285 * @str2: the second xmlChar *
286 * @len: the max comparison length
287 *
288 * a strncasecmp for xmlChar's
289 *
290 * Returns the integer result of the comparison
291 */
292
293int
294xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
295 register int tmp;
296
297 if (len <= 0) return(0);
298 if (str1 == str2) return(0);
299 if (str1 == NULL) return(-1);
300 if (str2 == NULL) return(1);
301 do {
302 tmp = casemap[*str1++] - casemap[*str2];
303 if (tmp != 0 || --len == 0) return(tmp);
304 } while (*str2++ != 0);
305 return 0;
306}
307
308/**
309 * xmlStrchr:
310 * @str: the xmlChar * array
311 * @val: the xmlChar to search
312 *
313 * a strchr for xmlChar's
314 *
315 * Returns the xmlChar * for the first occurrence or NULL.
316 */
317
318const xmlChar *
319xmlStrchr(const xmlChar *str, xmlChar val) {
320 if (str == NULL) return(NULL);
321 while (*str != 0) { /* non input consuming */
322 if (*str == val) return((xmlChar *) str);
323 str++;
324 }
325 return(NULL);
326}
327
328/**
329 * xmlStrstr:
330 * @str: the xmlChar * array (haystack)
331 * @val: the xmlChar to search (needle)
332 *
333 * a strstr for xmlChar's
334 *
335 * Returns the xmlChar * for the first occurrence or NULL.
336 */
337
338const xmlChar *
339xmlStrstr(const xmlChar *str, const xmlChar *val) {
340 int n;
341
342 if (str == NULL) return(NULL);
343 if (val == NULL) return(NULL);
344 n = xmlStrlen(val);
345
346 if (n == 0) return(str);
347 while (*str != 0) { /* non input consuming */
348 if (*str == *val) {
349 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
350 }
351 str++;
352 }
353 return(NULL);
354}
355
356/**
357 * xmlStrcasestr:
358 * @str: the xmlChar * array (haystack)
359 * @val: the xmlChar to search (needle)
360 *
361 * a case-ignoring strstr for xmlChar's
362 *
363 * Returns the xmlChar * for the first occurrence or NULL.
364 */
365
366const xmlChar *
367xmlStrcasestr(const xmlChar *str, xmlChar *val) {
368 int n;
369
370 if (str == NULL) return(NULL);
371 if (val == NULL) return(NULL);
372 n = xmlStrlen(val);
373
374 if (n == 0) return(str);
375 while (*str != 0) { /* non input consuming */
376 if (casemap[*str] == casemap[*val])
377 if (!xmlStrncasecmp(str, val, n)) return(str);
378 str++;
379 }
380 return(NULL);
381}
382
383/**
384 * xmlStrsub:
385 * @str: the xmlChar * array (haystack)
386 * @start: the index of the first char (zero based)
387 * @len: the length of the substring
388 *
389 * Extract a substring of a given string
390 *
391 * Returns the xmlChar * for the first occurrence or NULL.
392 */
393
394xmlChar *
395xmlStrsub(const xmlChar *str, int start, int len) {
396 int i;
397
398 if (str == NULL) return(NULL);
399 if (start < 0) return(NULL);
400 if (len < 0) return(NULL);
401
402 for (i = 0;i < start;i++) {
403 if (*str == 0) return(NULL);
404 str++;
405 }
406 if (*str == 0) return(NULL);
407 return(xmlStrndup(str, len));
408}
409
410/**
411 * xmlStrlen:
412 * @str: the xmlChar * array
413 *
414 * length of a xmlChar's string
415 *
416 * Returns the number of xmlChar contained in the ARRAY.
417 */
418
419int
420xmlStrlen(const xmlChar *str) {
421 int len = 0;
422
423 if (str == NULL) return(0);
424 while (*str != 0) { /* non input consuming */
425 str++;
426 len++;
427 }
428 return(len);
429}
430
431/**
432 * xmlStrncat:
433 * @cur: the original xmlChar * array
434 * @add: the xmlChar * array added
435 * @len: the length of @add
436 *
437 * a strncat for array of xmlChar's, it will extend @cur with the len
438 * first bytes of @add.
439 *
440 * Returns a new xmlChar *, the original @cur is reallocated if needed
441 * and should not be freed
442 */
443
444xmlChar *
445xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
446 int size;
447 xmlChar *ret;
448
449 if ((add == NULL) || (len == 0))
450 return(cur);
451 if (cur == NULL)
452 return(xmlStrndup(add, len));
453
454 size = xmlStrlen(cur);
455 ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
456 if (ret == NULL) {
457 xmlErrMemory(NULL, NULL);
458 return(cur);
459 }
460 memcpy(&ret[size], add, len * sizeof(xmlChar));
461 ret[size + len] = 0;
462 return(ret);
463}
464
465/**
466 * xmlStrncatNew:
467 * @str1: first xmlChar string
468 * @str2: second xmlChar string
469 * @len: the len of @str2
470 *
471 * same as xmlStrncat, but creates a new string. The original
472 * two strings are not freed.
473 *
474 * Returns a new xmlChar * or NULL
475 */
476xmlChar *
477xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
478 int size;
479 xmlChar *ret;
480
481 if ((str2 == NULL) || (len == 0))
482 return(xmlStrdup(str1));
483 if (str1 == NULL)
484 return(xmlStrndup(str2, len));
485
486 size = xmlStrlen(str1);
487 ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
488 if (ret == NULL) {
489 xmlErrMemory(NULL, NULL);
490 return(xmlStrndup(str1, size));
491 }
492 memcpy(ret, str1, size * sizeof(xmlChar));
493 memcpy(&ret[size], str2, len * sizeof(xmlChar));
494 ret[size + len] = 0;
495 return(ret);
496}
497
498/**
499 * xmlStrcat:
500 * @cur: the original xmlChar * array
501 * @add: the xmlChar * array added
502 *
503 * a strcat for array of xmlChar's. Since they are supposed to be
504 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
505 * a termination mark of '0'.
506 *
507 * Returns a new xmlChar * containing the concatenated string.
508 */
509xmlChar *
510xmlStrcat(xmlChar *cur, const xmlChar *add) {
511 const xmlChar *p = add;
512
513 if (add == NULL) return(cur);
514 if (cur == NULL)
515 return(xmlStrdup(add));
516
517 while (*p != 0) p++; /* non input consuming */
518 return(xmlStrncat(cur, add, p - add));
519}
520
521/**
522 * xmlStrPrintf:
523 * @buf: the result buffer.
524 * @len: the result buffer length.
525 * @msg: the message with printf formatting.
526 * @...: extra parameters for the message.
527 *
528 * Formats @msg and places result into @buf.
529 *
530 * Returns the number of characters written to @buf or -1 if an error occurs.
531 */
532int
533xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
534 va_list args;
535 int ret;
536
537 if((buf == NULL) || (msg == NULL)) {
538 return(-1);
539 }
540
541 va_start(args, msg);
542 ret = vsnprintf((char *) buf, len, (const char *) msg, args);
543 va_end(args);
544 buf[len - 1] = 0; /* be safe ! */
545
546 return(ret);
547}
548
549/**
550 * xmlStrVPrintf:
551 * @buf: the result buffer.
552 * @len: the result buffer length.
553 * @msg: the message with printf formatting.
554 * @ap: extra parameters for the message.
555 *
556 * Formats @msg and places result into @buf.
557 *
558 * Returns the number of characters written to @buf or -1 if an error occurs.
559 */
560int
561xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
562 int ret;
563
564 if((buf == NULL) || (msg == NULL)) {
565 return(-1);
566 }
567
568 ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
569 buf[len - 1] = 0; /* be safe ! */
570
571 return(ret);
572}
573
574/************************************************************************
575 * *
576 * Generic UTF8 handling routines *
577 * *
578 * From rfc2044: encoding of the Unicode values on UTF-8: *
579 * *
580 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
581 * 0000 0000-0000 007F 0xxxxxxx *
582 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
583 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
584 * *
585 * I hope we won't use values > 0xFFFF anytime soon ! *
586 * *
587 ************************************************************************/
588
589
590/**
591 * xmlUTF8Size:
592 * @utf: pointer to the UTF8 character
593 *
594 * calculates the internal size of a UTF8 character
595 *
596 * returns the numbers of bytes in the character, -1 on format error
597 */
598int
599xmlUTF8Size(const xmlChar *utf) {
600 xmlChar mask;
601 int len;
602
603 if (utf == NULL)
604 return -1;
605 if (*utf < 0x80)
606 return 1;
607 /* check valid UTF8 character */
608 if (!(*utf & 0x40))
609 return -1;
610 /* determine number of bytes in char */
611 len = 2;
612 for (mask=0x20; mask != 0; mask>>=1) {
613 if (!(*utf & mask))
614 return len;
615 len++;
616 }
617 return -1;
618}
619
620/**
621 * xmlUTF8Charcmp:
622 * @utf1: pointer to first UTF8 char
623 * @utf2: pointer to second UTF8 char
624 *
625 * compares the two UCS4 values
626 *
627 * returns result of the compare as with xmlStrncmp
628 */
629int
630xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
631
632 if (utf1 == NULL ) {
633 if (utf2 == NULL)
634 return 0;
635 return -1;
636 }
637 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
638}
639
640/**
641 * xmlUTF8Strlen:
642 * @utf: a sequence of UTF-8 encoded bytes
643 *
644 * compute the length of an UTF8 string, it doesn't do a full UTF8
645 * checking of the content of the string.
646 *
647 * Returns the number of characters in the string or -1 in case of error
648 */
649int
650xmlUTF8Strlen(const xmlChar *utf) {
651 int ret = 0;
652
653 if (utf == NULL)
654 return(-1);
655
656 while (*utf != 0) {
657 if (utf[0] & 0x80) {
658 if ((utf[1] & 0xc0) != 0x80)
659 return(-1);
660 if ((utf[0] & 0xe0) == 0xe0) {
661 if ((utf[2] & 0xc0) != 0x80)
662 return(-1);
663 if ((utf[0] & 0xf0) == 0xf0) {
664 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
665 return(-1);
666 utf += 4;
667 } else {
668 utf += 3;
669 }
670 } else {
671 utf += 2;
672 }
673 } else {
674 utf++;
675 }
676 ret++;
677 }
678 return(ret);
679}
680
681/**
682 * xmlGetUTF8Char:
683 * @utf: a sequence of UTF-8 encoded bytes
684 * @len: a pointer to @bytes len
685 *
686 * Read one UTF8 Char from @utf
687 *
688 * Returns the char value or -1 in case of error, and updates *len with the
689 * number of bytes consumed
690 */
691int
692xmlGetUTF8Char(const unsigned char *utf, int *len) {
693 unsigned int c;
694
695 if (utf == NULL)
696 goto error;
697 if (len == NULL)
698 goto error;
699 if (*len < 1)
700 goto error;
701
702 c = utf[0];
703 if (c & 0x80) {
704 if (*len < 2)
705 goto error;
706 if ((utf[1] & 0xc0) != 0x80)
707 goto error;
708 if ((c & 0xe0) == 0xe0) {
709 if (*len < 3)
710 goto error;
711 if ((utf[2] & 0xc0) != 0x80)
712 goto error;
713 if ((c & 0xf0) == 0xf0) {
714 if (*len < 4)
715 goto error;
716 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
717 goto error;
718 *len = 4;
719 /* 4-byte code */
720 c = (utf[0] & 0x7) << 18;
721 c |= (utf[1] & 0x3f) << 12;
722 c |= (utf[2] & 0x3f) << 6;
723 c |= utf[3] & 0x3f;
724 } else {
725 /* 3-byte code */
726 *len = 3;
727 c = (utf[0] & 0xf) << 12;
728 c |= (utf[1] & 0x3f) << 6;
729 c |= utf[2] & 0x3f;
730 }
731 } else {
732 /* 2-byte code */
733 *len = 2;
734 c = (utf[0] & 0x1f) << 6;
735 c |= utf[1] & 0x3f;
736 }
737 } else {
738 /* 1-byte code */
739 *len = 1;
740 }
741 return(c);
742
743error:
744 *len = 0;
745 return(-1);
746}
747
748/**
749 * xmlCheckUTF8:
750 * @utf: Pointer to putative UTF-8 encoded string.
751 *
752 * Checks @utf for being valid UTF-8. @utf is assumed to be
753 * null-terminated. This function is not super-strict, as it will
754 * allow longer UTF-8 sequences than necessary. Note that Java is
755 * capable of producing these sequences if provoked. Also note, this
756 * routine checks for the 4-byte maximum size, but does not check for
757 * 0x10ffff maximum value.
758 *
759 * Return value: true if @utf is valid.
760 **/
761int
762xmlCheckUTF8(const unsigned char *utf)
763{
764 int ix;
765 unsigned char c;
766
William M. Brack3ffe90e2004-08-28 01:33:30 +0000767 /*
768 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
769 * are as follows (in "bit format"):
770 * 0xxxxxxx valid 1-byte
771 * 110xxxxx 10xxxxxx valid 2-byte
772 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
773 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
774 */
775 for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
776 if (c & 0x80) { /* if it is not a single byte */
777 /*
778 * We know the first byte starts with '1', so check
779 * the following bits and bytes.
780 *
781 * if the first byte does *not* start with 1 1, or the
782 * second byte does *not* start with 1 0 it's an error
783 */
784 if (((c & 0xc0) != 0xc0) || ((utf[ix + 1] & 0xc0) != 0x80))
William M. Bracka2e844a2004-01-06 11:52:13 +0000785 return(0);
William M. Brack3ffe90e2004-08-28 01:33:30 +0000786 /*
787 * if the first three bits are set then the 3rd byte *must* start
788 * with 1 0
789 */
William M. Bracka2e844a2004-01-06 11:52:13 +0000790 if ((c & 0xe0) == 0xe0) {
791 if ((utf[ix + 2] & 0xc0) != 0x80)
792 return(0);
William M. Brack3ffe90e2004-08-28 01:33:30 +0000793 /*
794 * if the first four bits are set then the fifth bit
795 * must not be set, and the 4th byte *must* start with 1 0
796 */
William M. Bracka2e844a2004-01-06 11:52:13 +0000797 if ((c & 0xf0) == 0xf0) {
798 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
799 return(0);
800 ix += 4;
801 /* 4-byte code */
802 } else
803 /* 3-byte code */
804 ix += 3;
805 } else
806 /* 2-byte code */
807 ix += 2;
808 } else
809 /* 1-byte code */
810 ix++;
811 }
812 return(1);
813}
814
815/**
816 * xmlUTF8Strsize:
817 * @utf: a sequence of UTF-8 encoded bytes
818 * @len: the number of characters in the array
819 *
820 * storage size of an UTF8 string
821 *
822 * Returns the storage size of
823 * the first 'len' characters of ARRAY
824 *
825 */
826
827int
828xmlUTF8Strsize(const xmlChar *utf, int len) {
829 const xmlChar *ptr=utf;
830 xmlChar ch;
831
832 if (len <= 0)
833 return(0);
834
835 while ( len-- > 0) {
836 if ( !*ptr )
837 break;
838 if ( (ch = *ptr++) & 0x80)
839 while ( (ch<<=1) & 0x80 )
840 ptr++;
841 }
842 return (ptr - utf);
843}
844
845
846/**
847 * xmlUTF8Strndup:
848 * @utf: the input UTF8 *
849 * @len: the len of @utf (in chars)
850 *
851 * a strndup for array of UTF8's
852 *
853 * Returns a new UTF8 * or NULL
854 */
855xmlChar *
856xmlUTF8Strndup(const xmlChar *utf, int len) {
857 xmlChar *ret;
858 int i;
859
860 if ((utf == NULL) || (len < 0)) return(NULL);
861 i = xmlUTF8Strsize(utf, len);
862 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
863 if (ret == NULL) {
864 xmlGenericError(xmlGenericErrorContext,
865 "malloc of %ld byte failed\n",
866 (len + 1) * (long)sizeof(xmlChar));
867 return(NULL);
868 }
869 memcpy(ret, utf, i * sizeof(xmlChar));
870 ret[i] = 0;
871 return(ret);
872}
873
874/**
875 * xmlUTF8Strpos:
876 * @utf: the input UTF8 *
877 * @pos: the position of the desired UTF8 char (in chars)
878 *
879 * a function to provide the equivalent of fetching a
880 * character from a string array
881 *
882 * Returns a pointer to the UTF8 character or NULL
883 */
884xmlChar *
885xmlUTF8Strpos(const xmlChar *utf, int pos) {
886 xmlChar ch;
887
888 if (utf == NULL) return(NULL);
889 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
890 return(NULL);
891 while (pos--) {
892 if ((ch=*utf++) == 0) return(NULL);
893 if ( ch & 0x80 ) {
894 /* if not simple ascii, verify proper format */
895 if ( (ch & 0xc0) != 0xc0 )
896 return(NULL);
897 /* then skip over remaining bytes for this char */
898 while ( (ch <<= 1) & 0x80 )
899 if ( (*utf++ & 0xc0) != 0x80 )
900 return(NULL);
901 }
902 }
903 return((xmlChar *)utf);
904}
905
906/**
907 * xmlUTF8Strloc:
908 * @utf: the input UTF8 *
909 * @utfchar: the UTF8 character to be found
910 *
911 * a function to provide the relative location of a UTF8 char
912 *
913 * Returns the relative character position of the desired char
914 * or -1 if not found
915 */
916int
917xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
918 int i, size;
919 xmlChar ch;
920
921 if (utf==NULL || utfchar==NULL) return -1;
922 size = xmlUTF8Strsize(utfchar, 1);
923 for(i=0; (ch=*utf) != 0; i++) {
924 if (xmlStrncmp(utf, utfchar, size)==0)
925 return(i);
926 utf++;
927 if ( ch & 0x80 ) {
928 /* if not simple ascii, verify proper format */
929 if ( (ch & 0xc0) != 0xc0 )
930 return(-1);
931 /* then skip over remaining bytes for this char */
932 while ( (ch <<= 1) & 0x80 )
933 if ( (*utf++ & 0xc0) != 0x80 )
934 return(-1);
935 }
936 }
937
938 return(-1);
939}
940/**
941 * xmlUTF8Strsub:
942 * @utf: a sequence of UTF-8 encoded bytes
943 * @start: relative pos of first char
944 * @len: total number to copy
945 *
946 * Create a substring from a given UTF-8 string
947 * Note: positions are given in units of UTF-8 chars
948 *
949 * Returns a pointer to a newly created string
950 * or NULL if any problem
951 */
952
953xmlChar *
954xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
955 int i;
956 xmlChar ch;
957
958 if (utf == NULL) return(NULL);
959 if (start < 0) return(NULL);
960 if (len < 0) return(NULL);
961
962 /*
963 * Skip over any leading chars
964 */
965 for (i = 0;i < start;i++) {
966 if ((ch=*utf++) == 0) return(NULL);
967 if ( ch & 0x80 ) {
968 /* if not simple ascii, verify proper format */
969 if ( (ch & 0xc0) != 0xc0 )
970 return(NULL);
971 /* then skip over remaining bytes for this char */
972 while ( (ch <<= 1) & 0x80 )
973 if ( (*utf++ & 0xc0) != 0x80 )
974 return(NULL);
975 }
976 }
977
978 return(xmlUTF8Strndup(utf, len));
979}