blob: 5c6405342d1595ec3084056266dba08e95c3ca97 [file] [log] [blame]
William M. Bracka2e844a2004-01-06 11:52:13 +00001/*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
6 * from the parser.c file (their original home).
7 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16#define IN_LIBXML
17#include "libxml.h"
18
19#include <stdlib.h>
20#include <string.h>
21#include <libxml/xmlmemory.h>
22#include <libxml/parserInternals.h>
23#include <libxml/xmlstring.h>
24
25/************************************************************************
26 * *
27 * Commodity functions to handle xmlChars *
28 * *
29 ************************************************************************/
30
31/**
32 * xmlStrndup:
33 * @cur: the input xmlChar *
34 * @len: the len of @cur
35 *
36 * a strndup for array of xmlChar's
37 *
38 * Returns a new xmlChar * or NULL
39 */
40xmlChar *
41xmlStrndup(const xmlChar *cur, int len) {
42 xmlChar *ret;
43
44 if ((cur == NULL) || (len < 0)) return(NULL);
45 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46 if (ret == NULL) {
47 xmlErrMemory(NULL, NULL);
48 return(NULL);
49 }
50 memcpy(ret, cur, len * sizeof(xmlChar));
51 ret[len] = 0;
52 return(ret);
53}
54
55/**
56 * xmlStrdup:
57 * @cur: the input xmlChar *
58 *
59 * a strdup for array of xmlChar's. Since they are supposed to be
60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61 * a termination mark of '0'.
62 *
63 * Returns a new xmlChar * or NULL
64 */
65xmlChar *
66xmlStrdup(const xmlChar *cur) {
67 const xmlChar *p = cur;
68
69 if (cur == NULL) return(NULL);
70 while (*p != 0) p++; /* non input consuming */
71 return(xmlStrndup(cur, p - cur));
72}
73
74/**
75 * xmlCharStrndup:
76 * @cur: the input char *
77 * @len: the len of @cur
78 *
79 * a strndup for char's to xmlChar's
80 *
81 * Returns a new xmlChar * or NULL
82 */
83
84xmlChar *
85xmlCharStrndup(const char *cur, int len) {
86 int i;
87 xmlChar *ret;
88
89 if ((cur == NULL) || (len < 0)) return(NULL);
90 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91 if (ret == NULL) {
92 xmlErrMemory(NULL, NULL);
93 return(NULL);
94 }
95 for (i = 0;i < len;i++)
96 ret[i] = (xmlChar) cur[i];
97 ret[len] = 0;
98 return(ret);
99}
100
101/**
102 * xmlCharStrdup:
103 * @cur: the input char *
104 *
105 * a strdup for char's to xmlChar's
106 *
107 * Returns a new xmlChar * or NULL
108 */
109
110xmlChar *
111xmlCharStrdup(const char *cur) {
112 const char *p = cur;
113
114 if (cur == NULL) return(NULL);
115 while (*p != '\0') p++; /* non input consuming */
116 return(xmlCharStrndup(cur, p - cur));
117}
118
119/**
120 * xmlStrcmp:
121 * @str1: the first xmlChar *
122 * @str2: the second xmlChar *
123 *
124 * a strcmp for xmlChar's
125 *
126 * Returns the integer result of the comparison
127 */
128
129int
130xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
131 register int tmp;
132
133 if (str1 == str2) return(0);
134 if (str1 == NULL) return(-1);
135 if (str2 == NULL) return(1);
136 do {
137 tmp = *str1++ - *str2;
138 if (tmp != 0) return(tmp);
139 } while (*str2++ != 0);
140 return 0;
141}
142
143/**
144 * xmlStrEqual:
145 * @str1: the first xmlChar *
146 * @str2: the second xmlChar *
147 *
148 * Check if both string are equal of have same content
149 * Should be a bit more readable and faster than xmlStrEqual()
150 *
151 * Returns 1 if they are equal, 0 if they are different
152 */
153
154int
155xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
156 if (str1 == str2) return(1);
157 if (str1 == NULL) return(0);
158 if (str2 == NULL) return(0);
159 do {
160 if (*str1++ != *str2) return(0);
161 } while (*str2++);
162 return(1);
163}
164
165/**
166 * xmlStrQEqual:
167 * @pref: the prefix of the QName
168 * @name: the localname of the QName
169 * @str: the second xmlChar *
170 *
171 * Check if a QName is Equal to a given string
172 *
173 * Returns 1 if they are equal, 0 if they are different
174 */
175
176int
177xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
178 if (pref == NULL) return(xmlStrEqual(name, str));
179 if (name == NULL) return(0);
180 if (str == NULL) return(0);
181
182 do {
183 if (*pref++ != *str) return(0);
184 } while ((*str++) && (*pref));
185 if (*str++ != ':') return(0);
186 do {
187 if (*name++ != *str) return(0);
188 } while (*str++);
189 return(1);
190}
191
192/**
193 * xmlStrncmp:
194 * @str1: the first xmlChar *
195 * @str2: the second xmlChar *
196 * @len: the max comparison length
197 *
198 * a strncmp for xmlChar's
199 *
200 * Returns the integer result of the comparison
201 */
202
203int
204xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
205 register int tmp;
206
207 if (len <= 0) return(0);
208 if (str1 == str2) return(0);
209 if (str1 == NULL) return(-1);
210 if (str2 == NULL) return(1);
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000211#ifdef __GNUC__
212 tmp = strncmp(str1, str2, len);
213 return tmp;
214#else
William M. Bracka2e844a2004-01-06 11:52:13 +0000215 do {
216 tmp = *str1++ - *str2;
217 if (tmp != 0 || --len == 0) return(tmp);
218 } while (*str2++ != 0);
219 return 0;
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000220#endif
William M. Bracka2e844a2004-01-06 11:52:13 +0000221}
222
223static const xmlChar casemap[256] = {
224 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
225 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
226 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
227 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
228 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
229 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
230 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
231 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
232 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
233 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
234 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
235 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
236 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
237 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
238 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
239 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
240 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
241 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
242 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
243 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
244 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
245 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
246 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
247 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
248 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
249 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
250 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
251 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
252 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
253 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
254 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
255 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
256};
257
258/**
259 * xmlStrcasecmp:
260 * @str1: the first xmlChar *
261 * @str2: the second xmlChar *
262 *
263 * a strcasecmp for xmlChar's
264 *
265 * Returns the integer result of the comparison
266 */
267
268int
269xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
270 register int tmp;
271
272 if (str1 == str2) return(0);
273 if (str1 == NULL) return(-1);
274 if (str2 == NULL) return(1);
275 do {
276 tmp = casemap[*str1++] - casemap[*str2];
277 if (tmp != 0) return(tmp);
278 } while (*str2++ != 0);
279 return 0;
280}
281
282/**
283 * xmlStrncasecmp:
284 * @str1: the first xmlChar *
285 * @str2: the second xmlChar *
286 * @len: the max comparison length
287 *
288 * a strncasecmp for xmlChar's
289 *
290 * Returns the integer result of the comparison
291 */
292
293int
294xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
295 register int tmp;
296
297 if (len <= 0) return(0);
298 if (str1 == str2) return(0);
299 if (str1 == NULL) return(-1);
300 if (str2 == NULL) return(1);
301 do {
302 tmp = casemap[*str1++] - casemap[*str2];
303 if (tmp != 0 || --len == 0) return(tmp);
304 } while (*str2++ != 0);
305 return 0;
306}
307
308/**
309 * xmlStrchr:
310 * @str: the xmlChar * array
311 * @val: the xmlChar to search
312 *
313 * a strchr for xmlChar's
314 *
315 * Returns the xmlChar * for the first occurrence or NULL.
316 */
317
318const xmlChar *
319xmlStrchr(const xmlChar *str, xmlChar val) {
320 if (str == NULL) return(NULL);
321 while (*str != 0) { /* non input consuming */
322 if (*str == val) return((xmlChar *) str);
323 str++;
324 }
325 return(NULL);
326}
327
328/**
329 * xmlStrstr:
330 * @str: the xmlChar * array (haystack)
331 * @val: the xmlChar to search (needle)
332 *
333 * a strstr for xmlChar's
334 *
335 * Returns the xmlChar * for the first occurrence or NULL.
336 */
337
338const xmlChar *
339xmlStrstr(const xmlChar *str, const xmlChar *val) {
340 int n;
341
342 if (str == NULL) return(NULL);
343 if (val == NULL) return(NULL);
344 n = xmlStrlen(val);
345
346 if (n == 0) return(str);
347 while (*str != 0) { /* non input consuming */
348 if (*str == *val) {
349 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
350 }
351 str++;
352 }
353 return(NULL);
354}
355
356/**
357 * xmlStrcasestr:
358 * @str: the xmlChar * array (haystack)
359 * @val: the xmlChar to search (needle)
360 *
361 * a case-ignoring strstr for xmlChar's
362 *
363 * Returns the xmlChar * for the first occurrence or NULL.
364 */
365
366const xmlChar *
367xmlStrcasestr(const xmlChar *str, xmlChar *val) {
368 int n;
369
370 if (str == NULL) return(NULL);
371 if (val == NULL) return(NULL);
372 n = xmlStrlen(val);
373
374 if (n == 0) return(str);
375 while (*str != 0) { /* non input consuming */
376 if (casemap[*str] == casemap[*val])
377 if (!xmlStrncasecmp(str, val, n)) return(str);
378 str++;
379 }
380 return(NULL);
381}
382
383/**
384 * xmlStrsub:
385 * @str: the xmlChar * array (haystack)
386 * @start: the index of the first char (zero based)
387 * @len: the length of the substring
388 *
389 * Extract a substring of a given string
390 *
391 * Returns the xmlChar * for the first occurrence or NULL.
392 */
393
394xmlChar *
395xmlStrsub(const xmlChar *str, int start, int len) {
396 int i;
397
398 if (str == NULL) return(NULL);
399 if (start < 0) return(NULL);
400 if (len < 0) return(NULL);
401
402 for (i = 0;i < start;i++) {
403 if (*str == 0) return(NULL);
404 str++;
405 }
406 if (*str == 0) return(NULL);
407 return(xmlStrndup(str, len));
408}
409
410/**
411 * xmlStrlen:
412 * @str: the xmlChar * array
413 *
414 * length of a xmlChar's string
415 *
416 * Returns the number of xmlChar contained in the ARRAY.
417 */
418
419int
420xmlStrlen(const xmlChar *str) {
421 int len = 0;
422
423 if (str == NULL) return(0);
424 while (*str != 0) { /* non input consuming */
425 str++;
426 len++;
427 }
428 return(len);
429}
430
431/**
432 * xmlStrncat:
433 * @cur: the original xmlChar * array
434 * @add: the xmlChar * array added
435 * @len: the length of @add
436 *
437 * a strncat for array of xmlChar's, it will extend @cur with the len
438 * first bytes of @add.
439 *
440 * Returns a new xmlChar *, the original @cur is reallocated if needed
441 * and should not be freed
442 */
443
444xmlChar *
445xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
446 int size;
447 xmlChar *ret;
448
449 if ((add == NULL) || (len == 0))
450 return(cur);
451 if (cur == NULL)
452 return(xmlStrndup(add, len));
453
454 size = xmlStrlen(cur);
455 ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
456 if (ret == NULL) {
457 xmlErrMemory(NULL, NULL);
458 return(cur);
459 }
460 memcpy(&ret[size], add, len * sizeof(xmlChar));
461 ret[size + len] = 0;
462 return(ret);
463}
464
465/**
466 * xmlStrncatNew:
467 * @str1: first xmlChar string
468 * @str2: second xmlChar string
469 * @len: the len of @str2
470 *
471 * same as xmlStrncat, but creates a new string. The original
472 * two strings are not freed.
473 *
474 * Returns a new xmlChar * or NULL
475 */
476xmlChar *
477xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
478 int size;
479 xmlChar *ret;
480
481 if ((str2 == NULL) || (len == 0))
482 return(xmlStrdup(str1));
483 if (str1 == NULL)
484 return(xmlStrndup(str2, len));
485
486 size = xmlStrlen(str1);
487 ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
488 if (ret == NULL) {
489 xmlErrMemory(NULL, NULL);
490 return(xmlStrndup(str1, size));
491 }
492 memcpy(ret, str1, size * sizeof(xmlChar));
493 memcpy(&ret[size], str2, len * sizeof(xmlChar));
494 ret[size + len] = 0;
495 return(ret);
496}
497
498/**
499 * xmlStrcat:
500 * @cur: the original xmlChar * array
501 * @add: the xmlChar * array added
502 *
503 * a strcat for array of xmlChar's. Since they are supposed to be
504 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
505 * a termination mark of '0'.
506 *
507 * Returns a new xmlChar * containing the concatenated string.
508 */
509xmlChar *
510xmlStrcat(xmlChar *cur, const xmlChar *add) {
511 const xmlChar *p = add;
512
513 if (add == NULL) return(cur);
514 if (cur == NULL)
515 return(xmlStrdup(add));
516
517 while (*p != 0) p++; /* non input consuming */
518 return(xmlStrncat(cur, add, p - add));
519}
520
521/**
522 * xmlStrPrintf:
523 * @buf: the result buffer.
524 * @len: the result buffer length.
525 * @msg: the message with printf formatting.
526 * @...: extra parameters for the message.
527 *
528 * Formats @msg and places result into @buf.
529 *
530 * Returns the number of characters written to @buf or -1 if an error occurs.
531 */
532int
533xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
534 va_list args;
535 int ret;
536
537 if((buf == NULL) || (msg == NULL)) {
538 return(-1);
539 }
540
541 va_start(args, msg);
542 ret = vsnprintf((char *) buf, len, (const char *) msg, args);
543 va_end(args);
544 buf[len - 1] = 0; /* be safe ! */
545
546 return(ret);
547}
548
549/**
550 * xmlStrVPrintf:
551 * @buf: the result buffer.
552 * @len: the result buffer length.
553 * @msg: the message with printf formatting.
554 * @ap: extra parameters for the message.
555 *
556 * Formats @msg and places result into @buf.
557 *
558 * Returns the number of characters written to @buf or -1 if an error occurs.
559 */
560int
561xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
562 int ret;
563
564 if((buf == NULL) || (msg == NULL)) {
565 return(-1);
566 }
567
568 ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
569 buf[len - 1] = 0; /* be safe ! */
570
571 return(ret);
572}
573
574/************************************************************************
575 * *
576 * Generic UTF8 handling routines *
577 * *
578 * From rfc2044: encoding of the Unicode values on UTF-8: *
579 * *
580 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
581 * 0000 0000-0000 007F 0xxxxxxx *
582 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
583 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
584 * *
585 * I hope we won't use values > 0xFFFF anytime soon ! *
586 * *
587 ************************************************************************/
588
589
590/**
591 * xmlUTF8Size:
592 * @utf: pointer to the UTF8 character
593 *
594 * calculates the internal size of a UTF8 character
595 *
596 * returns the numbers of bytes in the character, -1 on format error
597 */
598int
599xmlUTF8Size(const xmlChar *utf) {
600 xmlChar mask;
601 int len;
602
603 if (utf == NULL)
604 return -1;
605 if (*utf < 0x80)
606 return 1;
607 /* check valid UTF8 character */
608 if (!(*utf & 0x40))
609 return -1;
610 /* determine number of bytes in char */
611 len = 2;
612 for (mask=0x20; mask != 0; mask>>=1) {
613 if (!(*utf & mask))
614 return len;
615 len++;
616 }
617 return -1;
618}
619
620/**
621 * xmlUTF8Charcmp:
622 * @utf1: pointer to first UTF8 char
623 * @utf2: pointer to second UTF8 char
624 *
625 * compares the two UCS4 values
626 *
627 * returns result of the compare as with xmlStrncmp
628 */
629int
630xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
631
632 if (utf1 == NULL ) {
633 if (utf2 == NULL)
634 return 0;
635 return -1;
636 }
637 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
638}
639
640/**
641 * xmlUTF8Strlen:
642 * @utf: a sequence of UTF-8 encoded bytes
643 *
644 * compute the length of an UTF8 string, it doesn't do a full UTF8
645 * checking of the content of the string.
646 *
647 * Returns the number of characters in the string or -1 in case of error
648 */
649int
650xmlUTF8Strlen(const xmlChar *utf) {
651 int ret = 0;
652
653 if (utf == NULL)
654 return(-1);
655
656 while (*utf != 0) {
657 if (utf[0] & 0x80) {
658 if ((utf[1] & 0xc0) != 0x80)
659 return(-1);
660 if ((utf[0] & 0xe0) == 0xe0) {
661 if ((utf[2] & 0xc0) != 0x80)
662 return(-1);
663 if ((utf[0] & 0xf0) == 0xf0) {
664 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
665 return(-1);
666 utf += 4;
667 } else {
668 utf += 3;
669 }
670 } else {
671 utf += 2;
672 }
673 } else {
674 utf++;
675 }
676 ret++;
677 }
678 return(ret);
679}
680
681/**
682 * xmlGetUTF8Char:
683 * @utf: a sequence of UTF-8 encoded bytes
684 * @len: a pointer to @bytes len
685 *
686 * Read one UTF8 Char from @utf
687 *
688 * Returns the char value or -1 in case of error, and updates *len with the
689 * number of bytes consumed
690 */
691int
692xmlGetUTF8Char(const unsigned char *utf, int *len) {
693 unsigned int c;
694
695 if (utf == NULL)
696 goto error;
697 if (len == NULL)
698 goto error;
699 if (*len < 1)
700 goto error;
701
702 c = utf[0];
703 if (c & 0x80) {
704 if (*len < 2)
705 goto error;
706 if ((utf[1] & 0xc0) != 0x80)
707 goto error;
708 if ((c & 0xe0) == 0xe0) {
709 if (*len < 3)
710 goto error;
711 if ((utf[2] & 0xc0) != 0x80)
712 goto error;
713 if ((c & 0xf0) == 0xf0) {
714 if (*len < 4)
715 goto error;
716 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
717 goto error;
718 *len = 4;
719 /* 4-byte code */
720 c = (utf[0] & 0x7) << 18;
721 c |= (utf[1] & 0x3f) << 12;
722 c |= (utf[2] & 0x3f) << 6;
723 c |= utf[3] & 0x3f;
724 } else {
725 /* 3-byte code */
726 *len = 3;
727 c = (utf[0] & 0xf) << 12;
728 c |= (utf[1] & 0x3f) << 6;
729 c |= utf[2] & 0x3f;
730 }
731 } else {
732 /* 2-byte code */
733 *len = 2;
734 c = (utf[0] & 0x1f) << 6;
735 c |= utf[1] & 0x3f;
736 }
737 } else {
738 /* 1-byte code */
739 *len = 1;
740 }
741 return(c);
742
743error:
744 *len = 0;
745 return(-1);
746}
747
748/**
749 * xmlCheckUTF8:
750 * @utf: Pointer to putative UTF-8 encoded string.
751 *
752 * Checks @utf for being valid UTF-8. @utf is assumed to be
753 * null-terminated. This function is not super-strict, as it will
754 * allow longer UTF-8 sequences than necessary. Note that Java is
755 * capable of producing these sequences if provoked. Also note, this
756 * routine checks for the 4-byte maximum size, but does not check for
757 * 0x10ffff maximum value.
758 *
759 * Return value: true if @utf is valid.
760 **/
761int
762xmlCheckUTF8(const unsigned char *utf)
763{
764 int ix;
765 unsigned char c;
766
767 for (ix = 0; (c = utf[ix]);) {
768 if (c & 0x80) {
769 if ((utf[ix + 1] & 0xc0) != 0x80)
770 return(0);
771 if ((c & 0xe0) == 0xe0) {
772 if ((utf[ix + 2] & 0xc0) != 0x80)
773 return(0);
774 if ((c & 0xf0) == 0xf0) {
775 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
776 return(0);
777 ix += 4;
778 /* 4-byte code */
779 } else
780 /* 3-byte code */
781 ix += 3;
782 } else
783 /* 2-byte code */
784 ix += 2;
785 } else
786 /* 1-byte code */
787 ix++;
788 }
789 return(1);
790}
791
792/**
793 * xmlUTF8Strsize:
794 * @utf: a sequence of UTF-8 encoded bytes
795 * @len: the number of characters in the array
796 *
797 * storage size of an UTF8 string
798 *
799 * Returns the storage size of
800 * the first 'len' characters of ARRAY
801 *
802 */
803
804int
805xmlUTF8Strsize(const xmlChar *utf, int len) {
806 const xmlChar *ptr=utf;
807 xmlChar ch;
808
809 if (len <= 0)
810 return(0);
811
812 while ( len-- > 0) {
813 if ( !*ptr )
814 break;
815 if ( (ch = *ptr++) & 0x80)
816 while ( (ch<<=1) & 0x80 )
817 ptr++;
818 }
819 return (ptr - utf);
820}
821
822
823/**
824 * xmlUTF8Strndup:
825 * @utf: the input UTF8 *
826 * @len: the len of @utf (in chars)
827 *
828 * a strndup for array of UTF8's
829 *
830 * Returns a new UTF8 * or NULL
831 */
832xmlChar *
833xmlUTF8Strndup(const xmlChar *utf, int len) {
834 xmlChar *ret;
835 int i;
836
837 if ((utf == NULL) || (len < 0)) return(NULL);
838 i = xmlUTF8Strsize(utf, len);
839 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
840 if (ret == NULL) {
841 xmlGenericError(xmlGenericErrorContext,
842 "malloc of %ld byte failed\n",
843 (len + 1) * (long)sizeof(xmlChar));
844 return(NULL);
845 }
846 memcpy(ret, utf, i * sizeof(xmlChar));
847 ret[i] = 0;
848 return(ret);
849}
850
851/**
852 * xmlUTF8Strpos:
853 * @utf: the input UTF8 *
854 * @pos: the position of the desired UTF8 char (in chars)
855 *
856 * a function to provide the equivalent of fetching a
857 * character from a string array
858 *
859 * Returns a pointer to the UTF8 character or NULL
860 */
861xmlChar *
862xmlUTF8Strpos(const xmlChar *utf, int pos) {
863 xmlChar ch;
864
865 if (utf == NULL) return(NULL);
866 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
867 return(NULL);
868 while (pos--) {
869 if ((ch=*utf++) == 0) return(NULL);
870 if ( ch & 0x80 ) {
871 /* if not simple ascii, verify proper format */
872 if ( (ch & 0xc0) != 0xc0 )
873 return(NULL);
874 /* then skip over remaining bytes for this char */
875 while ( (ch <<= 1) & 0x80 )
876 if ( (*utf++ & 0xc0) != 0x80 )
877 return(NULL);
878 }
879 }
880 return((xmlChar *)utf);
881}
882
883/**
884 * xmlUTF8Strloc:
885 * @utf: the input UTF8 *
886 * @utfchar: the UTF8 character to be found
887 *
888 * a function to provide the relative location of a UTF8 char
889 *
890 * Returns the relative character position of the desired char
891 * or -1 if not found
892 */
893int
894xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
895 int i, size;
896 xmlChar ch;
897
898 if (utf==NULL || utfchar==NULL) return -1;
899 size = xmlUTF8Strsize(utfchar, 1);
900 for(i=0; (ch=*utf) != 0; i++) {
901 if (xmlStrncmp(utf, utfchar, size)==0)
902 return(i);
903 utf++;
904 if ( ch & 0x80 ) {
905 /* if not simple ascii, verify proper format */
906 if ( (ch & 0xc0) != 0xc0 )
907 return(-1);
908 /* then skip over remaining bytes for this char */
909 while ( (ch <<= 1) & 0x80 )
910 if ( (*utf++ & 0xc0) != 0x80 )
911 return(-1);
912 }
913 }
914
915 return(-1);
916}
917/**
918 * xmlUTF8Strsub:
919 * @utf: a sequence of UTF-8 encoded bytes
920 * @start: relative pos of first char
921 * @len: total number to copy
922 *
923 * Create a substring from a given UTF-8 string
924 * Note: positions are given in units of UTF-8 chars
925 *
926 * Returns a pointer to a newly created string
927 * or NULL if any problem
928 */
929
930xmlChar *
931xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
932 int i;
933 xmlChar ch;
934
935 if (utf == NULL) return(NULL);
936 if (start < 0) return(NULL);
937 if (len < 0) return(NULL);
938
939 /*
940 * Skip over any leading chars
941 */
942 for (i = 0;i < start;i++) {
943 if ((ch=*utf++) == 0) return(NULL);
944 if ( ch & 0x80 ) {
945 /* if not simple ascii, verify proper format */
946 if ( (ch & 0xc0) != 0xc0 )
947 return(NULL);
948 /* then skip over remaining bytes for this char */
949 while ( (ch <<= 1) & 0x80 )
950 if ( (*utf++ & 0xc0) != 0x80 )
951 return(NULL);
952 }
953 }
954
955 return(xmlUTF8Strndup(utf, len));
956}