blob: b47e13eb01cb4cf1a28ddc0192727a0746635dd7 [file] [log] [blame]
William M. Bracka2e844a2004-01-06 11:52:13 +00001/*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
6 * from the parser.c file (their original home).
7 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16#define IN_LIBXML
17#include "libxml.h"
18
19#include <stdlib.h>
20#include <string.h>
21#include <libxml/xmlmemory.h>
22#include <libxml/parserInternals.h>
23#include <libxml/xmlstring.h>
24
25/************************************************************************
26 * *
27 * Commodity functions to handle xmlChars *
28 * *
29 ************************************************************************/
30
31/**
32 * xmlStrndup:
33 * @cur: the input xmlChar *
34 * @len: the len of @cur
35 *
36 * a strndup for array of xmlChar's
37 *
38 * Returns a new xmlChar * or NULL
39 */
40xmlChar *
41xmlStrndup(const xmlChar *cur, int len) {
42 xmlChar *ret;
43
44 if ((cur == NULL) || (len < 0)) return(NULL);
45 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46 if (ret == NULL) {
47 xmlErrMemory(NULL, NULL);
48 return(NULL);
49 }
50 memcpy(ret, cur, len * sizeof(xmlChar));
51 ret[len] = 0;
52 return(ret);
53}
54
55/**
56 * xmlStrdup:
57 * @cur: the input xmlChar *
58 *
59 * a strdup for array of xmlChar's. Since they are supposed to be
60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61 * a termination mark of '0'.
62 *
63 * Returns a new xmlChar * or NULL
64 */
65xmlChar *
66xmlStrdup(const xmlChar *cur) {
67 const xmlChar *p = cur;
68
69 if (cur == NULL) return(NULL);
70 while (*p != 0) p++; /* non input consuming */
71 return(xmlStrndup(cur, p - cur));
72}
73
74/**
75 * xmlCharStrndup:
76 * @cur: the input char *
77 * @len: the len of @cur
78 *
79 * a strndup for char's to xmlChar's
80 *
81 * Returns a new xmlChar * or NULL
82 */
83
84xmlChar *
85xmlCharStrndup(const char *cur, int len) {
86 int i;
87 xmlChar *ret;
88
89 if ((cur == NULL) || (len < 0)) return(NULL);
90 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91 if (ret == NULL) {
92 xmlErrMemory(NULL, NULL);
93 return(NULL);
94 }
95 for (i = 0;i < len;i++)
96 ret[i] = (xmlChar) cur[i];
97 ret[len] = 0;
98 return(ret);
99}
100
101/**
102 * xmlCharStrdup:
103 * @cur: the input char *
104 *
105 * a strdup for char's to xmlChar's
106 *
107 * Returns a new xmlChar * or NULL
108 */
109
110xmlChar *
111xmlCharStrdup(const char *cur) {
112 const char *p = cur;
113
114 if (cur == NULL) return(NULL);
115 while (*p != '\0') p++; /* non input consuming */
116 return(xmlCharStrndup(cur, p - cur));
117}
118
119/**
120 * xmlStrcmp:
121 * @str1: the first xmlChar *
122 * @str2: the second xmlChar *
123 *
124 * a strcmp for xmlChar's
125 *
126 * Returns the integer result of the comparison
127 */
128
129int
130xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
131 register int tmp;
132
133 if (str1 == str2) return(0);
134 if (str1 == NULL) return(-1);
135 if (str2 == NULL) return(1);
136 do {
137 tmp = *str1++ - *str2;
138 if (tmp != 0) return(tmp);
139 } while (*str2++ != 0);
140 return 0;
141}
142
143/**
144 * xmlStrEqual:
145 * @str1: the first xmlChar *
146 * @str2: the second xmlChar *
147 *
148 * Check if both string are equal of have same content
149 * Should be a bit more readable and faster than xmlStrEqual()
150 *
151 * Returns 1 if they are equal, 0 if they are different
152 */
153
154int
155xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
156 if (str1 == str2) return(1);
157 if (str1 == NULL) return(0);
158 if (str2 == NULL) return(0);
159 do {
160 if (*str1++ != *str2) return(0);
161 } while (*str2++);
162 return(1);
163}
164
165/**
166 * xmlStrQEqual:
167 * @pref: the prefix of the QName
168 * @name: the localname of the QName
169 * @str: the second xmlChar *
170 *
171 * Check if a QName is Equal to a given string
172 *
173 * Returns 1 if they are equal, 0 if they are different
174 */
175
176int
177xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
178 if (pref == NULL) return(xmlStrEqual(name, str));
179 if (name == NULL) return(0);
180 if (str == NULL) return(0);
181
182 do {
183 if (*pref++ != *str) return(0);
184 } while ((*str++) && (*pref));
185 if (*str++ != ':') return(0);
186 do {
187 if (*name++ != *str) return(0);
188 } while (*str++);
189 return(1);
190}
191
192/**
193 * xmlStrncmp:
194 * @str1: the first xmlChar *
195 * @str2: the second xmlChar *
196 * @len: the max comparison length
197 *
198 * a strncmp for xmlChar's
199 *
200 * Returns the integer result of the comparison
201 */
202
203int
204xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
205 register int tmp;
206
207 if (len <= 0) return(0);
208 if (str1 == str2) return(0);
209 if (str1 == NULL) return(-1);
210 if (str2 == NULL) return(1);
211 do {
212 tmp = *str1++ - *str2;
213 if (tmp != 0 || --len == 0) return(tmp);
214 } while (*str2++ != 0);
215 return 0;
216}
217
218static const xmlChar casemap[256] = {
219 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
220 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
221 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
222 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
223 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
224 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
225 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
226 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
227 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
228 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
229 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
230 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
231 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
232 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
233 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
234 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
235 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
236 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
237 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
238 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
239 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
240 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
241 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
242 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
243 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
244 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
245 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
246 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
247 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
248 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
249 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
250 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
251};
252
253/**
254 * xmlStrcasecmp:
255 * @str1: the first xmlChar *
256 * @str2: the second xmlChar *
257 *
258 * a strcasecmp for xmlChar's
259 *
260 * Returns the integer result of the comparison
261 */
262
263int
264xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
265 register int tmp;
266
267 if (str1 == str2) return(0);
268 if (str1 == NULL) return(-1);
269 if (str2 == NULL) return(1);
270 do {
271 tmp = casemap[*str1++] - casemap[*str2];
272 if (tmp != 0) return(tmp);
273 } while (*str2++ != 0);
274 return 0;
275}
276
277/**
278 * xmlStrncasecmp:
279 * @str1: the first xmlChar *
280 * @str2: the second xmlChar *
281 * @len: the max comparison length
282 *
283 * a strncasecmp for xmlChar's
284 *
285 * Returns the integer result of the comparison
286 */
287
288int
289xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
290 register int tmp;
291
292 if (len <= 0) return(0);
293 if (str1 == str2) return(0);
294 if (str1 == NULL) return(-1);
295 if (str2 == NULL) return(1);
296 do {
297 tmp = casemap[*str1++] - casemap[*str2];
298 if (tmp != 0 || --len == 0) return(tmp);
299 } while (*str2++ != 0);
300 return 0;
301}
302
303/**
304 * xmlStrchr:
305 * @str: the xmlChar * array
306 * @val: the xmlChar to search
307 *
308 * a strchr for xmlChar's
309 *
310 * Returns the xmlChar * for the first occurrence or NULL.
311 */
312
313const xmlChar *
314xmlStrchr(const xmlChar *str, xmlChar val) {
315 if (str == NULL) return(NULL);
316 while (*str != 0) { /* non input consuming */
317 if (*str == val) return((xmlChar *) str);
318 str++;
319 }
320 return(NULL);
321}
322
323/**
324 * xmlStrstr:
325 * @str: the xmlChar * array (haystack)
326 * @val: the xmlChar to search (needle)
327 *
328 * a strstr for xmlChar's
329 *
330 * Returns the xmlChar * for the first occurrence or NULL.
331 */
332
333const xmlChar *
334xmlStrstr(const xmlChar *str, const xmlChar *val) {
335 int n;
336
337 if (str == NULL) return(NULL);
338 if (val == NULL) return(NULL);
339 n = xmlStrlen(val);
340
341 if (n == 0) return(str);
342 while (*str != 0) { /* non input consuming */
343 if (*str == *val) {
344 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
345 }
346 str++;
347 }
348 return(NULL);
349}
350
351/**
352 * xmlStrcasestr:
353 * @str: the xmlChar * array (haystack)
354 * @val: the xmlChar to search (needle)
355 *
356 * a case-ignoring strstr for xmlChar's
357 *
358 * Returns the xmlChar * for the first occurrence or NULL.
359 */
360
361const xmlChar *
362xmlStrcasestr(const xmlChar *str, xmlChar *val) {
363 int n;
364
365 if (str == NULL) return(NULL);
366 if (val == NULL) return(NULL);
367 n = xmlStrlen(val);
368
369 if (n == 0) return(str);
370 while (*str != 0) { /* non input consuming */
371 if (casemap[*str] == casemap[*val])
372 if (!xmlStrncasecmp(str, val, n)) return(str);
373 str++;
374 }
375 return(NULL);
376}
377
378/**
379 * xmlStrsub:
380 * @str: the xmlChar * array (haystack)
381 * @start: the index of the first char (zero based)
382 * @len: the length of the substring
383 *
384 * Extract a substring of a given string
385 *
386 * Returns the xmlChar * for the first occurrence or NULL.
387 */
388
389xmlChar *
390xmlStrsub(const xmlChar *str, int start, int len) {
391 int i;
392
393 if (str == NULL) return(NULL);
394 if (start < 0) return(NULL);
395 if (len < 0) return(NULL);
396
397 for (i = 0;i < start;i++) {
398 if (*str == 0) return(NULL);
399 str++;
400 }
401 if (*str == 0) return(NULL);
402 return(xmlStrndup(str, len));
403}
404
405/**
406 * xmlStrlen:
407 * @str: the xmlChar * array
408 *
409 * length of a xmlChar's string
410 *
411 * Returns the number of xmlChar contained in the ARRAY.
412 */
413
414int
415xmlStrlen(const xmlChar *str) {
416 int len = 0;
417
418 if (str == NULL) return(0);
419 while (*str != 0) { /* non input consuming */
420 str++;
421 len++;
422 }
423 return(len);
424}
425
426/**
427 * xmlStrncat:
428 * @cur: the original xmlChar * array
429 * @add: the xmlChar * array added
430 * @len: the length of @add
431 *
432 * a strncat for array of xmlChar's, it will extend @cur with the len
433 * first bytes of @add.
434 *
435 * Returns a new xmlChar *, the original @cur is reallocated if needed
436 * and should not be freed
437 */
438
439xmlChar *
440xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
441 int size;
442 xmlChar *ret;
443
444 if ((add == NULL) || (len == 0))
445 return(cur);
446 if (cur == NULL)
447 return(xmlStrndup(add, len));
448
449 size = xmlStrlen(cur);
450 ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
451 if (ret == NULL) {
452 xmlErrMemory(NULL, NULL);
453 return(cur);
454 }
455 memcpy(&ret[size], add, len * sizeof(xmlChar));
456 ret[size + len] = 0;
457 return(ret);
458}
459
460/**
461 * xmlStrncatNew:
462 * @str1: first xmlChar string
463 * @str2: second xmlChar string
464 * @len: the len of @str2
465 *
466 * same as xmlStrncat, but creates a new string. The original
467 * two strings are not freed.
468 *
469 * Returns a new xmlChar * or NULL
470 */
471xmlChar *
472xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
473 int size;
474 xmlChar *ret;
475
476 if ((str2 == NULL) || (len == 0))
477 return(xmlStrdup(str1));
478 if (str1 == NULL)
479 return(xmlStrndup(str2, len));
480
481 size = xmlStrlen(str1);
482 ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
483 if (ret == NULL) {
484 xmlErrMemory(NULL, NULL);
485 return(xmlStrndup(str1, size));
486 }
487 memcpy(ret, str1, size * sizeof(xmlChar));
488 memcpy(&ret[size], str2, len * sizeof(xmlChar));
489 ret[size + len] = 0;
490 return(ret);
491}
492
493/**
494 * xmlStrcat:
495 * @cur: the original xmlChar * array
496 * @add: the xmlChar * array added
497 *
498 * a strcat for array of xmlChar's. Since they are supposed to be
499 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
500 * a termination mark of '0'.
501 *
502 * Returns a new xmlChar * containing the concatenated string.
503 */
504xmlChar *
505xmlStrcat(xmlChar *cur, const xmlChar *add) {
506 const xmlChar *p = add;
507
508 if (add == NULL) return(cur);
509 if (cur == NULL)
510 return(xmlStrdup(add));
511
512 while (*p != 0) p++; /* non input consuming */
513 return(xmlStrncat(cur, add, p - add));
514}
515
516/**
517 * xmlStrPrintf:
518 * @buf: the result buffer.
519 * @len: the result buffer length.
520 * @msg: the message with printf formatting.
521 * @...: extra parameters for the message.
522 *
523 * Formats @msg and places result into @buf.
524 *
525 * Returns the number of characters written to @buf or -1 if an error occurs.
526 */
527int
528xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
529 va_list args;
530 int ret;
531
532 if((buf == NULL) || (msg == NULL)) {
533 return(-1);
534 }
535
536 va_start(args, msg);
537 ret = vsnprintf((char *) buf, len, (const char *) msg, args);
538 va_end(args);
539 buf[len - 1] = 0; /* be safe ! */
540
541 return(ret);
542}
543
544/**
545 * xmlStrVPrintf:
546 * @buf: the result buffer.
547 * @len: the result buffer length.
548 * @msg: the message with printf formatting.
549 * @ap: extra parameters for the message.
550 *
551 * Formats @msg and places result into @buf.
552 *
553 * Returns the number of characters written to @buf or -1 if an error occurs.
554 */
555int
556xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
557 int ret;
558
559 if((buf == NULL) || (msg == NULL)) {
560 return(-1);
561 }
562
563 ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
564 buf[len - 1] = 0; /* be safe ! */
565
566 return(ret);
567}
568
569/************************************************************************
570 * *
571 * Generic UTF8 handling routines *
572 * *
573 * From rfc2044: encoding of the Unicode values on UTF-8: *
574 * *
575 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
576 * 0000 0000-0000 007F 0xxxxxxx *
577 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
578 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
579 * *
580 * I hope we won't use values > 0xFFFF anytime soon ! *
581 * *
582 ************************************************************************/
583
584
585/**
586 * xmlUTF8Size:
587 * @utf: pointer to the UTF8 character
588 *
589 * calculates the internal size of a UTF8 character
590 *
591 * returns the numbers of bytes in the character, -1 on format error
592 */
593int
594xmlUTF8Size(const xmlChar *utf) {
595 xmlChar mask;
596 int len;
597
598 if (utf == NULL)
599 return -1;
600 if (*utf < 0x80)
601 return 1;
602 /* check valid UTF8 character */
603 if (!(*utf & 0x40))
604 return -1;
605 /* determine number of bytes in char */
606 len = 2;
607 for (mask=0x20; mask != 0; mask>>=1) {
608 if (!(*utf & mask))
609 return len;
610 len++;
611 }
612 return -1;
613}
614
615/**
616 * xmlUTF8Charcmp:
617 * @utf1: pointer to first UTF8 char
618 * @utf2: pointer to second UTF8 char
619 *
620 * compares the two UCS4 values
621 *
622 * returns result of the compare as with xmlStrncmp
623 */
624int
625xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
626
627 if (utf1 == NULL ) {
628 if (utf2 == NULL)
629 return 0;
630 return -1;
631 }
632 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
633}
634
635/**
636 * xmlUTF8Strlen:
637 * @utf: a sequence of UTF-8 encoded bytes
638 *
639 * compute the length of an UTF8 string, it doesn't do a full UTF8
640 * checking of the content of the string.
641 *
642 * Returns the number of characters in the string or -1 in case of error
643 */
644int
645xmlUTF8Strlen(const xmlChar *utf) {
646 int ret = 0;
647
648 if (utf == NULL)
649 return(-1);
650
651 while (*utf != 0) {
652 if (utf[0] & 0x80) {
653 if ((utf[1] & 0xc0) != 0x80)
654 return(-1);
655 if ((utf[0] & 0xe0) == 0xe0) {
656 if ((utf[2] & 0xc0) != 0x80)
657 return(-1);
658 if ((utf[0] & 0xf0) == 0xf0) {
659 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
660 return(-1);
661 utf += 4;
662 } else {
663 utf += 3;
664 }
665 } else {
666 utf += 2;
667 }
668 } else {
669 utf++;
670 }
671 ret++;
672 }
673 return(ret);
674}
675
676/**
677 * xmlGetUTF8Char:
678 * @utf: a sequence of UTF-8 encoded bytes
679 * @len: a pointer to @bytes len
680 *
681 * Read one UTF8 Char from @utf
682 *
683 * Returns the char value or -1 in case of error, and updates *len with the
684 * number of bytes consumed
685 */
686int
687xmlGetUTF8Char(const unsigned char *utf, int *len) {
688 unsigned int c;
689
690 if (utf == NULL)
691 goto error;
692 if (len == NULL)
693 goto error;
694 if (*len < 1)
695 goto error;
696
697 c = utf[0];
698 if (c & 0x80) {
699 if (*len < 2)
700 goto error;
701 if ((utf[1] & 0xc0) != 0x80)
702 goto error;
703 if ((c & 0xe0) == 0xe0) {
704 if (*len < 3)
705 goto error;
706 if ((utf[2] & 0xc0) != 0x80)
707 goto error;
708 if ((c & 0xf0) == 0xf0) {
709 if (*len < 4)
710 goto error;
711 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
712 goto error;
713 *len = 4;
714 /* 4-byte code */
715 c = (utf[0] & 0x7) << 18;
716 c |= (utf[1] & 0x3f) << 12;
717 c |= (utf[2] & 0x3f) << 6;
718 c |= utf[3] & 0x3f;
719 } else {
720 /* 3-byte code */
721 *len = 3;
722 c = (utf[0] & 0xf) << 12;
723 c |= (utf[1] & 0x3f) << 6;
724 c |= utf[2] & 0x3f;
725 }
726 } else {
727 /* 2-byte code */
728 *len = 2;
729 c = (utf[0] & 0x1f) << 6;
730 c |= utf[1] & 0x3f;
731 }
732 } else {
733 /* 1-byte code */
734 *len = 1;
735 }
736 return(c);
737
738error:
739 *len = 0;
740 return(-1);
741}
742
743/**
744 * xmlCheckUTF8:
745 * @utf: Pointer to putative UTF-8 encoded string.
746 *
747 * Checks @utf for being valid UTF-8. @utf is assumed to be
748 * null-terminated. This function is not super-strict, as it will
749 * allow longer UTF-8 sequences than necessary. Note that Java is
750 * capable of producing these sequences if provoked. Also note, this
751 * routine checks for the 4-byte maximum size, but does not check for
752 * 0x10ffff maximum value.
753 *
754 * Return value: true if @utf is valid.
755 **/
756int
757xmlCheckUTF8(const unsigned char *utf)
758{
759 int ix;
760 unsigned char c;
761
762 for (ix = 0; (c = utf[ix]);) {
763 if (c & 0x80) {
764 if ((utf[ix + 1] & 0xc0) != 0x80)
765 return(0);
766 if ((c & 0xe0) == 0xe0) {
767 if ((utf[ix + 2] & 0xc0) != 0x80)
768 return(0);
769 if ((c & 0xf0) == 0xf0) {
770 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
771 return(0);
772 ix += 4;
773 /* 4-byte code */
774 } else
775 /* 3-byte code */
776 ix += 3;
777 } else
778 /* 2-byte code */
779 ix += 2;
780 } else
781 /* 1-byte code */
782 ix++;
783 }
784 return(1);
785}
786
787/**
788 * xmlUTF8Strsize:
789 * @utf: a sequence of UTF-8 encoded bytes
790 * @len: the number of characters in the array
791 *
792 * storage size of an UTF8 string
793 *
794 * Returns the storage size of
795 * the first 'len' characters of ARRAY
796 *
797 */
798
799int
800xmlUTF8Strsize(const xmlChar *utf, int len) {
801 const xmlChar *ptr=utf;
802 xmlChar ch;
803
804 if (len <= 0)
805 return(0);
806
807 while ( len-- > 0) {
808 if ( !*ptr )
809 break;
810 if ( (ch = *ptr++) & 0x80)
811 while ( (ch<<=1) & 0x80 )
812 ptr++;
813 }
814 return (ptr - utf);
815}
816
817
818/**
819 * xmlUTF8Strndup:
820 * @utf: the input UTF8 *
821 * @len: the len of @utf (in chars)
822 *
823 * a strndup for array of UTF8's
824 *
825 * Returns a new UTF8 * or NULL
826 */
827xmlChar *
828xmlUTF8Strndup(const xmlChar *utf, int len) {
829 xmlChar *ret;
830 int i;
831
832 if ((utf == NULL) || (len < 0)) return(NULL);
833 i = xmlUTF8Strsize(utf, len);
834 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
835 if (ret == NULL) {
836 xmlGenericError(xmlGenericErrorContext,
837 "malloc of %ld byte failed\n",
838 (len + 1) * (long)sizeof(xmlChar));
839 return(NULL);
840 }
841 memcpy(ret, utf, i * sizeof(xmlChar));
842 ret[i] = 0;
843 return(ret);
844}
845
846/**
847 * xmlUTF8Strpos:
848 * @utf: the input UTF8 *
849 * @pos: the position of the desired UTF8 char (in chars)
850 *
851 * a function to provide the equivalent of fetching a
852 * character from a string array
853 *
854 * Returns a pointer to the UTF8 character or NULL
855 */
856xmlChar *
857xmlUTF8Strpos(const xmlChar *utf, int pos) {
858 xmlChar ch;
859
860 if (utf == NULL) return(NULL);
861 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
862 return(NULL);
863 while (pos--) {
864 if ((ch=*utf++) == 0) return(NULL);
865 if ( ch & 0x80 ) {
866 /* if not simple ascii, verify proper format */
867 if ( (ch & 0xc0) != 0xc0 )
868 return(NULL);
869 /* then skip over remaining bytes for this char */
870 while ( (ch <<= 1) & 0x80 )
871 if ( (*utf++ & 0xc0) != 0x80 )
872 return(NULL);
873 }
874 }
875 return((xmlChar *)utf);
876}
877
878/**
879 * xmlUTF8Strloc:
880 * @utf: the input UTF8 *
881 * @utfchar: the UTF8 character to be found
882 *
883 * a function to provide the relative location of a UTF8 char
884 *
885 * Returns the relative character position of the desired char
886 * or -1 if not found
887 */
888int
889xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
890 int i, size;
891 xmlChar ch;
892
893 if (utf==NULL || utfchar==NULL) return -1;
894 size = xmlUTF8Strsize(utfchar, 1);
895 for(i=0; (ch=*utf) != 0; i++) {
896 if (xmlStrncmp(utf, utfchar, size)==0)
897 return(i);
898 utf++;
899 if ( ch & 0x80 ) {
900 /* if not simple ascii, verify proper format */
901 if ( (ch & 0xc0) != 0xc0 )
902 return(-1);
903 /* then skip over remaining bytes for this char */
904 while ( (ch <<= 1) & 0x80 )
905 if ( (*utf++ & 0xc0) != 0x80 )
906 return(-1);
907 }
908 }
909
910 return(-1);
911}
912/**
913 * xmlUTF8Strsub:
914 * @utf: a sequence of UTF-8 encoded bytes
915 * @start: relative pos of first char
916 * @len: total number to copy
917 *
918 * Create a substring from a given UTF-8 string
919 * Note: positions are given in units of UTF-8 chars
920 *
921 * Returns a pointer to a newly created string
922 * or NULL if any problem
923 */
924
925xmlChar *
926xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
927 int i;
928 xmlChar ch;
929
930 if (utf == NULL) return(NULL);
931 if (start < 0) return(NULL);
932 if (len < 0) return(NULL);
933
934 /*
935 * Skip over any leading chars
936 */
937 for (i = 0;i < start;i++) {
938 if ((ch=*utf++) == 0) return(NULL);
939 if ( ch & 0x80 ) {
940 /* if not simple ascii, verify proper format */
941 if ( (ch & 0xc0) != 0xc0 )
942 return(NULL);
943 /* then skip over remaining bytes for this char */
944 while ( (ch <<= 1) & 0x80 )
945 if ( (*utf++ & 0xc0) != 0x80 )
946 return(NULL);
947 }
948 }
949
950 return(xmlUTF8Strndup(utf, len));
951}