blob: 78acbff7531af962d41fcde938e9d63833386317 [file] [log] [blame]
William M. Bracka2e844a2004-01-06 11:52:13 +00001/*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
6 * from the parser.c file (their original home).
7 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16#define IN_LIBXML
17#include "libxml.h"
18
19#include <stdlib.h>
20#include <string.h>
21#include <libxml/xmlmemory.h>
22#include <libxml/parserInternals.h>
23#include <libxml/xmlstring.h>
24
25/************************************************************************
26 * *
27 * Commodity functions to handle xmlChars *
28 * *
29 ************************************************************************/
30
31/**
32 * xmlStrndup:
33 * @cur: the input xmlChar *
34 * @len: the len of @cur
35 *
36 * a strndup for array of xmlChar's
37 *
38 * Returns a new xmlChar * or NULL
39 */
40xmlChar *
41xmlStrndup(const xmlChar *cur, int len) {
42 xmlChar *ret;
43
44 if ((cur == NULL) || (len < 0)) return(NULL);
45 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46 if (ret == NULL) {
47 xmlErrMemory(NULL, NULL);
48 return(NULL);
49 }
50 memcpy(ret, cur, len * sizeof(xmlChar));
51 ret[len] = 0;
52 return(ret);
53}
54
55/**
56 * xmlStrdup:
57 * @cur: the input xmlChar *
58 *
59 * a strdup for array of xmlChar's. Since they are supposed to be
60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61 * a termination mark of '0'.
62 *
63 * Returns a new xmlChar * or NULL
64 */
65xmlChar *
66xmlStrdup(const xmlChar *cur) {
67 const xmlChar *p = cur;
68
69 if (cur == NULL) return(NULL);
70 while (*p != 0) p++; /* non input consuming */
71 return(xmlStrndup(cur, p - cur));
72}
73
74/**
75 * xmlCharStrndup:
76 * @cur: the input char *
77 * @len: the len of @cur
78 *
79 * a strndup for char's to xmlChar's
80 *
81 * Returns a new xmlChar * or NULL
82 */
83
84xmlChar *
85xmlCharStrndup(const char *cur, int len) {
86 int i;
87 xmlChar *ret;
88
89 if ((cur == NULL) || (len < 0)) return(NULL);
90 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91 if (ret == NULL) {
92 xmlErrMemory(NULL, NULL);
93 return(NULL);
94 }
95 for (i = 0;i < len;i++)
96 ret[i] = (xmlChar) cur[i];
97 ret[len] = 0;
98 return(ret);
99}
100
101/**
102 * xmlCharStrdup:
103 * @cur: the input char *
104 *
105 * a strdup for char's to xmlChar's
106 *
107 * Returns a new xmlChar * or NULL
108 */
109
110xmlChar *
111xmlCharStrdup(const char *cur) {
112 const char *p = cur;
113
114 if (cur == NULL) return(NULL);
115 while (*p != '\0') p++; /* non input consuming */
116 return(xmlCharStrndup(cur, p - cur));
117}
118
119/**
120 * xmlStrcmp:
121 * @str1: the first xmlChar *
122 * @str2: the second xmlChar *
123 *
124 * a strcmp for xmlChar's
125 *
126 * Returns the integer result of the comparison
127 */
128
129int
130xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
131 register int tmp;
132
133 if (str1 == str2) return(0);
134 if (str1 == NULL) return(-1);
135 if (str2 == NULL) return(1);
136 do {
137 tmp = *str1++ - *str2;
138 if (tmp != 0) return(tmp);
139 } while (*str2++ != 0);
140 return 0;
141}
142
143/**
144 * xmlStrEqual:
145 * @str1: the first xmlChar *
146 * @str2: the second xmlChar *
147 *
148 * Check if both string are equal of have same content
149 * Should be a bit more readable and faster than xmlStrEqual()
150 *
151 * Returns 1 if they are equal, 0 if they are different
152 */
153
154int
155xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
156 if (str1 == str2) return(1);
157 if (str1 == NULL) return(0);
158 if (str2 == NULL) return(0);
159 do {
160 if (*str1++ != *str2) return(0);
161 } while (*str2++);
162 return(1);
163}
164
165/**
166 * xmlStrQEqual:
167 * @pref: the prefix of the QName
168 * @name: the localname of the QName
169 * @str: the second xmlChar *
170 *
171 * Check if a QName is Equal to a given string
172 *
173 * Returns 1 if they are equal, 0 if they are different
174 */
175
176int
177xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
178 if (pref == NULL) return(xmlStrEqual(name, str));
179 if (name == NULL) return(0);
180 if (str == NULL) return(0);
181
182 do {
183 if (*pref++ != *str) return(0);
184 } while ((*str++) && (*pref));
185 if (*str++ != ':') return(0);
186 do {
187 if (*name++ != *str) return(0);
188 } while (*str++);
189 return(1);
190}
191
192/**
193 * xmlStrncmp:
194 * @str1: the first xmlChar *
195 * @str2: the second xmlChar *
196 * @len: the max comparison length
197 *
198 * a strncmp for xmlChar's
199 *
200 * Returns the integer result of the comparison
201 */
202
203int
204xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
205 register int tmp;
206
207 if (len <= 0) return(0);
208 if (str1 == str2) return(0);
209 if (str1 == NULL) return(-1);
210 if (str2 == NULL) return(1);
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000211#ifdef __GNUC__
William M. Brackb7b54de2004-10-06 16:38:01 +0000212 tmp = strncmp((const char *)str1, (const char *)str2, len);
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000213 return tmp;
214#else
William M. Bracka2e844a2004-01-06 11:52:13 +0000215 do {
216 tmp = *str1++ - *str2;
217 if (tmp != 0 || --len == 0) return(tmp);
218 } while (*str2++ != 0);
219 return 0;
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000220#endif
William M. Bracka2e844a2004-01-06 11:52:13 +0000221}
222
223static const xmlChar casemap[256] = {
224 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
225 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
226 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
227 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
228 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
229 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
230 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
231 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
232 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
233 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
234 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
235 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
236 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
237 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
238 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
239 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
240 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
241 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
242 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
243 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
244 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
245 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
246 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
247 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
248 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
249 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
250 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
251 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
252 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
253 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
254 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
255 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
256};
257
258/**
259 * xmlStrcasecmp:
260 * @str1: the first xmlChar *
261 * @str2: the second xmlChar *
262 *
263 * a strcasecmp for xmlChar's
264 *
265 * Returns the integer result of the comparison
266 */
267
268int
269xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
270 register int tmp;
271
272 if (str1 == str2) return(0);
273 if (str1 == NULL) return(-1);
274 if (str2 == NULL) return(1);
275 do {
276 tmp = casemap[*str1++] - casemap[*str2];
277 if (tmp != 0) return(tmp);
278 } while (*str2++ != 0);
279 return 0;
280}
281
282/**
283 * xmlStrncasecmp:
284 * @str1: the first xmlChar *
285 * @str2: the second xmlChar *
286 * @len: the max comparison length
287 *
288 * a strncasecmp for xmlChar's
289 *
290 * Returns the integer result of the comparison
291 */
292
293int
294xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
295 register int tmp;
296
297 if (len <= 0) return(0);
298 if (str1 == str2) return(0);
299 if (str1 == NULL) return(-1);
300 if (str2 == NULL) return(1);
301 do {
302 tmp = casemap[*str1++] - casemap[*str2];
303 if (tmp != 0 || --len == 0) return(tmp);
304 } while (*str2++ != 0);
305 return 0;
306}
307
308/**
309 * xmlStrchr:
310 * @str: the xmlChar * array
311 * @val: the xmlChar to search
312 *
313 * a strchr for xmlChar's
314 *
315 * Returns the xmlChar * for the first occurrence or NULL.
316 */
317
318const xmlChar *
319xmlStrchr(const xmlChar *str, xmlChar val) {
320 if (str == NULL) return(NULL);
321 while (*str != 0) { /* non input consuming */
322 if (*str == val) return((xmlChar *) str);
323 str++;
324 }
325 return(NULL);
326}
327
328/**
329 * xmlStrstr:
330 * @str: the xmlChar * array (haystack)
331 * @val: the xmlChar to search (needle)
332 *
333 * a strstr for xmlChar's
334 *
335 * Returns the xmlChar * for the first occurrence or NULL.
336 */
337
338const xmlChar *
339xmlStrstr(const xmlChar *str, const xmlChar *val) {
340 int n;
341
342 if (str == NULL) return(NULL);
343 if (val == NULL) return(NULL);
344 n = xmlStrlen(val);
345
346 if (n == 0) return(str);
347 while (*str != 0) { /* non input consuming */
348 if (*str == *val) {
349 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
350 }
351 str++;
352 }
353 return(NULL);
354}
355
356/**
357 * xmlStrcasestr:
358 * @str: the xmlChar * array (haystack)
359 * @val: the xmlChar to search (needle)
360 *
361 * a case-ignoring strstr for xmlChar's
362 *
363 * Returns the xmlChar * for the first occurrence or NULL.
364 */
365
366const xmlChar *
367xmlStrcasestr(const xmlChar *str, xmlChar *val) {
368 int n;
369
370 if (str == NULL) return(NULL);
371 if (val == NULL) return(NULL);
372 n = xmlStrlen(val);
373
374 if (n == 0) return(str);
375 while (*str != 0) { /* non input consuming */
376 if (casemap[*str] == casemap[*val])
377 if (!xmlStrncasecmp(str, val, n)) return(str);
378 str++;
379 }
380 return(NULL);
381}
382
383/**
384 * xmlStrsub:
385 * @str: the xmlChar * array (haystack)
386 * @start: the index of the first char (zero based)
387 * @len: the length of the substring
388 *
389 * Extract a substring of a given string
390 *
391 * Returns the xmlChar * for the first occurrence or NULL.
392 */
393
394xmlChar *
395xmlStrsub(const xmlChar *str, int start, int len) {
396 int i;
397
398 if (str == NULL) return(NULL);
399 if (start < 0) return(NULL);
400 if (len < 0) return(NULL);
401
402 for (i = 0;i < start;i++) {
403 if (*str == 0) return(NULL);
404 str++;
405 }
406 if (*str == 0) return(NULL);
407 return(xmlStrndup(str, len));
408}
409
410/**
411 * xmlStrlen:
412 * @str: the xmlChar * array
413 *
414 * length of a xmlChar's string
415 *
416 * Returns the number of xmlChar contained in the ARRAY.
417 */
418
419int
420xmlStrlen(const xmlChar *str) {
421 int len = 0;
422
423 if (str == NULL) return(0);
424 while (*str != 0) { /* non input consuming */
425 str++;
426 len++;
427 }
428 return(len);
429}
430
431/**
432 * xmlStrncat:
433 * @cur: the original xmlChar * array
434 * @add: the xmlChar * array added
435 * @len: the length of @add
436 *
437 * a strncat for array of xmlChar's, it will extend @cur with the len
438 * first bytes of @add.
439 *
440 * Returns a new xmlChar *, the original @cur is reallocated if needed
441 * and should not be freed
442 */
443
444xmlChar *
445xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
446 int size;
447 xmlChar *ret;
448
449 if ((add == NULL) || (len == 0))
450 return(cur);
451 if (cur == NULL)
452 return(xmlStrndup(add, len));
453
454 size = xmlStrlen(cur);
455 ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
456 if (ret == NULL) {
457 xmlErrMemory(NULL, NULL);
458 return(cur);
459 }
460 memcpy(&ret[size], add, len * sizeof(xmlChar));
461 ret[size + len] = 0;
462 return(ret);
463}
464
465/**
466 * xmlStrncatNew:
467 * @str1: first xmlChar string
468 * @str2: second xmlChar string
469 * @len: the len of @str2
470 *
471 * same as xmlStrncat, but creates a new string. The original
472 * two strings are not freed.
473 *
474 * Returns a new xmlChar * or NULL
475 */
476xmlChar *
477xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
478 int size;
479 xmlChar *ret;
480
Daniel Veillard8a32fe42004-11-02 22:10:16 +0000481 if (len < 0)
482 len = xmlStrlen(str2);
William M. Bracka2e844a2004-01-06 11:52:13 +0000483 if ((str2 == NULL) || (len == 0))
484 return(xmlStrdup(str1));
485 if (str1 == NULL)
486 return(xmlStrndup(str2, len));
487
488 size = xmlStrlen(str1);
489 ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
490 if (ret == NULL) {
491 xmlErrMemory(NULL, NULL);
492 return(xmlStrndup(str1, size));
493 }
494 memcpy(ret, str1, size * sizeof(xmlChar));
495 memcpy(&ret[size], str2, len * sizeof(xmlChar));
496 ret[size + len] = 0;
497 return(ret);
498}
499
500/**
501 * xmlStrcat:
502 * @cur: the original xmlChar * array
503 * @add: the xmlChar * array added
504 *
505 * a strcat for array of xmlChar's. Since they are supposed to be
506 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
507 * a termination mark of '0'.
508 *
509 * Returns a new xmlChar * containing the concatenated string.
510 */
511xmlChar *
512xmlStrcat(xmlChar *cur, const xmlChar *add) {
513 const xmlChar *p = add;
514
515 if (add == NULL) return(cur);
516 if (cur == NULL)
517 return(xmlStrdup(add));
518
519 while (*p != 0) p++; /* non input consuming */
520 return(xmlStrncat(cur, add, p - add));
521}
522
523/**
524 * xmlStrPrintf:
525 * @buf: the result buffer.
526 * @len: the result buffer length.
527 * @msg: the message with printf formatting.
528 * @...: extra parameters for the message.
529 *
530 * Formats @msg and places result into @buf.
531 *
532 * Returns the number of characters written to @buf or -1 if an error occurs.
533 */
534int
535xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
536 va_list args;
537 int ret;
538
539 if((buf == NULL) || (msg == NULL)) {
540 return(-1);
541 }
542
543 va_start(args, msg);
544 ret = vsnprintf((char *) buf, len, (const char *) msg, args);
545 va_end(args);
546 buf[len - 1] = 0; /* be safe ! */
547
548 return(ret);
549}
550
551/**
552 * xmlStrVPrintf:
553 * @buf: the result buffer.
554 * @len: the result buffer length.
555 * @msg: the message with printf formatting.
556 * @ap: extra parameters for the message.
557 *
558 * Formats @msg and places result into @buf.
559 *
560 * Returns the number of characters written to @buf or -1 if an error occurs.
561 */
562int
563xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
564 int ret;
565
566 if((buf == NULL) || (msg == NULL)) {
567 return(-1);
568 }
569
570 ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
571 buf[len - 1] = 0; /* be safe ! */
572
573 return(ret);
574}
575
576/************************************************************************
577 * *
578 * Generic UTF8 handling routines *
579 * *
580 * From rfc2044: encoding of the Unicode values on UTF-8: *
581 * *
582 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
583 * 0000 0000-0000 007F 0xxxxxxx *
584 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
585 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
586 * *
587 * I hope we won't use values > 0xFFFF anytime soon ! *
588 * *
589 ************************************************************************/
590
591
592/**
593 * xmlUTF8Size:
594 * @utf: pointer to the UTF8 character
595 *
596 * calculates the internal size of a UTF8 character
597 *
598 * returns the numbers of bytes in the character, -1 on format error
599 */
600int
601xmlUTF8Size(const xmlChar *utf) {
602 xmlChar mask;
603 int len;
604
605 if (utf == NULL)
606 return -1;
607 if (*utf < 0x80)
608 return 1;
609 /* check valid UTF8 character */
610 if (!(*utf & 0x40))
611 return -1;
612 /* determine number of bytes in char */
613 len = 2;
614 for (mask=0x20; mask != 0; mask>>=1) {
615 if (!(*utf & mask))
616 return len;
617 len++;
618 }
619 return -1;
620}
621
622/**
623 * xmlUTF8Charcmp:
624 * @utf1: pointer to first UTF8 char
625 * @utf2: pointer to second UTF8 char
626 *
627 * compares the two UCS4 values
628 *
629 * returns result of the compare as with xmlStrncmp
630 */
631int
632xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
633
634 if (utf1 == NULL ) {
635 if (utf2 == NULL)
636 return 0;
637 return -1;
638 }
639 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
640}
641
642/**
643 * xmlUTF8Strlen:
644 * @utf: a sequence of UTF-8 encoded bytes
645 *
646 * compute the length of an UTF8 string, it doesn't do a full UTF8
647 * checking of the content of the string.
648 *
649 * Returns the number of characters in the string or -1 in case of error
650 */
651int
652xmlUTF8Strlen(const xmlChar *utf) {
653 int ret = 0;
654
655 if (utf == NULL)
656 return(-1);
657
658 while (*utf != 0) {
659 if (utf[0] & 0x80) {
660 if ((utf[1] & 0xc0) != 0x80)
661 return(-1);
662 if ((utf[0] & 0xe0) == 0xe0) {
663 if ((utf[2] & 0xc0) != 0x80)
664 return(-1);
665 if ((utf[0] & 0xf0) == 0xf0) {
666 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
667 return(-1);
668 utf += 4;
669 } else {
670 utf += 3;
671 }
672 } else {
673 utf += 2;
674 }
675 } else {
676 utf++;
677 }
678 ret++;
679 }
680 return(ret);
681}
682
683/**
684 * xmlGetUTF8Char:
685 * @utf: a sequence of UTF-8 encoded bytes
William M. Brack3e530162004-09-03 17:10:08 +0000686 * @len: a pointer to the minimum number of bytes present in
687 * the sequence. This is used to assure the next character
688 * is completely contained within the sequence.
William M. Bracka2e844a2004-01-06 11:52:13 +0000689 *
William M. Brack3e530162004-09-03 17:10:08 +0000690 * Read the first UTF8 character from @utf
William M. Bracka2e844a2004-01-06 11:52:13 +0000691 *
William M. Brack3e530162004-09-03 17:10:08 +0000692 * Returns the char value or -1 in case of error, and sets *len to
693 * the actual number of bytes consumed (0 in case of error)
William M. Bracka2e844a2004-01-06 11:52:13 +0000694 */
695int
696xmlGetUTF8Char(const unsigned char *utf, int *len) {
697 unsigned int c;
698
699 if (utf == NULL)
700 goto error;
701 if (len == NULL)
702 goto error;
703 if (*len < 1)
704 goto error;
705
706 c = utf[0];
707 if (c & 0x80) {
708 if (*len < 2)
709 goto error;
710 if ((utf[1] & 0xc0) != 0x80)
711 goto error;
712 if ((c & 0xe0) == 0xe0) {
713 if (*len < 3)
714 goto error;
715 if ((utf[2] & 0xc0) != 0x80)
716 goto error;
717 if ((c & 0xf0) == 0xf0) {
718 if (*len < 4)
719 goto error;
720 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
721 goto error;
722 *len = 4;
723 /* 4-byte code */
724 c = (utf[0] & 0x7) << 18;
725 c |= (utf[1] & 0x3f) << 12;
726 c |= (utf[2] & 0x3f) << 6;
727 c |= utf[3] & 0x3f;
728 } else {
729 /* 3-byte code */
730 *len = 3;
731 c = (utf[0] & 0xf) << 12;
732 c |= (utf[1] & 0x3f) << 6;
733 c |= utf[2] & 0x3f;
734 }
735 } else {
736 /* 2-byte code */
737 *len = 2;
738 c = (utf[0] & 0x1f) << 6;
739 c |= utf[1] & 0x3f;
740 }
741 } else {
742 /* 1-byte code */
743 *len = 1;
744 }
745 return(c);
746
747error:
748 *len = 0;
749 return(-1);
750}
751
752/**
753 * xmlCheckUTF8:
754 * @utf: Pointer to putative UTF-8 encoded string.
755 *
756 * Checks @utf for being valid UTF-8. @utf is assumed to be
757 * null-terminated. This function is not super-strict, as it will
758 * allow longer UTF-8 sequences than necessary. Note that Java is
759 * capable of producing these sequences if provoked. Also note, this
760 * routine checks for the 4-byte maximum size, but does not check for
761 * 0x10ffff maximum value.
762 *
763 * Return value: true if @utf is valid.
764 **/
765int
766xmlCheckUTF8(const unsigned char *utf)
767{
768 int ix;
769 unsigned char c;
770
William M. Brack3ffe90e2004-08-28 01:33:30 +0000771 /*
772 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
773 * are as follows (in "bit format"):
774 * 0xxxxxxx valid 1-byte
775 * 110xxxxx 10xxxxxx valid 2-byte
776 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
777 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
778 */
779 for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
William M. Brackf4095152004-08-31 16:49:26 +0000780 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
William M. Bracka2e844a2004-01-06 11:52:13 +0000781 ix++;
William M. Brackbf5cf212004-08-31 06:47:17 +0000782 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
783 if ((utf[ix+1] & 0xc0 ) != 0x80)
784 return 0;
785 ix += 2;
786 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
787 if (((utf[ix+1] & 0xc0) != 0x80) ||
788 ((utf[ix+2] & 0xc0) != 0x80))
789 return 0;
790 ix += 3;
791 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
792 if (((utf[ix+1] & 0xc0) != 0x80) ||
793 ((utf[ix+2] & 0xc0) != 0x80) ||
794 ((utf[ix+3] & 0xc0) != 0x80))
795 return 0;
796 ix += 4;
797 } else /* unknown encoding */
798 return 0;
William M. Bracka2e844a2004-01-06 11:52:13 +0000799 }
800 return(1);
801}
802
803/**
804 * xmlUTF8Strsize:
805 * @utf: a sequence of UTF-8 encoded bytes
806 * @len: the number of characters in the array
807 *
808 * storage size of an UTF8 string
809 *
810 * Returns the storage size of
811 * the first 'len' characters of ARRAY
William M. Bracka2e844a2004-01-06 11:52:13 +0000812 */
813
814int
815xmlUTF8Strsize(const xmlChar *utf, int len) {
816 const xmlChar *ptr=utf;
817 xmlChar ch;
818
Daniel Veillard36e5cd52004-11-02 14:52:23 +0000819 if (utf == NULL)
820 return(0);
821
William M. Bracka2e844a2004-01-06 11:52:13 +0000822 if (len <= 0)
823 return(0);
824
825 while ( len-- > 0) {
826 if ( !*ptr )
827 break;
828 if ( (ch = *ptr++) & 0x80)
829 while ( (ch<<=1) & 0x80 )
830 ptr++;
831 }
832 return (ptr - utf);
833}
834
835
836/**
837 * xmlUTF8Strndup:
838 * @utf: the input UTF8 *
839 * @len: the len of @utf (in chars)
840 *
841 * a strndup for array of UTF8's
842 *
843 * Returns a new UTF8 * or NULL
844 */
845xmlChar *
846xmlUTF8Strndup(const xmlChar *utf, int len) {
847 xmlChar *ret;
848 int i;
849
850 if ((utf == NULL) || (len < 0)) return(NULL);
851 i = xmlUTF8Strsize(utf, len);
852 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
853 if (ret == NULL) {
854 xmlGenericError(xmlGenericErrorContext,
855 "malloc of %ld byte failed\n",
856 (len + 1) * (long)sizeof(xmlChar));
857 return(NULL);
858 }
859 memcpy(ret, utf, i * sizeof(xmlChar));
860 ret[i] = 0;
861 return(ret);
862}
863
864/**
865 * xmlUTF8Strpos:
866 * @utf: the input UTF8 *
867 * @pos: the position of the desired UTF8 char (in chars)
868 *
869 * a function to provide the equivalent of fetching a
870 * character from a string array
871 *
872 * Returns a pointer to the UTF8 character or NULL
873 */
Daniel Veillard8a32fe42004-11-02 22:10:16 +0000874const xmlChar *
William M. Bracka2e844a2004-01-06 11:52:13 +0000875xmlUTF8Strpos(const xmlChar *utf, int pos) {
876 xmlChar ch;
877
878 if (utf == NULL) return(NULL);
879 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
880 return(NULL);
881 while (pos--) {
882 if ((ch=*utf++) == 0) return(NULL);
883 if ( ch & 0x80 ) {
884 /* if not simple ascii, verify proper format */
885 if ( (ch & 0xc0) != 0xc0 )
886 return(NULL);
887 /* then skip over remaining bytes for this char */
888 while ( (ch <<= 1) & 0x80 )
889 if ( (*utf++ & 0xc0) != 0x80 )
890 return(NULL);
891 }
892 }
893 return((xmlChar *)utf);
894}
895
896/**
897 * xmlUTF8Strloc:
898 * @utf: the input UTF8 *
899 * @utfchar: the UTF8 character to be found
900 *
901 * a function to provide the relative location of a UTF8 char
902 *
903 * Returns the relative character position of the desired char
904 * or -1 if not found
905 */
906int
907xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
908 int i, size;
909 xmlChar ch;
910
911 if (utf==NULL || utfchar==NULL) return -1;
912 size = xmlUTF8Strsize(utfchar, 1);
913 for(i=0; (ch=*utf) != 0; i++) {
914 if (xmlStrncmp(utf, utfchar, size)==0)
915 return(i);
916 utf++;
917 if ( ch & 0x80 ) {
918 /* if not simple ascii, verify proper format */
919 if ( (ch & 0xc0) != 0xc0 )
920 return(-1);
921 /* then skip over remaining bytes for this char */
922 while ( (ch <<= 1) & 0x80 )
923 if ( (*utf++ & 0xc0) != 0x80 )
924 return(-1);
925 }
926 }
927
928 return(-1);
929}
930/**
931 * xmlUTF8Strsub:
932 * @utf: a sequence of UTF-8 encoded bytes
933 * @start: relative pos of first char
934 * @len: total number to copy
935 *
936 * Create a substring from a given UTF-8 string
937 * Note: positions are given in units of UTF-8 chars
938 *
939 * Returns a pointer to a newly created string
940 * or NULL if any problem
941 */
942
943xmlChar *
944xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
945 int i;
946 xmlChar ch;
947
948 if (utf == NULL) return(NULL);
949 if (start < 0) return(NULL);
950 if (len < 0) return(NULL);
951
952 /*
953 * Skip over any leading chars
954 */
955 for (i = 0;i < start;i++) {
956 if ((ch=*utf++) == 0) return(NULL);
957 if ( ch & 0x80 ) {
958 /* if not simple ascii, verify proper format */
959 if ( (ch & 0xc0) != 0xc0 )
960 return(NULL);
961 /* then skip over remaining bytes for this char */
962 while ( (ch <<= 1) & 0x80 )
963 if ( (*utf++ & 0xc0) != 0x80 )
964 return(NULL);
965 }
966 }
967
968 return(xmlUTF8Strndup(utf, len));
969}