blob: d73c49cadbc2afed13bd643cc8eca256bd6f2daa [file] [log] [blame]
William M. Bracka2e844a2004-01-06 11:52:13 +00001/*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
6 * from the parser.c file (their original home).
7 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16#define IN_LIBXML
17#include "libxml.h"
18
19#include <stdlib.h>
20#include <string.h>
21#include <libxml/xmlmemory.h>
22#include <libxml/parserInternals.h>
23#include <libxml/xmlstring.h>
24
25/************************************************************************
26 * *
27 * Commodity functions to handle xmlChars *
28 * *
29 ************************************************************************/
30
31/**
32 * xmlStrndup:
33 * @cur: the input xmlChar *
34 * @len: the len of @cur
35 *
36 * a strndup for array of xmlChar's
37 *
38 * Returns a new xmlChar * or NULL
39 */
40xmlChar *
41xmlStrndup(const xmlChar *cur, int len) {
42 xmlChar *ret;
43
44 if ((cur == NULL) || (len < 0)) return(NULL);
45 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46 if (ret == NULL) {
47 xmlErrMemory(NULL, NULL);
48 return(NULL);
49 }
50 memcpy(ret, cur, len * sizeof(xmlChar));
51 ret[len] = 0;
52 return(ret);
53}
54
55/**
56 * xmlStrdup:
57 * @cur: the input xmlChar *
58 *
59 * a strdup for array of xmlChar's. Since they are supposed to be
60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61 * a termination mark of '0'.
62 *
63 * Returns a new xmlChar * or NULL
64 */
65xmlChar *
66xmlStrdup(const xmlChar *cur) {
67 const xmlChar *p = cur;
68
69 if (cur == NULL) return(NULL);
70 while (*p != 0) p++; /* non input consuming */
71 return(xmlStrndup(cur, p - cur));
72}
73
74/**
75 * xmlCharStrndup:
76 * @cur: the input char *
77 * @len: the len of @cur
78 *
79 * a strndup for char's to xmlChar's
80 *
81 * Returns a new xmlChar * or NULL
82 */
83
84xmlChar *
85xmlCharStrndup(const char *cur, int len) {
86 int i;
87 xmlChar *ret;
88
89 if ((cur == NULL) || (len < 0)) return(NULL);
90 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91 if (ret == NULL) {
92 xmlErrMemory(NULL, NULL);
93 return(NULL);
94 }
Daniel Veillard5ea30d72004-11-08 11:54:28 +000095 for (i = 0;i < len;i++) {
William M. Bracka2e844a2004-01-06 11:52:13 +000096 ret[i] = (xmlChar) cur[i];
Daniel Veillard5ea30d72004-11-08 11:54:28 +000097 if (ret[i] == 0) return(ret);
98 }
William M. Bracka2e844a2004-01-06 11:52:13 +000099 ret[len] = 0;
100 return(ret);
101}
102
103/**
104 * xmlCharStrdup:
105 * @cur: the input char *
106 *
107 * a strdup for char's to xmlChar's
108 *
109 * Returns a new xmlChar * or NULL
110 */
111
112xmlChar *
113xmlCharStrdup(const char *cur) {
114 const char *p = cur;
115
116 if (cur == NULL) return(NULL);
117 while (*p != '\0') p++; /* non input consuming */
118 return(xmlCharStrndup(cur, p - cur));
119}
120
121/**
122 * xmlStrcmp:
123 * @str1: the first xmlChar *
124 * @str2: the second xmlChar *
125 *
126 * a strcmp for xmlChar's
127 *
128 * Returns the integer result of the comparison
129 */
130
131int
132xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
133 register int tmp;
134
135 if (str1 == str2) return(0);
136 if (str1 == NULL) return(-1);
137 if (str2 == NULL) return(1);
138 do {
139 tmp = *str1++ - *str2;
140 if (tmp != 0) return(tmp);
141 } while (*str2++ != 0);
142 return 0;
143}
144
145/**
146 * xmlStrEqual:
147 * @str1: the first xmlChar *
148 * @str2: the second xmlChar *
149 *
150 * Check if both string are equal of have same content
151 * Should be a bit more readable and faster than xmlStrEqual()
152 *
153 * Returns 1 if they are equal, 0 if they are different
154 */
155
156int
157xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
158 if (str1 == str2) return(1);
159 if (str1 == NULL) return(0);
160 if (str2 == NULL) return(0);
161 do {
162 if (*str1++ != *str2) return(0);
163 } while (*str2++);
164 return(1);
165}
166
167/**
168 * xmlStrQEqual:
169 * @pref: the prefix of the QName
170 * @name: the localname of the QName
171 * @str: the second xmlChar *
172 *
173 * Check if a QName is Equal to a given string
174 *
175 * Returns 1 if they are equal, 0 if they are different
176 */
177
178int
179xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
180 if (pref == NULL) return(xmlStrEqual(name, str));
181 if (name == NULL) return(0);
182 if (str == NULL) return(0);
183
184 do {
185 if (*pref++ != *str) return(0);
186 } while ((*str++) && (*pref));
187 if (*str++ != ':') return(0);
188 do {
189 if (*name++ != *str) return(0);
190 } while (*str++);
191 return(1);
192}
193
194/**
195 * xmlStrncmp:
196 * @str1: the first xmlChar *
197 * @str2: the second xmlChar *
198 * @len: the max comparison length
199 *
200 * a strncmp for xmlChar's
201 *
202 * Returns the integer result of the comparison
203 */
204
205int
206xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
207 register int tmp;
208
209 if (len <= 0) return(0);
210 if (str1 == str2) return(0);
211 if (str1 == NULL) return(-1);
212 if (str2 == NULL) return(1);
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000213#ifdef __GNUC__
William M. Brackb7b54de2004-10-06 16:38:01 +0000214 tmp = strncmp((const char *)str1, (const char *)str2, len);
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000215 return tmp;
216#else
William M. Bracka2e844a2004-01-06 11:52:13 +0000217 do {
218 tmp = *str1++ - *str2;
219 if (tmp != 0 || --len == 0) return(tmp);
220 } while (*str2++ != 0);
221 return 0;
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000222#endif
William M. Bracka2e844a2004-01-06 11:52:13 +0000223}
224
225static const xmlChar casemap[256] = {
226 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
227 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
228 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
229 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
230 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
231 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
232 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
233 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
234 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
235 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
236 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
237 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
238 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
239 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
240 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
241 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
242 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
243 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
244 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
245 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
246 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
247 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
248 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
249 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
250 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
251 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
252 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
253 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
254 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
255 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
256 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
257 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
258};
259
260/**
261 * xmlStrcasecmp:
262 * @str1: the first xmlChar *
263 * @str2: the second xmlChar *
264 *
265 * a strcasecmp for xmlChar's
266 *
267 * Returns the integer result of the comparison
268 */
269
270int
271xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
272 register int tmp;
273
274 if (str1 == str2) return(0);
275 if (str1 == NULL) return(-1);
276 if (str2 == NULL) return(1);
277 do {
278 tmp = casemap[*str1++] - casemap[*str2];
279 if (tmp != 0) return(tmp);
280 } while (*str2++ != 0);
281 return 0;
282}
283
284/**
285 * xmlStrncasecmp:
286 * @str1: the first xmlChar *
287 * @str2: the second xmlChar *
288 * @len: the max comparison length
289 *
290 * a strncasecmp for xmlChar's
291 *
292 * Returns the integer result of the comparison
293 */
294
295int
296xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
297 register int tmp;
298
299 if (len <= 0) return(0);
300 if (str1 == str2) return(0);
301 if (str1 == NULL) return(-1);
302 if (str2 == NULL) return(1);
303 do {
304 tmp = casemap[*str1++] - casemap[*str2];
305 if (tmp != 0 || --len == 0) return(tmp);
306 } while (*str2++ != 0);
307 return 0;
308}
309
310/**
311 * xmlStrchr:
312 * @str: the xmlChar * array
313 * @val: the xmlChar to search
314 *
315 * a strchr for xmlChar's
316 *
317 * Returns the xmlChar * for the first occurrence or NULL.
318 */
319
320const xmlChar *
321xmlStrchr(const xmlChar *str, xmlChar val) {
322 if (str == NULL) return(NULL);
323 while (*str != 0) { /* non input consuming */
324 if (*str == val) return((xmlChar *) str);
325 str++;
326 }
327 return(NULL);
328}
329
330/**
331 * xmlStrstr:
332 * @str: the xmlChar * array (haystack)
333 * @val: the xmlChar to search (needle)
334 *
335 * a strstr for xmlChar's
336 *
337 * Returns the xmlChar * for the first occurrence or NULL.
338 */
339
340const xmlChar *
341xmlStrstr(const xmlChar *str, const xmlChar *val) {
342 int n;
343
344 if (str == NULL) return(NULL);
345 if (val == NULL) return(NULL);
346 n = xmlStrlen(val);
347
348 if (n == 0) return(str);
349 while (*str != 0) { /* non input consuming */
350 if (*str == *val) {
351 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
352 }
353 str++;
354 }
355 return(NULL);
356}
357
358/**
359 * xmlStrcasestr:
360 * @str: the xmlChar * array (haystack)
361 * @val: the xmlChar to search (needle)
362 *
363 * a case-ignoring strstr for xmlChar's
364 *
365 * Returns the xmlChar * for the first occurrence or NULL.
366 */
367
368const xmlChar *
369xmlStrcasestr(const xmlChar *str, xmlChar *val) {
370 int n;
371
372 if (str == NULL) return(NULL);
373 if (val == NULL) return(NULL);
374 n = xmlStrlen(val);
375
376 if (n == 0) return(str);
377 while (*str != 0) { /* non input consuming */
378 if (casemap[*str] == casemap[*val])
379 if (!xmlStrncasecmp(str, val, n)) return(str);
380 str++;
381 }
382 return(NULL);
383}
384
385/**
386 * xmlStrsub:
387 * @str: the xmlChar * array (haystack)
388 * @start: the index of the first char (zero based)
389 * @len: the length of the substring
390 *
391 * Extract a substring of a given string
392 *
393 * Returns the xmlChar * for the first occurrence or NULL.
394 */
395
396xmlChar *
397xmlStrsub(const xmlChar *str, int start, int len) {
398 int i;
399
400 if (str == NULL) return(NULL);
401 if (start < 0) return(NULL);
402 if (len < 0) return(NULL);
403
404 for (i = 0;i < start;i++) {
405 if (*str == 0) return(NULL);
406 str++;
407 }
408 if (*str == 0) return(NULL);
409 return(xmlStrndup(str, len));
410}
411
412/**
413 * xmlStrlen:
414 * @str: the xmlChar * array
415 *
416 * length of a xmlChar's string
417 *
418 * Returns the number of xmlChar contained in the ARRAY.
419 */
420
421int
422xmlStrlen(const xmlChar *str) {
423 int len = 0;
424
425 if (str == NULL) return(0);
426 while (*str != 0) { /* non input consuming */
427 str++;
428 len++;
429 }
430 return(len);
431}
432
433/**
434 * xmlStrncat:
435 * @cur: the original xmlChar * array
436 * @add: the xmlChar * array added
437 * @len: the length of @add
438 *
439 * a strncat for array of xmlChar's, it will extend @cur with the len
440 * first bytes of @add.
441 *
442 * Returns a new xmlChar *, the original @cur is reallocated if needed
443 * and should not be freed
444 */
445
446xmlChar *
447xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
448 int size;
449 xmlChar *ret;
450
451 if ((add == NULL) || (len == 0))
452 return(cur);
453 if (cur == NULL)
454 return(xmlStrndup(add, len));
455
456 size = xmlStrlen(cur);
457 ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
458 if (ret == NULL) {
459 xmlErrMemory(NULL, NULL);
460 return(cur);
461 }
462 memcpy(&ret[size], add, len * sizeof(xmlChar));
463 ret[size + len] = 0;
464 return(ret);
465}
466
467/**
468 * xmlStrncatNew:
469 * @str1: first xmlChar string
470 * @str2: second xmlChar string
471 * @len: the len of @str2
472 *
473 * same as xmlStrncat, but creates a new string. The original
474 * two strings are not freed.
475 *
476 * Returns a new xmlChar * or NULL
477 */
478xmlChar *
479xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
480 int size;
481 xmlChar *ret;
482
Daniel Veillard8a32fe42004-11-02 22:10:16 +0000483 if (len < 0)
484 len = xmlStrlen(str2);
William M. Bracka2e844a2004-01-06 11:52:13 +0000485 if ((str2 == NULL) || (len == 0))
486 return(xmlStrdup(str1));
487 if (str1 == NULL)
488 return(xmlStrndup(str2, len));
489
490 size = xmlStrlen(str1);
491 ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
492 if (ret == NULL) {
493 xmlErrMemory(NULL, NULL);
494 return(xmlStrndup(str1, size));
495 }
496 memcpy(ret, str1, size * sizeof(xmlChar));
497 memcpy(&ret[size], str2, len * sizeof(xmlChar));
498 ret[size + len] = 0;
499 return(ret);
500}
501
502/**
503 * xmlStrcat:
504 * @cur: the original xmlChar * array
505 * @add: the xmlChar * array added
506 *
507 * a strcat for array of xmlChar's. Since they are supposed to be
508 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
509 * a termination mark of '0'.
510 *
511 * Returns a new xmlChar * containing the concatenated string.
512 */
513xmlChar *
514xmlStrcat(xmlChar *cur, const xmlChar *add) {
515 const xmlChar *p = add;
516
517 if (add == NULL) return(cur);
518 if (cur == NULL)
519 return(xmlStrdup(add));
520
521 while (*p != 0) p++; /* non input consuming */
522 return(xmlStrncat(cur, add, p - add));
523}
524
525/**
526 * xmlStrPrintf:
527 * @buf: the result buffer.
528 * @len: the result buffer length.
529 * @msg: the message with printf formatting.
530 * @...: extra parameters for the message.
531 *
532 * Formats @msg and places result into @buf.
533 *
534 * Returns the number of characters written to @buf or -1 if an error occurs.
535 */
536int
537xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
538 va_list args;
539 int ret;
540
541 if((buf == NULL) || (msg == NULL)) {
542 return(-1);
543 }
544
545 va_start(args, msg);
546 ret = vsnprintf((char *) buf, len, (const char *) msg, args);
547 va_end(args);
548 buf[len - 1] = 0; /* be safe ! */
549
550 return(ret);
551}
552
553/**
554 * xmlStrVPrintf:
555 * @buf: the result buffer.
556 * @len: the result buffer length.
557 * @msg: the message with printf formatting.
558 * @ap: extra parameters for the message.
559 *
560 * Formats @msg and places result into @buf.
561 *
562 * Returns the number of characters written to @buf or -1 if an error occurs.
563 */
564int
565xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
566 int ret;
567
568 if((buf == NULL) || (msg == NULL)) {
569 return(-1);
570 }
571
572 ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
573 buf[len - 1] = 0; /* be safe ! */
574
575 return(ret);
576}
577
578/************************************************************************
579 * *
580 * Generic UTF8 handling routines *
581 * *
582 * From rfc2044: encoding of the Unicode values on UTF-8: *
583 * *
584 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
585 * 0000 0000-0000 007F 0xxxxxxx *
586 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
587 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
588 * *
589 * I hope we won't use values > 0xFFFF anytime soon ! *
590 * *
591 ************************************************************************/
592
593
594/**
595 * xmlUTF8Size:
596 * @utf: pointer to the UTF8 character
597 *
598 * calculates the internal size of a UTF8 character
599 *
600 * returns the numbers of bytes in the character, -1 on format error
601 */
602int
603xmlUTF8Size(const xmlChar *utf) {
604 xmlChar mask;
605 int len;
606
607 if (utf == NULL)
608 return -1;
609 if (*utf < 0x80)
610 return 1;
611 /* check valid UTF8 character */
612 if (!(*utf & 0x40))
613 return -1;
614 /* determine number of bytes in char */
615 len = 2;
616 for (mask=0x20; mask != 0; mask>>=1) {
617 if (!(*utf & mask))
618 return len;
619 len++;
620 }
621 return -1;
622}
623
624/**
625 * xmlUTF8Charcmp:
626 * @utf1: pointer to first UTF8 char
627 * @utf2: pointer to second UTF8 char
628 *
629 * compares the two UCS4 values
630 *
631 * returns result of the compare as with xmlStrncmp
632 */
633int
634xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
635
636 if (utf1 == NULL ) {
637 if (utf2 == NULL)
638 return 0;
639 return -1;
640 }
641 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
642}
643
644/**
645 * xmlUTF8Strlen:
646 * @utf: a sequence of UTF-8 encoded bytes
647 *
648 * compute the length of an UTF8 string, it doesn't do a full UTF8
649 * checking of the content of the string.
650 *
651 * Returns the number of characters in the string or -1 in case of error
652 */
653int
654xmlUTF8Strlen(const xmlChar *utf) {
655 int ret = 0;
656
657 if (utf == NULL)
658 return(-1);
659
660 while (*utf != 0) {
661 if (utf[0] & 0x80) {
662 if ((utf[1] & 0xc0) != 0x80)
663 return(-1);
664 if ((utf[0] & 0xe0) == 0xe0) {
665 if ((utf[2] & 0xc0) != 0x80)
666 return(-1);
667 if ((utf[0] & 0xf0) == 0xf0) {
668 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
669 return(-1);
670 utf += 4;
671 } else {
672 utf += 3;
673 }
674 } else {
675 utf += 2;
676 }
677 } else {
678 utf++;
679 }
680 ret++;
681 }
682 return(ret);
683}
684
685/**
686 * xmlGetUTF8Char:
687 * @utf: a sequence of UTF-8 encoded bytes
William M. Brack3e530162004-09-03 17:10:08 +0000688 * @len: a pointer to the minimum number of bytes present in
689 * the sequence. This is used to assure the next character
690 * is completely contained within the sequence.
William M. Bracka2e844a2004-01-06 11:52:13 +0000691 *
William M. Brack3e530162004-09-03 17:10:08 +0000692 * Read the first UTF8 character from @utf
William M. Bracka2e844a2004-01-06 11:52:13 +0000693 *
William M. Brack3e530162004-09-03 17:10:08 +0000694 * Returns the char value or -1 in case of error, and sets *len to
695 * the actual number of bytes consumed (0 in case of error)
William M. Bracka2e844a2004-01-06 11:52:13 +0000696 */
697int
698xmlGetUTF8Char(const unsigned char *utf, int *len) {
699 unsigned int c;
700
701 if (utf == NULL)
702 goto error;
703 if (len == NULL)
704 goto error;
705 if (*len < 1)
706 goto error;
707
708 c = utf[0];
709 if (c & 0x80) {
710 if (*len < 2)
711 goto error;
712 if ((utf[1] & 0xc0) != 0x80)
713 goto error;
714 if ((c & 0xe0) == 0xe0) {
715 if (*len < 3)
716 goto error;
717 if ((utf[2] & 0xc0) != 0x80)
718 goto error;
719 if ((c & 0xf0) == 0xf0) {
720 if (*len < 4)
721 goto error;
722 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
723 goto error;
724 *len = 4;
725 /* 4-byte code */
726 c = (utf[0] & 0x7) << 18;
727 c |= (utf[1] & 0x3f) << 12;
728 c |= (utf[2] & 0x3f) << 6;
729 c |= utf[3] & 0x3f;
730 } else {
731 /* 3-byte code */
732 *len = 3;
733 c = (utf[0] & 0xf) << 12;
734 c |= (utf[1] & 0x3f) << 6;
735 c |= utf[2] & 0x3f;
736 }
737 } else {
738 /* 2-byte code */
739 *len = 2;
740 c = (utf[0] & 0x1f) << 6;
741 c |= utf[1] & 0x3f;
742 }
743 } else {
744 /* 1-byte code */
745 *len = 1;
746 }
747 return(c);
748
749error:
Daniel Veillardce682bc2004-11-05 17:22:25 +0000750 if (len != NULL)
751 *len = 0;
William M. Bracka2e844a2004-01-06 11:52:13 +0000752 return(-1);
753}
754
755/**
756 * xmlCheckUTF8:
757 * @utf: Pointer to putative UTF-8 encoded string.
758 *
759 * Checks @utf for being valid UTF-8. @utf is assumed to be
760 * null-terminated. This function is not super-strict, as it will
761 * allow longer UTF-8 sequences than necessary. Note that Java is
762 * capable of producing these sequences if provoked. Also note, this
763 * routine checks for the 4-byte maximum size, but does not check for
764 * 0x10ffff maximum value.
765 *
766 * Return value: true if @utf is valid.
767 **/
768int
769xmlCheckUTF8(const unsigned char *utf)
770{
771 int ix;
772 unsigned char c;
773
Daniel Veillardce682bc2004-11-05 17:22:25 +0000774 if (utf == NULL)
775 return(0);
William M. Brack3ffe90e2004-08-28 01:33:30 +0000776 /*
777 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
778 * are as follows (in "bit format"):
779 * 0xxxxxxx valid 1-byte
780 * 110xxxxx 10xxxxxx valid 2-byte
781 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
782 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
783 */
784 for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
William M. Brackf4095152004-08-31 16:49:26 +0000785 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
William M. Bracka2e844a2004-01-06 11:52:13 +0000786 ix++;
William M. Brackbf5cf212004-08-31 06:47:17 +0000787 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
788 if ((utf[ix+1] & 0xc0 ) != 0x80)
789 return 0;
790 ix += 2;
791 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
792 if (((utf[ix+1] & 0xc0) != 0x80) ||
793 ((utf[ix+2] & 0xc0) != 0x80))
794 return 0;
795 ix += 3;
796 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
797 if (((utf[ix+1] & 0xc0) != 0x80) ||
798 ((utf[ix+2] & 0xc0) != 0x80) ||
799 ((utf[ix+3] & 0xc0) != 0x80))
800 return 0;
801 ix += 4;
802 } else /* unknown encoding */
803 return 0;
William M. Bracka2e844a2004-01-06 11:52:13 +0000804 }
805 return(1);
806}
807
808/**
809 * xmlUTF8Strsize:
810 * @utf: a sequence of UTF-8 encoded bytes
811 * @len: the number of characters in the array
812 *
813 * storage size of an UTF8 string
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000814 * the behaviour is not garanteed if the input string is not UTF-8
William M. Bracka2e844a2004-01-06 11:52:13 +0000815 *
816 * Returns the storage size of
817 * the first 'len' characters of ARRAY
William M. Bracka2e844a2004-01-06 11:52:13 +0000818 */
819
820int
821xmlUTF8Strsize(const xmlChar *utf, int len) {
822 const xmlChar *ptr=utf;
823 xmlChar ch;
824
Daniel Veillard36e5cd52004-11-02 14:52:23 +0000825 if (utf == NULL)
826 return(0);
827
William M. Bracka2e844a2004-01-06 11:52:13 +0000828 if (len <= 0)
829 return(0);
830
831 while ( len-- > 0) {
832 if ( !*ptr )
833 break;
834 if ( (ch = *ptr++) & 0x80)
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000835 while ((ch<<=1) & 0x80 ) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000836 ptr++;
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000837 if (*ptr == 0) break;
838 }
William M. Bracka2e844a2004-01-06 11:52:13 +0000839 }
840 return (ptr - utf);
841}
842
843
844/**
845 * xmlUTF8Strndup:
846 * @utf: the input UTF8 *
847 * @len: the len of @utf (in chars)
848 *
849 * a strndup for array of UTF8's
850 *
851 * Returns a new UTF8 * or NULL
852 */
853xmlChar *
854xmlUTF8Strndup(const xmlChar *utf, int len) {
855 xmlChar *ret;
856 int i;
857
858 if ((utf == NULL) || (len < 0)) return(NULL);
859 i = xmlUTF8Strsize(utf, len);
860 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
861 if (ret == NULL) {
862 xmlGenericError(xmlGenericErrorContext,
863 "malloc of %ld byte failed\n",
864 (len + 1) * (long)sizeof(xmlChar));
865 return(NULL);
866 }
867 memcpy(ret, utf, i * sizeof(xmlChar));
868 ret[i] = 0;
869 return(ret);
870}
871
872/**
873 * xmlUTF8Strpos:
874 * @utf: the input UTF8 *
875 * @pos: the position of the desired UTF8 char (in chars)
876 *
877 * a function to provide the equivalent of fetching a
878 * character from a string array
879 *
880 * Returns a pointer to the UTF8 character or NULL
881 */
Daniel Veillard8a32fe42004-11-02 22:10:16 +0000882const xmlChar *
William M. Bracka2e844a2004-01-06 11:52:13 +0000883xmlUTF8Strpos(const xmlChar *utf, int pos) {
884 xmlChar ch;
885
886 if (utf == NULL) return(NULL);
William M. Brack230c5502004-12-20 16:18:49 +0000887 if (pos < 0)
William M. Bracka2e844a2004-01-06 11:52:13 +0000888 return(NULL);
889 while (pos--) {
890 if ((ch=*utf++) == 0) return(NULL);
891 if ( ch & 0x80 ) {
892 /* if not simple ascii, verify proper format */
893 if ( (ch & 0xc0) != 0xc0 )
894 return(NULL);
895 /* then skip over remaining bytes for this char */
896 while ( (ch <<= 1) & 0x80 )
897 if ( (*utf++ & 0xc0) != 0x80 )
898 return(NULL);
899 }
900 }
901 return((xmlChar *)utf);
902}
903
904/**
905 * xmlUTF8Strloc:
906 * @utf: the input UTF8 *
907 * @utfchar: the UTF8 character to be found
908 *
909 * a function to provide the relative location of a UTF8 char
910 *
911 * Returns the relative character position of the desired char
912 * or -1 if not found
913 */
914int
915xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
916 int i, size;
917 xmlChar ch;
918
919 if (utf==NULL || utfchar==NULL) return -1;
920 size = xmlUTF8Strsize(utfchar, 1);
921 for(i=0; (ch=*utf) != 0; i++) {
922 if (xmlStrncmp(utf, utfchar, size)==0)
923 return(i);
924 utf++;
925 if ( ch & 0x80 ) {
926 /* if not simple ascii, verify proper format */
927 if ( (ch & 0xc0) != 0xc0 )
928 return(-1);
929 /* then skip over remaining bytes for this char */
930 while ( (ch <<= 1) & 0x80 )
931 if ( (*utf++ & 0xc0) != 0x80 )
932 return(-1);
933 }
934 }
935
936 return(-1);
937}
938/**
939 * xmlUTF8Strsub:
940 * @utf: a sequence of UTF-8 encoded bytes
941 * @start: relative pos of first char
942 * @len: total number to copy
943 *
944 * Create a substring from a given UTF-8 string
945 * Note: positions are given in units of UTF-8 chars
946 *
947 * Returns a pointer to a newly created string
948 * or NULL if any problem
949 */
950
951xmlChar *
952xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
953 int i;
954 xmlChar ch;
955
956 if (utf == NULL) return(NULL);
957 if (start < 0) return(NULL);
958 if (len < 0) return(NULL);
959
960 /*
961 * Skip over any leading chars
962 */
963 for (i = 0;i < start;i++) {
964 if ((ch=*utf++) == 0) return(NULL);
965 if ( ch & 0x80 ) {
966 /* if not simple ascii, verify proper format */
967 if ( (ch & 0xc0) != 0xc0 )
968 return(NULL);
969 /* then skip over remaining bytes for this char */
970 while ( (ch <<= 1) & 0x80 )
971 if ( (*utf++ & 0xc0) != 0x80 )
972 return(NULL);
973 }
974 }
975
976 return(xmlUTF8Strndup(utf, len));
977}
Daniel Veillard5d4644e2005-04-01 13:11:58 +0000978
979#define bottom_xmlstring
980#include "elfgcchack.h"