blob: cc85777eadc71889f5ab25ba8ef54c4724f5bc20 [file] [log] [blame]
William M. Bracka2e844a2004-01-06 11:52:13 +00001/*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006 * from the parser.c file (their original home).
William M. Bracka2e844a2004-01-06 11:52:13 +00007 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16#define IN_LIBXML
17#include "libxml.h"
18
19#include <stdlib.h>
20#include <string.h>
21#include <libxml/xmlmemory.h>
22#include <libxml/parserInternals.h>
23#include <libxml/xmlstring.h>
24
25/************************************************************************
26 * *
27 * Commodity functions to handle xmlChars *
28 * *
29 ************************************************************************/
30
31/**
32 * xmlStrndup:
33 * @cur: the input xmlChar *
34 * @len: the len of @cur
35 *
36 * a strndup for array of xmlChar's
37 *
38 * Returns a new xmlChar * or NULL
39 */
40xmlChar *
41xmlStrndup(const xmlChar *cur, int len) {
42 xmlChar *ret;
Daniel Veillardf8e3db02012-09-11 13:26:36 +080043
William M. Bracka2e844a2004-01-06 11:52:13 +000044 if ((cur == NULL) || (len < 0)) return(NULL);
45 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46 if (ret == NULL) {
47 xmlErrMemory(NULL, NULL);
48 return(NULL);
49 }
50 memcpy(ret, cur, len * sizeof(xmlChar));
51 ret[len] = 0;
52 return(ret);
53}
54
55/**
56 * xmlStrdup:
57 * @cur: the input xmlChar *
58 *
59 * a strdup for array of xmlChar's. Since they are supposed to be
60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61 * a termination mark of '0'.
62 *
63 * Returns a new xmlChar * or NULL
64 */
65xmlChar *
66xmlStrdup(const xmlChar *cur) {
67 const xmlChar *p = cur;
68
69 if (cur == NULL) return(NULL);
70 while (*p != 0) p++; /* non input consuming */
71 return(xmlStrndup(cur, p - cur));
72}
73
74/**
75 * xmlCharStrndup:
76 * @cur: the input char *
77 * @len: the len of @cur
78 *
79 * a strndup for char's to xmlChar's
80 *
81 * Returns a new xmlChar * or NULL
82 */
83
84xmlChar *
85xmlCharStrndup(const char *cur, int len) {
86 int i;
87 xmlChar *ret;
Daniel Veillardf8e3db02012-09-11 13:26:36 +080088
William M. Bracka2e844a2004-01-06 11:52:13 +000089 if ((cur == NULL) || (len < 0)) return(NULL);
90 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91 if (ret == NULL) {
92 xmlErrMemory(NULL, NULL);
93 return(NULL);
94 }
Daniel Veillard5ea30d72004-11-08 11:54:28 +000095 for (i = 0;i < len;i++) {
William M. Bracka2e844a2004-01-06 11:52:13 +000096 ret[i] = (xmlChar) cur[i];
Daniel Veillard5ea30d72004-11-08 11:54:28 +000097 if (ret[i] == 0) return(ret);
98 }
William M. Bracka2e844a2004-01-06 11:52:13 +000099 ret[len] = 0;
100 return(ret);
101}
102
103/**
104 * xmlCharStrdup:
105 * @cur: the input char *
106 *
107 * a strdup for char's to xmlChar's
108 *
109 * Returns a new xmlChar * or NULL
110 */
111
112xmlChar *
113xmlCharStrdup(const char *cur) {
114 const char *p = cur;
115
116 if (cur == NULL) return(NULL);
117 while (*p != '\0') p++; /* non input consuming */
118 return(xmlCharStrndup(cur, p - cur));
119}
120
121/**
122 * xmlStrcmp:
123 * @str1: the first xmlChar *
124 * @str2: the second xmlChar *
125 *
126 * a strcmp for xmlChar's
127 *
128 * Returns the integer result of the comparison
129 */
130
131int
132xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
133 register int tmp;
134
135 if (str1 == str2) return(0);
136 if (str1 == NULL) return(-1);
137 if (str2 == NULL) return(1);
138 do {
139 tmp = *str1++ - *str2;
140 if (tmp != 0) return(tmp);
141 } while (*str2++ != 0);
142 return 0;
143}
144
145/**
146 * xmlStrEqual:
147 * @str1: the first xmlChar *
148 * @str2: the second xmlChar *
149 *
Daniel Veillardd95ecf02005-12-22 14:58:32 +0000150 * Check if both strings are equal of have same content.
Daniel Veillard6a0baa02005-12-10 11:11:12 +0000151 * Should be a bit more readable and faster than xmlStrcmp()
William M. Bracka2e844a2004-01-06 11:52:13 +0000152 *
153 * Returns 1 if they are equal, 0 if they are different
154 */
155
156int
157xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
158 if (str1 == str2) return(1);
159 if (str1 == NULL) return(0);
160 if (str2 == NULL) return(0);
161 do {
162 if (*str1++ != *str2) return(0);
163 } while (*str2++);
164 return(1);
165}
166
167/**
168 * xmlStrQEqual:
169 * @pref: the prefix of the QName
170 * @name: the localname of the QName
171 * @str: the second xmlChar *
172 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800173 * Check if a QName is Equal to a given string
William M. Bracka2e844a2004-01-06 11:52:13 +0000174 *
175 * Returns 1 if they are equal, 0 if they are different
176 */
177
178int
179xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
180 if (pref == NULL) return(xmlStrEqual(name, str));
181 if (name == NULL) return(0);
182 if (str == NULL) return(0);
183
184 do {
185 if (*pref++ != *str) return(0);
186 } while ((*str++) && (*pref));
187 if (*str++ != ':') return(0);
188 do {
189 if (*name++ != *str) return(0);
190 } while (*str++);
191 return(1);
192}
193
194/**
195 * xmlStrncmp:
196 * @str1: the first xmlChar *
197 * @str2: the second xmlChar *
198 * @len: the max comparison length
199 *
200 * a strncmp for xmlChar's
201 *
202 * Returns the integer result of the comparison
203 */
204
205int
206xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
207 register int tmp;
208
209 if (len <= 0) return(0);
210 if (str1 == str2) return(0);
211 if (str1 == NULL) return(-1);
212 if (str2 == NULL) return(1);
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000213#ifdef __GNUC__
William M. Brackb7b54de2004-10-06 16:38:01 +0000214 tmp = strncmp((const char *)str1, (const char *)str2, len);
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000215 return tmp;
216#else
William M. Bracka2e844a2004-01-06 11:52:13 +0000217 do {
218 tmp = *str1++ - *str2;
219 if (tmp != 0 || --len == 0) return(tmp);
220 } while (*str2++ != 0);
221 return 0;
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000222#endif
William M. Bracka2e844a2004-01-06 11:52:13 +0000223}
224
225static const xmlChar casemap[256] = {
226 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
227 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
228 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
229 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
230 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
231 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
232 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
233 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
234 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
235 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
236 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
237 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
238 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
239 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
240 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
241 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
242 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
243 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
244 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
245 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
246 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
247 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
248 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
249 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
250 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
251 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
252 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
253 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
254 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
255 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
256 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
257 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
258};
259
260/**
261 * xmlStrcasecmp:
262 * @str1: the first xmlChar *
263 * @str2: the second xmlChar *
264 *
265 * a strcasecmp for xmlChar's
266 *
267 * Returns the integer result of the comparison
268 */
269
270int
271xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
272 register int tmp;
273
274 if (str1 == str2) return(0);
275 if (str1 == NULL) return(-1);
276 if (str2 == NULL) return(1);
277 do {
278 tmp = casemap[*str1++] - casemap[*str2];
279 if (tmp != 0) return(tmp);
280 } while (*str2++ != 0);
281 return 0;
282}
283
284/**
285 * xmlStrncasecmp:
286 * @str1: the first xmlChar *
287 * @str2: the second xmlChar *
288 * @len: the max comparison length
289 *
290 * a strncasecmp for xmlChar's
291 *
292 * Returns the integer result of the comparison
293 */
294
295int
296xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
297 register int tmp;
298
299 if (len <= 0) return(0);
300 if (str1 == str2) return(0);
301 if (str1 == NULL) return(-1);
302 if (str2 == NULL) return(1);
303 do {
304 tmp = casemap[*str1++] - casemap[*str2];
305 if (tmp != 0 || --len == 0) return(tmp);
306 } while (*str2++ != 0);
307 return 0;
308}
309
310/**
311 * xmlStrchr:
312 * @str: the xmlChar * array
313 * @val: the xmlChar to search
314 *
315 * a strchr for xmlChar's
316 *
317 * Returns the xmlChar * for the first occurrence or NULL.
318 */
319
320const xmlChar *
321xmlStrchr(const xmlChar *str, xmlChar val) {
322 if (str == NULL) return(NULL);
323 while (*str != 0) { /* non input consuming */
324 if (*str == val) return((xmlChar *) str);
325 str++;
326 }
327 return(NULL);
328}
329
330/**
331 * xmlStrstr:
332 * @str: the xmlChar * array (haystack)
333 * @val: the xmlChar to search (needle)
334 *
335 * a strstr for xmlChar's
336 *
337 * Returns the xmlChar * for the first occurrence or NULL.
338 */
339
340const xmlChar *
341xmlStrstr(const xmlChar *str, const xmlChar *val) {
342 int n;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800343
William M. Bracka2e844a2004-01-06 11:52:13 +0000344 if (str == NULL) return(NULL);
345 if (val == NULL) return(NULL);
346 n = xmlStrlen(val);
347
348 if (n == 0) return(str);
349 while (*str != 0) { /* non input consuming */
350 if (*str == *val) {
351 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
352 }
353 str++;
354 }
355 return(NULL);
356}
357
358/**
359 * xmlStrcasestr:
360 * @str: the xmlChar * array (haystack)
361 * @val: the xmlChar to search (needle)
362 *
363 * a case-ignoring strstr for xmlChar's
364 *
365 * Returns the xmlChar * for the first occurrence or NULL.
366 */
367
368const xmlChar *
Daniel Veillardfcf24572009-08-12 23:02:08 +0200369xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000370 int n;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800371
William M. Bracka2e844a2004-01-06 11:52:13 +0000372 if (str == NULL) return(NULL);
373 if (val == NULL) return(NULL);
374 n = xmlStrlen(val);
375
376 if (n == 0) return(str);
377 while (*str != 0) { /* non input consuming */
378 if (casemap[*str] == casemap[*val])
379 if (!xmlStrncasecmp(str, val, n)) return(str);
380 str++;
381 }
382 return(NULL);
383}
384
385/**
386 * xmlStrsub:
387 * @str: the xmlChar * array (haystack)
388 * @start: the index of the first char (zero based)
389 * @len: the length of the substring
390 *
391 * Extract a substring of a given string
392 *
393 * Returns the xmlChar * for the first occurrence or NULL.
394 */
395
396xmlChar *
397xmlStrsub(const xmlChar *str, int start, int len) {
398 int i;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800399
William M. Bracka2e844a2004-01-06 11:52:13 +0000400 if (str == NULL) return(NULL);
401 if (start < 0) return(NULL);
402 if (len < 0) return(NULL);
403
404 for (i = 0;i < start;i++) {
405 if (*str == 0) return(NULL);
406 str++;
407 }
408 if (*str == 0) return(NULL);
409 return(xmlStrndup(str, len));
410}
411
412/**
413 * xmlStrlen:
414 * @str: the xmlChar * array
415 *
416 * length of a xmlChar's string
417 *
418 * Returns the number of xmlChar contained in the ARRAY.
419 */
420
421int
422xmlStrlen(const xmlChar *str) {
423 int len = 0;
424
425 if (str == NULL) return(0);
426 while (*str != 0) { /* non input consuming */
427 str++;
428 len++;
429 }
430 return(len);
431}
432
433/**
434 * xmlStrncat:
435 * @cur: the original xmlChar * array
436 * @add: the xmlChar * array added
437 * @len: the length of @add
438 *
439 * a strncat for array of xmlChar's, it will extend @cur with the len
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000440 * first bytes of @add. Note that if @len < 0 then this is an API error
441 * and NULL will be returned.
William M. Bracka2e844a2004-01-06 11:52:13 +0000442 *
443 * Returns a new xmlChar *, the original @cur is reallocated if needed
444 * and should not be freed
445 */
446
447xmlChar *
448xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
449 int size;
450 xmlChar *ret;
451
452 if ((add == NULL) || (len == 0))
453 return(cur);
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000454 if (len < 0)
455 return(NULL);
William M. Bracka2e844a2004-01-06 11:52:13 +0000456 if (cur == NULL)
457 return(xmlStrndup(add, len));
458
459 size = xmlStrlen(cur);
Pranjal Jumde8fbbf552016-03-08 17:29:00 -0800460 if (size < 0)
461 return(NULL);
William M. Bracka2e844a2004-01-06 11:52:13 +0000462 ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
463 if (ret == NULL) {
464 xmlErrMemory(NULL, NULL);
465 return(cur);
466 }
467 memcpy(&ret[size], add, len * sizeof(xmlChar));
468 ret[size + len] = 0;
469 return(ret);
470}
471
472/**
473 * xmlStrncatNew:
474 * @str1: first xmlChar string
475 * @str2: second xmlChar string
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000476 * @len: the len of @str2 or < 0
William M. Bracka2e844a2004-01-06 11:52:13 +0000477 *
478 * same as xmlStrncat, but creates a new string. The original
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000479 * two strings are not freed. If @len is < 0 then the length
480 * will be calculated automatically.
William M. Bracka2e844a2004-01-06 11:52:13 +0000481 *
482 * Returns a new xmlChar * or NULL
483 */
484xmlChar *
485xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
486 int size;
487 xmlChar *ret;
488
Pranjal Jumde8fbbf552016-03-08 17:29:00 -0800489 if (len < 0) {
Daniel Veillard8a32fe42004-11-02 22:10:16 +0000490 len = xmlStrlen(str2);
Pranjal Jumde8fbbf552016-03-08 17:29:00 -0800491 if (len < 0)
492 return(NULL);
493 }
William M. Bracka2e844a2004-01-06 11:52:13 +0000494 if ((str2 == NULL) || (len == 0))
495 return(xmlStrdup(str1));
496 if (str1 == NULL)
497 return(xmlStrndup(str2, len));
498
499 size = xmlStrlen(str1);
Pranjal Jumde8fbbf552016-03-08 17:29:00 -0800500 if (size < 0)
501 return(NULL);
William M. Bracka2e844a2004-01-06 11:52:13 +0000502 ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
503 if (ret == NULL) {
504 xmlErrMemory(NULL, NULL);
505 return(xmlStrndup(str1, size));
506 }
507 memcpy(ret, str1, size * sizeof(xmlChar));
508 memcpy(&ret[size], str2, len * sizeof(xmlChar));
509 ret[size + len] = 0;
510 return(ret);
511}
512
513/**
514 * xmlStrcat:
515 * @cur: the original xmlChar * array
516 * @add: the xmlChar * array added
517 *
518 * a strcat for array of xmlChar's. Since they are supposed to be
519 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
520 * a termination mark of '0'.
521 *
522 * Returns a new xmlChar * containing the concatenated string.
523 */
524xmlChar *
525xmlStrcat(xmlChar *cur, const xmlChar *add) {
526 const xmlChar *p = add;
527
528 if (add == NULL) return(cur);
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800529 if (cur == NULL)
William M. Bracka2e844a2004-01-06 11:52:13 +0000530 return(xmlStrdup(add));
531
532 while (*p != 0) p++; /* non input consuming */
533 return(xmlStrncat(cur, add, p - add));
534}
535
536/**
537 * xmlStrPrintf:
538 * @buf: the result buffer.
539 * @len: the result buffer length.
540 * @msg: the message with printf formatting.
541 * @...: extra parameters for the message.
542 *
543 * Formats @msg and places result into @buf.
544 *
545 * Returns the number of characters written to @buf or -1 if an error occurs.
546 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800547int XMLCDECL
David Kilzer4472c3a2016-05-13 15:13:17 +0800548xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000549 va_list args;
550 int ret;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800551
William M. Bracka2e844a2004-01-06 11:52:13 +0000552 if((buf == NULL) || (msg == NULL)) {
553 return(-1);
554 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800555
William M. Bracka2e844a2004-01-06 11:52:13 +0000556 va_start(args, msg);
557 ret = vsnprintf((char *) buf, len, (const char *) msg, args);
558 va_end(args);
559 buf[len - 1] = 0; /* be safe ! */
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800560
William M. Bracka2e844a2004-01-06 11:52:13 +0000561 return(ret);
562}
563
564/**
565 * xmlStrVPrintf:
566 * @buf: the result buffer.
567 * @len: the result buffer length.
568 * @msg: the message with printf formatting.
569 * @ap: extra parameters for the message.
570 *
571 * Formats @msg and places result into @buf.
572 *
573 * Returns the number of characters written to @buf or -1 if an error occurs.
574 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800575int
David Kilzer4472c3a2016-05-13 15:13:17 +0800576xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000577 int ret;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800578
William M. Bracka2e844a2004-01-06 11:52:13 +0000579 if((buf == NULL) || (msg == NULL)) {
580 return(-1);
581 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800582
William M. Bracka2e844a2004-01-06 11:52:13 +0000583 ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
584 buf[len - 1] = 0; /* be safe ! */
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800585
William M. Bracka2e844a2004-01-06 11:52:13 +0000586 return(ret);
587}
588
589/************************************************************************
590 * *
591 * Generic UTF8 handling routines *
592 * *
593 * From rfc2044: encoding of the Unicode values on UTF-8: *
594 * *
595 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
596 * 0000 0000-0000 007F 0xxxxxxx *
597 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
598 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
599 * *
600 * I hope we won't use values > 0xFFFF anytime soon ! *
601 * *
602 ************************************************************************/
603
604
605/**
606 * xmlUTF8Size:
607 * @utf: pointer to the UTF8 character
608 *
609 * calculates the internal size of a UTF8 character
610 *
611 * returns the numbers of bytes in the character, -1 on format error
612 */
613int
614xmlUTF8Size(const xmlChar *utf) {
615 xmlChar mask;
616 int len;
617
618 if (utf == NULL)
619 return -1;
620 if (*utf < 0x80)
621 return 1;
622 /* check valid UTF8 character */
623 if (!(*utf & 0x40))
624 return -1;
625 /* determine number of bytes in char */
626 len = 2;
627 for (mask=0x20; mask != 0; mask>>=1) {
628 if (!(*utf & mask))
629 return len;
630 len++;
631 }
632 return -1;
633}
634
635/**
636 * xmlUTF8Charcmp:
637 * @utf1: pointer to first UTF8 char
638 * @utf2: pointer to second UTF8 char
639 *
640 * compares the two UCS4 values
641 *
642 * returns result of the compare as with xmlStrncmp
643 */
644int
645xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
646
647 if (utf1 == NULL ) {
648 if (utf2 == NULL)
649 return 0;
650 return -1;
651 }
652 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
653}
654
655/**
656 * xmlUTF8Strlen:
657 * @utf: a sequence of UTF-8 encoded bytes
658 *
659 * compute the length of an UTF8 string, it doesn't do a full UTF8
660 * checking of the content of the string.
661 *
662 * Returns the number of characters in the string or -1 in case of error
663 */
664int
665xmlUTF8Strlen(const xmlChar *utf) {
666 int ret = 0;
667
668 if (utf == NULL)
669 return(-1);
670
671 while (*utf != 0) {
672 if (utf[0] & 0x80) {
673 if ((utf[1] & 0xc0) != 0x80)
674 return(-1);
675 if ((utf[0] & 0xe0) == 0xe0) {
676 if ((utf[2] & 0xc0) != 0x80)
677 return(-1);
678 if ((utf[0] & 0xf0) == 0xf0) {
679 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
680 return(-1);
681 utf += 4;
682 } else {
683 utf += 3;
684 }
685 } else {
686 utf += 2;
687 }
688 } else {
689 utf++;
690 }
691 ret++;
692 }
693 return(ret);
694}
695
696/**
697 * xmlGetUTF8Char:
698 * @utf: a sequence of UTF-8 encoded bytes
William M. Brack3e530162004-09-03 17:10:08 +0000699 * @len: a pointer to the minimum number of bytes present in
700 * the sequence. This is used to assure the next character
701 * is completely contained within the sequence.
William M. Bracka2e844a2004-01-06 11:52:13 +0000702 *
William M. Brack3e530162004-09-03 17:10:08 +0000703 * Read the first UTF8 character from @utf
William M. Bracka2e844a2004-01-06 11:52:13 +0000704 *
William M. Brack3e530162004-09-03 17:10:08 +0000705 * Returns the char value or -1 in case of error, and sets *len to
706 * the actual number of bytes consumed (0 in case of error)
William M. Bracka2e844a2004-01-06 11:52:13 +0000707 */
708int
709xmlGetUTF8Char(const unsigned char *utf, int *len) {
710 unsigned int c;
711
712 if (utf == NULL)
713 goto error;
714 if (len == NULL)
715 goto error;
716 if (*len < 1)
717 goto error;
718
719 c = utf[0];
720 if (c & 0x80) {
721 if (*len < 2)
722 goto error;
723 if ((utf[1] & 0xc0) != 0x80)
724 goto error;
725 if ((c & 0xe0) == 0xe0) {
726 if (*len < 3)
727 goto error;
728 if ((utf[2] & 0xc0) != 0x80)
729 goto error;
730 if ((c & 0xf0) == 0xf0) {
731 if (*len < 4)
732 goto error;
733 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
734 goto error;
735 *len = 4;
736 /* 4-byte code */
737 c = (utf[0] & 0x7) << 18;
738 c |= (utf[1] & 0x3f) << 12;
739 c |= (utf[2] & 0x3f) << 6;
740 c |= utf[3] & 0x3f;
741 } else {
742 /* 3-byte code */
743 *len = 3;
744 c = (utf[0] & 0xf) << 12;
745 c |= (utf[1] & 0x3f) << 6;
746 c |= utf[2] & 0x3f;
747 }
748 } else {
749 /* 2-byte code */
750 *len = 2;
751 c = (utf[0] & 0x1f) << 6;
752 c |= utf[1] & 0x3f;
753 }
754 } else {
755 /* 1-byte code */
756 *len = 1;
757 }
758 return(c);
759
760error:
Daniel Veillardce682bc2004-11-05 17:22:25 +0000761 if (len != NULL)
762 *len = 0;
William M. Bracka2e844a2004-01-06 11:52:13 +0000763 return(-1);
764}
765
766/**
767 * xmlCheckUTF8:
768 * @utf: Pointer to putative UTF-8 encoded string.
769 *
770 * Checks @utf for being valid UTF-8. @utf is assumed to be
771 * null-terminated. This function is not super-strict, as it will
772 * allow longer UTF-8 sequences than necessary. Note that Java is
773 * capable of producing these sequences if provoked. Also note, this
774 * routine checks for the 4-byte maximum size, but does not check for
775 * 0x10ffff maximum value.
776 *
777 * Return value: true if @utf is valid.
778 **/
779int
780xmlCheckUTF8(const unsigned char *utf)
781{
782 int ix;
783 unsigned char c;
784
Daniel Veillardce682bc2004-11-05 17:22:25 +0000785 if (utf == NULL)
786 return(0);
William M. Brack3ffe90e2004-08-28 01:33:30 +0000787 /*
788 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
789 * are as follows (in "bit format"):
790 * 0xxxxxxx valid 1-byte
791 * 110xxxxx 10xxxxxx valid 2-byte
792 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
793 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
794 */
795 for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
William M. Brackf4095152004-08-31 16:49:26 +0000796 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
William M. Bracka2e844a2004-01-06 11:52:13 +0000797 ix++;
William M. Brackbf5cf212004-08-31 06:47:17 +0000798 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
799 if ((utf[ix+1] & 0xc0 ) != 0x80)
800 return 0;
801 ix += 2;
802 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
803 if (((utf[ix+1] & 0xc0) != 0x80) ||
804 ((utf[ix+2] & 0xc0) != 0x80))
805 return 0;
806 ix += 3;
807 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
808 if (((utf[ix+1] & 0xc0) != 0x80) ||
809 ((utf[ix+2] & 0xc0) != 0x80) ||
810 ((utf[ix+3] & 0xc0) != 0x80))
811 return 0;
812 ix += 4;
813 } else /* unknown encoding */
814 return 0;
William M. Bracka2e844a2004-01-06 11:52:13 +0000815 }
816 return(1);
817}
818
819/**
820 * xmlUTF8Strsize:
821 * @utf: a sequence of UTF-8 encoded bytes
822 * @len: the number of characters in the array
823 *
824 * storage size of an UTF8 string
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000825 * the behaviour is not garanteed if the input string is not UTF-8
William M. Bracka2e844a2004-01-06 11:52:13 +0000826 *
827 * Returns the storage size of
828 * the first 'len' characters of ARRAY
William M. Bracka2e844a2004-01-06 11:52:13 +0000829 */
830
831int
832xmlUTF8Strsize(const xmlChar *utf, int len) {
833 const xmlChar *ptr=utf;
834 xmlChar ch;
835
Daniel Veillard36e5cd52004-11-02 14:52:23 +0000836 if (utf == NULL)
837 return(0);
838
William M. Bracka2e844a2004-01-06 11:52:13 +0000839 if (len <= 0)
840 return(0);
841
842 while ( len-- > 0) {
843 if ( !*ptr )
844 break;
845 if ( (ch = *ptr++) & 0x80)
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000846 while ((ch<<=1) & 0x80 ) {
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000847 if (*ptr == 0) break;
Nick Wellnhofer96a5c172016-04-21 19:03:47 +0200848 ptr++;
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000849 }
William M. Bracka2e844a2004-01-06 11:52:13 +0000850 }
851 return (ptr - utf);
852}
853
854
855/**
856 * xmlUTF8Strndup:
857 * @utf: the input UTF8 *
858 * @len: the len of @utf (in chars)
859 *
860 * a strndup for array of UTF8's
861 *
862 * Returns a new UTF8 * or NULL
863 */
864xmlChar *
865xmlUTF8Strndup(const xmlChar *utf, int len) {
866 xmlChar *ret;
867 int i;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800868
William M. Bracka2e844a2004-01-06 11:52:13 +0000869 if ((utf == NULL) || (len < 0)) return(NULL);
870 i = xmlUTF8Strsize(utf, len);
871 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
872 if (ret == NULL) {
873 xmlGenericError(xmlGenericErrorContext,
874 "malloc of %ld byte failed\n",
875 (len + 1) * (long)sizeof(xmlChar));
876 return(NULL);
877 }
878 memcpy(ret, utf, i * sizeof(xmlChar));
879 ret[i] = 0;
880 return(ret);
881}
882
883/**
884 * xmlUTF8Strpos:
885 * @utf: the input UTF8 *
886 * @pos: the position of the desired UTF8 char (in chars)
887 *
888 * a function to provide the equivalent of fetching a
889 * character from a string array
890 *
891 * Returns a pointer to the UTF8 character or NULL
892 */
Daniel Veillard8a32fe42004-11-02 22:10:16 +0000893const xmlChar *
William M. Bracka2e844a2004-01-06 11:52:13 +0000894xmlUTF8Strpos(const xmlChar *utf, int pos) {
895 xmlChar ch;
896
897 if (utf == NULL) return(NULL);
William M. Brack230c5502004-12-20 16:18:49 +0000898 if (pos < 0)
William M. Bracka2e844a2004-01-06 11:52:13 +0000899 return(NULL);
900 while (pos--) {
901 if ((ch=*utf++) == 0) return(NULL);
902 if ( ch & 0x80 ) {
903 /* if not simple ascii, verify proper format */
904 if ( (ch & 0xc0) != 0xc0 )
905 return(NULL);
906 /* then skip over remaining bytes for this char */
907 while ( (ch <<= 1) & 0x80 )
908 if ( (*utf++ & 0xc0) != 0x80 )
909 return(NULL);
910 }
911 }
912 return((xmlChar *)utf);
913}
914
915/**
916 * xmlUTF8Strloc:
917 * @utf: the input UTF8 *
918 * @utfchar: the UTF8 character to be found
919 *
920 * a function to provide the relative location of a UTF8 char
921 *
922 * Returns the relative character position of the desired char
923 * or -1 if not found
924 */
925int
926xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
927 int i, size;
928 xmlChar ch;
929
930 if (utf==NULL || utfchar==NULL) return -1;
931 size = xmlUTF8Strsize(utfchar, 1);
932 for(i=0; (ch=*utf) != 0; i++) {
933 if (xmlStrncmp(utf, utfchar, size)==0)
934 return(i);
935 utf++;
936 if ( ch & 0x80 ) {
937 /* if not simple ascii, verify proper format */
938 if ( (ch & 0xc0) != 0xc0 )
939 return(-1);
940 /* then skip over remaining bytes for this char */
941 while ( (ch <<= 1) & 0x80 )
942 if ( (*utf++ & 0xc0) != 0x80 )
943 return(-1);
944 }
945 }
946
947 return(-1);
948}
949/**
950 * xmlUTF8Strsub:
951 * @utf: a sequence of UTF-8 encoded bytes
952 * @start: relative pos of first char
953 * @len: total number to copy
954 *
955 * Create a substring from a given UTF-8 string
956 * Note: positions are given in units of UTF-8 chars
957 *
958 * Returns a pointer to a newly created string
959 * or NULL if any problem
960 */
961
962xmlChar *
963xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
964 int i;
965 xmlChar ch;
966
967 if (utf == NULL) return(NULL);
968 if (start < 0) return(NULL);
969 if (len < 0) return(NULL);
970
971 /*
972 * Skip over any leading chars
973 */
974 for (i = 0;i < start;i++) {
975 if ((ch=*utf++) == 0) return(NULL);
976 if ( ch & 0x80 ) {
977 /* if not simple ascii, verify proper format */
978 if ( (ch & 0xc0) != 0xc0 )
979 return(NULL);
980 /* then skip over remaining bytes for this char */
981 while ( (ch <<= 1) & 0x80 )
982 if ( (*utf++ & 0xc0) != 0x80 )
983 return(NULL);
984 }
985 }
986
987 return(xmlUTF8Strndup(utf, len));
988}
Daniel Veillard5d4644e2005-04-01 13:11:58 +0000989
David Kilzer502f6a62016-05-23 14:58:41 +0800990/**
991 * xmlEscapeFormatString:
992 * @msg: a pointer to the string in which to escape '%' characters.
993 * Must be a heap-allocated buffer created by libxml2 that may be
994 * returned, or that may be freed and replaced.
995 *
996 * Replaces the string pointed to by 'msg' with an escaped string.
997 * Returns the same string with all '%' characters escaped.
998 */
999xmlChar *
1000xmlEscapeFormatString(xmlChar **msg)
1001{
1002 xmlChar *msgPtr = NULL;
1003 xmlChar *result = NULL;
1004 xmlChar *resultPtr = NULL;
1005 size_t count = 0;
1006 size_t msgLen = 0;
1007 size_t resultLen = 0;
1008
1009 if (!msg || !*msg)
1010 return(NULL);
1011
1012 for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1013 ++msgLen;
1014 if (*msgPtr == '%')
1015 ++count;
1016 }
1017
1018 if (count == 0)
1019 return(*msg);
1020
1021 resultLen = msgLen + count + 1;
1022 result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
1023 if (result == NULL) {
1024 /* Clear *msg to prevent format string vulnerabilities in
1025 out-of-memory situations. */
1026 xmlFree(*msg);
1027 *msg = NULL;
1028 xmlErrMemory(NULL, NULL);
1029 return(NULL);
1030 }
1031
1032 for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1033 *resultPtr = *msgPtr;
1034 if (*msgPtr == '%')
1035 *(++resultPtr) = '%';
1036 }
1037 result[resultLen - 1] = '\0';
1038
1039 xmlFree(*msg);
1040 *msg = result;
1041
1042 return *msg;
1043}
1044
Daniel Veillard5d4644e2005-04-01 13:11:58 +00001045#define bottom_xmlstring
1046#include "elfgcchack.h"