blob: e8a1e45d1b3e890238f0ccb9ada60ccccf887074 [file] [log] [blame]
William M. Bracka2e844a2004-01-06 11:52:13 +00001/*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006 * from the parser.c file (their original home).
William M. Bracka2e844a2004-01-06 11:52:13 +00007 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16#define IN_LIBXML
17#include "libxml.h"
18
19#include <stdlib.h>
20#include <string.h>
21#include <libxml/xmlmemory.h>
22#include <libxml/parserInternals.h>
23#include <libxml/xmlstring.h>
24
25/************************************************************************
26 * *
27 * Commodity functions to handle xmlChars *
28 * *
29 ************************************************************************/
30
31/**
32 * xmlStrndup:
33 * @cur: the input xmlChar *
34 * @len: the len of @cur
35 *
36 * a strndup for array of xmlChar's
37 *
38 * Returns a new xmlChar * or NULL
39 */
40xmlChar *
41xmlStrndup(const xmlChar *cur, int len) {
42 xmlChar *ret;
Daniel Veillardf8e3db02012-09-11 13:26:36 +080043
William M. Bracka2e844a2004-01-06 11:52:13 +000044 if ((cur == NULL) || (len < 0)) return(NULL);
45 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46 if (ret == NULL) {
47 xmlErrMemory(NULL, NULL);
48 return(NULL);
49 }
50 memcpy(ret, cur, len * sizeof(xmlChar));
51 ret[len] = 0;
52 return(ret);
53}
54
55/**
56 * xmlStrdup:
57 * @cur: the input xmlChar *
58 *
59 * a strdup for array of xmlChar's. Since they are supposed to be
60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61 * a termination mark of '0'.
62 *
63 * Returns a new xmlChar * or NULL
64 */
65xmlChar *
66xmlStrdup(const xmlChar *cur) {
67 const xmlChar *p = cur;
68
69 if (cur == NULL) return(NULL);
70 while (*p != 0) p++; /* non input consuming */
71 return(xmlStrndup(cur, p - cur));
72}
73
74/**
75 * xmlCharStrndup:
76 * @cur: the input char *
77 * @len: the len of @cur
78 *
79 * a strndup for char's to xmlChar's
80 *
81 * Returns a new xmlChar * or NULL
82 */
83
84xmlChar *
85xmlCharStrndup(const char *cur, int len) {
86 int i;
87 xmlChar *ret;
Daniel Veillardf8e3db02012-09-11 13:26:36 +080088
William M. Bracka2e844a2004-01-06 11:52:13 +000089 if ((cur == NULL) || (len < 0)) return(NULL);
90 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91 if (ret == NULL) {
92 xmlErrMemory(NULL, NULL);
93 return(NULL);
94 }
Daniel Veillard5ea30d72004-11-08 11:54:28 +000095 for (i = 0;i < len;i++) {
William M. Bracka2e844a2004-01-06 11:52:13 +000096 ret[i] = (xmlChar) cur[i];
Daniel Veillard5ea30d72004-11-08 11:54:28 +000097 if (ret[i] == 0) return(ret);
98 }
William M. Bracka2e844a2004-01-06 11:52:13 +000099 ret[len] = 0;
100 return(ret);
101}
102
103/**
104 * xmlCharStrdup:
105 * @cur: the input char *
106 *
107 * a strdup for char's to xmlChar's
108 *
109 * Returns a new xmlChar * or NULL
110 */
111
112xmlChar *
113xmlCharStrdup(const char *cur) {
114 const char *p = cur;
115
116 if (cur == NULL) return(NULL);
117 while (*p != '\0') p++; /* non input consuming */
118 return(xmlCharStrndup(cur, p - cur));
119}
120
121/**
122 * xmlStrcmp:
123 * @str1: the first xmlChar *
124 * @str2: the second xmlChar *
125 *
126 * a strcmp for xmlChar's
127 *
128 * Returns the integer result of the comparison
129 */
130
131int
132xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000133 if (str1 == str2) return(0);
134 if (str1 == NULL) return(-1);
135 if (str2 == NULL) return(1);
Haibo Huangf0a546b2020-09-01 20:28:19 -0700136#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
137 return(strcmp((const char *)str1, (const char *)str2));
138#else
William M. Bracka2e844a2004-01-06 11:52:13 +0000139 do {
Haibo Huangf0a546b2020-09-01 20:28:19 -0700140 int tmp = *str1++ - *str2;
William M. Bracka2e844a2004-01-06 11:52:13 +0000141 if (tmp != 0) return(tmp);
142 } while (*str2++ != 0);
143 return 0;
Haibo Huangf0a546b2020-09-01 20:28:19 -0700144#endif
William M. Bracka2e844a2004-01-06 11:52:13 +0000145}
146
147/**
148 * xmlStrEqual:
149 * @str1: the first xmlChar *
150 * @str2: the second xmlChar *
151 *
Daniel Veillardd95ecf02005-12-22 14:58:32 +0000152 * Check if both strings are equal of have same content.
Daniel Veillard6a0baa02005-12-10 11:11:12 +0000153 * Should be a bit more readable and faster than xmlStrcmp()
William M. Bracka2e844a2004-01-06 11:52:13 +0000154 *
155 * Returns 1 if they are equal, 0 if they are different
156 */
157
158int
159xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
160 if (str1 == str2) return(1);
161 if (str1 == NULL) return(0);
162 if (str2 == NULL) return(0);
Haibo Huangf0a546b2020-09-01 20:28:19 -0700163#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
164 return(strcmp((const char *)str1, (const char *)str2) == 0);
165#else
William M. Bracka2e844a2004-01-06 11:52:13 +0000166 do {
167 if (*str1++ != *str2) return(0);
168 } while (*str2++);
169 return(1);
Haibo Huangf0a546b2020-09-01 20:28:19 -0700170#endif
William M. Bracka2e844a2004-01-06 11:52:13 +0000171}
172
173/**
174 * xmlStrQEqual:
175 * @pref: the prefix of the QName
176 * @name: the localname of the QName
177 * @str: the second xmlChar *
178 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800179 * Check if a QName is Equal to a given string
William M. Bracka2e844a2004-01-06 11:52:13 +0000180 *
181 * Returns 1 if they are equal, 0 if they are different
182 */
183
184int
185xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
186 if (pref == NULL) return(xmlStrEqual(name, str));
187 if (name == NULL) return(0);
188 if (str == NULL) return(0);
189
190 do {
191 if (*pref++ != *str) return(0);
192 } while ((*str++) && (*pref));
193 if (*str++ != ':') return(0);
194 do {
195 if (*name++ != *str) return(0);
196 } while (*str++);
197 return(1);
198}
199
200/**
201 * xmlStrncmp:
202 * @str1: the first xmlChar *
203 * @str2: the second xmlChar *
204 * @len: the max comparison length
205 *
206 * a strncmp for xmlChar's
207 *
208 * Returns the integer result of the comparison
209 */
210
211int
212xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000213 if (len <= 0) return(0);
214 if (str1 == str2) return(0);
215 if (str1 == NULL) return(-1);
216 if (str2 == NULL) return(1);
Haibo Huangf0a546b2020-09-01 20:28:19 -0700217#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
218 return(strncmp((const char *)str1, (const char *)str2, len));
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000219#else
William M. Bracka2e844a2004-01-06 11:52:13 +0000220 do {
Haibo Huangf0a546b2020-09-01 20:28:19 -0700221 int tmp = *str1++ - *str2;
William M. Bracka2e844a2004-01-06 11:52:13 +0000222 if (tmp != 0 || --len == 0) return(tmp);
223 } while (*str2++ != 0);
224 return 0;
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000225#endif
William M. Bracka2e844a2004-01-06 11:52:13 +0000226}
227
228static const xmlChar casemap[256] = {
229 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
230 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
231 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
232 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
233 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
234 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
235 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
236 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
237 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
238 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
239 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
240 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
241 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
242 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
243 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
244 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
245 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
246 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
247 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
248 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
249 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
250 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
251 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
252 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
253 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
254 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
255 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
256 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
257 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
258 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
259 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
260 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
261};
262
263/**
264 * xmlStrcasecmp:
265 * @str1: the first xmlChar *
266 * @str2: the second xmlChar *
267 *
268 * a strcasecmp for xmlChar's
269 *
270 * Returns the integer result of the comparison
271 */
272
273int
274xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
275 register int tmp;
276
277 if (str1 == str2) return(0);
278 if (str1 == NULL) return(-1);
279 if (str2 == NULL) return(1);
280 do {
281 tmp = casemap[*str1++] - casemap[*str2];
282 if (tmp != 0) return(tmp);
283 } while (*str2++ != 0);
284 return 0;
285}
286
287/**
288 * xmlStrncasecmp:
289 * @str1: the first xmlChar *
290 * @str2: the second xmlChar *
291 * @len: the max comparison length
292 *
293 * a strncasecmp for xmlChar's
294 *
295 * Returns the integer result of the comparison
296 */
297
298int
299xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
300 register int tmp;
301
302 if (len <= 0) return(0);
303 if (str1 == str2) return(0);
304 if (str1 == NULL) return(-1);
305 if (str2 == NULL) return(1);
306 do {
307 tmp = casemap[*str1++] - casemap[*str2];
308 if (tmp != 0 || --len == 0) return(tmp);
309 } while (*str2++ != 0);
310 return 0;
311}
312
313/**
314 * xmlStrchr:
315 * @str: the xmlChar * array
316 * @val: the xmlChar to search
317 *
318 * a strchr for xmlChar's
319 *
320 * Returns the xmlChar * for the first occurrence or NULL.
321 */
322
323const xmlChar *
324xmlStrchr(const xmlChar *str, xmlChar val) {
325 if (str == NULL) return(NULL);
326 while (*str != 0) { /* non input consuming */
327 if (*str == val) return((xmlChar *) str);
328 str++;
329 }
330 return(NULL);
331}
332
333/**
334 * xmlStrstr:
335 * @str: the xmlChar * array (haystack)
336 * @val: the xmlChar to search (needle)
337 *
338 * a strstr for xmlChar's
339 *
340 * Returns the xmlChar * for the first occurrence or NULL.
341 */
342
343const xmlChar *
344xmlStrstr(const xmlChar *str, const xmlChar *val) {
345 int n;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800346
William M. Bracka2e844a2004-01-06 11:52:13 +0000347 if (str == NULL) return(NULL);
348 if (val == NULL) return(NULL);
349 n = xmlStrlen(val);
350
351 if (n == 0) return(str);
352 while (*str != 0) { /* non input consuming */
353 if (*str == *val) {
354 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
355 }
356 str++;
357 }
358 return(NULL);
359}
360
361/**
362 * xmlStrcasestr:
363 * @str: the xmlChar * array (haystack)
364 * @val: the xmlChar to search (needle)
365 *
366 * a case-ignoring strstr for xmlChar's
367 *
368 * Returns the xmlChar * for the first occurrence or NULL.
369 */
370
371const xmlChar *
Daniel Veillardfcf24572009-08-12 23:02:08 +0200372xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000373 int n;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800374
William M. Bracka2e844a2004-01-06 11:52:13 +0000375 if (str == NULL) return(NULL);
376 if (val == NULL) return(NULL);
377 n = xmlStrlen(val);
378
379 if (n == 0) return(str);
380 while (*str != 0) { /* non input consuming */
381 if (casemap[*str] == casemap[*val])
382 if (!xmlStrncasecmp(str, val, n)) return(str);
383 str++;
384 }
385 return(NULL);
386}
387
388/**
389 * xmlStrsub:
390 * @str: the xmlChar * array (haystack)
391 * @start: the index of the first char (zero based)
392 * @len: the length of the substring
393 *
394 * Extract a substring of a given string
395 *
396 * Returns the xmlChar * for the first occurrence or NULL.
397 */
398
399xmlChar *
400xmlStrsub(const xmlChar *str, int start, int len) {
401 int i;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800402
William M. Bracka2e844a2004-01-06 11:52:13 +0000403 if (str == NULL) return(NULL);
404 if (start < 0) return(NULL);
405 if (len < 0) return(NULL);
406
407 for (i = 0;i < start;i++) {
408 if (*str == 0) return(NULL);
409 str++;
410 }
411 if (*str == 0) return(NULL);
412 return(xmlStrndup(str, len));
413}
414
415/**
416 * xmlStrlen:
417 * @str: the xmlChar * array
418 *
419 * length of a xmlChar's string
420 *
421 * Returns the number of xmlChar contained in the ARRAY.
422 */
423
424int
425xmlStrlen(const xmlChar *str) {
426 int len = 0;
427
428 if (str == NULL) return(0);
429 while (*str != 0) { /* non input consuming */
430 str++;
431 len++;
432 }
433 return(len);
434}
435
436/**
437 * xmlStrncat:
438 * @cur: the original xmlChar * array
439 * @add: the xmlChar * array added
440 * @len: the length of @add
441 *
442 * a strncat for array of xmlChar's, it will extend @cur with the len
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000443 * first bytes of @add. Note that if @len < 0 then this is an API error
444 * and NULL will be returned.
William M. Bracka2e844a2004-01-06 11:52:13 +0000445 *
Nick Wellnhofer5a0ae662017-06-17 23:20:38 +0200446 * Returns a new xmlChar *, the original @cur is reallocated and should
447 * not be freed.
William M. Bracka2e844a2004-01-06 11:52:13 +0000448 */
449
450xmlChar *
451xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
452 int size;
453 xmlChar *ret;
454
455 if ((add == NULL) || (len == 0))
456 return(cur);
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000457 if (len < 0)
458 return(NULL);
William M. Bracka2e844a2004-01-06 11:52:13 +0000459 if (cur == NULL)
460 return(xmlStrndup(add, len));
461
462 size = xmlStrlen(cur);
Pranjal Jumde8fbbf552016-03-08 17:29:00 -0800463 if (size < 0)
464 return(NULL);
William M. Bracka2e844a2004-01-06 11:52:13 +0000465 ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
466 if (ret == NULL) {
467 xmlErrMemory(NULL, NULL);
468 return(cur);
469 }
470 memcpy(&ret[size], add, len * sizeof(xmlChar));
471 ret[size + len] = 0;
472 return(ret);
473}
474
475/**
476 * xmlStrncatNew:
477 * @str1: first xmlChar string
478 * @str2: second xmlChar string
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000479 * @len: the len of @str2 or < 0
William M. Bracka2e844a2004-01-06 11:52:13 +0000480 *
481 * same as xmlStrncat, but creates a new string. The original
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000482 * two strings are not freed. If @len is < 0 then the length
483 * will be calculated automatically.
William M. Bracka2e844a2004-01-06 11:52:13 +0000484 *
485 * Returns a new xmlChar * or NULL
486 */
487xmlChar *
488xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
489 int size;
490 xmlChar *ret;
491
Pranjal Jumde8fbbf552016-03-08 17:29:00 -0800492 if (len < 0) {
Daniel Veillard8a32fe42004-11-02 22:10:16 +0000493 len = xmlStrlen(str2);
Pranjal Jumde8fbbf552016-03-08 17:29:00 -0800494 if (len < 0)
495 return(NULL);
496 }
William M. Bracka2e844a2004-01-06 11:52:13 +0000497 if ((str2 == NULL) || (len == 0))
498 return(xmlStrdup(str1));
499 if (str1 == NULL)
500 return(xmlStrndup(str2, len));
501
502 size = xmlStrlen(str1);
Pranjal Jumde8fbbf552016-03-08 17:29:00 -0800503 if (size < 0)
504 return(NULL);
William M. Bracka2e844a2004-01-06 11:52:13 +0000505 ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
506 if (ret == NULL) {
507 xmlErrMemory(NULL, NULL);
508 return(xmlStrndup(str1, size));
509 }
510 memcpy(ret, str1, size * sizeof(xmlChar));
511 memcpy(&ret[size], str2, len * sizeof(xmlChar));
512 ret[size + len] = 0;
513 return(ret);
514}
515
516/**
517 * xmlStrcat:
518 * @cur: the original xmlChar * array
519 * @add: the xmlChar * array added
520 *
521 * a strcat for array of xmlChar's. Since they are supposed to be
522 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
523 * a termination mark of '0'.
524 *
Nick Wellnhofer5a0ae662017-06-17 23:20:38 +0200525 * Returns a new xmlChar * containing the concatenated string. The original
526 * @cur is reallocated and should not be freed.
William M. Bracka2e844a2004-01-06 11:52:13 +0000527 */
528xmlChar *
529xmlStrcat(xmlChar *cur, const xmlChar *add) {
530 const xmlChar *p = add;
531
532 if (add == NULL) return(cur);
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800533 if (cur == NULL)
William M. Bracka2e844a2004-01-06 11:52:13 +0000534 return(xmlStrdup(add));
535
536 while (*p != 0) p++; /* non input consuming */
537 return(xmlStrncat(cur, add, p - add));
538}
539
540/**
541 * xmlStrPrintf:
542 * @buf: the result buffer.
543 * @len: the result buffer length.
544 * @msg: the message with printf formatting.
545 * @...: extra parameters for the message.
546 *
547 * Formats @msg and places result into @buf.
548 *
549 * Returns the number of characters written to @buf or -1 if an error occurs.
550 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800551int XMLCDECL
David Kilzer4472c3a2016-05-13 15:13:17 +0800552xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000553 va_list args;
554 int ret;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800555
William M. Bracka2e844a2004-01-06 11:52:13 +0000556 if((buf == NULL) || (msg == NULL)) {
557 return(-1);
558 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800559
William M. Bracka2e844a2004-01-06 11:52:13 +0000560 va_start(args, msg);
561 ret = vsnprintf((char *) buf, len, (const char *) msg, args);
562 va_end(args);
563 buf[len - 1] = 0; /* be safe ! */
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800564
William M. Bracka2e844a2004-01-06 11:52:13 +0000565 return(ret);
566}
567
568/**
569 * xmlStrVPrintf:
570 * @buf: the result buffer.
571 * @len: the result buffer length.
572 * @msg: the message with printf formatting.
573 * @ap: extra parameters for the message.
574 *
575 * Formats @msg and places result into @buf.
576 *
577 * Returns the number of characters written to @buf or -1 if an error occurs.
578 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800579int
David Kilzer4472c3a2016-05-13 15:13:17 +0800580xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000581 int ret;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800582
William M. Bracka2e844a2004-01-06 11:52:13 +0000583 if((buf == NULL) || (msg == NULL)) {
584 return(-1);
585 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800586
William M. Bracka2e844a2004-01-06 11:52:13 +0000587 ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
588 buf[len - 1] = 0; /* be safe ! */
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800589
William M. Bracka2e844a2004-01-06 11:52:13 +0000590 return(ret);
591}
592
593/************************************************************************
594 * *
595 * Generic UTF8 handling routines *
596 * *
597 * From rfc2044: encoding of the Unicode values on UTF-8: *
598 * *
599 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
600 * 0000 0000-0000 007F 0xxxxxxx *
601 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
602 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
603 * *
604 * I hope we won't use values > 0xFFFF anytime soon ! *
605 * *
606 ************************************************************************/
607
608
609/**
610 * xmlUTF8Size:
611 * @utf: pointer to the UTF8 character
612 *
613 * calculates the internal size of a UTF8 character
614 *
615 * returns the numbers of bytes in the character, -1 on format error
616 */
617int
618xmlUTF8Size(const xmlChar *utf) {
619 xmlChar mask;
620 int len;
621
622 if (utf == NULL)
623 return -1;
624 if (*utf < 0x80)
625 return 1;
626 /* check valid UTF8 character */
627 if (!(*utf & 0x40))
628 return -1;
629 /* determine number of bytes in char */
630 len = 2;
631 for (mask=0x20; mask != 0; mask>>=1) {
632 if (!(*utf & mask))
633 return len;
634 len++;
635 }
636 return -1;
637}
638
639/**
640 * xmlUTF8Charcmp:
641 * @utf1: pointer to first UTF8 char
642 * @utf2: pointer to second UTF8 char
643 *
644 * compares the two UCS4 values
645 *
646 * returns result of the compare as with xmlStrncmp
647 */
648int
649xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
650
651 if (utf1 == NULL ) {
652 if (utf2 == NULL)
653 return 0;
654 return -1;
655 }
656 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
657}
658
659/**
660 * xmlUTF8Strlen:
661 * @utf: a sequence of UTF-8 encoded bytes
662 *
663 * compute the length of an UTF8 string, it doesn't do a full UTF8
664 * checking of the content of the string.
665 *
666 * Returns the number of characters in the string or -1 in case of error
667 */
668int
669xmlUTF8Strlen(const xmlChar *utf) {
670 int ret = 0;
671
672 if (utf == NULL)
673 return(-1);
674
675 while (*utf != 0) {
676 if (utf[0] & 0x80) {
677 if ((utf[1] & 0xc0) != 0x80)
678 return(-1);
679 if ((utf[0] & 0xe0) == 0xe0) {
680 if ((utf[2] & 0xc0) != 0x80)
681 return(-1);
682 if ((utf[0] & 0xf0) == 0xf0) {
683 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
684 return(-1);
685 utf += 4;
686 } else {
687 utf += 3;
688 }
689 } else {
690 utf += 2;
691 }
692 } else {
693 utf++;
694 }
695 ret++;
696 }
697 return(ret);
698}
699
700/**
701 * xmlGetUTF8Char:
702 * @utf: a sequence of UTF-8 encoded bytes
William M. Brack3e530162004-09-03 17:10:08 +0000703 * @len: a pointer to the minimum number of bytes present in
704 * the sequence. This is used to assure the next character
705 * is completely contained within the sequence.
William M. Bracka2e844a2004-01-06 11:52:13 +0000706 *
William M. Brack3e530162004-09-03 17:10:08 +0000707 * Read the first UTF8 character from @utf
William M. Bracka2e844a2004-01-06 11:52:13 +0000708 *
William M. Brack3e530162004-09-03 17:10:08 +0000709 * Returns the char value or -1 in case of error, and sets *len to
710 * the actual number of bytes consumed (0 in case of error)
William M. Bracka2e844a2004-01-06 11:52:13 +0000711 */
712int
713xmlGetUTF8Char(const unsigned char *utf, int *len) {
714 unsigned int c;
715
716 if (utf == NULL)
717 goto error;
718 if (len == NULL)
719 goto error;
720 if (*len < 1)
721 goto error;
722
723 c = utf[0];
724 if (c & 0x80) {
725 if (*len < 2)
726 goto error;
727 if ((utf[1] & 0xc0) != 0x80)
728 goto error;
729 if ((c & 0xe0) == 0xe0) {
730 if (*len < 3)
731 goto error;
732 if ((utf[2] & 0xc0) != 0x80)
733 goto error;
734 if ((c & 0xf0) == 0xf0) {
735 if (*len < 4)
736 goto error;
737 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
738 goto error;
739 *len = 4;
740 /* 4-byte code */
741 c = (utf[0] & 0x7) << 18;
742 c |= (utf[1] & 0x3f) << 12;
743 c |= (utf[2] & 0x3f) << 6;
744 c |= utf[3] & 0x3f;
745 } else {
746 /* 3-byte code */
747 *len = 3;
748 c = (utf[0] & 0xf) << 12;
749 c |= (utf[1] & 0x3f) << 6;
750 c |= utf[2] & 0x3f;
751 }
752 } else {
753 /* 2-byte code */
754 *len = 2;
755 c = (utf[0] & 0x1f) << 6;
756 c |= utf[1] & 0x3f;
757 }
758 } else {
759 /* 1-byte code */
760 *len = 1;
761 }
762 return(c);
763
764error:
Daniel Veillardce682bc2004-11-05 17:22:25 +0000765 if (len != NULL)
766 *len = 0;
William M. Bracka2e844a2004-01-06 11:52:13 +0000767 return(-1);
768}
769
770/**
771 * xmlCheckUTF8:
772 * @utf: Pointer to putative UTF-8 encoded string.
773 *
774 * Checks @utf for being valid UTF-8. @utf is assumed to be
775 * null-terminated. This function is not super-strict, as it will
776 * allow longer UTF-8 sequences than necessary. Note that Java is
777 * capable of producing these sequences if provoked. Also note, this
778 * routine checks for the 4-byte maximum size, but does not check for
779 * 0x10ffff maximum value.
780 *
781 * Return value: true if @utf is valid.
782 **/
783int
784xmlCheckUTF8(const unsigned char *utf)
785{
786 int ix;
787 unsigned char c;
788
Daniel Veillardce682bc2004-11-05 17:22:25 +0000789 if (utf == NULL)
790 return(0);
William M. Brack3ffe90e2004-08-28 01:33:30 +0000791 /*
792 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
793 * are as follows (in "bit format"):
794 * 0xxxxxxx valid 1-byte
795 * 110xxxxx 10xxxxxx valid 2-byte
796 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
797 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
798 */
799 for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
William M. Brackf4095152004-08-31 16:49:26 +0000800 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
William M. Bracka2e844a2004-01-06 11:52:13 +0000801 ix++;
William M. Brackbf5cf212004-08-31 06:47:17 +0000802 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
803 if ((utf[ix+1] & 0xc0 ) != 0x80)
804 return 0;
805 ix += 2;
806 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
807 if (((utf[ix+1] & 0xc0) != 0x80) ||
808 ((utf[ix+2] & 0xc0) != 0x80))
809 return 0;
810 ix += 3;
811 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
812 if (((utf[ix+1] & 0xc0) != 0x80) ||
813 ((utf[ix+2] & 0xc0) != 0x80) ||
814 ((utf[ix+3] & 0xc0) != 0x80))
815 return 0;
816 ix += 4;
817 } else /* unknown encoding */
818 return 0;
William M. Bracka2e844a2004-01-06 11:52:13 +0000819 }
820 return(1);
821}
822
823/**
824 * xmlUTF8Strsize:
825 * @utf: a sequence of UTF-8 encoded bytes
826 * @len: the number of characters in the array
827 *
828 * storage size of an UTF8 string
Nick Wellnhofer8bbe4502017-06-17 16:15:09 +0200829 * the behaviour is not guaranteed if the input string is not UTF-8
William M. Bracka2e844a2004-01-06 11:52:13 +0000830 *
831 * Returns the storage size of
832 * the first 'len' characters of ARRAY
William M. Bracka2e844a2004-01-06 11:52:13 +0000833 */
834
835int
836xmlUTF8Strsize(const xmlChar *utf, int len) {
837 const xmlChar *ptr=utf;
838 xmlChar ch;
839
Daniel Veillard36e5cd52004-11-02 14:52:23 +0000840 if (utf == NULL)
841 return(0);
842
William M. Bracka2e844a2004-01-06 11:52:13 +0000843 if (len <= 0)
844 return(0);
845
846 while ( len-- > 0) {
847 if ( !*ptr )
848 break;
849 if ( (ch = *ptr++) & 0x80)
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000850 while ((ch<<=1) & 0x80 ) {
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000851 if (*ptr == 0) break;
Nick Wellnhofer96a5c172016-04-21 19:03:47 +0200852 ptr++;
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000853 }
William M. Bracka2e844a2004-01-06 11:52:13 +0000854 }
855 return (ptr - utf);
856}
857
858
859/**
860 * xmlUTF8Strndup:
861 * @utf: the input UTF8 *
862 * @len: the len of @utf (in chars)
863 *
864 * a strndup for array of UTF8's
865 *
866 * Returns a new UTF8 * or NULL
867 */
868xmlChar *
869xmlUTF8Strndup(const xmlChar *utf, int len) {
870 xmlChar *ret;
871 int i;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800872
William M. Bracka2e844a2004-01-06 11:52:13 +0000873 if ((utf == NULL) || (len < 0)) return(NULL);
874 i = xmlUTF8Strsize(utf, len);
875 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
876 if (ret == NULL) {
877 xmlGenericError(xmlGenericErrorContext,
878 "malloc of %ld byte failed\n",
879 (len + 1) * (long)sizeof(xmlChar));
880 return(NULL);
881 }
882 memcpy(ret, utf, i * sizeof(xmlChar));
883 ret[i] = 0;
884 return(ret);
885}
886
887/**
888 * xmlUTF8Strpos:
889 * @utf: the input UTF8 *
890 * @pos: the position of the desired UTF8 char (in chars)
891 *
892 * a function to provide the equivalent of fetching a
893 * character from a string array
894 *
895 * Returns a pointer to the UTF8 character or NULL
896 */
Daniel Veillard8a32fe42004-11-02 22:10:16 +0000897const xmlChar *
William M. Bracka2e844a2004-01-06 11:52:13 +0000898xmlUTF8Strpos(const xmlChar *utf, int pos) {
899 xmlChar ch;
900
901 if (utf == NULL) return(NULL);
William M. Brack230c5502004-12-20 16:18:49 +0000902 if (pos < 0)
William M. Bracka2e844a2004-01-06 11:52:13 +0000903 return(NULL);
904 while (pos--) {
905 if ((ch=*utf++) == 0) return(NULL);
906 if ( ch & 0x80 ) {
907 /* if not simple ascii, verify proper format */
908 if ( (ch & 0xc0) != 0xc0 )
909 return(NULL);
910 /* then skip over remaining bytes for this char */
911 while ( (ch <<= 1) & 0x80 )
912 if ( (*utf++ & 0xc0) != 0x80 )
913 return(NULL);
914 }
915 }
916 return((xmlChar *)utf);
917}
918
919/**
920 * xmlUTF8Strloc:
921 * @utf: the input UTF8 *
922 * @utfchar: the UTF8 character to be found
923 *
924 * a function to provide the relative location of a UTF8 char
925 *
926 * Returns the relative character position of the desired char
927 * or -1 if not found
928 */
929int
930xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
931 int i, size;
932 xmlChar ch;
933
934 if (utf==NULL || utfchar==NULL) return -1;
935 size = xmlUTF8Strsize(utfchar, 1);
936 for(i=0; (ch=*utf) != 0; i++) {
937 if (xmlStrncmp(utf, utfchar, size)==0)
938 return(i);
939 utf++;
940 if ( ch & 0x80 ) {
941 /* if not simple ascii, verify proper format */
942 if ( (ch & 0xc0) != 0xc0 )
943 return(-1);
944 /* then skip over remaining bytes for this char */
945 while ( (ch <<= 1) & 0x80 )
946 if ( (*utf++ & 0xc0) != 0x80 )
947 return(-1);
948 }
949 }
950
951 return(-1);
952}
953/**
954 * xmlUTF8Strsub:
955 * @utf: a sequence of UTF-8 encoded bytes
956 * @start: relative pos of first char
957 * @len: total number to copy
958 *
959 * Create a substring from a given UTF-8 string
960 * Note: positions are given in units of UTF-8 chars
961 *
962 * Returns a pointer to a newly created string
963 * or NULL if any problem
964 */
965
966xmlChar *
967xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
968 int i;
969 xmlChar ch;
970
971 if (utf == NULL) return(NULL);
972 if (start < 0) return(NULL);
973 if (len < 0) return(NULL);
974
975 /*
976 * Skip over any leading chars
977 */
978 for (i = 0;i < start;i++) {
979 if ((ch=*utf++) == 0) return(NULL);
980 if ( ch & 0x80 ) {
981 /* if not simple ascii, verify proper format */
982 if ( (ch & 0xc0) != 0xc0 )
983 return(NULL);
984 /* then skip over remaining bytes for this char */
985 while ( (ch <<= 1) & 0x80 )
986 if ( (*utf++ & 0xc0) != 0x80 )
987 return(NULL);
988 }
989 }
990
991 return(xmlUTF8Strndup(utf, len));
992}
Daniel Veillard5d4644e2005-04-01 13:11:58 +0000993
David Kilzer502f6a62016-05-23 14:58:41 +0800994/**
995 * xmlEscapeFormatString:
996 * @msg: a pointer to the string in which to escape '%' characters.
997 * Must be a heap-allocated buffer created by libxml2 that may be
998 * returned, or that may be freed and replaced.
999 *
1000 * Replaces the string pointed to by 'msg' with an escaped string.
1001 * Returns the same string with all '%' characters escaped.
1002 */
1003xmlChar *
1004xmlEscapeFormatString(xmlChar **msg)
1005{
1006 xmlChar *msgPtr = NULL;
1007 xmlChar *result = NULL;
1008 xmlChar *resultPtr = NULL;
1009 size_t count = 0;
1010 size_t msgLen = 0;
1011 size_t resultLen = 0;
1012
1013 if (!msg || !*msg)
1014 return(NULL);
1015
1016 for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1017 ++msgLen;
1018 if (*msgPtr == '%')
1019 ++count;
1020 }
1021
1022 if (count == 0)
1023 return(*msg);
1024
1025 resultLen = msgLen + count + 1;
1026 result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
1027 if (result == NULL) {
1028 /* Clear *msg to prevent format string vulnerabilities in
1029 out-of-memory situations. */
1030 xmlFree(*msg);
1031 *msg = NULL;
1032 xmlErrMemory(NULL, NULL);
1033 return(NULL);
1034 }
1035
1036 for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1037 *resultPtr = *msgPtr;
1038 if (*msgPtr == '%')
1039 *(++resultPtr) = '%';
1040 }
1041 result[resultLen - 1] = '\0';
1042
1043 xmlFree(*msg);
1044 *msg = result;
1045
1046 return *msg;
1047}
1048
Daniel Veillard5d4644e2005-04-01 13:11:58 +00001049#define bottom_xmlstring
1050#include "elfgcchack.h"