blob: a85685d7e138eaec59e1d193b1ce9bb6835e5efb [file] [log] [blame]
William M. Bracka2e844a2004-01-06 11:52:13 +00001/*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
Daniel Veillardf8e3db02012-09-11 13:26:36 +08006 * from the parser.c file (their original home).
William M. Bracka2e844a2004-01-06 11:52:13 +00007 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16#define IN_LIBXML
17#include "libxml.h"
18
19#include <stdlib.h>
20#include <string.h>
Elliott Hughesecdab2a2022-02-23 14:33:50 -080021#include <limits.h>
William M. Bracka2e844a2004-01-06 11:52:13 +000022#include <libxml/xmlmemory.h>
23#include <libxml/parserInternals.h>
24#include <libxml/xmlstring.h>
25
26/************************************************************************
27 * *
28 * Commodity functions to handle xmlChars *
29 * *
30 ************************************************************************/
31
32/**
33 * xmlStrndup:
34 * @cur: the input xmlChar *
35 * @len: the len of @cur
36 *
37 * a strndup for array of xmlChar's
38 *
39 * Returns a new xmlChar * or NULL
40 */
41xmlChar *
42xmlStrndup(const xmlChar *cur, int len) {
43 xmlChar *ret;
Daniel Veillardf8e3db02012-09-11 13:26:36 +080044
William M. Bracka2e844a2004-01-06 11:52:13 +000045 if ((cur == NULL) || (len < 0)) return(NULL);
Elliott Hughesecdab2a2022-02-23 14:33:50 -080046 ret = (xmlChar *) xmlMallocAtomic(((size_t) len + 1) * sizeof(xmlChar));
William M. Bracka2e844a2004-01-06 11:52:13 +000047 if (ret == NULL) {
48 xmlErrMemory(NULL, NULL);
49 return(NULL);
50 }
51 memcpy(ret, cur, len * sizeof(xmlChar));
52 ret[len] = 0;
53 return(ret);
54}
55
56/**
57 * xmlStrdup:
58 * @cur: the input xmlChar *
59 *
60 * a strdup for array of xmlChar's. Since they are supposed to be
61 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
62 * a termination mark of '0'.
63 *
64 * Returns a new xmlChar * or NULL
65 */
66xmlChar *
67xmlStrdup(const xmlChar *cur) {
68 const xmlChar *p = cur;
69
70 if (cur == NULL) return(NULL);
71 while (*p != 0) p++; /* non input consuming */
72 return(xmlStrndup(cur, p - cur));
73}
74
75/**
76 * xmlCharStrndup:
77 * @cur: the input char *
78 * @len: the len of @cur
79 *
80 * a strndup for char's to xmlChar's
81 *
82 * Returns a new xmlChar * or NULL
83 */
84
85xmlChar *
86xmlCharStrndup(const char *cur, int len) {
87 int i;
88 xmlChar *ret;
Daniel Veillardf8e3db02012-09-11 13:26:36 +080089
William M. Bracka2e844a2004-01-06 11:52:13 +000090 if ((cur == NULL) || (len < 0)) return(NULL);
Elliott Hughesecdab2a2022-02-23 14:33:50 -080091 ret = (xmlChar *) xmlMallocAtomic(((size_t) len + 1) * sizeof(xmlChar));
William M. Bracka2e844a2004-01-06 11:52:13 +000092 if (ret == NULL) {
93 xmlErrMemory(NULL, NULL);
94 return(NULL);
95 }
Daniel Veillard5ea30d72004-11-08 11:54:28 +000096 for (i = 0;i < len;i++) {
William M. Bracka2e844a2004-01-06 11:52:13 +000097 ret[i] = (xmlChar) cur[i];
Daniel Veillard5ea30d72004-11-08 11:54:28 +000098 if (ret[i] == 0) return(ret);
99 }
William M. Bracka2e844a2004-01-06 11:52:13 +0000100 ret[len] = 0;
101 return(ret);
102}
103
104/**
105 * xmlCharStrdup:
106 * @cur: the input char *
107 *
108 * a strdup for char's to xmlChar's
109 *
110 * Returns a new xmlChar * or NULL
111 */
112
113xmlChar *
114xmlCharStrdup(const char *cur) {
115 const char *p = cur;
116
117 if (cur == NULL) return(NULL);
118 while (*p != '\0') p++; /* non input consuming */
119 return(xmlCharStrndup(cur, p - cur));
120}
121
122/**
123 * xmlStrcmp:
124 * @str1: the first xmlChar *
125 * @str2: the second xmlChar *
126 *
127 * a strcmp for xmlChar's
128 *
129 * Returns the integer result of the comparison
130 */
131
132int
133xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000134 if (str1 == str2) return(0);
135 if (str1 == NULL) return(-1);
136 if (str2 == NULL) return(1);
Haibo Huangf0a546b2020-09-01 20:28:19 -0700137#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
138 return(strcmp((const char *)str1, (const char *)str2));
139#else
William M. Bracka2e844a2004-01-06 11:52:13 +0000140 do {
Haibo Huangf0a546b2020-09-01 20:28:19 -0700141 int tmp = *str1++ - *str2;
William M. Bracka2e844a2004-01-06 11:52:13 +0000142 if (tmp != 0) return(tmp);
143 } while (*str2++ != 0);
144 return 0;
Haibo Huangf0a546b2020-09-01 20:28:19 -0700145#endif
William M. Bracka2e844a2004-01-06 11:52:13 +0000146}
147
148/**
149 * xmlStrEqual:
150 * @str1: the first xmlChar *
151 * @str2: the second xmlChar *
152 *
Daniel Veillardd95ecf02005-12-22 14:58:32 +0000153 * Check if both strings are equal of have same content.
Daniel Veillard6a0baa02005-12-10 11:11:12 +0000154 * Should be a bit more readable and faster than xmlStrcmp()
William M. Bracka2e844a2004-01-06 11:52:13 +0000155 *
156 * Returns 1 if they are equal, 0 if they are different
157 */
158
159int
160xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
161 if (str1 == str2) return(1);
162 if (str1 == NULL) return(0);
163 if (str2 == NULL) return(0);
Haibo Huangf0a546b2020-09-01 20:28:19 -0700164#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
165 return(strcmp((const char *)str1, (const char *)str2) == 0);
166#else
William M. Bracka2e844a2004-01-06 11:52:13 +0000167 do {
168 if (*str1++ != *str2) return(0);
169 } while (*str2++);
170 return(1);
Haibo Huangf0a546b2020-09-01 20:28:19 -0700171#endif
William M. Bracka2e844a2004-01-06 11:52:13 +0000172}
173
174/**
175 * xmlStrQEqual:
176 * @pref: the prefix of the QName
177 * @name: the localname of the QName
178 * @str: the second xmlChar *
179 *
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800180 * Check if a QName is Equal to a given string
William M. Bracka2e844a2004-01-06 11:52:13 +0000181 *
182 * Returns 1 if they are equal, 0 if they are different
183 */
184
185int
186xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
187 if (pref == NULL) return(xmlStrEqual(name, str));
188 if (name == NULL) return(0);
189 if (str == NULL) return(0);
190
191 do {
192 if (*pref++ != *str) return(0);
193 } while ((*str++) && (*pref));
194 if (*str++ != ':') return(0);
195 do {
196 if (*name++ != *str) return(0);
197 } while (*str++);
198 return(1);
199}
200
201/**
202 * xmlStrncmp:
203 * @str1: the first xmlChar *
204 * @str2: the second xmlChar *
205 * @len: the max comparison length
206 *
207 * a strncmp for xmlChar's
208 *
209 * Returns the integer result of the comparison
210 */
211
212int
213xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000214 if (len <= 0) return(0);
215 if (str1 == str2) return(0);
216 if (str1 == NULL) return(-1);
217 if (str2 == NULL) return(1);
Haibo Huangf0a546b2020-09-01 20:28:19 -0700218#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
219 return(strncmp((const char *)str1, (const char *)str2, len));
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000220#else
William M. Bracka2e844a2004-01-06 11:52:13 +0000221 do {
Haibo Huangf0a546b2020-09-01 20:28:19 -0700222 int tmp = *str1++ - *str2;
William M. Bracka2e844a2004-01-06 11:52:13 +0000223 if (tmp != 0 || --len == 0) return(tmp);
224 } while (*str2++ != 0);
225 return 0;
Daniel Veillardc82c57e2004-01-12 16:24:34 +0000226#endif
William M. Bracka2e844a2004-01-06 11:52:13 +0000227}
228
229static const xmlChar casemap[256] = {
230 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
231 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
232 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
233 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
234 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
235 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
236 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
237 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
238 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
239 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
240 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
241 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
242 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
243 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
244 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
245 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
246 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
247 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
248 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
249 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
250 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
251 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
252 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
253 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
254 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
255 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
256 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
257 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
258 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
259 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
260 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
261 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
262};
263
264/**
265 * xmlStrcasecmp:
266 * @str1: the first xmlChar *
267 * @str2: the second xmlChar *
268 *
269 * a strcasecmp for xmlChar's
270 *
271 * Returns the integer result of the comparison
272 */
273
274int
275xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
276 register int tmp;
277
278 if (str1 == str2) return(0);
279 if (str1 == NULL) return(-1);
280 if (str2 == NULL) return(1);
281 do {
282 tmp = casemap[*str1++] - casemap[*str2];
283 if (tmp != 0) return(tmp);
284 } while (*str2++ != 0);
285 return 0;
286}
287
288/**
289 * xmlStrncasecmp:
290 * @str1: the first xmlChar *
291 * @str2: the second xmlChar *
292 * @len: the max comparison length
293 *
294 * a strncasecmp for xmlChar's
295 *
296 * Returns the integer result of the comparison
297 */
298
299int
300xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
301 register int tmp;
302
303 if (len <= 0) return(0);
304 if (str1 == str2) return(0);
305 if (str1 == NULL) return(-1);
306 if (str2 == NULL) return(1);
307 do {
308 tmp = casemap[*str1++] - casemap[*str2];
309 if (tmp != 0 || --len == 0) return(tmp);
310 } while (*str2++ != 0);
311 return 0;
312}
313
314/**
315 * xmlStrchr:
316 * @str: the xmlChar * array
317 * @val: the xmlChar to search
318 *
319 * a strchr for xmlChar's
320 *
321 * Returns the xmlChar * for the first occurrence or NULL.
322 */
323
324const xmlChar *
325xmlStrchr(const xmlChar *str, xmlChar val) {
326 if (str == NULL) return(NULL);
327 while (*str != 0) { /* non input consuming */
328 if (*str == val) return((xmlChar *) str);
329 str++;
330 }
331 return(NULL);
332}
333
334/**
335 * xmlStrstr:
336 * @str: the xmlChar * array (haystack)
337 * @val: the xmlChar to search (needle)
338 *
339 * a strstr for xmlChar's
340 *
341 * Returns the xmlChar * for the first occurrence or NULL.
342 */
343
344const xmlChar *
345xmlStrstr(const xmlChar *str, const xmlChar *val) {
346 int n;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800347
William M. Bracka2e844a2004-01-06 11:52:13 +0000348 if (str == NULL) return(NULL);
349 if (val == NULL) return(NULL);
350 n = xmlStrlen(val);
351
352 if (n == 0) return(str);
353 while (*str != 0) { /* non input consuming */
354 if (*str == *val) {
355 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
356 }
357 str++;
358 }
359 return(NULL);
360}
361
362/**
363 * xmlStrcasestr:
364 * @str: the xmlChar * array (haystack)
365 * @val: the xmlChar to search (needle)
366 *
367 * a case-ignoring strstr for xmlChar's
368 *
369 * Returns the xmlChar * for the first occurrence or NULL.
370 */
371
372const xmlChar *
Daniel Veillardfcf24572009-08-12 23:02:08 +0200373xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000374 int n;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800375
William M. Bracka2e844a2004-01-06 11:52:13 +0000376 if (str == NULL) return(NULL);
377 if (val == NULL) return(NULL);
378 n = xmlStrlen(val);
379
380 if (n == 0) return(str);
381 while (*str != 0) { /* non input consuming */
382 if (casemap[*str] == casemap[*val])
383 if (!xmlStrncasecmp(str, val, n)) return(str);
384 str++;
385 }
386 return(NULL);
387}
388
389/**
390 * xmlStrsub:
391 * @str: the xmlChar * array (haystack)
392 * @start: the index of the first char (zero based)
393 * @len: the length of the substring
394 *
395 * Extract a substring of a given string
396 *
397 * Returns the xmlChar * for the first occurrence or NULL.
398 */
399
400xmlChar *
401xmlStrsub(const xmlChar *str, int start, int len) {
402 int i;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800403
William M. Bracka2e844a2004-01-06 11:52:13 +0000404 if (str == NULL) return(NULL);
405 if (start < 0) return(NULL);
406 if (len < 0) return(NULL);
407
408 for (i = 0;i < start;i++) {
409 if (*str == 0) return(NULL);
410 str++;
411 }
412 if (*str == 0) return(NULL);
413 return(xmlStrndup(str, len));
414}
415
416/**
417 * xmlStrlen:
418 * @str: the xmlChar * array
419 *
420 * length of a xmlChar's string
421 *
422 * Returns the number of xmlChar contained in the ARRAY.
423 */
424
425int
426xmlStrlen(const xmlChar *str) {
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800427 size_t len = 0;
William M. Bracka2e844a2004-01-06 11:52:13 +0000428
429 if (str == NULL) return(0);
430 while (*str != 0) { /* non input consuming */
431 str++;
432 len++;
433 }
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800434 return(len > INT_MAX ? 0 : len);
William M. Bracka2e844a2004-01-06 11:52:13 +0000435}
436
437/**
438 * xmlStrncat:
439 * @cur: the original xmlChar * array
440 * @add: the xmlChar * array added
441 * @len: the length of @add
442 *
443 * a strncat for array of xmlChar's, it will extend @cur with the len
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000444 * first bytes of @add. Note that if @len < 0 then this is an API error
445 * and NULL will be returned.
William M. Bracka2e844a2004-01-06 11:52:13 +0000446 *
Nick Wellnhofer5a0ae662017-06-17 23:20:38 +0200447 * Returns a new xmlChar *, the original @cur is reallocated and should
448 * not be freed.
William M. Bracka2e844a2004-01-06 11:52:13 +0000449 */
450
451xmlChar *
452xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
453 int size;
454 xmlChar *ret;
455
456 if ((add == NULL) || (len == 0))
457 return(cur);
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000458 if (len < 0)
459 return(NULL);
William M. Bracka2e844a2004-01-06 11:52:13 +0000460 if (cur == NULL)
461 return(xmlStrndup(add, len));
462
463 size = xmlStrlen(cur);
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800464 if ((size < 0) || (size > INT_MAX - len))
Pranjal Jumde8fbbf552016-03-08 17:29:00 -0800465 return(NULL);
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800466 ret = (xmlChar *) xmlRealloc(cur, ((size_t) size + len + 1) * sizeof(xmlChar));
William M. Bracka2e844a2004-01-06 11:52:13 +0000467 if (ret == NULL) {
468 xmlErrMemory(NULL, NULL);
469 return(cur);
470 }
471 memcpy(&ret[size], add, len * sizeof(xmlChar));
472 ret[size + len] = 0;
473 return(ret);
474}
475
476/**
477 * xmlStrncatNew:
478 * @str1: first xmlChar string
479 * @str2: second xmlChar string
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000480 * @len: the len of @str2 or < 0
William M. Bracka2e844a2004-01-06 11:52:13 +0000481 *
482 * same as xmlStrncat, but creates a new string. The original
Kasimier T. Buchcik5bb0c082005-12-20 10:48:33 +0000483 * two strings are not freed. If @len is < 0 then the length
484 * will be calculated automatically.
William M. Bracka2e844a2004-01-06 11:52:13 +0000485 *
486 * Returns a new xmlChar * or NULL
487 */
488xmlChar *
489xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
490 int size;
491 xmlChar *ret;
492
Pranjal Jumde8fbbf552016-03-08 17:29:00 -0800493 if (len < 0) {
Daniel Veillard8a32fe42004-11-02 22:10:16 +0000494 len = xmlStrlen(str2);
Pranjal Jumde8fbbf552016-03-08 17:29:00 -0800495 if (len < 0)
496 return(NULL);
497 }
William M. Bracka2e844a2004-01-06 11:52:13 +0000498 if ((str2 == NULL) || (len == 0))
499 return(xmlStrdup(str1));
500 if (str1 == NULL)
501 return(xmlStrndup(str2, len));
502
503 size = xmlStrlen(str1);
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800504 if ((size < 0) || (size > INT_MAX - len))
Pranjal Jumde8fbbf552016-03-08 17:29:00 -0800505 return(NULL);
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800506 ret = (xmlChar *) xmlMalloc(((size_t) size + len + 1) * sizeof(xmlChar));
William M. Bracka2e844a2004-01-06 11:52:13 +0000507 if (ret == NULL) {
508 xmlErrMemory(NULL, NULL);
509 return(xmlStrndup(str1, size));
510 }
511 memcpy(ret, str1, size * sizeof(xmlChar));
512 memcpy(&ret[size], str2, len * sizeof(xmlChar));
513 ret[size + len] = 0;
514 return(ret);
515}
516
517/**
518 * xmlStrcat:
519 * @cur: the original xmlChar * array
520 * @add: the xmlChar * array added
521 *
522 * a strcat for array of xmlChar's. Since they are supposed to be
523 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
524 * a termination mark of '0'.
525 *
Nick Wellnhofer5a0ae662017-06-17 23:20:38 +0200526 * Returns a new xmlChar * containing the concatenated string. The original
527 * @cur is reallocated and should not be freed.
William M. Bracka2e844a2004-01-06 11:52:13 +0000528 */
529xmlChar *
530xmlStrcat(xmlChar *cur, const xmlChar *add) {
531 const xmlChar *p = add;
532
533 if (add == NULL) return(cur);
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800534 if (cur == NULL)
William M. Bracka2e844a2004-01-06 11:52:13 +0000535 return(xmlStrdup(add));
536
537 while (*p != 0) p++; /* non input consuming */
538 return(xmlStrncat(cur, add, p - add));
539}
540
541/**
542 * xmlStrPrintf:
543 * @buf: the result buffer.
544 * @len: the result buffer length.
545 * @msg: the message with printf formatting.
546 * @...: extra parameters for the message.
547 *
548 * Formats @msg and places result into @buf.
549 *
550 * Returns the number of characters written to @buf or -1 if an error occurs.
551 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800552int XMLCDECL
David Kilzer4472c3a2016-05-13 15:13:17 +0800553xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000554 va_list args;
555 int ret;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800556
William M. Bracka2e844a2004-01-06 11:52:13 +0000557 if((buf == NULL) || (msg == NULL)) {
558 return(-1);
559 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800560
William M. Bracka2e844a2004-01-06 11:52:13 +0000561 va_start(args, msg);
562 ret = vsnprintf((char *) buf, len, (const char *) msg, args);
563 va_end(args);
564 buf[len - 1] = 0; /* be safe ! */
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800565
William M. Bracka2e844a2004-01-06 11:52:13 +0000566 return(ret);
567}
568
569/**
570 * xmlStrVPrintf:
571 * @buf: the result buffer.
572 * @len: the result buffer length.
573 * @msg: the message with printf formatting.
574 * @ap: extra parameters for the message.
575 *
576 * Formats @msg and places result into @buf.
577 *
578 * Returns the number of characters written to @buf or -1 if an error occurs.
579 */
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800580int
David Kilzer4472c3a2016-05-13 15:13:17 +0800581xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000582 int ret;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800583
William M. Bracka2e844a2004-01-06 11:52:13 +0000584 if((buf == NULL) || (msg == NULL)) {
585 return(-1);
586 }
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800587
William M. Bracka2e844a2004-01-06 11:52:13 +0000588 ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
589 buf[len - 1] = 0; /* be safe ! */
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800590
William M. Bracka2e844a2004-01-06 11:52:13 +0000591 return(ret);
592}
593
594/************************************************************************
595 * *
596 * Generic UTF8 handling routines *
597 * *
598 * From rfc2044: encoding of the Unicode values on UTF-8: *
599 * *
600 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
601 * 0000 0000-0000 007F 0xxxxxxx *
602 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
603 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
604 * *
605 * I hope we won't use values > 0xFFFF anytime soon ! *
606 * *
607 ************************************************************************/
608
609
610/**
611 * xmlUTF8Size:
612 * @utf: pointer to the UTF8 character
613 *
614 * calculates the internal size of a UTF8 character
615 *
616 * returns the numbers of bytes in the character, -1 on format error
617 */
618int
619xmlUTF8Size(const xmlChar *utf) {
620 xmlChar mask;
621 int len;
622
623 if (utf == NULL)
624 return -1;
625 if (*utf < 0x80)
626 return 1;
627 /* check valid UTF8 character */
628 if (!(*utf & 0x40))
629 return -1;
630 /* determine number of bytes in char */
631 len = 2;
632 for (mask=0x20; mask != 0; mask>>=1) {
633 if (!(*utf & mask))
634 return len;
635 len++;
636 }
637 return -1;
638}
639
640/**
641 * xmlUTF8Charcmp:
642 * @utf1: pointer to first UTF8 char
643 * @utf2: pointer to second UTF8 char
644 *
645 * compares the two UCS4 values
646 *
647 * returns result of the compare as with xmlStrncmp
648 */
649int
650xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
651
652 if (utf1 == NULL ) {
653 if (utf2 == NULL)
654 return 0;
655 return -1;
656 }
657 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
658}
659
660/**
661 * xmlUTF8Strlen:
662 * @utf: a sequence of UTF-8 encoded bytes
663 *
664 * compute the length of an UTF8 string, it doesn't do a full UTF8
665 * checking of the content of the string.
666 *
667 * Returns the number of characters in the string or -1 in case of error
668 */
669int
670xmlUTF8Strlen(const xmlChar *utf) {
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800671 size_t ret = 0;
William M. Bracka2e844a2004-01-06 11:52:13 +0000672
673 if (utf == NULL)
674 return(-1);
675
676 while (*utf != 0) {
677 if (utf[0] & 0x80) {
678 if ((utf[1] & 0xc0) != 0x80)
679 return(-1);
680 if ((utf[0] & 0xe0) == 0xe0) {
681 if ((utf[2] & 0xc0) != 0x80)
682 return(-1);
683 if ((utf[0] & 0xf0) == 0xf0) {
684 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
685 return(-1);
686 utf += 4;
687 } else {
688 utf += 3;
689 }
690 } else {
691 utf += 2;
692 }
693 } else {
694 utf++;
695 }
696 ret++;
697 }
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800698 return(ret > INT_MAX ? 0 : ret);
William M. Bracka2e844a2004-01-06 11:52:13 +0000699}
700
701/**
702 * xmlGetUTF8Char:
703 * @utf: a sequence of UTF-8 encoded bytes
William M. Brack3e530162004-09-03 17:10:08 +0000704 * @len: a pointer to the minimum number of bytes present in
705 * the sequence. This is used to assure the next character
706 * is completely contained within the sequence.
William M. Bracka2e844a2004-01-06 11:52:13 +0000707 *
William M. Brack3e530162004-09-03 17:10:08 +0000708 * Read the first UTF8 character from @utf
William M. Bracka2e844a2004-01-06 11:52:13 +0000709 *
William M. Brack3e530162004-09-03 17:10:08 +0000710 * Returns the char value or -1 in case of error, and sets *len to
711 * the actual number of bytes consumed (0 in case of error)
William M. Bracka2e844a2004-01-06 11:52:13 +0000712 */
713int
714xmlGetUTF8Char(const unsigned char *utf, int *len) {
715 unsigned int c;
716
717 if (utf == NULL)
718 goto error;
719 if (len == NULL)
720 goto error;
721 if (*len < 1)
722 goto error;
723
724 c = utf[0];
725 if (c & 0x80) {
726 if (*len < 2)
727 goto error;
728 if ((utf[1] & 0xc0) != 0x80)
729 goto error;
730 if ((c & 0xe0) == 0xe0) {
731 if (*len < 3)
732 goto error;
733 if ((utf[2] & 0xc0) != 0x80)
734 goto error;
735 if ((c & 0xf0) == 0xf0) {
736 if (*len < 4)
737 goto error;
738 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
739 goto error;
740 *len = 4;
741 /* 4-byte code */
742 c = (utf[0] & 0x7) << 18;
743 c |= (utf[1] & 0x3f) << 12;
744 c |= (utf[2] & 0x3f) << 6;
745 c |= utf[3] & 0x3f;
746 } else {
747 /* 3-byte code */
748 *len = 3;
749 c = (utf[0] & 0xf) << 12;
750 c |= (utf[1] & 0x3f) << 6;
751 c |= utf[2] & 0x3f;
752 }
753 } else {
754 /* 2-byte code */
755 *len = 2;
756 c = (utf[0] & 0x1f) << 6;
757 c |= utf[1] & 0x3f;
758 }
759 } else {
760 /* 1-byte code */
761 *len = 1;
762 }
763 return(c);
764
765error:
Daniel Veillardce682bc2004-11-05 17:22:25 +0000766 if (len != NULL)
767 *len = 0;
William M. Bracka2e844a2004-01-06 11:52:13 +0000768 return(-1);
769}
770
771/**
772 * xmlCheckUTF8:
773 * @utf: Pointer to putative UTF-8 encoded string.
774 *
775 * Checks @utf for being valid UTF-8. @utf is assumed to be
776 * null-terminated. This function is not super-strict, as it will
777 * allow longer UTF-8 sequences than necessary. Note that Java is
778 * capable of producing these sequences if provoked. Also note, this
779 * routine checks for the 4-byte maximum size, but does not check for
780 * 0x10ffff maximum value.
781 *
782 * Return value: true if @utf is valid.
783 **/
784int
785xmlCheckUTF8(const unsigned char *utf)
786{
787 int ix;
788 unsigned char c;
789
Daniel Veillardce682bc2004-11-05 17:22:25 +0000790 if (utf == NULL)
791 return(0);
William M. Brack3ffe90e2004-08-28 01:33:30 +0000792 /*
793 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
794 * are as follows (in "bit format"):
795 * 0xxxxxxx valid 1-byte
796 * 110xxxxx 10xxxxxx valid 2-byte
797 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
798 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
799 */
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800800 while ((c = utf[0])) { /* string is 0-terminated */
801 ix = 0;
William M. Brackf4095152004-08-31 16:49:26 +0000802 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800803 ix = 1;
William M. Brackbf5cf212004-08-31 06:47:17 +0000804 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800805 if ((utf[1] & 0xc0 ) != 0x80)
William M. Brackbf5cf212004-08-31 06:47:17 +0000806 return 0;
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800807 ix = 2;
William M. Brackbf5cf212004-08-31 06:47:17 +0000808 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800809 if (((utf[1] & 0xc0) != 0x80) ||
810 ((utf[2] & 0xc0) != 0x80))
William M. Brackbf5cf212004-08-31 06:47:17 +0000811 return 0;
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800812 ix = 3;
William M. Brackbf5cf212004-08-31 06:47:17 +0000813 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800814 if (((utf[1] & 0xc0) != 0x80) ||
815 ((utf[2] & 0xc0) != 0x80) ||
816 ((utf[3] & 0xc0) != 0x80))
William M. Brackbf5cf212004-08-31 06:47:17 +0000817 return 0;
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800818 ix = 4;
William M. Brackbf5cf212004-08-31 06:47:17 +0000819 } else /* unknown encoding */
820 return 0;
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800821 utf += ix;
William M. Bracka2e844a2004-01-06 11:52:13 +0000822 }
823 return(1);
824}
825
826/**
827 * xmlUTF8Strsize:
828 * @utf: a sequence of UTF-8 encoded bytes
829 * @len: the number of characters in the array
830 *
831 * storage size of an UTF8 string
Nick Wellnhofer8bbe4502017-06-17 16:15:09 +0200832 * the behaviour is not guaranteed if the input string is not UTF-8
William M. Bracka2e844a2004-01-06 11:52:13 +0000833 *
834 * Returns the storage size of
835 * the first 'len' characters of ARRAY
William M. Bracka2e844a2004-01-06 11:52:13 +0000836 */
837
838int
839xmlUTF8Strsize(const xmlChar *utf, int len) {
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800840 const xmlChar *ptr=utf;
841 int ch;
842 size_t ret;
William M. Bracka2e844a2004-01-06 11:52:13 +0000843
Daniel Veillard36e5cd52004-11-02 14:52:23 +0000844 if (utf == NULL)
845 return(0);
846
William M. Bracka2e844a2004-01-06 11:52:13 +0000847 if (len <= 0)
848 return(0);
849
850 while ( len-- > 0) {
851 if ( !*ptr )
852 break;
853 if ( (ch = *ptr++) & 0x80)
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000854 while ((ch<<=1) & 0x80 ) {
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000855 if (*ptr == 0) break;
Nick Wellnhofer96a5c172016-04-21 19:03:47 +0200856 ptr++;
Daniel Veillard5ea30d72004-11-08 11:54:28 +0000857 }
William M. Bracka2e844a2004-01-06 11:52:13 +0000858 }
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800859 ret = ptr - utf;
860 return (ret > INT_MAX ? 0 : ret);
William M. Bracka2e844a2004-01-06 11:52:13 +0000861}
862
863
864/**
865 * xmlUTF8Strndup:
866 * @utf: the input UTF8 *
867 * @len: the len of @utf (in chars)
868 *
869 * a strndup for array of UTF8's
870 *
871 * Returns a new UTF8 * or NULL
872 */
873xmlChar *
874xmlUTF8Strndup(const xmlChar *utf, int len) {
875 xmlChar *ret;
876 int i;
Daniel Veillardf8e3db02012-09-11 13:26:36 +0800877
William M. Bracka2e844a2004-01-06 11:52:13 +0000878 if ((utf == NULL) || (len < 0)) return(NULL);
879 i = xmlUTF8Strsize(utf, len);
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800880 ret = (xmlChar *) xmlMallocAtomic(((size_t) i + 1) * sizeof(xmlChar));
William M. Bracka2e844a2004-01-06 11:52:13 +0000881 if (ret == NULL) {
William M. Bracka2e844a2004-01-06 11:52:13 +0000882 return(NULL);
883 }
884 memcpy(ret, utf, i * sizeof(xmlChar));
885 ret[i] = 0;
886 return(ret);
887}
888
889/**
890 * xmlUTF8Strpos:
891 * @utf: the input UTF8 *
892 * @pos: the position of the desired UTF8 char (in chars)
893 *
894 * a function to provide the equivalent of fetching a
895 * character from a string array
896 *
897 * Returns a pointer to the UTF8 character or NULL
898 */
Daniel Veillard8a32fe42004-11-02 22:10:16 +0000899const xmlChar *
William M. Bracka2e844a2004-01-06 11:52:13 +0000900xmlUTF8Strpos(const xmlChar *utf, int pos) {
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800901 int ch;
William M. Bracka2e844a2004-01-06 11:52:13 +0000902
903 if (utf == NULL) return(NULL);
William M. Brack230c5502004-12-20 16:18:49 +0000904 if (pos < 0)
William M. Bracka2e844a2004-01-06 11:52:13 +0000905 return(NULL);
906 while (pos--) {
907 if ((ch=*utf++) == 0) return(NULL);
908 if ( ch & 0x80 ) {
909 /* if not simple ascii, verify proper format */
910 if ( (ch & 0xc0) != 0xc0 )
911 return(NULL);
912 /* then skip over remaining bytes for this char */
913 while ( (ch <<= 1) & 0x80 )
914 if ( (*utf++ & 0xc0) != 0x80 )
915 return(NULL);
916 }
917 }
918 return((xmlChar *)utf);
919}
920
921/**
922 * xmlUTF8Strloc:
923 * @utf: the input UTF8 *
924 * @utfchar: the UTF8 character to be found
925 *
926 * a function to provide the relative location of a UTF8 char
927 *
928 * Returns the relative character position of the desired char
929 * or -1 if not found
930 */
931int
932xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800933 size_t i;
934 int size;
935 int ch;
William M. Bracka2e844a2004-01-06 11:52:13 +0000936
937 if (utf==NULL || utfchar==NULL) return -1;
938 size = xmlUTF8Strsize(utfchar, 1);
939 for(i=0; (ch=*utf) != 0; i++) {
940 if (xmlStrncmp(utf, utfchar, size)==0)
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800941 return(i > INT_MAX ? 0 : i);
William M. Bracka2e844a2004-01-06 11:52:13 +0000942 utf++;
943 if ( ch & 0x80 ) {
944 /* if not simple ascii, verify proper format */
945 if ( (ch & 0xc0) != 0xc0 )
946 return(-1);
947 /* then skip over remaining bytes for this char */
948 while ( (ch <<= 1) & 0x80 )
949 if ( (*utf++ & 0xc0) != 0x80 )
950 return(-1);
951 }
952 }
953
954 return(-1);
955}
956/**
957 * xmlUTF8Strsub:
958 * @utf: a sequence of UTF-8 encoded bytes
959 * @start: relative pos of first char
960 * @len: total number to copy
961 *
962 * Create a substring from a given UTF-8 string
963 * Note: positions are given in units of UTF-8 chars
964 *
965 * Returns a pointer to a newly created string
966 * or NULL if any problem
967 */
968
969xmlChar *
970xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
Elliott Hughesecdab2a2022-02-23 14:33:50 -0800971 int i;
972 int ch;
William M. Bracka2e844a2004-01-06 11:52:13 +0000973
974 if (utf == NULL) return(NULL);
975 if (start < 0) return(NULL);
976 if (len < 0) return(NULL);
977
978 /*
979 * Skip over any leading chars
980 */
981 for (i = 0;i < start;i++) {
982 if ((ch=*utf++) == 0) return(NULL);
983 if ( ch & 0x80 ) {
984 /* if not simple ascii, verify proper format */
985 if ( (ch & 0xc0) != 0xc0 )
986 return(NULL);
987 /* then skip over remaining bytes for this char */
988 while ( (ch <<= 1) & 0x80 )
989 if ( (*utf++ & 0xc0) != 0x80 )
990 return(NULL);
991 }
992 }
993
994 return(xmlUTF8Strndup(utf, len));
995}
Daniel Veillard5d4644e2005-04-01 13:11:58 +0000996
David Kilzer502f6a62016-05-23 14:58:41 +0800997/**
998 * xmlEscapeFormatString:
999 * @msg: a pointer to the string in which to escape '%' characters.
1000 * Must be a heap-allocated buffer created by libxml2 that may be
1001 * returned, or that may be freed and replaced.
1002 *
1003 * Replaces the string pointed to by 'msg' with an escaped string.
1004 * Returns the same string with all '%' characters escaped.
1005 */
1006xmlChar *
1007xmlEscapeFormatString(xmlChar **msg)
1008{
1009 xmlChar *msgPtr = NULL;
1010 xmlChar *result = NULL;
1011 xmlChar *resultPtr = NULL;
1012 size_t count = 0;
1013 size_t msgLen = 0;
1014 size_t resultLen = 0;
1015
1016 if (!msg || !*msg)
1017 return(NULL);
1018
1019 for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1020 ++msgLen;
1021 if (*msgPtr == '%')
1022 ++count;
1023 }
1024
1025 if (count == 0)
1026 return(*msg);
1027
Elliott Hughesecdab2a2022-02-23 14:33:50 -08001028 if ((count > INT_MAX) || (msgLen > INT_MAX - count))
1029 return(NULL);
David Kilzer502f6a62016-05-23 14:58:41 +08001030 resultLen = msgLen + count + 1;
1031 result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
1032 if (result == NULL) {
1033 /* Clear *msg to prevent format string vulnerabilities in
1034 out-of-memory situations. */
1035 xmlFree(*msg);
1036 *msg = NULL;
1037 xmlErrMemory(NULL, NULL);
1038 return(NULL);
1039 }
1040
1041 for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1042 *resultPtr = *msgPtr;
1043 if (*msgPtr == '%')
1044 *(++resultPtr) = '%';
1045 }
1046 result[resultLen - 1] = '\0';
1047
1048 xmlFree(*msg);
1049 *msg = result;
1050
1051 return *msg;
1052}
1053