Blame - xmlstring.c - platform/external/libxml2

blob: d73c49cadbc2afed13bd643cc8eca256bd6f2daa [file] [log] [blame]

William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	1	/*
				2	* string.c : an XML string utilities module
				3	*
				4	* This module provides various utility functions for manipulating
				5	* the xmlChar* type. All functions named xmlStr* have been moved here
				6	* from the parser.c file (their original home).
				7	*
				8	* See Copyright for the status of this software.
				9	*
				10	* UTF8 string routines from:
				11	* William Brack <wbrack@mmm.com.hk>
				12	*
				13	* daniel@veillard.com
				14	*/
				15
				16	#define IN_LIBXML
				17	#include "libxml.h"
				18
				19	#include <stdlib.h>
				20	#include <string.h>
				21	#include <libxml/xmlmemory.h>
				22	#include <libxml/parserInternals.h>
				23	#include <libxml/xmlstring.h>
				24
				25	/************************************************************************
				26	* *
				27	* Commodity functions to handle xmlChars *
				28	* *
				29	************************************************************************/
				30
				31	/**
				32	* xmlStrndup:
				33	* @cur: the input xmlChar *
				34	* @len: the len of @cur
				35	*
				36	* a strndup for array of xmlChar's
				37	*
				38	* Returns a new xmlChar * or NULL
				39	*/
				40	xmlChar *
				41	xmlStrndup(const xmlChar *cur, int len) {
				42	xmlChar *ret;
				43
				44	if ((cur == NULL) \|\| (len < 0)) return(NULL);
				45	ret = (xmlChar ) xmlMallocAtomic((len + 1) sizeof(xmlChar));
				46	if (ret == NULL) {
				47	xmlErrMemory(NULL, NULL);
				48	return(NULL);
				49	}
				50	memcpy(ret, cur, len * sizeof(xmlChar));
				51	ret[len] = 0;
				52	return(ret);
				53	}
				54
				55	/**
				56	* xmlStrdup:
				57	* @cur: the input xmlChar *
				58	*
				59	* a strdup for array of xmlChar's. Since they are supposed to be
				60	* encoded in UTF-8 or an encoding with 8bit based chars, we assume
				61	* a termination mark of '0'.
				62	*
				63	* Returns a new xmlChar * or NULL
				64	*/
				65	xmlChar *
				66	xmlStrdup(const xmlChar *cur) {
				67	const xmlChar *p = cur;
				68
				69	if (cur == NULL) return(NULL);
				70	while (p != 0) p++; / non input consuming */
				71	return(xmlStrndup(cur, p - cur));
				72	}
				73
				74	/**
				75	* xmlCharStrndup:
				76	* @cur: the input char *
				77	* @len: the len of @cur
				78	*
				79	* a strndup for char's to xmlChar's
				80	*
				81	* Returns a new xmlChar * or NULL
				82	*/
				83
				84	xmlChar *
				85	xmlCharStrndup(const char *cur, int len) {
				86	int i;
				87	xmlChar *ret;
				88
				89	if ((cur == NULL) \|\| (len < 0)) return(NULL);
				90	ret = (xmlChar ) xmlMallocAtomic((len + 1) sizeof(xmlChar));
				91	if (ret == NULL) {
				92	xmlErrMemory(NULL, NULL);
				93	return(NULL);
				94	}
Daniel Veillard	5ea30d7	2004-11-08 11:54:28 +0000	[diff] [blame]	95	for (i = 0;i < len;i++) {
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	96	ret[i] = (xmlChar) cur[i];
Daniel Veillard	5ea30d7	2004-11-08 11:54:28 +0000	[diff] [blame]	97	if (ret[i] == 0) return(ret);
				98	}
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	99	ret[len] = 0;
				100	return(ret);
				101	}
				102
				103	/**
				104	* xmlCharStrdup:
				105	* @cur: the input char *
				106	*
				107	* a strdup for char's to xmlChar's
				108	*
				109	* Returns a new xmlChar * or NULL
				110	*/
				111
				112	xmlChar *
				113	xmlCharStrdup(const char *cur) {
				114	const char *p = cur;
				115
				116	if (cur == NULL) return(NULL);
				117	while (p != '\0') p++; / non input consuming */
				118	return(xmlCharStrndup(cur, p - cur));
				119	}
				120
				121	/**
				122	* xmlStrcmp:
				123	* @str1: the first xmlChar *
				124	* @str2: the second xmlChar *
				125	*
				126	* a strcmp for xmlChar's
				127	*
				128	* Returns the integer result of the comparison
				129	*/
				130
				131	int
				132	xmlStrcmp(const xmlChar str1, const xmlChar str2) {
				133	register int tmp;
				134
				135	if (str1 == str2) return(0);
				136	if (str1 == NULL) return(-1);
				137	if (str2 == NULL) return(1);
				138	do {
				139	tmp = str1++ - str2;
				140	if (tmp != 0) return(tmp);
				141	} while (*str2++ != 0);
				142	return 0;
				143	}
				144
				145	/**
				146	* xmlStrEqual:
				147	* @str1: the first xmlChar *
				148	* @str2: the second xmlChar *
				149	*
				150	* Check if both string are equal of have same content
				151	* Should be a bit more readable and faster than xmlStrEqual()
				152	*
				153	* Returns 1 if they are equal, 0 if they are different
				154	*/
				155
				156	int
				157	xmlStrEqual(const xmlChar str1, const xmlChar str2) {
				158	if (str1 == str2) return(1);
				159	if (str1 == NULL) return(0);
				160	if (str2 == NULL) return(0);
				161	do {
				162	if (str1++ != str2) return(0);
				163	} while (*str2++);
				164	return(1);
				165	}
				166
				167	/**
				168	* xmlStrQEqual:
				169	* @pref: the prefix of the QName
				170	* @name: the localname of the QName
				171	* @str: the second xmlChar *
				172	*
				173	* Check if a QName is Equal to a given string
				174	*
				175	* Returns 1 if they are equal, 0 if they are different
				176	*/
				177
				178	int
				179	xmlStrQEqual(const xmlChar pref, const xmlChar name, const xmlChar *str) {
				180	if (pref == NULL) return(xmlStrEqual(name, str));
				181	if (name == NULL) return(0);
				182	if (str == NULL) return(0);
				183
				184	do {
				185	if (pref++ != str) return(0);
				186	} while ((str++) && (pref));
				187	if (*str++ != ':') return(0);
				188	do {
				189	if (name++ != str) return(0);
				190	} while (*str++);
				191	return(1);
				192	}
				193
				194	/**
				195	* xmlStrncmp:
				196	* @str1: the first xmlChar *
				197	* @str2: the second xmlChar *
				198	* @len: the max comparison length
				199	*
				200	* a strncmp for xmlChar's
				201	*
				202	* Returns the integer result of the comparison
				203	*/
				204
				205	int
				206	xmlStrncmp(const xmlChar str1, const xmlChar str2, int len) {
				207	register int tmp;
				208
				209	if (len <= 0) return(0);
				210	if (str1 == str2) return(0);
				211	if (str1 == NULL) return(-1);
				212	if (str2 == NULL) return(1);
Daniel Veillard	c82c57e	2004-01-12 16:24:34 +0000	[diff] [blame]	213	#ifdef __GNUC__
William M. Brack	b7b54de	2004-10-06 16:38:01 +0000	[diff] [blame]	214	tmp = strncmp((const char )str1, (const char )str2, len);
Daniel Veillard	c82c57e	2004-01-12 16:24:34 +0000	[diff] [blame]	215	return tmp;
				216	#else
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	217	do {
				218	tmp = str1++ - str2;
				219	if (tmp != 0 \|\| --len == 0) return(tmp);
				220	} while (*str2++ != 0);
				221	return 0;
Daniel Veillard	c82c57e	2004-01-12 16:24:34 +0000	[diff] [blame]	222	#endif
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	223	}
				224
				225	static const xmlChar casemap[256] = {
				226	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
				227	0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
				228	0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
				229	0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
				230	0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
				231	0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
				232	0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
				233	0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
				234	0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
				235	0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
				236	0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
				237	0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
				238	0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
				239	0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
				240	0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
				241	0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
				242	0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
				243	0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
				244	0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
				245	0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
				246	0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
				247	0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
				248	0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
				249	0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
				250	0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
				251	0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
				252	0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
				253	0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
				254	0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
				255	0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
				256	0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
				257	0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
				258	};
				259
				260	/**
				261	* xmlStrcasecmp:
				262	* @str1: the first xmlChar *
				263	* @str2: the second xmlChar *
				264	*
				265	* a strcasecmp for xmlChar's
				266	*
				267	* Returns the integer result of the comparison
				268	*/
				269
				270	int
				271	xmlStrcasecmp(const xmlChar str1, const xmlChar str2) {
				272	register int tmp;
				273
				274	if (str1 == str2) return(0);
				275	if (str1 == NULL) return(-1);
				276	if (str2 == NULL) return(1);
				277	do {
				278	tmp = casemap[str1++] - casemap[str2];
				279	if (tmp != 0) return(tmp);
				280	} while (*str2++ != 0);
				281	return 0;
				282	}
				283
				284	/**
				285	* xmlStrncasecmp:
				286	* @str1: the first xmlChar *
				287	* @str2: the second xmlChar *
				288	* @len: the max comparison length
				289	*
				290	* a strncasecmp for xmlChar's
				291	*
				292	* Returns the integer result of the comparison
				293	*/
				294
				295	int
				296	xmlStrncasecmp(const xmlChar str1, const xmlChar str2, int len) {
				297	register int tmp;
				298
				299	if (len <= 0) return(0);
				300	if (str1 == str2) return(0);
				301	if (str1 == NULL) return(-1);
				302	if (str2 == NULL) return(1);
				303	do {
				304	tmp = casemap[str1++] - casemap[str2];
				305	if (tmp != 0 \|\| --len == 0) return(tmp);
				306	} while (*str2++ != 0);
				307	return 0;
				308	}
				309
				310	/**
				311	* xmlStrchr:
				312	* @str: the xmlChar * array
				313	* @val: the xmlChar to search
				314	*
				315	* a strchr for xmlChar's
				316	*
				317	* Returns the xmlChar * for the first occurrence or NULL.
				318	*/
				319
				320	const xmlChar *
				321	xmlStrchr(const xmlChar *str, xmlChar val) {
				322	if (str == NULL) return(NULL);
				323	while (str != 0) { / non input consuming */
				324	if (str == val) return((xmlChar ) str);
				325	str++;
				326	}
				327	return(NULL);
				328	}
				329
				330	/**
				331	* xmlStrstr:
				332	* @str: the xmlChar * array (haystack)
				333	* @val: the xmlChar to search (needle)
				334	*
				335	* a strstr for xmlChar's
				336	*
				337	* Returns the xmlChar * for the first occurrence or NULL.
				338	*/
				339
				340	const xmlChar *
				341	xmlStrstr(const xmlChar str, const xmlChar val) {
				342	int n;
				343
				344	if (str == NULL) return(NULL);
				345	if (val == NULL) return(NULL);
				346	n = xmlStrlen(val);
				347
				348	if (n == 0) return(str);
				349	while (str != 0) { / non input consuming */
				350	if (str == val) {
				351	if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
				352	}
				353	str++;
				354	}
				355	return(NULL);
				356	}
				357
				358	/**
				359	* xmlStrcasestr:
				360	* @str: the xmlChar * array (haystack)
				361	* @val: the xmlChar to search (needle)
				362	*
				363	* a case-ignoring strstr for xmlChar's
				364	*
				365	* Returns the xmlChar * for the first occurrence or NULL.
				366	*/
				367
				368	const xmlChar *
				369	xmlStrcasestr(const xmlChar str, xmlChar val) {
				370	int n;
				371
				372	if (str == NULL) return(NULL);
				373	if (val == NULL) return(NULL);
				374	n = xmlStrlen(val);
				375
				376	if (n == 0) return(str);
				377	while (str != 0) { / non input consuming */
				378	if (casemap[str] == casemap[val])
				379	if (!xmlStrncasecmp(str, val, n)) return(str);
				380	str++;
				381	}
				382	return(NULL);
				383	}
				384
				385	/**
				386	* xmlStrsub:
				387	* @str: the xmlChar * array (haystack)
				388	* @start: the index of the first char (zero based)
				389	* @len: the length of the substring
				390	*
				391	* Extract a substring of a given string
				392	*
				393	* Returns the xmlChar * for the first occurrence or NULL.
				394	*/
				395
				396	xmlChar *
				397	xmlStrsub(const xmlChar *str, int start, int len) {
				398	int i;
				399
				400	if (str == NULL) return(NULL);
				401	if (start < 0) return(NULL);
				402	if (len < 0) return(NULL);
				403
				404	for (i = 0;i < start;i++) {
				405	if (*str == 0) return(NULL);
				406	str++;
				407	}
				408	if (*str == 0) return(NULL);
				409	return(xmlStrndup(str, len));
				410	}
				411
				412	/**
				413	* xmlStrlen:
				414	* @str: the xmlChar * array
				415	*
				416	* length of a xmlChar's string
				417	*
				418	* Returns the number of xmlChar contained in the ARRAY.
				419	*/
				420
				421	int
				422	xmlStrlen(const xmlChar *str) {
				423	int len = 0;
				424
				425	if (str == NULL) return(0);
				426	while (str != 0) { / non input consuming */
				427	str++;
				428	len++;
				429	}
				430	return(len);
				431	}
				432
				433	/**
				434	* xmlStrncat:
				435	* @cur: the original xmlChar * array
				436	* @add: the xmlChar * array added
				437	* @len: the length of @add
				438	*
				439	* a strncat for array of xmlChar's, it will extend @cur with the len
				440	* first bytes of @add.
				441	*
				442	* Returns a new xmlChar *, the original @cur is reallocated if needed
				443	* and should not be freed
				444	*/
				445
				446	xmlChar *
				447	xmlStrncat(xmlChar cur, const xmlChar add, int len) {
				448	int size;
				449	xmlChar *ret;
				450
				451	if ((add == NULL) \|\| (len == 0))
				452	return(cur);
				453	if (cur == NULL)
				454	return(xmlStrndup(add, len));
				455
				456	size = xmlStrlen(cur);
				457	ret = (xmlChar ) xmlRealloc(cur, (size + len + 1) sizeof(xmlChar));
				458	if (ret == NULL) {
				459	xmlErrMemory(NULL, NULL);
				460	return(cur);
				461	}
				462	memcpy(&ret[size], add, len * sizeof(xmlChar));
				463	ret[size + len] = 0;
				464	return(ret);
				465	}
				466
				467	/**
				468	* xmlStrncatNew:
				469	* @str1: first xmlChar string
				470	* @str2: second xmlChar string
				471	* @len: the len of @str2
				472	*
				473	* same as xmlStrncat, but creates a new string. The original
				474	* two strings are not freed.
				475	*
				476	* Returns a new xmlChar * or NULL
				477	*/
				478	xmlChar *
				479	xmlStrncatNew(const xmlChar str1, const xmlChar str2, int len) {
				480	int size;
				481	xmlChar *ret;
				482
Daniel Veillard	8a32fe4	2004-11-02 22:10:16 +0000	[diff] [blame]	483	if (len < 0)
				484	len = xmlStrlen(str2);
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	485	if ((str2 == NULL) \|\| (len == 0))
				486	return(xmlStrdup(str1));
				487	if (str1 == NULL)
				488	return(xmlStrndup(str2, len));
				489
				490	size = xmlStrlen(str1);
				491	ret = (xmlChar ) xmlMalloc((size + len + 1) sizeof(xmlChar));
				492	if (ret == NULL) {
				493	xmlErrMemory(NULL, NULL);
				494	return(xmlStrndup(str1, size));
				495	}
				496	memcpy(ret, str1, size * sizeof(xmlChar));
				497	memcpy(&ret[size], str2, len * sizeof(xmlChar));
				498	ret[size + len] = 0;
				499	return(ret);
				500	}
				501
				502	/**
				503	* xmlStrcat:
				504	* @cur: the original xmlChar * array
				505	* @add: the xmlChar * array added
				506	*
				507	* a strcat for array of xmlChar's. Since they are supposed to be
				508	* encoded in UTF-8 or an encoding with 8bit based chars, we assume
				509	* a termination mark of '0'.
				510	*
				511	* Returns a new xmlChar * containing the concatenated string.
				512	*/
				513	xmlChar *
				514	xmlStrcat(xmlChar cur, const xmlChar add) {
				515	const xmlChar *p = add;
				516
				517	if (add == NULL) return(cur);
				518	if (cur == NULL)
				519	return(xmlStrdup(add));
				520
				521	while (p != 0) p++; / non input consuming */
				522	return(xmlStrncat(cur, add, p - add));
				523	}
				524
				525	/**
				526	* xmlStrPrintf:
				527	* @buf: the result buffer.
				528	* @len: the result buffer length.
				529	* @msg: the message with printf formatting.
				530	* @...: extra parameters for the message.
				531	*
				532	* Formats @msg and places result into @buf.
				533	*
				534	* Returns the number of characters written to @buf or -1 if an error occurs.
				535	*/
				536	int
				537	xmlStrPrintf(xmlChar buf, int len, const xmlChar msg, ...) {
				538	va_list args;
				539	int ret;
				540
				541	if((buf == NULL) \|\| (msg == NULL)) {
				542	return(-1);
				543	}
				544
				545	va_start(args, msg);
				546	ret = vsnprintf((char ) buf, len, (const char ) msg, args);
				547	va_end(args);
				548	buf[len - 1] = 0; /* be safe ! */
				549
				550	return(ret);
				551	}
				552
				553	/**
				554	* xmlStrVPrintf:
				555	* @buf: the result buffer.
				556	* @len: the result buffer length.
				557	* @msg: the message with printf formatting.
				558	* @ap: extra parameters for the message.
				559	*
				560	* Formats @msg and places result into @buf.
				561	*
				562	* Returns the number of characters written to @buf or -1 if an error occurs.
				563	*/
				564	int
				565	xmlStrVPrintf(xmlChar buf, int len, const xmlChar msg, va_list ap) {
				566	int ret;
				567
				568	if((buf == NULL) \|\| (msg == NULL)) {
				569	return(-1);
				570	}
				571
				572	ret = vsnprintf((char ) buf, len, (const char ) msg, ap);
				573	buf[len - 1] = 0; /* be safe ! */
				574
				575	return(ret);
				576	}
				577
				578	/************************************************************************
				579	* *
				580	* Generic UTF8 handling routines *
				581	* *
				582	* From rfc2044: encoding of the Unicode values on UTF-8: *
				583	* *
				584	* UCS-4 range (hex.) UTF-8 octet sequence (binary) *
				585	* 0000 0000-0000 007F 0xxxxxxx *
				586	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
				587	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
				588	* *
				589	* I hope we won't use values > 0xFFFF anytime soon ! *
				590	* *
				591	************************************************************************/
				592
				593
				594	/**
				595	* xmlUTF8Size:
				596	* @utf: pointer to the UTF8 character
				597	*
				598	* calculates the internal size of a UTF8 character
				599	*
				600	* returns the numbers of bytes in the character, -1 on format error
				601	*/
				602	int
				603	xmlUTF8Size(const xmlChar *utf) {
				604	xmlChar mask;
				605	int len;
				606
				607	if (utf == NULL)
				608	return -1;
				609	if (*utf < 0x80)
				610	return 1;
				611	/* check valid UTF8 character */
				612	if (!(*utf & 0x40))
				613	return -1;
				614	/* determine number of bytes in char */
				615	len = 2;
				616	for (mask=0x20; mask != 0; mask>>=1) {
				617	if (!(*utf & mask))
				618	return len;
				619	len++;
				620	}
				621	return -1;
				622	}
				623
				624	/**
				625	* xmlUTF8Charcmp:
				626	* @utf1: pointer to first UTF8 char
				627	* @utf2: pointer to second UTF8 char
				628	*
				629	* compares the two UCS4 values
				630	*
				631	* returns result of the compare as with xmlStrncmp
				632	*/
				633	int
				634	xmlUTF8Charcmp(const xmlChar utf1, const xmlChar utf2) {
				635
				636	if (utf1 == NULL ) {
				637	if (utf2 == NULL)
				638	return 0;
				639	return -1;
				640	}
				641	return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
				642	}
				643
				644	/**
				645	* xmlUTF8Strlen:
				646	* @utf: a sequence of UTF-8 encoded bytes
				647	*
				648	* compute the length of an UTF8 string, it doesn't do a full UTF8
				649	* checking of the content of the string.
				650	*
				651	* Returns the number of characters in the string or -1 in case of error
				652	*/
				653	int
				654	xmlUTF8Strlen(const xmlChar *utf) {
				655	int ret = 0;
				656
				657	if (utf == NULL)
				658	return(-1);
				659
				660	while (*utf != 0) {
				661	if (utf[0] & 0x80) {
				662	if ((utf[1] & 0xc0) != 0x80)
				663	return(-1);
				664	if ((utf[0] & 0xe0) == 0xe0) {
				665	if ((utf[2] & 0xc0) != 0x80)
				666	return(-1);
				667	if ((utf[0] & 0xf0) == 0xf0) {
				668	if ((utf[0] & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
				669	return(-1);
				670	utf += 4;
				671	} else {
				672	utf += 3;
				673	}
				674	} else {
				675	utf += 2;
				676	}
				677	} else {
				678	utf++;
				679	}
				680	ret++;
				681	}
				682	return(ret);
				683	}
				684
				685	/**
				686	* xmlGetUTF8Char:
				687	* @utf: a sequence of UTF-8 encoded bytes
William M. Brack	3e53016	2004-09-03 17:10:08 +0000	[diff] [blame]	688	* @len: a pointer to the minimum number of bytes present in
				689	* the sequence. This is used to assure the next character
				690	* is completely contained within the sequence.
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	691	*
William M. Brack	3e53016	2004-09-03 17:10:08 +0000	[diff] [blame]	692	* Read the first UTF8 character from @utf
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	693	*
William M. Brack	3e53016	2004-09-03 17:10:08 +0000	[diff] [blame]	694	* Returns the char value or -1 in case of error, and sets *len to
				695	* the actual number of bytes consumed (0 in case of error)
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	696	*/
				697	int
				698	xmlGetUTF8Char(const unsigned char utf, int len) {
				699	unsigned int c;
				700
				701	if (utf == NULL)
				702	goto error;
				703	if (len == NULL)
				704	goto error;
				705	if (*len < 1)
				706	goto error;
				707
				708	c = utf[0];
				709	if (c & 0x80) {
				710	if (*len < 2)
				711	goto error;
				712	if ((utf[1] & 0xc0) != 0x80)
				713	goto error;
				714	if ((c & 0xe0) == 0xe0) {
				715	if (*len < 3)
				716	goto error;
				717	if ((utf[2] & 0xc0) != 0x80)
				718	goto error;
				719	if ((c & 0xf0) == 0xf0) {
				720	if (*len < 4)
				721	goto error;
				722	if ((c & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
				723	goto error;
				724	*len = 4;
				725	/* 4-byte code */
				726	c = (utf[0] & 0x7) << 18;
				727	c \|= (utf[1] & 0x3f) << 12;
				728	c \|= (utf[2] & 0x3f) << 6;
				729	c \|= utf[3] & 0x3f;
				730	} else {
				731	/* 3-byte code */
				732	*len = 3;
				733	c = (utf[0] & 0xf) << 12;
				734	c \|= (utf[1] & 0x3f) << 6;
				735	c \|= utf[2] & 0x3f;
				736	}
				737	} else {
				738	/* 2-byte code */
				739	*len = 2;
				740	c = (utf[0] & 0x1f) << 6;
				741	c \|= utf[1] & 0x3f;
				742	}
				743	} else {
				744	/* 1-byte code */
				745	*len = 1;
				746	}
				747	return(c);
				748
				749	error:
Daniel Veillard	ce682bc	2004-11-05 17:22:25 +0000	[diff] [blame]	750	if (len != NULL)
				751	*len = 0;
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	752	return(-1);
				753	}
				754
				755	/**
				756	* xmlCheckUTF8:
				757	* @utf: Pointer to putative UTF-8 encoded string.
				758	*
				759	* Checks @utf for being valid UTF-8. @utf is assumed to be
				760	* null-terminated. This function is not super-strict, as it will
				761	* allow longer UTF-8 sequences than necessary. Note that Java is
				762	* capable of producing these sequences if provoked. Also note, this
				763	* routine checks for the 4-byte maximum size, but does not check for
				764	* 0x10ffff maximum value.
				765	*
				766	* Return value: true if @utf is valid.
				767	**/
				768	int
				769	xmlCheckUTF8(const unsigned char *utf)
				770	{
				771	int ix;
				772	unsigned char c;
				773
Daniel Veillard	ce682bc	2004-11-05 17:22:25 +0000	[diff] [blame]	774	if (utf == NULL)
				775	return(0);
William M. Brack	3ffe90e	2004-08-28 01:33:30 +0000	[diff] [blame]	776	/*
				777	* utf is a string of 1, 2, 3 or 4 bytes. The valid strings
				778	* are as follows (in "bit format"):
				779	* 0xxxxxxx valid 1-byte
				780	* 110xxxxx 10xxxxxx valid 2-byte
				781	* 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
				782	* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
				783	*/
				784	for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
William M. Brack	f409515	2004-08-31 16:49:26 +0000	[diff] [blame]	785	if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	786	ix++;
William M. Brack	bf5cf21	2004-08-31 06:47:17 +0000	[diff] [blame]	787	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
				788	if ((utf[ix+1] & 0xc0 ) != 0x80)
				789	return 0;
				790	ix += 2;
				791	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
				792	if (((utf[ix+1] & 0xc0) != 0x80) \|\|
				793	((utf[ix+2] & 0xc0) != 0x80))
				794	return 0;
				795	ix += 3;
				796	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
				797	if (((utf[ix+1] & 0xc0) != 0x80) \|\|
				798	((utf[ix+2] & 0xc0) != 0x80) \|\|
				799	((utf[ix+3] & 0xc0) != 0x80))
				800	return 0;
				801	ix += 4;
				802	} else /* unknown encoding */
				803	return 0;
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	804	}
				805	return(1);
				806	}
				807
				808	/**
				809	* xmlUTF8Strsize:
				810	* @utf: a sequence of UTF-8 encoded bytes
				811	* @len: the number of characters in the array
				812	*
				813	* storage size of an UTF8 string
Daniel Veillard	5ea30d7	2004-11-08 11:54:28 +0000	[diff] [blame]	814	* the behaviour is not garanteed if the input string is not UTF-8
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	815	*
				816	* Returns the storage size of
				817	* the first 'len' characters of ARRAY
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	818	*/
				819
				820	int
				821	xmlUTF8Strsize(const xmlChar *utf, int len) {
				822	const xmlChar *ptr=utf;
				823	xmlChar ch;
				824
Daniel Veillard	36e5cd5	2004-11-02 14:52:23 +0000	[diff] [blame]	825	if (utf == NULL)
				826	return(0);
				827
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	828	if (len <= 0)
				829	return(0);
				830
				831	while ( len-- > 0) {
				832	if ( !*ptr )
				833	break;
				834	if ( (ch = *ptr++) & 0x80)
Daniel Veillard	5ea30d7	2004-11-08 11:54:28 +0000	[diff] [blame]	835	while ((ch<<=1) & 0x80 ) {
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	836	ptr++;
Daniel Veillard	5ea30d7	2004-11-08 11:54:28 +0000	[diff] [blame]	837	if (*ptr == 0) break;
				838	}
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	839	}
				840	return (ptr - utf);
				841	}
				842
				843
				844	/**
				845	* xmlUTF8Strndup:
				846	* @utf: the input UTF8 *
				847	* @len: the len of @utf (in chars)
				848	*
				849	* a strndup for array of UTF8's
				850	*
				851	* Returns a new UTF8 * or NULL
				852	*/
				853	xmlChar *
				854	xmlUTF8Strndup(const xmlChar *utf, int len) {
				855	xmlChar *ret;
				856	int i;
				857
				858	if ((utf == NULL) \|\| (len < 0)) return(NULL);
				859	i = xmlUTF8Strsize(utf, len);
				860	ret = (xmlChar ) xmlMallocAtomic((i + 1) sizeof(xmlChar));
				861	if (ret == NULL) {
				862	xmlGenericError(xmlGenericErrorContext,
				863	"malloc of %ld byte failed\n",
				864	(len + 1) * (long)sizeof(xmlChar));
				865	return(NULL);
				866	}
				867	memcpy(ret, utf, i * sizeof(xmlChar));
				868	ret[i] = 0;
				869	return(ret);
				870	}
				871
				872	/**
				873	* xmlUTF8Strpos:
				874	* @utf: the input UTF8 *
				875	* @pos: the position of the desired UTF8 char (in chars)
				876	*
				877	* a function to provide the equivalent of fetching a
				878	* character from a string array
				879	*
				880	* Returns a pointer to the UTF8 character or NULL
				881	*/
Daniel Veillard	8a32fe4	2004-11-02 22:10:16 +0000	[diff] [blame]	882	const xmlChar *
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	883	xmlUTF8Strpos(const xmlChar *utf, int pos) {
				884	xmlChar ch;
				885
				886	if (utf == NULL) return(NULL);
William M. Brack	230c550	2004-12-20 16:18:49 +0000	[diff] [blame]	887	if (pos < 0)
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	888	return(NULL);
				889	while (pos--) {
				890	if ((ch=*utf++) == 0) return(NULL);
				891	if ( ch & 0x80 ) {
				892	/* if not simple ascii, verify proper format */
				893	if ( (ch & 0xc0) != 0xc0 )
				894	return(NULL);
				895	/* then skip over remaining bytes for this char */
				896	while ( (ch <<= 1) & 0x80 )
				897	if ( (*utf++ & 0xc0) != 0x80 )
				898	return(NULL);
				899	}
				900	}
				901	return((xmlChar *)utf);
				902	}
				903
				904	/**
				905	* xmlUTF8Strloc:
				906	* @utf: the input UTF8 *
				907	* @utfchar: the UTF8 character to be found
				908	*
				909	* a function to provide the relative location of a UTF8 char
				910	*
				911	* Returns the relative character position of the desired char
				912	* or -1 if not found
				913	*/
				914	int
				915	xmlUTF8Strloc(const xmlChar utf, const xmlChar utfchar) {
				916	int i, size;
				917	xmlChar ch;
				918
				919	if (utf==NULL \|\| utfchar==NULL) return -1;
				920	size = xmlUTF8Strsize(utfchar, 1);
				921	for(i=0; (ch=*utf) != 0; i++) {
				922	if (xmlStrncmp(utf, utfchar, size)==0)
				923	return(i);
				924	utf++;
				925	if ( ch & 0x80 ) {
				926	/* if not simple ascii, verify proper format */
				927	if ( (ch & 0xc0) != 0xc0 )
				928	return(-1);
				929	/* then skip over remaining bytes for this char */
				930	while ( (ch <<= 1) & 0x80 )
				931	if ( (*utf++ & 0xc0) != 0x80 )
				932	return(-1);
				933	}
				934	}
				935
				936	return(-1);
				937	}
				938	/**
				939	* xmlUTF8Strsub:
				940	* @utf: a sequence of UTF-8 encoded bytes
				941	* @start: relative pos of first char
				942	* @len: total number to copy
				943	*
				944	* Create a substring from a given UTF-8 string
				945	* Note: positions are given in units of UTF-8 chars
				946	*
				947	* Returns a pointer to a newly created string
				948	* or NULL if any problem
				949	*/
				950
				951	xmlChar *
				952	xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
				953	int i;
				954	xmlChar ch;
				955
				956	if (utf == NULL) return(NULL);
				957	if (start < 0) return(NULL);
				958	if (len < 0) return(NULL);
				959
				960	/*
				961	* Skip over any leading chars
				962	*/
				963	for (i = 0;i < start;i++) {
				964	if ((ch=*utf++) == 0) return(NULL);
				965	if ( ch & 0x80 ) {
				966	/* if not simple ascii, verify proper format */
				967	if ( (ch & 0xc0) != 0xc0 )
				968	return(NULL);
				969	/* then skip over remaining bytes for this char */
				970	while ( (ch <<= 1) & 0x80 )
				971	if ( (*utf++ & 0xc0) != 0x80 )
				972	return(NULL);
				973	}
				974	}
				975
				976	return(xmlUTF8Strndup(utf, len));
				977	}
Daniel Veillard	5d4644e	2005-04-01 13:11:58 +0000	[diff] [blame^]	978
				979	#define bottom_xmlstring
				980	#include "elfgcchack.h"