Blame - xmlstring.c - platform/external/libxml2

blob: 78acbff7531af962d41fcde938e9d63833386317 [file] [log] [blame]

William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	1	/*
				2	* string.c : an XML string utilities module
				3	*
				4	* This module provides various utility functions for manipulating
				5	* the xmlChar* type. All functions named xmlStr* have been moved here
				6	* from the parser.c file (their original home).
				7	*
				8	* See Copyright for the status of this software.
				9	*
				10	* UTF8 string routines from:
				11	* William Brack <wbrack@mmm.com.hk>
				12	*
				13	* daniel@veillard.com
				14	*/
				15
				16	#define IN_LIBXML
				17	#include "libxml.h"
				18
				19	#include <stdlib.h>
				20	#include <string.h>
				21	#include <libxml/xmlmemory.h>
				22	#include <libxml/parserInternals.h>
				23	#include <libxml/xmlstring.h>
				24
				25	/************************************************************************
				26	* *
				27	* Commodity functions to handle xmlChars *
				28	* *
				29	************************************************************************/
				30
				31	/**
				32	* xmlStrndup:
				33	* @cur: the input xmlChar *
				34	* @len: the len of @cur
				35	*
				36	* a strndup for array of xmlChar's
				37	*
				38	* Returns a new xmlChar * or NULL
				39	*/
				40	xmlChar *
				41	xmlStrndup(const xmlChar *cur, int len) {
				42	xmlChar *ret;
				43
				44	if ((cur == NULL) \|\| (len < 0)) return(NULL);
				45	ret = (xmlChar ) xmlMallocAtomic((len + 1) sizeof(xmlChar));
				46	if (ret == NULL) {
				47	xmlErrMemory(NULL, NULL);
				48	return(NULL);
				49	}
				50	memcpy(ret, cur, len * sizeof(xmlChar));
				51	ret[len] = 0;
				52	return(ret);
				53	}
				54
				55	/**
				56	* xmlStrdup:
				57	* @cur: the input xmlChar *
				58	*
				59	* a strdup for array of xmlChar's. Since they are supposed to be
				60	* encoded in UTF-8 or an encoding with 8bit based chars, we assume
				61	* a termination mark of '0'.
				62	*
				63	* Returns a new xmlChar * or NULL
				64	*/
				65	xmlChar *
				66	xmlStrdup(const xmlChar *cur) {
				67	const xmlChar *p = cur;
				68
				69	if (cur == NULL) return(NULL);
				70	while (p != 0) p++; / non input consuming */
				71	return(xmlStrndup(cur, p - cur));
				72	}
				73
				74	/**
				75	* xmlCharStrndup:
				76	* @cur: the input char *
				77	* @len: the len of @cur
				78	*
				79	* a strndup for char's to xmlChar's
				80	*
				81	* Returns a new xmlChar * or NULL
				82	*/
				83
				84	xmlChar *
				85	xmlCharStrndup(const char *cur, int len) {
				86	int i;
				87	xmlChar *ret;
				88
				89	if ((cur == NULL) \|\| (len < 0)) return(NULL);
				90	ret = (xmlChar ) xmlMallocAtomic((len + 1) sizeof(xmlChar));
				91	if (ret == NULL) {
				92	xmlErrMemory(NULL, NULL);
				93	return(NULL);
				94	}
				95	for (i = 0;i < len;i++)
				96	ret[i] = (xmlChar) cur[i];
				97	ret[len] = 0;
				98	return(ret);
				99	}
				100
				101	/**
				102	* xmlCharStrdup:
				103	* @cur: the input char *
				104	*
				105	* a strdup for char's to xmlChar's
				106	*
				107	* Returns a new xmlChar * or NULL
				108	*/
				109
				110	xmlChar *
				111	xmlCharStrdup(const char *cur) {
				112	const char *p = cur;
				113
				114	if (cur == NULL) return(NULL);
				115	while (p != '\0') p++; / non input consuming */
				116	return(xmlCharStrndup(cur, p - cur));
				117	}
				118
				119	/**
				120	* xmlStrcmp:
				121	* @str1: the first xmlChar *
				122	* @str2: the second xmlChar *
				123	*
				124	* a strcmp for xmlChar's
				125	*
				126	* Returns the integer result of the comparison
				127	*/
				128
				129	int
				130	xmlStrcmp(const xmlChar str1, const xmlChar str2) {
				131	register int tmp;
				132
				133	if (str1 == str2) return(0);
				134	if (str1 == NULL) return(-1);
				135	if (str2 == NULL) return(1);
				136	do {
				137	tmp = str1++ - str2;
				138	if (tmp != 0) return(tmp);
				139	} while (*str2++ != 0);
				140	return 0;
				141	}
				142
				143	/**
				144	* xmlStrEqual:
				145	* @str1: the first xmlChar *
				146	* @str2: the second xmlChar *
				147	*
				148	* Check if both string are equal of have same content
				149	* Should be a bit more readable and faster than xmlStrEqual()
				150	*
				151	* Returns 1 if they are equal, 0 if they are different
				152	*/
				153
				154	int
				155	xmlStrEqual(const xmlChar str1, const xmlChar str2) {
				156	if (str1 == str2) return(1);
				157	if (str1 == NULL) return(0);
				158	if (str2 == NULL) return(0);
				159	do {
				160	if (str1++ != str2) return(0);
				161	} while (*str2++);
				162	return(1);
				163	}
				164
				165	/**
				166	* xmlStrQEqual:
				167	* @pref: the prefix of the QName
				168	* @name: the localname of the QName
				169	* @str: the second xmlChar *
				170	*
				171	* Check if a QName is Equal to a given string
				172	*
				173	* Returns 1 if they are equal, 0 if they are different
				174	*/
				175
				176	int
				177	xmlStrQEqual(const xmlChar pref, const xmlChar name, const xmlChar *str) {
				178	if (pref == NULL) return(xmlStrEqual(name, str));
				179	if (name == NULL) return(0);
				180	if (str == NULL) return(0);
				181
				182	do {
				183	if (pref++ != str) return(0);
				184	} while ((str++) && (pref));
				185	if (*str++ != ':') return(0);
				186	do {
				187	if (name++ != str) return(0);
				188	} while (*str++);
				189	return(1);
				190	}
				191
				192	/**
				193	* xmlStrncmp:
				194	* @str1: the first xmlChar *
				195	* @str2: the second xmlChar *
				196	* @len: the max comparison length
				197	*
				198	* a strncmp for xmlChar's
				199	*
				200	* Returns the integer result of the comparison
				201	*/
				202
				203	int
				204	xmlStrncmp(const xmlChar str1, const xmlChar str2, int len) {
				205	register int tmp;
				206
				207	if (len <= 0) return(0);
				208	if (str1 == str2) return(0);
				209	if (str1 == NULL) return(-1);
				210	if (str2 == NULL) return(1);
Daniel Veillard	c82c57e	2004-01-12 16:24:34 +0000	[diff] [blame]	211	#ifdef __GNUC__
William M. Brack	b7b54de	2004-10-06 16:38:01 +0000	[diff] [blame]	212	tmp = strncmp((const char )str1, (const char )str2, len);
Daniel Veillard	c82c57e	2004-01-12 16:24:34 +0000	[diff] [blame]	213	return tmp;
				214	#else
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	215	do {
				216	tmp = str1++ - str2;
				217	if (tmp != 0 \|\| --len == 0) return(tmp);
				218	} while (*str2++ != 0);
				219	return 0;
Daniel Veillard	c82c57e	2004-01-12 16:24:34 +0000	[diff] [blame]	220	#endif
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	221	}
				222
				223	static const xmlChar casemap[256] = {
				224	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
				225	0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
				226	0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
				227	0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
				228	0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
				229	0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
				230	0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
				231	0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
				232	0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
				233	0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
				234	0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
				235	0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
				236	0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
				237	0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
				238	0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
				239	0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
				240	0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
				241	0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
				242	0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
				243	0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
				244	0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
				245	0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
				246	0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
				247	0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
				248	0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
				249	0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
				250	0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
				251	0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
				252	0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
				253	0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
				254	0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
				255	0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
				256	};
				257
				258	/**
				259	* xmlStrcasecmp:
				260	* @str1: the first xmlChar *
				261	* @str2: the second xmlChar *
				262	*
				263	* a strcasecmp for xmlChar's
				264	*
				265	* Returns the integer result of the comparison
				266	*/
				267
				268	int
				269	xmlStrcasecmp(const xmlChar str1, const xmlChar str2) {
				270	register int tmp;
				271
				272	if (str1 == str2) return(0);
				273	if (str1 == NULL) return(-1);
				274	if (str2 == NULL) return(1);
				275	do {
				276	tmp = casemap[str1++] - casemap[str2];
				277	if (tmp != 0) return(tmp);
				278	} while (*str2++ != 0);
				279	return 0;
				280	}
				281
				282	/**
				283	* xmlStrncasecmp:
				284	* @str1: the first xmlChar *
				285	* @str2: the second xmlChar *
				286	* @len: the max comparison length
				287	*
				288	* a strncasecmp for xmlChar's
				289	*
				290	* Returns the integer result of the comparison
				291	*/
				292
				293	int
				294	xmlStrncasecmp(const xmlChar str1, const xmlChar str2, int len) {
				295	register int tmp;
				296
				297	if (len <= 0) return(0);
				298	if (str1 == str2) return(0);
				299	if (str1 == NULL) return(-1);
				300	if (str2 == NULL) return(1);
				301	do {
				302	tmp = casemap[str1++] - casemap[str2];
				303	if (tmp != 0 \|\| --len == 0) return(tmp);
				304	} while (*str2++ != 0);
				305	return 0;
				306	}
				307
				308	/**
				309	* xmlStrchr:
				310	* @str: the xmlChar * array
				311	* @val: the xmlChar to search
				312	*
				313	* a strchr for xmlChar's
				314	*
				315	* Returns the xmlChar * for the first occurrence or NULL.
				316	*/
				317
				318	const xmlChar *
				319	xmlStrchr(const xmlChar *str, xmlChar val) {
				320	if (str == NULL) return(NULL);
				321	while (str != 0) { / non input consuming */
				322	if (str == val) return((xmlChar ) str);
				323	str++;
				324	}
				325	return(NULL);
				326	}
				327
				328	/**
				329	* xmlStrstr:
				330	* @str: the xmlChar * array (haystack)
				331	* @val: the xmlChar to search (needle)
				332	*
				333	* a strstr for xmlChar's
				334	*
				335	* Returns the xmlChar * for the first occurrence or NULL.
				336	*/
				337
				338	const xmlChar *
				339	xmlStrstr(const xmlChar str, const xmlChar val) {
				340	int n;
				341
				342	if (str == NULL) return(NULL);
				343	if (val == NULL) return(NULL);
				344	n = xmlStrlen(val);
				345
				346	if (n == 0) return(str);
				347	while (str != 0) { / non input consuming */
				348	if (str == val) {
				349	if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
				350	}
				351	str++;
				352	}
				353	return(NULL);
				354	}
				355
				356	/**
				357	* xmlStrcasestr:
				358	* @str: the xmlChar * array (haystack)
				359	* @val: the xmlChar to search (needle)
				360	*
				361	* a case-ignoring strstr for xmlChar's
				362	*
				363	* Returns the xmlChar * for the first occurrence or NULL.
				364	*/
				365
				366	const xmlChar *
				367	xmlStrcasestr(const xmlChar str, xmlChar val) {
				368	int n;
				369
				370	if (str == NULL) return(NULL);
				371	if (val == NULL) return(NULL);
				372	n = xmlStrlen(val);
				373
				374	if (n == 0) return(str);
				375	while (str != 0) { / non input consuming */
				376	if (casemap[str] == casemap[val])
				377	if (!xmlStrncasecmp(str, val, n)) return(str);
				378	str++;
				379	}
				380	return(NULL);
				381	}
				382
				383	/**
				384	* xmlStrsub:
				385	* @str: the xmlChar * array (haystack)
				386	* @start: the index of the first char (zero based)
				387	* @len: the length of the substring
				388	*
				389	* Extract a substring of a given string
				390	*
				391	* Returns the xmlChar * for the first occurrence or NULL.
				392	*/
				393
				394	xmlChar *
				395	xmlStrsub(const xmlChar *str, int start, int len) {
				396	int i;
				397
				398	if (str == NULL) return(NULL);
				399	if (start < 0) return(NULL);
				400	if (len < 0) return(NULL);
				401
				402	for (i = 0;i < start;i++) {
				403	if (*str == 0) return(NULL);
				404	str++;
				405	}
				406	if (*str == 0) return(NULL);
				407	return(xmlStrndup(str, len));
				408	}
				409
				410	/**
				411	* xmlStrlen:
				412	* @str: the xmlChar * array
				413	*
				414	* length of a xmlChar's string
				415	*
				416	* Returns the number of xmlChar contained in the ARRAY.
				417	*/
				418
				419	int
				420	xmlStrlen(const xmlChar *str) {
				421	int len = 0;
				422
				423	if (str == NULL) return(0);
				424	while (str != 0) { / non input consuming */
				425	str++;
				426	len++;
				427	}
				428	return(len);
				429	}
				430
				431	/**
				432	* xmlStrncat:
				433	* @cur: the original xmlChar * array
				434	* @add: the xmlChar * array added
				435	* @len: the length of @add
				436	*
				437	* a strncat for array of xmlChar's, it will extend @cur with the len
				438	* first bytes of @add.
				439	*
				440	* Returns a new xmlChar *, the original @cur is reallocated if needed
				441	* and should not be freed
				442	*/
				443
				444	xmlChar *
				445	xmlStrncat(xmlChar cur, const xmlChar add, int len) {
				446	int size;
				447	xmlChar *ret;
				448
				449	if ((add == NULL) \|\| (len == 0))
				450	return(cur);
				451	if (cur == NULL)
				452	return(xmlStrndup(add, len));
				453
				454	size = xmlStrlen(cur);
				455	ret = (xmlChar ) xmlRealloc(cur, (size + len + 1) sizeof(xmlChar));
				456	if (ret == NULL) {
				457	xmlErrMemory(NULL, NULL);
				458	return(cur);
				459	}
				460	memcpy(&ret[size], add, len * sizeof(xmlChar));
				461	ret[size + len] = 0;
				462	return(ret);
				463	}
				464
				465	/**
				466	* xmlStrncatNew:
				467	* @str1: first xmlChar string
				468	* @str2: second xmlChar string
				469	* @len: the len of @str2
				470	*
				471	* same as xmlStrncat, but creates a new string. The original
				472	* two strings are not freed.
				473	*
				474	* Returns a new xmlChar * or NULL
				475	*/
				476	xmlChar *
				477	xmlStrncatNew(const xmlChar str1, const xmlChar str2, int len) {
				478	int size;
				479	xmlChar *ret;
				480
Daniel Veillard	8a32fe4	2004-11-02 22:10:16 +0000	[diff] [blame^]	481	if (len < 0)
				482	len = xmlStrlen(str2);
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	483	if ((str2 == NULL) \|\| (len == 0))
				484	return(xmlStrdup(str1));
				485	if (str1 == NULL)
				486	return(xmlStrndup(str2, len));
				487
				488	size = xmlStrlen(str1);
				489	ret = (xmlChar ) xmlMalloc((size + len + 1) sizeof(xmlChar));
				490	if (ret == NULL) {
				491	xmlErrMemory(NULL, NULL);
				492	return(xmlStrndup(str1, size));
				493	}
				494	memcpy(ret, str1, size * sizeof(xmlChar));
				495	memcpy(&ret[size], str2, len * sizeof(xmlChar));
				496	ret[size + len] = 0;
				497	return(ret);
				498	}
				499
				500	/**
				501	* xmlStrcat:
				502	* @cur: the original xmlChar * array
				503	* @add: the xmlChar * array added
				504	*
				505	* a strcat for array of xmlChar's. Since they are supposed to be
				506	* encoded in UTF-8 or an encoding with 8bit based chars, we assume
				507	* a termination mark of '0'.
				508	*
				509	* Returns a new xmlChar * containing the concatenated string.
				510	*/
				511	xmlChar *
				512	xmlStrcat(xmlChar cur, const xmlChar add) {
				513	const xmlChar *p = add;
				514
				515	if (add == NULL) return(cur);
				516	if (cur == NULL)
				517	return(xmlStrdup(add));
				518
				519	while (p != 0) p++; / non input consuming */
				520	return(xmlStrncat(cur, add, p - add));
				521	}
				522
				523	/**
				524	* xmlStrPrintf:
				525	* @buf: the result buffer.
				526	* @len: the result buffer length.
				527	* @msg: the message with printf formatting.
				528	* @...: extra parameters for the message.
				529	*
				530	* Formats @msg and places result into @buf.
				531	*
				532	* Returns the number of characters written to @buf or -1 if an error occurs.
				533	*/
				534	int
				535	xmlStrPrintf(xmlChar buf, int len, const xmlChar msg, ...) {
				536	va_list args;
				537	int ret;
				538
				539	if((buf == NULL) \|\| (msg == NULL)) {
				540	return(-1);
				541	}
				542
				543	va_start(args, msg);
				544	ret = vsnprintf((char ) buf, len, (const char ) msg, args);
				545	va_end(args);
				546	buf[len - 1] = 0; /* be safe ! */
				547
				548	return(ret);
				549	}
				550
				551	/**
				552	* xmlStrVPrintf:
				553	* @buf: the result buffer.
				554	* @len: the result buffer length.
				555	* @msg: the message with printf formatting.
				556	* @ap: extra parameters for the message.
				557	*
				558	* Formats @msg and places result into @buf.
				559	*
				560	* Returns the number of characters written to @buf or -1 if an error occurs.
				561	*/
				562	int
				563	xmlStrVPrintf(xmlChar buf, int len, const xmlChar msg, va_list ap) {
				564	int ret;
				565
				566	if((buf == NULL) \|\| (msg == NULL)) {
				567	return(-1);
				568	}
				569
				570	ret = vsnprintf((char ) buf, len, (const char ) msg, ap);
				571	buf[len - 1] = 0; /* be safe ! */
				572
				573	return(ret);
				574	}
				575
				576	/************************************************************************
				577	* *
				578	* Generic UTF8 handling routines *
				579	* *
				580	* From rfc2044: encoding of the Unicode values on UTF-8: *
				581	* *
				582	* UCS-4 range (hex.) UTF-8 octet sequence (binary) *
				583	* 0000 0000-0000 007F 0xxxxxxx *
				584	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
				585	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
				586	* *
				587	* I hope we won't use values > 0xFFFF anytime soon ! *
				588	* *
				589	************************************************************************/
				590
				591
				592	/**
				593	* xmlUTF8Size:
				594	* @utf: pointer to the UTF8 character
				595	*
				596	* calculates the internal size of a UTF8 character
				597	*
				598	* returns the numbers of bytes in the character, -1 on format error
				599	*/
				600	int
				601	xmlUTF8Size(const xmlChar *utf) {
				602	xmlChar mask;
				603	int len;
				604
				605	if (utf == NULL)
				606	return -1;
				607	if (*utf < 0x80)
				608	return 1;
				609	/* check valid UTF8 character */
				610	if (!(*utf & 0x40))
				611	return -1;
				612	/* determine number of bytes in char */
				613	len = 2;
				614	for (mask=0x20; mask != 0; mask>>=1) {
				615	if (!(*utf & mask))
				616	return len;
				617	len++;
				618	}
				619	return -1;
				620	}
				621
				622	/**
				623	* xmlUTF8Charcmp:
				624	* @utf1: pointer to first UTF8 char
				625	* @utf2: pointer to second UTF8 char
				626	*
				627	* compares the two UCS4 values
				628	*
				629	* returns result of the compare as with xmlStrncmp
				630	*/
				631	int
				632	xmlUTF8Charcmp(const xmlChar utf1, const xmlChar utf2) {
				633
				634	if (utf1 == NULL ) {
				635	if (utf2 == NULL)
				636	return 0;
				637	return -1;
				638	}
				639	return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
				640	}
				641
				642	/**
				643	* xmlUTF8Strlen:
				644	* @utf: a sequence of UTF-8 encoded bytes
				645	*
				646	* compute the length of an UTF8 string, it doesn't do a full UTF8
				647	* checking of the content of the string.
				648	*
				649	* Returns the number of characters in the string or -1 in case of error
				650	*/
				651	int
				652	xmlUTF8Strlen(const xmlChar *utf) {
				653	int ret = 0;
				654
				655	if (utf == NULL)
				656	return(-1);
				657
				658	while (*utf != 0) {
				659	if (utf[0] & 0x80) {
				660	if ((utf[1] & 0xc0) != 0x80)
				661	return(-1);
				662	if ((utf[0] & 0xe0) == 0xe0) {
				663	if ((utf[2] & 0xc0) != 0x80)
				664	return(-1);
				665	if ((utf[0] & 0xf0) == 0xf0) {
				666	if ((utf[0] & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
				667	return(-1);
				668	utf += 4;
				669	} else {
				670	utf += 3;
				671	}
				672	} else {
				673	utf += 2;
				674	}
				675	} else {
				676	utf++;
				677	}
				678	ret++;
				679	}
				680	return(ret);
				681	}
				682
				683	/**
				684	* xmlGetUTF8Char:
				685	* @utf: a sequence of UTF-8 encoded bytes
William M. Brack	3e53016	2004-09-03 17:10:08 +0000	[diff] [blame]	686	* @len: a pointer to the minimum number of bytes present in
				687	* the sequence. This is used to assure the next character
				688	* is completely contained within the sequence.
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	689	*
William M. Brack	3e53016	2004-09-03 17:10:08 +0000	[diff] [blame]	690	* Read the first UTF8 character from @utf
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	691	*
William M. Brack	3e53016	2004-09-03 17:10:08 +0000	[diff] [blame]	692	* Returns the char value or -1 in case of error, and sets *len to
				693	* the actual number of bytes consumed (0 in case of error)
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	694	*/
				695	int
				696	xmlGetUTF8Char(const unsigned char utf, int len) {
				697	unsigned int c;
				698
				699	if (utf == NULL)
				700	goto error;
				701	if (len == NULL)
				702	goto error;
				703	if (*len < 1)
				704	goto error;
				705
				706	c = utf[0];
				707	if (c & 0x80) {
				708	if (*len < 2)
				709	goto error;
				710	if ((utf[1] & 0xc0) != 0x80)
				711	goto error;
				712	if ((c & 0xe0) == 0xe0) {
				713	if (*len < 3)
				714	goto error;
				715	if ((utf[2] & 0xc0) != 0x80)
				716	goto error;
				717	if ((c & 0xf0) == 0xf0) {
				718	if (*len < 4)
				719	goto error;
				720	if ((c & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
				721	goto error;
				722	*len = 4;
				723	/* 4-byte code */
				724	c = (utf[0] & 0x7) << 18;
				725	c \|= (utf[1] & 0x3f) << 12;
				726	c \|= (utf[2] & 0x3f) << 6;
				727	c \|= utf[3] & 0x3f;
				728	} else {
				729	/* 3-byte code */
				730	*len = 3;
				731	c = (utf[0] & 0xf) << 12;
				732	c \|= (utf[1] & 0x3f) << 6;
				733	c \|= utf[2] & 0x3f;
				734	}
				735	} else {
				736	/* 2-byte code */
				737	*len = 2;
				738	c = (utf[0] & 0x1f) << 6;
				739	c \|= utf[1] & 0x3f;
				740	}
				741	} else {
				742	/* 1-byte code */
				743	*len = 1;
				744	}
				745	return(c);
				746
				747	error:
				748	*len = 0;
				749	return(-1);
				750	}
				751
				752	/**
				753	* xmlCheckUTF8:
				754	* @utf: Pointer to putative UTF-8 encoded string.
				755	*
				756	* Checks @utf for being valid UTF-8. @utf is assumed to be
				757	* null-terminated. This function is not super-strict, as it will
				758	* allow longer UTF-8 sequences than necessary. Note that Java is
				759	* capable of producing these sequences if provoked. Also note, this
				760	* routine checks for the 4-byte maximum size, but does not check for
				761	* 0x10ffff maximum value.
				762	*
				763	* Return value: true if @utf is valid.
				764	**/
				765	int
				766	xmlCheckUTF8(const unsigned char *utf)
				767	{
				768	int ix;
				769	unsigned char c;
				770
William M. Brack	3ffe90e	2004-08-28 01:33:30 +0000	[diff] [blame]	771	/*
				772	* utf is a string of 1, 2, 3 or 4 bytes. The valid strings
				773	* are as follows (in "bit format"):
				774	* 0xxxxxxx valid 1-byte
				775	* 110xxxxx 10xxxxxx valid 2-byte
				776	* 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
				777	* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
				778	*/
				779	for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
William M. Brack	f409515	2004-08-31 16:49:26 +0000	[diff] [blame]	780	if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	781	ix++;
William M. Brack	bf5cf21	2004-08-31 06:47:17 +0000	[diff] [blame]	782	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
				783	if ((utf[ix+1] & 0xc0 ) != 0x80)
				784	return 0;
				785	ix += 2;
				786	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
				787	if (((utf[ix+1] & 0xc0) != 0x80) \|\|
				788	((utf[ix+2] & 0xc0) != 0x80))
				789	return 0;
				790	ix += 3;
				791	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
				792	if (((utf[ix+1] & 0xc0) != 0x80) \|\|
				793	((utf[ix+2] & 0xc0) != 0x80) \|\|
				794	((utf[ix+3] & 0xc0) != 0x80))
				795	return 0;
				796	ix += 4;
				797	} else /* unknown encoding */
				798	return 0;
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	799	}
				800	return(1);
				801	}
				802
				803	/**
				804	* xmlUTF8Strsize:
				805	* @utf: a sequence of UTF-8 encoded bytes
				806	* @len: the number of characters in the array
				807	*
				808	* storage size of an UTF8 string
				809	*
				810	* Returns the storage size of
				811	* the first 'len' characters of ARRAY
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	812	*/
				813
				814	int
				815	xmlUTF8Strsize(const xmlChar *utf, int len) {
				816	const xmlChar *ptr=utf;
				817	xmlChar ch;
				818
Daniel Veillard	36e5cd5	2004-11-02 14:52:23 +0000	[diff] [blame]	819	if (utf == NULL)
				820	return(0);
				821
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	822	if (len <= 0)
				823	return(0);
				824
				825	while ( len-- > 0) {
				826	if ( !*ptr )
				827	break;
				828	if ( (ch = *ptr++) & 0x80)
				829	while ( (ch<<=1) & 0x80 )
				830	ptr++;
				831	}
				832	return (ptr - utf);
				833	}
				834
				835
				836	/**
				837	* xmlUTF8Strndup:
				838	* @utf: the input UTF8 *
				839	* @len: the len of @utf (in chars)
				840	*
				841	* a strndup for array of UTF8's
				842	*
				843	* Returns a new UTF8 * or NULL
				844	*/
				845	xmlChar *
				846	xmlUTF8Strndup(const xmlChar *utf, int len) {
				847	xmlChar *ret;
				848	int i;
				849
				850	if ((utf == NULL) \|\| (len < 0)) return(NULL);
				851	i = xmlUTF8Strsize(utf, len);
				852	ret = (xmlChar ) xmlMallocAtomic((i + 1) sizeof(xmlChar));
				853	if (ret == NULL) {
				854	xmlGenericError(xmlGenericErrorContext,
				855	"malloc of %ld byte failed\n",
				856	(len + 1) * (long)sizeof(xmlChar));
				857	return(NULL);
				858	}
				859	memcpy(ret, utf, i * sizeof(xmlChar));
				860	ret[i] = 0;
				861	return(ret);
				862	}
				863
				864	/**
				865	* xmlUTF8Strpos:
				866	* @utf: the input UTF8 *
				867	* @pos: the position of the desired UTF8 char (in chars)
				868	*
				869	* a function to provide the equivalent of fetching a
				870	* character from a string array
				871	*
				872	* Returns a pointer to the UTF8 character or NULL
				873	*/
Daniel Veillard	8a32fe4	2004-11-02 22:10:16 +0000	[diff] [blame^]	874	const xmlChar *
William M. Brack	a2e844a	2004-01-06 11:52:13 +0000	[diff] [blame]	875	xmlUTF8Strpos(const xmlChar *utf, int pos) {
				876	xmlChar ch;
				877
				878	if (utf == NULL) return(NULL);
				879	if ( (pos < 0) \|\| (pos >= xmlUTF8Strlen(utf)) )
				880	return(NULL);
				881	while (pos--) {
				882	if ((ch=*utf++) == 0) return(NULL);
				883	if ( ch & 0x80 ) {
				884	/* if not simple ascii, verify proper format */
				885	if ( (ch & 0xc0) != 0xc0 )
				886	return(NULL);
				887	/* then skip over remaining bytes for this char */
				888	while ( (ch <<= 1) & 0x80 )
				889	if ( (*utf++ & 0xc0) != 0x80 )
				890	return(NULL);
				891	}
				892	}
				893	return((xmlChar *)utf);
				894	}
				895
				896	/**
				897	* xmlUTF8Strloc:
				898	* @utf: the input UTF8 *
				899	* @utfchar: the UTF8 character to be found
				900	*
				901	* a function to provide the relative location of a UTF8 char
				902	*
				903	* Returns the relative character position of the desired char
				904	* or -1 if not found
				905	*/
				906	int
				907	xmlUTF8Strloc(const xmlChar utf, const xmlChar utfchar) {
				908	int i, size;
				909	xmlChar ch;
				910
				911	if (utf==NULL \|\| utfchar==NULL) return -1;
				912	size = xmlUTF8Strsize(utfchar, 1);
				913	for(i=0; (ch=*utf) != 0; i++) {
				914	if (xmlStrncmp(utf, utfchar, size)==0)
				915	return(i);
				916	utf++;
				917	if ( ch & 0x80 ) {
				918	/* if not simple ascii, verify proper format */
				919	if ( (ch & 0xc0) != 0xc0 )
				920	return(-1);
				921	/* then skip over remaining bytes for this char */
				922	while ( (ch <<= 1) & 0x80 )
				923	if ( (*utf++ & 0xc0) != 0x80 )
				924	return(-1);
				925	}
				926	}
				927
				928	return(-1);
				929	}
				930	/**
				931	* xmlUTF8Strsub:
				932	* @utf: a sequence of UTF-8 encoded bytes
				933	* @start: relative pos of first char
				934	* @len: total number to copy
				935	*
				936	* Create a substring from a given UTF-8 string
				937	* Note: positions are given in units of UTF-8 chars
				938	*
				939	* Returns a pointer to a newly created string
				940	* or NULL if any problem
				941	*/
				942
				943	xmlChar *
				944	xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
				945	int i;
				946	xmlChar ch;
				947
				948	if (utf == NULL) return(NULL);
				949	if (start < 0) return(NULL);
				950	if (len < 0) return(NULL);
				951
				952	/*
				953	* Skip over any leading chars
				954	*/
				955	for (i = 0;i < start;i++) {
				956	if ((ch=*utf++) == 0) return(NULL);
				957	if ( ch & 0x80 ) {
				958	/* if not simple ascii, verify proper format */
				959	if ( (ch & 0xc0) != 0xc0 )
				960	return(NULL);
				961	/* then skip over remaining bytes for this char */
				962	while ( (ch <<= 1) & 0x80 )
				963	if ( (*utf++ & 0xc0) != 0x80 )
				964	return(NULL);
				965	}
				966	}
				967
				968	return(xmlUTF8Strndup(utf, len));
				969	}