- xpath.c encoding.[ch]: William M. Brack provided a set of UTF8 string oriented functions and started cleaning the related areas in xpath.c which needed fixing in this respect Daniel

commit: 97ac13197ce5a6a754a7071a0e95b07f1f54ac6c [log] [tgz]
author: Daniel Veillard <veillard@src.gnome.org> Wed May 30 19:14:17 2001 +0000
committer: Daniel Veillard <veillard@src.gnome.org> Wed May 30 19:14:17 2001 +0000
tree: ef41f68f6d6861de2a3aaa5cc6147ae13e673ed6
parent: 2d70372ce33920712a2a4b0ebdae61c826418324 [diff] [blame]
diff --git a/encoding.c b/encoding.c
index 020f4de..c0b7316 100644
--- a/encoding.c
+++ b/encoding.c

@@ -13,11 +13,14 @@
  * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
  *                Information Interchange, ANSI X3.4-1986.
  *
- * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
- *
  * See Copyright for the status of this software.
  *
  * Daniel.Veillard@w3.org
+ *
+ * UTF8 string routines from:
+ * "William M. Brack" <wbrack@mmm.com.hk>
+ *
+ * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
  */
 
 #include "libxml.h"
@@ -64,16 +67,20 @@
 
 static int xmlLittleEndian = 1;
 
-/*
- * From rfc2044: encoding of the Unicode values on UTF-8:
- *
- * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
- * 0000 0000-0000 007F   0xxxxxxx
- * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
- * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 
- *
- * I hope we won't use values > 0xFFFF anytime soon !
- */
+/************************************************************************
+ *									*
+ *			Generic UTF8 handling routines			*
+ *									*
+ * From rfc2044: encoding of the Unicode values on UTF-8:		*
+ *									*
+ * UCS-4 range (hex.)           UTF-8 octet sequence (binary)		*
+ * 0000 0000-0000 007F   0xxxxxxx					*
+ * 0000 0080-0000 07FF   110xxxxx 10xxxxxx				*
+ * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 			*
+ *									*
+ * I hope we won't use values > 0xFFFF anytime soon !			*
+ *									*
+ ************************************************************************/
 
 /**
  * xmlUTF8Strlen:
@@ -85,7 +92,7 @@
  * Returns the number of characters in the string or -1 in case of error
  */
 int
-xmlUTF8Strlen(const unsigned char *utf) {
+xmlUTF8Strlen(const xmlChar *utf) {
     int ret = 0;
 
     if (utf == NULL)
@@ -228,6 +235,178 @@
 }
 
 /**
+ * xmlUTF8Strsize:
+ * @utf:  a sequence of UTF-8 encoded bytes
+ * @len:  the number of characters in the array
+ *
+ * storage size of an UTF8 string
+ *
+ * Returns the storage size of
+ * the first 'len' characters of ARRAY
+ *
+ */
+
+int
+xmlUTF8Strsize(const xmlChar *utf, int len) {
+    const xmlChar	*ptr=utf;
+    xmlChar	ch;
+
+    if (len <= 0)
+	return(0);
+
+    while ( len-- > 0) {
+	if ( !*ptr )
+	    break;
+	if ( (ch = *ptr++) & 0x80)
+	    while ( (ch<<=1) & 0x80 )
+		ptr++;
+    }
+    return (ptr - utf);
+}
+
+
+/**
+ * xmlUTF8Strndup:
+ * @utf:  the input UTF8 *
+ * @len:  the len of @utf (in chars)
+ *
+ * a strndup for array of UTF8's
+ *
+ * Returns a new UTF8 * or NULL
+ */
+xmlChar *
+xmlUTF8Strndup(const xmlChar *utf, int len) {
+    xmlChar *ret;
+    int i;
+    
+    if ((utf == NULL) || (len < 0)) return(NULL);
+    i = xmlUTF8Strsize(utf, len);
+    ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
+    if (ret == NULL) {
+        xmlGenericError(xmlGenericErrorContext,
+		"malloc of %ld byte failed\n",
+	        (len + 1) * (long)sizeof(xmlChar));
+        return(NULL);
+    }
+    memcpy(ret, utf, i * sizeof(xmlChar));
+    ret[i] = 0;
+    return(ret);
+}
+
+/**
+ * xmlUTF8Strpos:
+ * @utf:  the input UTF8 *
+ * @pos:  the position of the desired UTF8 char (in chars)
+ *
+ * a function to provide the equivalent of fetching a
+ * character from a string array
+ *
+ * Returns a pointer to the UTF8 character or NULL
+ */
+xmlChar *
+xmlUTF8Strpos(const xmlChar *utf, int pos) {
+    xmlChar ch;
+
+    if (utf == NULL) return(NULL);
+    if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
+	return(NULL);
+    while (pos--) {
+	if ((ch=*utf++) == 0) return(NULL);
+	if ( ch & 0x80 ) {
+	    /* if not simple ascii, verify proper format */
+	    if ( (ch & 0xc0) != 0xc0 )
+		return(NULL);
+	    /* then skip over remaining bytes for this char */
+	    while ( (ch <<= 1) & 0x80 )
+		if ( (*utf++ & 0xc0) != 0x80 )
+		    return(NULL);
+	}
+    }
+    return((xmlChar *)utf);
+}
+
+/**
+ * xmlUTF8Strloc:
+ * @utf:  the input UTF8 *
+ * @utfchar:  the UTF8 character to be found
+ *
+ * a function to provide relative location of a UTF8 char
+ *
+ * Returns the relative character position of the desired char
+ * or -1 if not found
+ */
+int
+xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
+    int i, size;
+    xmlChar ch;
+
+    if (utf==NULL || utfchar==NULL) return -1;
+    size = xmlUTF8Strsize(utfchar, 1);
+	for(i=0; (ch=*utf) != 0; i++) {
+	    if (xmlStrncmp(utf, utfchar, size)==0)
+		return(i);
+	    utf++;
+	    if ( ch & 0x80 ) {
+		/* if not simple ascii, verify proper format */
+		if ( (ch & 0xc0) != 0xc0 )
+		    return(-1);
+		/* then skip over remaining bytes for this char */
+		while ( (ch <<= 1) & 0x80 )
+		    if ( (*utf++ & 0xc0) != 0x80 )
+			return(-1);
+	    }
+	}
+
+    return(-1);
+}
+/**
+ * xmlUTF8Strsub:
+ * @utf:  a sequence of UTF-8 encoded bytes
+ *
+ * @start: relative pos of first char
+ * @len:   total number to copy
+ *
+ * Note:  positions are given in units of UTF-8 chars
+ *
+ * Returns a pointer to a newly created string
+ * or NULL if any problem
+ */
+
+xmlChar *
+xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
+    int	    i;
+    xmlChar ch;
+
+    if (utf == NULL) return(NULL);
+    if (start < 0) return(NULL);
+    if (len < 0) return(NULL);
+
+    /*
+     * Skip over any leading chars
+     */
+    for (i = 0;i < start;i++) {
+	if ((ch=*utf++) == 0) return(NULL);
+	if ( ch & 0x80 ) {
+	    /* if not simple ascii, verify proper format */
+	    if ( (ch & 0xc0) != 0xc0 )
+		return(NULL);
+	    /* then skip over remaining bytes for this char */
+	    while ( (ch <<= 1) & 0x80 )
+		if ( (*utf++ & 0xc0) != 0x80 )
+		    return(NULL);
+	}
+    }
+
+    return(xmlUTF8Strndup(utf, len));
+}
+
+/************************************************************************
+ *									*
+ *		Conversions To/From UTF8 encoding			*
+ *									*
+ ************************************************************************/
+
+/**
  * asciiToUTF8:
  * @out:  a pointer to an array of bytes to store the result
  * @outlen:  the length of @out
@@ -912,6 +1091,12 @@
     return(0);
 }
 
+/************************************************************************
+ *									*
+ *		Generic encoding handling routines			*
+ *									*
+ ************************************************************************/
+
 /**
  * xmlDetectCharEncoding:
  * @in:  a pointer to the first bytes of the XML entity, must be at least
@@ -1256,11 +1441,12 @@
     return(NULL);
 }
 
-/****************************************************************
- *								*
- *		Char encoding handlers				*
- *								*
- ****************************************************************/
+/************************************************************************
+ *									*
+ *			Char encoding handlers				*
+ *									*
+ ************************************************************************/
+
 
 /* the size should be growable, but it's not a big deal ... */
 #define MAX_ENCODING_HANDLERS 50
@@ -1669,6 +1855,12 @@
     return(NULL);
 }
 
+/************************************************************************
+ *									*
+ *		ICONV based generic conversion functions		*
+ *									*
+ ************************************************************************/
+
 #ifdef LIBXML_ICONV_ENABLED
 /**
  * xmlIconvWrapper:
@@ -1730,6 +1922,12 @@
 }
 #endif /* LIBXML_ICONV_ENABLED */
 
+/************************************************************************
+ *									*
+ *		The real API used by libxml for on-the-fly conversion	*
+ *									*
+ ************************************************************************/
+
 /**
  * xmlCharEncFirstLine:
  * @handler:	char enconding transformation data structure
commit	97ac13197ce5a6a754a7071a0e95b07f1f54ac6c	[log] [tgz]
author	Daniel Veillard <veillard@src.gnome.org>	Wed May 30 19:14:17 2001 +0000
committer	Daniel Veillard <veillard@src.gnome.org>	Wed May 30 19:14:17 2001 +0000
tree	ef41f68f6d6861de2a3aaa5cc6147ae13e673ed6
parent	2d70372ce33920712a2a4b0ebdae61c826418324 [diff] [blame]