Unicode patch-up and stuff.

commit: 16571dcb63b48338c4d45a0a1e7c7610aa3ca92a [log] [tgz]
author: Linus Walleij <triad@df.lth.se> Thu Aug 17 20:27:46 2006 +0000
committer: Linus Walleij <triad@df.lth.se> Thu Aug 17 20:27:46 2006 +0000
tree: 08430991a6989b65189cd710deaae4a1910b0bcf
parent: a1124e79deb4a5cb743be1ddfe284d16c9720e69 [diff]
diff --git a/ChangeLog b/ChangeLog
index 1eb3522..eee83a7 100644
--- a/ChangeLog
+++ b/ChangeLog

@@ -1,3 +1,14 @@
+2006-08-17  Linus Walleij <triad@df.lth.se>
+
+	* configure.ac: bump to 0.0.12 and require iconv.h.
+
+2006-08-16  Linus Walleij <triad@df.lth.se>
+
+	* src/unicode.c: remove bigendian weirdness and switch
+	  to using iconv() instead.
+	* src/unicode.h: dito.
+	* src/libmtp.c: reflect changes.
+	
 2006-08-09  Linus Walleij <triad@df.lth.se>
 
 	* Release libmtp 0.0.11.

diff --git a/configure.ac b/configure.ac
index e0fb658..25b7484 100644
--- a/configure.ac
+++ b/configure.ac

@@ -1,6 +1,6 @@
 # Process this file with autoconf to produce a configure script.
 AC_PREREQ(2.52)
-AC_INIT([libmtp], [0.0.11], [libmtp-users@lists.sourceforge.net])
+AC_INIT([libmtp], [0.0.12], [libmtp-users@lists.sourceforge.net])
 AM_INIT_AUTOMAKE([foreign])
 AC_CONFIG_SRCDIR([src/libmtp.c])
 AC_CONFIG_HEADER([config.h])
@@ -62,7 +62,7 @@
 AC_HEADER_TIME
 # zlib.h the day we need to decompress firmware
 AC_CHECK_HEADERS([ctype.h errno.h fcntl.h getopt.h libgen.h \
-	limits.h stdio.h string.h sys/stat.h sys/time.h unistd.h])
+	limits.h stdio.h string.h sys/stat.h sys/time.h unistd.h iconv.h])
 AC_CHECK_HEADER([usb.h],,
 	AC_MSG_ERROR([I can't find the libusb header file on your system.
 	You may need to set the CPPFLAGS environment variable to include

diff --git a/src/libmtp.c b/src/libmtp.c
index 09466c1..ecc07e1 100644
--- a/src/libmtp.c
+++ b/src/libmtp.c

@@ -454,6 +454,7 @@
 void LIBMTP_Init(void)
 {
   init_filemap();
+  unicode_init();
   return;
 }
 
@@ -485,7 +486,7 @@
   if (ret == PTP_RC_OK) {
     if (getUtf8 == 1) {
       if (propval.unistr != NULL) {
-	retstring = ucs2_to_utf8(propval.unistr, 0);
+	retstring = ucs2le_to_utf8(propval.unistr);
 	free(propval.unistr);
       }
     } else {
@@ -586,7 +587,7 @@
   }
 
   if (setUtf8 == 1) {
-    propval.unistr = utf8_to_ucs2((unsigned char const * const) string, 0);
+    propval.unistr = utf8_to_ucs2le((unsigned char const * const) string);
     ret = ptp_mtp_setobjectpropvalue(params, object_id, attribute_id, &propval, PTP_DTC_UNISTR);
     free(propval.unistr);
   } else {
@@ -1079,7 +1080,7 @@
     return NULL;
   }
   // Convert from UTF-16 to UTF-8
-  retstring = ucs2_to_utf8((uint16_t *) propval.unistr, 0);
+  retstring = ucs2le_to_utf8((uint16_t *) propval.unistr);
   free(propval.unistr);
   return retstring;
 }
@@ -1180,8 +1181,7 @@
 {
   PTPPropertyValue propval;
   PTPParams *params = (PTPParams *) device->params;
-  uint8_t *tmp;
-  uint32_t len;
+  uint16_t *tmp;
   int i;
 
   if (!ptp_property_issupported(params, property)) {
@@ -1197,20 +1197,17 @@
   }
 
   // Extract the actual array.
-  len = propval.a.count * 2 + 2;
-  tmp = malloc(len);
+  // printf("Array of %d elements\n", propval.a.count);
+  tmp = malloc((propval.a.count + 1)*sizeof(uint16_t));
   for (i = 0; i < propval.a.count; i++) {
-    // Force this to become a little-endian unicode string
-    // in order to be able to use the ucs2_to_utf8 function.
-    uint16_t tch = propval.a.v[i].u16;
-    tmp[i*2] = (uint8_t) tch & 0xFF;
-    tmp[(i*2)+1] = (uint8_t) tch >> 8;
+    tmp[i] = propval.a.v[i].u16;
+    // printf("%04x ", tmp[i]);
   }
-  tmp[len-1] = 0;
-  tmp[len-2] = 0;
+  tmp[propval.a.count] = 0x0000U;
   free(propval.a.v);
 
-  *unicstring = ucs2_to_utf8((uint16_t *) tmp, 0);
+  *unicstring = utf16_to_utf8(tmp);
+
   free(tmp);
 
   return 0;

diff --git a/src/unicode.c b/src/unicode.c
index 0f06645..9cdc535 100644
--- a/src/unicode.c
+++ b/src/unicode.c

@@ -17,10 +17,56 @@
 #include "unicode.h"
 #include "util.h"
 
+#ifdef USE_ICONV
+#include <iconv.h>
+// Default to not using iconv() until it's properly initialized.
+static int use_fallbacks = 1;
+
+/*
+ * iconv converters, since these conversions are stateless
+ * to/from UTF-8/UCS-2, we don't need any special thread
+ * handling here. (Like making the iconv():erters part of
+ * the device struct.)
+ */
+iconv_t cd_utf8_to_ucs2le;
+iconv_t cd_ucs2le_to_utf8;
+iconv_t cd_utf16_to_utf8;
+
+void unicode_init(void)
+{
+  // printf("Using iconv()...\n");
+  cd_utf16_to_utf8 = iconv_open("UTF-8", "UTF-16");
+  cd_ucs2le_to_utf8 = iconv_open("UTF-8", "UCS-2LE");
+  cd_utf8_to_ucs2le = iconv_open("UCS-2LE", "UTF-8");
+  /*
+   * If we cannot use the iconv implementation on this
+   * machine, fall back on the old routines.
+   */
+  if (cd_utf16_to_utf8 == (iconv_t) -1 ||
+      cd_ucs2le_to_utf8 == (iconv_t) -1 ||
+      cd_utf8_to_ucs2le == (iconv_t) -1) {
+    if (cd_utf16_to_utf8 != (iconv_t) -1)
+      iconv_close(cd_utf16_to_utf8);
+    if (cd_ucs2le_to_utf8 != (iconv_t) -1)
+      iconv_close(cd_ucs2le_to_utf8);
+    if (cd_utf8_to_ucs2le != (iconv_t) -1)
+      iconv_close(cd_utf8_to_ucs2le);    
+    use_fallbacks = 1;
+  }
+  // OK activate the iconv() stuff...
+  use_fallbacks = 0;
+}
+#else
+void unicode_init(void)
+{
+  return;
+}
+#endif
+
 /**
  * The size of the buffer (in characters) used for creating string copies.
  */
-#define STRING_BUFFER_LENGTH 256
+#define STRING_BUFFER_LENGTH 1024
 
 /** 
  * Gets the length (in characters, not bytes) of a unicode 
@@ -71,7 +117,7 @@
  * @param unicstr the UCS-2 string to copy
  * @return a newly allocated copy of the string
  */
-static uint16_t *ucs2strdup(uint16_t const * const unicstr) {
+static uint16_t *ucs2_strdup(uint16_t const * const unicstr) {
   int length = ucs2_strlen(unicstr);
   uint8_t *data;
   
@@ -83,26 +129,12 @@
   return (uint16_t *) data;
 }
 
-
-/**
- * Converts a Unicode UCS-2 2-byte string to a UTF-8
- * string.
- *
- * @param unicstr the UCS-2 unicode string to convert
- * @param endianness the default endianness of the string. 0 means
- *        little-endian, any other value means big-endian.
- *        If a byte-order-mark (BOM) occurs in the string this
- *        will be honoured and switch the endianness.
- * @return a UTF-8 string.
- */
-char *ucs2_to_utf8(uint16_t const * const unicstr, 
-		   uint8_t const endianness) {
+static char *builtin_ucs2le_to_utf8(uint16_t const * const unicstr) {
   char *data = NULL;
   int i = 0;
   int l = 0;
   int length8;
   uint8_t *locstr = (uint8_t *) unicstr;
-  uint8_t locend = endianness;
 
   length8 = ucs2utf8len(unicstr);
   data = (char *) malloc(length8+1);
@@ -110,43 +142,19 @@
     return NULL;
   }
   for(l = 0; (locstr[l] | locstr[l+1]) != '\0'; l += 2) {
-    // This will honour the byte-order-mark properly
-    if (locstr[l] == 0xFF && locstr[l+1] == 0xFE) {
-      locend = 0;
-    } else if (locstr[l] == 0xFE && locstr[l+1] == 0xFF) {
-      locend = 1;
+    // This is for little-endian machines
+    if (locstr[l+1] == 0x00 && locstr[l] < 0x80U) {
+      data[i] = locstr[l];
+      i ++;
+    } else if (locstr[l+1] < 0x08) {
+      data[i] = 0xc0 | (locstr[l+1]<<2 & 0x1C) | (locstr[l]>>6  & 0x03);
+      data[i+1] = 0x80 | (locstr[l] & 0x3F);
+      i+=2;
     } else {
-      if (!locend) {
-	// This is for little-endian machines
-	if (locstr[l+1] == 0x00 && locstr[l] < 0x80U) {
-	  data[i] = locstr[l];
-	  i ++;
-	} else if (locstr[l+1] < 0x08) {
-	  data[i] = 0xc0 | (locstr[l+1]<<2 & 0x1C) | (locstr[l]>>6  & 0x03);
-	  data[i+1] = 0x80 | (locstr[l] & 0x3F);
-	i+=2;
-	} else {
-	  data[i] = 0xe0 | (locstr[l+1]>>4 & 0x0F);
-	  data[i+1] = 0x80 | (locstr[l+1]<<2 & 0x3C) | (locstr[l]>>6 & 0x03);
-	  data[i+2] = 0x80 | (locstr[l] & 0x3F);
-	  i+=3;
-	}
-      } else {
-	// This is for big-endian machines
-	if (locstr[l] == 0x00 && locstr[l+1] < 0x80U) {
-	  data[i] = locstr[l+1];
-	  i ++;
-	} else if (locstr[l] < 0x08) {
-	data[i] = 0xc0 | (locstr[l]<<2 & 0x1C) | (locstr[l+1]>>6  & 0x03);
-	data[i+1] = 0x80 | (locstr[l+1] & 0x3F);
-	i+=2;
-	} else {
-	  data[i] = 0xe0 | (locstr[l]>>4 & 0x0F);
-	  data[i+1] = 0x80 | (locstr[l]<<2 & 0x3C) | (locstr[l+1]>>6 & 0x03);
-	  data[i+2] = 0x80 | (locstr[l+1] & 0x3F);
-	  i+=3;
-	}
-      }
+      data[i] = 0xe0 | (locstr[l+1]>>4 & 0x0F);
+      data[i+1] = 0x80 | (locstr[l+1]<<2 & 0x3C) | (locstr[l]>>6 & 0x03);
+      data[i+2] = 0x80 | (locstr[l] & 0x3F);
+      i+=3;
     }
   }
   /* Terminate string */
@@ -156,15 +164,96 @@
 }
 
 /**
- * Convert a UTF-8 string to a unicode UCS-2 string.
+ * Converts a little-endian Unicode UCS-2 2-byte string
+ * to a UTF-8 string.
  *
- * @param str the UTF-8 string to convert.
- * @param endianness desired endianness of the returned string. 0 means
- *        little-endian, any other value means big-endian.
- * @return a pointer to a newly allocated UCS-2 string.
+ * @param unicstr the UCS-2 unicode string to convert
+ * @return a UTF-8 string.
  */
-uint16_t *utf8_to_ucs2(unsigned char const * const str,
-		       const uint8_t endianness) {
+#ifdef USE_ICONV
+char *ucs2le_to_utf8(uint16_t const * const unicstr) {
+  if (use_fallbacks) {
+    return builtin_ucs2le_to_utf8(unicstr);
+  } else {
+    char *stringp = (char *) unicstr;
+    char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char.
+    char *locp = loclstr;
+    size_t nconv;
+    size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator
+    size_t convmax = STRING_BUFFER_LENGTH*3;
+    
+    loclstr[0]='\0';
+    /* Do the conversion.  */
+    nconv = iconv(cd_ucs2le_to_utf8, &stringp, &convlen, &locp, &convmax);
+    if (nconv == (size_t) -1) {
+      return NULL;
+    }
+    loclstr[STRING_BUFFER_LENGTH*3] = '\0';
+    return strdup(loclstr);
+  }
+}
+#else
+char *ucs2le_to_utf8(uint16_t const * const unicstr) {
+  return builtin_ucs2le_to_utf8(unicstr);
+}
+#endif
+
+/**
+ * Converts a big-endian UTF-16 2-byte string
+ * to a UTF-8 string.
+ *
+ * @param unicstr the UTF-16 unicode string to convert
+ * @return a UTF-8 string.
+ */
+#ifdef USE_ICONV
+char *utf16_to_utf8(const uint16_t *unicstr)
+{
+  if (use_fallbacks) {
+    if (unicstr[0] == 0xFFFEU || unicstr[0] == 0xFEFFU) {
+      // Consume BOM, endianness is fixed at network layer.
+      return builtin_ucs2le_to_utf8(unicstr+1);
+    }
+    return builtin_ucs2le_to_utf8(unicstr);
+  } else {
+    char *stringp = (char *) unicstr;
+    char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char.
+    char *locp = loclstr;
+    size_t nconv;
+    size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator
+    size_t convmax = STRING_BUFFER_LENGTH*3;
+
+    loclstr[0]='\0';
+    /* Do the conversion.  */
+    nconv = iconv(cd_utf16_to_utf8, &stringp, &convlen, &locp, &convmax);
+    if (nconv == (size_t) -1) {
+      // Return partial string anyway.
+      *locp = '\0';
+    }
+    loclstr[STRING_BUFFER_LENGTH*3] = '\0';
+    // Strip off any BOM, it's totally useless...
+    if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) {
+      return strdup(loclstr+3);
+    }
+    return strdup(loclstr);
+  }
+}
+#else
+char *utf16_to_utf8(const uint16_t *unicstr)
+{
+  if (unicstr[0] == 0xFFFEU) {
+    // FIXME: big-endian, swap bytes around or something
+    return NULL;
+  }
+  if (unicstr[0] == 0xFEFFU) {
+    // Consume BOM
+    return builtin_ucs2le_to_utf8(unicstr+1);
+  }
+  return builtin_ucs2le_to_utf8(unicstr);
+}
+#endif
+
+
+static uint16_t *builtin_utf8_to_ucs2le(unsigned char const * const str) {
   uint16_t *retval;
   int i;
   unsigned char buffer[STRING_BUFFER_LENGTH*2];    
@@ -172,13 +261,8 @@
     
   for(i = 0; str[i] != '\0' && length < (STRING_BUFFER_LENGTH*2-2);) {
     if (str[i] < 0x80) {
-      if (!endianness) {
-	buffer[length+1] = 0x00;
-	buffer[length] = str[i];
-      } else {
-	buffer[length] = 0x00;
-	buffer[length+1] = str[i];
-      }
+      buffer[length+1] = 0x00;
+      buffer[length] = str[i];
       length += 2;
       i++;
     } else {
@@ -194,24 +278,13 @@
       /* UCS-2 can handle no more than 3 UTF-8 encoded bytes */
       if (numbytes <= 3) {
 	if (numbytes == 2 && str[i+1] > 0x80) {
-	  /* This character can always be handled correctly */
-	  if (!endianness) {
-	    buffer[length+1] = (str[i]>>2 & 0x07);
-	    buffer[length] = (str[i]<<6 & 0xC0) | (str[i+1] & 0x3F);
-	  } else {
-	    buffer[length] = (str[i]>>2 & 0x07);
-	    buffer[length+1] = (str[i]<<6 & 0xC0) | (str[i+1] & 0x3F);
-	  }
+	  buffer[length+1] = (str[i]>>2 & 0x07);
+	  buffer[length] = (str[i]<<6 & 0xC0) | (str[i+1] & 0x3F);
 	  i += 2;
 	  length += 2;
 	} else if (numbytes == 3 && str[i+1] > 0x80 && str[i+2] > 0x80) {
-	  if (!endianness) {
-	    buffer[length+1] = (str[i]<<4 & 0xF0) | (str[i+1]>>2 & 0x0F);
-	    buffer[length]= (str[i+1]<<6 & 0xC0) | (str[i+2] & 0x3F);
-	  } else {
-	    buffer[length] = (str[i]<<4 & 0xF0) | (str[i+1]>>2 & 0x0F);
-	    buffer[length+1]= (str[i+1]<<6 & 0xC0) | (str[i+2] & 0x3F);
-	  }
+	  buffer[length+1] = (str[i]<<4 & 0xF0) | (str[i+1]>>2 & 0x0F);
+	  buffer[length]= (str[i+1]<<6 & 0xC0) | (str[i+2] & 0x3F);
 	  i += 3;
 	  length += 2;
 	} else {
@@ -229,6 +302,40 @@
   buffer[length+1] = 0x00;
 
   // Copy the buffer contents
-  retval = ucs2strdup((uint16_t *) buffer);
+  retval = ucs2_strdup((uint16_t *) buffer);
   return retval;
 }
+
+/**
+ * Convert a UTF-8 string to a little-endian Unicode
+ * UCS-2 string.
+ *
+ * @param str the UTF-8 string to convert.
+ * @return a pointer to a newly allocated UCS-2 string.
+ */
+#ifdef USE_ICONV
+uint16_t *utf8_to_ucs2le(unsigned char const * const str) {
+  if (use_fallbacks) {
+    return builtin_utf8_to_ucs2le(str);
+  } else {
+    uint16_t ucs2str[STRING_BUFFER_LENGTH+1];
+    char *ucs2strp = (char *) ucs2str;
+    char *stringp = (char *) str;
+    size_t nconv;
+    size_t convlen = strlen((char*)str) + 1; // Include the terminator in the conversion
+    size_t convmax = STRING_BUFFER_LENGTH * 2; // Includes the terminator
+    
+    ucs2str[0] = 0x0000U;
+    // memset(ucs2strp, 0, (STRING_BUFFER_LENGTH+1)*sizeof(uint16_t));
+    nconv = iconv (cd_utf8_to_ucs2le, &stringp, &convlen, &ucs2strp, &convmax);
+    if (nconv == (size_t) -1) {
+      return NULL;
+    }
+    return ucs2_strdup(ucs2str);
+  }
+}
+#else
+uint16_t *utf8_to_ucs2le(unsigned char const * const str) {
+  return builtin_utf8_to_ucs2le(str);
+}
+#endif

diff --git a/src/unicode.h b/src/unicode.h
index 471170a..9d439d8 100644
--- a/src/unicode.h
+++ b/src/unicode.h

@@ -1,8 +1,15 @@
 #ifndef __MTP__UNICODE__H
 #define __MTP__UNICODE__H
 
+#include "config.h"
+#ifdef HAVE_ICONV_H
+#define USE_ICONV
+#endif
+
+void unicode_init(void);
 int ucs2_strlen(uint16_t const * const);
-char *ucs2_to_utf8(const uint16_t*, const uint8_t endianness);
-uint16_t *utf8_to_ucs2(unsigned char const * const, const uint8_t endianness);
+char *ucs2le_to_utf8(const uint16_t*);
+char *utf16_to_utf8(const uint16_t*);
+uint16_t *utf8_to_ucs2le(unsigned char const * const);
 
 #endif /* __MTP__UNICODE__H */
commit	16571dcb63b48338c4d45a0a1e7c7610aa3ca92a	[log] [tgz]
author	Linus Walleij <triad@df.lth.se>	Thu Aug 17 20:27:46 2006 +0000
committer	Linus Walleij <triad@df.lth.se>	Thu Aug 17 20:27:46 2006 +0000
tree	08430991a6989b65189cd710deaae4a1910b0bcf
parent	a1124e79deb4a5cb743be1ddfe284d16c9720e69 [diff]