Unicode patch-up and stuff.
diff --git a/ChangeLog b/ChangeLog
index 1eb3522..eee83a7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2006-08-17 Linus Walleij <triad@df.lth.se>
+
+ * configure.ac: bump to 0.0.12 and require iconv.h.
+
+2006-08-16 Linus Walleij <triad@df.lth.se>
+
+ * src/unicode.c: remove bigendian weirdness and switch
+ to using iconv() instead.
+ * src/unicode.h: dito.
+ * src/libmtp.c: reflect changes.
+
2006-08-09 Linus Walleij <triad@df.lth.se>
* Release libmtp 0.0.11.
diff --git a/configure.ac b/configure.ac
index e0fb658..25b7484 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,6 +1,6 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ(2.52)
-AC_INIT([libmtp], [0.0.11], [libmtp-users@lists.sourceforge.net])
+AC_INIT([libmtp], [0.0.12], [libmtp-users@lists.sourceforge.net])
AM_INIT_AUTOMAKE([foreign])
AC_CONFIG_SRCDIR([src/libmtp.c])
AC_CONFIG_HEADER([config.h])
@@ -62,7 +62,7 @@
AC_HEADER_TIME
# zlib.h the day we need to decompress firmware
AC_CHECK_HEADERS([ctype.h errno.h fcntl.h getopt.h libgen.h \
- limits.h stdio.h string.h sys/stat.h sys/time.h unistd.h])
+ limits.h stdio.h string.h sys/stat.h sys/time.h unistd.h iconv.h])
AC_CHECK_HEADER([usb.h],,
AC_MSG_ERROR([I can't find the libusb header file on your system.
You may need to set the CPPFLAGS environment variable to include
diff --git a/src/libmtp.c b/src/libmtp.c
index 09466c1..ecc07e1 100644
--- a/src/libmtp.c
+++ b/src/libmtp.c
@@ -454,6 +454,7 @@
void LIBMTP_Init(void)
{
init_filemap();
+ unicode_init();
return;
}
@@ -485,7 +486,7 @@
if (ret == PTP_RC_OK) {
if (getUtf8 == 1) {
if (propval.unistr != NULL) {
- retstring = ucs2_to_utf8(propval.unistr, 0);
+ retstring = ucs2le_to_utf8(propval.unistr);
free(propval.unistr);
}
} else {
@@ -586,7 +587,7 @@
}
if (setUtf8 == 1) {
- propval.unistr = utf8_to_ucs2((unsigned char const * const) string, 0);
+ propval.unistr = utf8_to_ucs2le((unsigned char const * const) string);
ret = ptp_mtp_setobjectpropvalue(params, object_id, attribute_id, &propval, PTP_DTC_UNISTR);
free(propval.unistr);
} else {
@@ -1079,7 +1080,7 @@
return NULL;
}
// Convert from UTF-16 to UTF-8
- retstring = ucs2_to_utf8((uint16_t *) propval.unistr, 0);
+ retstring = ucs2le_to_utf8((uint16_t *) propval.unistr);
free(propval.unistr);
return retstring;
}
@@ -1180,8 +1181,7 @@
{
PTPPropertyValue propval;
PTPParams *params = (PTPParams *) device->params;
- uint8_t *tmp;
- uint32_t len;
+ uint16_t *tmp;
int i;
if (!ptp_property_issupported(params, property)) {
@@ -1197,20 +1197,17 @@
}
// Extract the actual array.
- len = propval.a.count * 2 + 2;
- tmp = malloc(len);
+ // printf("Array of %d elements\n", propval.a.count);
+ tmp = malloc((propval.a.count + 1)*sizeof(uint16_t));
for (i = 0; i < propval.a.count; i++) {
- // Force this to become a little-endian unicode string
- // in order to be able to use the ucs2_to_utf8 function.
- uint16_t tch = propval.a.v[i].u16;
- tmp[i*2] = (uint8_t) tch & 0xFF;
- tmp[(i*2)+1] = (uint8_t) tch >> 8;
+ tmp[i] = propval.a.v[i].u16;
+ // printf("%04x ", tmp[i]);
}
- tmp[len-1] = 0;
- tmp[len-2] = 0;
+ tmp[propval.a.count] = 0x0000U;
free(propval.a.v);
- *unicstring = ucs2_to_utf8((uint16_t *) tmp, 0);
+ *unicstring = utf16_to_utf8(tmp);
+
free(tmp);
return 0;
diff --git a/src/unicode.c b/src/unicode.c
index 0f06645..9cdc535 100644
--- a/src/unicode.c
+++ b/src/unicode.c
@@ -17,10 +17,56 @@
#include "unicode.h"
#include "util.h"
+#ifdef USE_ICONV
+#include <iconv.h>
+// Default to not using iconv() until it's properly initialized.
+static int use_fallbacks = 1;
+
+/*
+ * iconv converters, since these conversions are stateless
+ * to/from UTF-8/UCS-2, we don't need any special thread
+ * handling here. (Like making the iconv():erters part of
+ * the device struct.)
+ */
+iconv_t cd_utf8_to_ucs2le;
+iconv_t cd_ucs2le_to_utf8;
+iconv_t cd_utf16_to_utf8;
+
+void unicode_init(void)
+{
+ // printf("Using iconv()...\n");
+ cd_utf16_to_utf8 = iconv_open("UTF-8", "UTF-16");
+ cd_ucs2le_to_utf8 = iconv_open("UTF-8", "UCS-2LE");
+ cd_utf8_to_ucs2le = iconv_open("UCS-2LE", "UTF-8");
+ /*
+ * If we cannot use the iconv implementation on this
+ * machine, fall back on the old routines.
+ */
+ if (cd_utf16_to_utf8 == (iconv_t) -1 ||
+ cd_ucs2le_to_utf8 == (iconv_t) -1 ||
+ cd_utf8_to_ucs2le == (iconv_t) -1) {
+ if (cd_utf16_to_utf8 != (iconv_t) -1)
+ iconv_close(cd_utf16_to_utf8);
+ if (cd_ucs2le_to_utf8 != (iconv_t) -1)
+ iconv_close(cd_ucs2le_to_utf8);
+ if (cd_utf8_to_ucs2le != (iconv_t) -1)
+ iconv_close(cd_utf8_to_ucs2le);
+ use_fallbacks = 1;
+ }
+ // OK activate the iconv() stuff...
+ use_fallbacks = 0;
+}
+#else
+void unicode_init(void)
+{
+ return;
+}
+#endif
+
/**
* The size of the buffer (in characters) used for creating string copies.
*/
-#define STRING_BUFFER_LENGTH 256
+#define STRING_BUFFER_LENGTH 1024
/**
* Gets the length (in characters, not bytes) of a unicode
@@ -71,7 +117,7 @@
* @param unicstr the UCS-2 string to copy
* @return a newly allocated copy of the string
*/
-static uint16_t *ucs2strdup(uint16_t const * const unicstr) {
+static uint16_t *ucs2_strdup(uint16_t const * const unicstr) {
int length = ucs2_strlen(unicstr);
uint8_t *data;
@@ -83,26 +129,12 @@
return (uint16_t *) data;
}
-
-/**
- * Converts a Unicode UCS-2 2-byte string to a UTF-8
- * string.
- *
- * @param unicstr the UCS-2 unicode string to convert
- * @param endianness the default endianness of the string. 0 means
- * little-endian, any other value means big-endian.
- * If a byte-order-mark (BOM) occurs in the string this
- * will be honoured and switch the endianness.
- * @return a UTF-8 string.
- */
-char *ucs2_to_utf8(uint16_t const * const unicstr,
- uint8_t const endianness) {
+static char *builtin_ucs2le_to_utf8(uint16_t const * const unicstr) {
char *data = NULL;
int i = 0;
int l = 0;
int length8;
uint8_t *locstr = (uint8_t *) unicstr;
- uint8_t locend = endianness;
length8 = ucs2utf8len(unicstr);
data = (char *) malloc(length8+1);
@@ -110,43 +142,19 @@
return NULL;
}
for(l = 0; (locstr[l] | locstr[l+1]) != '\0'; l += 2) {
- // This will honour the byte-order-mark properly
- if (locstr[l] == 0xFF && locstr[l+1] == 0xFE) {
- locend = 0;
- } else if (locstr[l] == 0xFE && locstr[l+1] == 0xFF) {
- locend = 1;
+ // This is for little-endian machines
+ if (locstr[l+1] == 0x00 && locstr[l] < 0x80U) {
+ data[i] = locstr[l];
+ i ++;
+ } else if (locstr[l+1] < 0x08) {
+ data[i] = 0xc0 | (locstr[l+1]<<2 & 0x1C) | (locstr[l]>>6 & 0x03);
+ data[i+1] = 0x80 | (locstr[l] & 0x3F);
+ i+=2;
} else {
- if (!locend) {
- // This is for little-endian machines
- if (locstr[l+1] == 0x00 && locstr[l] < 0x80U) {
- data[i] = locstr[l];
- i ++;
- } else if (locstr[l+1] < 0x08) {
- data[i] = 0xc0 | (locstr[l+1]<<2 & 0x1C) | (locstr[l]>>6 & 0x03);
- data[i+1] = 0x80 | (locstr[l] & 0x3F);
- i+=2;
- } else {
- data[i] = 0xe0 | (locstr[l+1]>>4 & 0x0F);
- data[i+1] = 0x80 | (locstr[l+1]<<2 & 0x3C) | (locstr[l]>>6 & 0x03);
- data[i+2] = 0x80 | (locstr[l] & 0x3F);
- i+=3;
- }
- } else {
- // This is for big-endian machines
- if (locstr[l] == 0x00 && locstr[l+1] < 0x80U) {
- data[i] = locstr[l+1];
- i ++;
- } else if (locstr[l] < 0x08) {
- data[i] = 0xc0 | (locstr[l]<<2 & 0x1C) | (locstr[l+1]>>6 & 0x03);
- data[i+1] = 0x80 | (locstr[l+1] & 0x3F);
- i+=2;
- } else {
- data[i] = 0xe0 | (locstr[l]>>4 & 0x0F);
- data[i+1] = 0x80 | (locstr[l]<<2 & 0x3C) | (locstr[l+1]>>6 & 0x03);
- data[i+2] = 0x80 | (locstr[l+1] & 0x3F);
- i+=3;
- }
- }
+ data[i] = 0xe0 | (locstr[l+1]>>4 & 0x0F);
+ data[i+1] = 0x80 | (locstr[l+1]<<2 & 0x3C) | (locstr[l]>>6 & 0x03);
+ data[i+2] = 0x80 | (locstr[l] & 0x3F);
+ i+=3;
}
}
/* Terminate string */
@@ -156,15 +164,96 @@
}
/**
- * Convert a UTF-8 string to a unicode UCS-2 string.
+ * Converts a little-endian Unicode UCS-2 2-byte string
+ * to a UTF-8 string.
*
- * @param str the UTF-8 string to convert.
- * @param endianness desired endianness of the returned string. 0 means
- * little-endian, any other value means big-endian.
- * @return a pointer to a newly allocated UCS-2 string.
+ * @param unicstr the UCS-2 unicode string to convert
+ * @return a UTF-8 string.
*/
-uint16_t *utf8_to_ucs2(unsigned char const * const str,
- const uint8_t endianness) {
+#ifdef USE_ICONV
+char *ucs2le_to_utf8(uint16_t const * const unicstr) {
+ if (use_fallbacks) {
+ return builtin_ucs2le_to_utf8(unicstr);
+ } else {
+ char *stringp = (char *) unicstr;
+ char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char.
+ char *locp = loclstr;
+ size_t nconv;
+ size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator
+ size_t convmax = STRING_BUFFER_LENGTH*3;
+
+ loclstr[0]='\0';
+ /* Do the conversion. */
+ nconv = iconv(cd_ucs2le_to_utf8, &stringp, &convlen, &locp, &convmax);
+ if (nconv == (size_t) -1) {
+ return NULL;
+ }
+ loclstr[STRING_BUFFER_LENGTH*3] = '\0';
+ return strdup(loclstr);
+ }
+}
+#else
+char *ucs2le_to_utf8(uint16_t const * const unicstr) {
+ return builtin_ucs2le_to_utf8(unicstr);
+}
+#endif
+
+/**
+ * Converts a big-endian UTF-16 2-byte string
+ * to a UTF-8 string.
+ *
+ * @param unicstr the UTF-16 unicode string to convert
+ * @return a UTF-8 string.
+ */
+#ifdef USE_ICONV
+char *utf16_to_utf8(const uint16_t *unicstr)
+{
+ if (use_fallbacks) {
+ if (unicstr[0] == 0xFFFEU || unicstr[0] == 0xFEFFU) {
+ // Consume BOM, endianness is fixed at network layer.
+ return builtin_ucs2le_to_utf8(unicstr+1);
+ }
+ return builtin_ucs2le_to_utf8(unicstr);
+ } else {
+ char *stringp = (char *) unicstr;
+ char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char.
+ char *locp = loclstr;
+ size_t nconv;
+ size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator
+ size_t convmax = STRING_BUFFER_LENGTH*3;
+
+ loclstr[0]='\0';
+ /* Do the conversion. */
+ nconv = iconv(cd_utf16_to_utf8, &stringp, &convlen, &locp, &convmax);
+ if (nconv == (size_t) -1) {
+ // Return partial string anyway.
+ *locp = '\0';
+ }
+ loclstr[STRING_BUFFER_LENGTH*3] = '\0';
+ // Strip off any BOM, it's totally useless...
+ if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) {
+ return strdup(loclstr+3);
+ }
+ return strdup(loclstr);
+ }
+}
+#else
+char *utf16_to_utf8(const uint16_t *unicstr)
+{
+ if (unicstr[0] == 0xFFFEU) {
+ // FIXME: big-endian, swap bytes around or something
+ return NULL;
+ }
+ if (unicstr[0] == 0xFEFFU) {
+ // Consume BOM
+ return builtin_ucs2le_to_utf8(unicstr+1);
+ }
+ return builtin_ucs2le_to_utf8(unicstr);
+}
+#endif
+
+
+static uint16_t *builtin_utf8_to_ucs2le(unsigned char const * const str) {
uint16_t *retval;
int i;
unsigned char buffer[STRING_BUFFER_LENGTH*2];
@@ -172,13 +261,8 @@
for(i = 0; str[i] != '\0' && length < (STRING_BUFFER_LENGTH*2-2);) {
if (str[i] < 0x80) {
- if (!endianness) {
- buffer[length+1] = 0x00;
- buffer[length] = str[i];
- } else {
- buffer[length] = 0x00;
- buffer[length+1] = str[i];
- }
+ buffer[length+1] = 0x00;
+ buffer[length] = str[i];
length += 2;
i++;
} else {
@@ -194,24 +278,13 @@
/* UCS-2 can handle no more than 3 UTF-8 encoded bytes */
if (numbytes <= 3) {
if (numbytes == 2 && str[i+1] > 0x80) {
- /* This character can always be handled correctly */
- if (!endianness) {
- buffer[length+1] = (str[i]>>2 & 0x07);
- buffer[length] = (str[i]<<6 & 0xC0) | (str[i+1] & 0x3F);
- } else {
- buffer[length] = (str[i]>>2 & 0x07);
- buffer[length+1] = (str[i]<<6 & 0xC0) | (str[i+1] & 0x3F);
- }
+ buffer[length+1] = (str[i]>>2 & 0x07);
+ buffer[length] = (str[i]<<6 & 0xC0) | (str[i+1] & 0x3F);
i += 2;
length += 2;
} else if (numbytes == 3 && str[i+1] > 0x80 && str[i+2] > 0x80) {
- if (!endianness) {
- buffer[length+1] = (str[i]<<4 & 0xF0) | (str[i+1]>>2 & 0x0F);
- buffer[length]= (str[i+1]<<6 & 0xC0) | (str[i+2] & 0x3F);
- } else {
- buffer[length] = (str[i]<<4 & 0xF0) | (str[i+1]>>2 & 0x0F);
- buffer[length+1]= (str[i+1]<<6 & 0xC0) | (str[i+2] & 0x3F);
- }
+ buffer[length+1] = (str[i]<<4 & 0xF0) | (str[i+1]>>2 & 0x0F);
+ buffer[length]= (str[i+1]<<6 & 0xC0) | (str[i+2] & 0x3F);
i += 3;
length += 2;
} else {
@@ -229,6 +302,40 @@
buffer[length+1] = 0x00;
// Copy the buffer contents
- retval = ucs2strdup((uint16_t *) buffer);
+ retval = ucs2_strdup((uint16_t *) buffer);
return retval;
}
+
+/**
+ * Convert a UTF-8 string to a little-endian Unicode
+ * UCS-2 string.
+ *
+ * @param str the UTF-8 string to convert.
+ * @return a pointer to a newly allocated UCS-2 string.
+ */
+#ifdef USE_ICONV
+uint16_t *utf8_to_ucs2le(unsigned char const * const str) {
+ if (use_fallbacks) {
+ return builtin_utf8_to_ucs2le(str);
+ } else {
+ uint16_t ucs2str[STRING_BUFFER_LENGTH+1];
+ char *ucs2strp = (char *) ucs2str;
+ char *stringp = (char *) str;
+ size_t nconv;
+ size_t convlen = strlen((char*)str) + 1; // Include the terminator in the conversion
+ size_t convmax = STRING_BUFFER_LENGTH * 2; // Includes the terminator
+
+ ucs2str[0] = 0x0000U;
+ // memset(ucs2strp, 0, (STRING_BUFFER_LENGTH+1)*sizeof(uint16_t));
+ nconv = iconv (cd_utf8_to_ucs2le, &stringp, &convlen, &ucs2strp, &convmax);
+ if (nconv == (size_t) -1) {
+ return NULL;
+ }
+ return ucs2_strdup(ucs2str);
+ }
+}
+#else
+uint16_t *utf8_to_ucs2le(unsigned char const * const str) {
+ return builtin_utf8_to_ucs2le(str);
+}
+#endif
diff --git a/src/unicode.h b/src/unicode.h
index 471170a..9d439d8 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -1,8 +1,15 @@
#ifndef __MTP__UNICODE__H
#define __MTP__UNICODE__H
+#include "config.h"
+#ifdef HAVE_ICONV_H
+#define USE_ICONV
+#endif
+
+void unicode_init(void);
int ucs2_strlen(uint16_t const * const);
-char *ucs2_to_utf8(const uint16_t*, const uint8_t endianness);
-uint16_t *utf8_to_ucs2(unsigned char const * const, const uint8_t endianness);
+char *ucs2le_to_utf8(const uint16_t*);
+char *utf16_to_utf8(const uint16_t*);
+uint16_t *utf8_to_ucs2le(unsigned char const * const);
#endif /* __MTP__UNICODE__H */