Linus Walleij | b9256fd | 2006-02-15 09:40:43 +0000 | [diff] [blame] | 1 | /** |
| 2 | * \file unicode.c |
| 3 | * |
| 4 | * This file contains general Unicode string manipulation functions. |
| 5 | * It mainly consist of functions for converting between UCS-2 (used on |
Linus Walleij | c00f70d | 2006-07-30 18:48:55 +0000 | [diff] [blame] | 6 | * the devices) and UTF-8 (used by several applications). |
| 7 | * |
| 8 | * For a deeper understanding of Unicode encoding formats see the |
| 9 | * Wikipedia entries for |
| 10 | * <a href="http://en.wikipedia.org/wiki/UTF-16/UCS-2">UTF-16/UCS-2</a> |
| 11 | * and <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>. |
Linus Walleij | 2f45d22 | 2007-02-02 22:47:39 +0000 | [diff] [blame] | 12 | * |
Linus Walleij | 6db174f | 2009-05-09 13:15:26 +0000 | [diff] [blame] | 13 | * Copyright (C) 2005-2009 Linus Walleij <triad@df.lth.se> |
Linus Walleij | 2f45d22 | 2007-02-02 22:47:39 +0000 | [diff] [blame] | 14 | * |
| 15 | * This library is free software; you can redistribute it and/or |
| 16 | * modify it under the terms of the GNU Lesser General Public |
| 17 | * License as published by the Free Software Foundation; either |
| 18 | * version 2 of the License, or (at your option) any later version. |
| 19 | * |
| 20 | * This library is distributed in the hope that it will be useful, |
| 21 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 22 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 23 | * Lesser General Public License for more details. |
| 24 | * |
| 25 | * You should have received a copy of the GNU Lesser General Public |
| 26 | * License along with this library; if not, write to the |
| 27 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
| 28 | * Boston, MA 02111-1307, USA. |
| 29 | * |
Linus Walleij | b9256fd | 2006-02-15 09:40:43 +0000 | [diff] [blame] | 30 | */ |
| 31 | |
Lei Zhang | e65fc0b | 2013-02-13 15:11:04 -0800 | [diff] [blame] | 32 | #include "config.h" |
| 33 | |
Linus Walleij | b9256fd | 2006-02-15 09:40:43 +0000 | [diff] [blame] | 34 | #include <stdlib.h> |
| 35 | #include <string.h> |
Linus Walleij | 6db174f | 2009-05-09 13:15:26 +0000 | [diff] [blame] | 36 | #ifdef HAVE_ICONV |
Linus Walleij | c6d7c98 | 2009-05-10 13:24:36 +0000 | [diff] [blame] | 37 | #include "iconv.h" |
Linus Walleij | 6db174f | 2009-05-09 13:15:26 +0000 | [diff] [blame] | 38 | #else |
| 39 | #error "libmtp unicode.c needs fixing to work without iconv()!" |
| 40 | #endif |
Linus Walleij | b9256fd | 2006-02-15 09:40:43 +0000 | [diff] [blame] | 41 | #include "libmtp.h" |
| 42 | #include "unicode.h" |
| 43 | #include "util.h" |
Linus Walleij | d5d51c8 | 2006-09-11 06:57:50 +0000 | [diff] [blame] | 44 | #include "ptp.h" |
Linus Walleij | 16571dc | 2006-08-17 20:27:46 +0000 | [diff] [blame] | 45 | |
Linus Walleij | b9256fd | 2006-02-15 09:40:43 +0000 | [diff] [blame] | 46 | /** |
| 47 | * The size of the buffer (in characters) used for creating string copies. |
| 48 | */ |
Linus Walleij | 16571dc | 2006-08-17 20:27:46 +0000 | [diff] [blame] | 49 | #define STRING_BUFFER_LENGTH 1024 |
Linus Walleij | b9256fd | 2006-02-15 09:40:43 +0000 | [diff] [blame] | 50 | |
Linus Walleij | f3c4405 | 2008-08-16 21:14:56 +0000 | [diff] [blame] | 51 | /** |
| 52 | * Gets the length (in characters, not bytes) of a unicode |
Linus Walleij | b9256fd | 2006-02-15 09:40:43 +0000 | [diff] [blame] | 53 | * UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00 |
| 54 | * will return a value of 1. |
| 55 | * |
| 56 | * @param unicstr a UCS-2 Unicode string |
Linus Walleij | f3c4405 | 2008-08-16 21:14:56 +0000 | [diff] [blame] | 57 | * @return the length of the string, in number of characters. If you |
Linus Walleij | b9256fd | 2006-02-15 09:40:43 +0000 | [diff] [blame] | 58 | * want to know the length in bytes, multiply this by two and |
| 59 | * add two (for zero terminator). |
| 60 | */ |
Linus Walleij | 438bd7f | 2006-06-08 11:35:44 +0000 | [diff] [blame] | 61 | int ucs2_strlen(uint16_t const * const unicstr) |
Linus Walleij | b9256fd | 2006-02-15 09:40:43 +0000 | [diff] [blame] | 62 | { |
| 63 | int length; |
Linus Walleij | f3c4405 | 2008-08-16 21:14:56 +0000 | [diff] [blame] | 64 | |
Linus Walleij | b9256fd | 2006-02-15 09:40:43 +0000 | [diff] [blame] | 65 | /* Unicode strings are terminated with 2 * 0x00 */ |
| 66 | for(length = 0; unicstr[length] != 0x0000U; length ++); |
| 67 | return length; |
| 68 | } |
| 69 | |
| 70 | /** |
Linus Walleij | 16571dc | 2006-08-17 20:27:46 +0000 | [diff] [blame] | 71 | * Converts a big-endian UTF-16 2-byte string |
Linus Walleij | d5d51c8 | 2006-09-11 06:57:50 +0000 | [diff] [blame] | 72 | * to a UTF-8 string. Actually just a UCS-2 internal conversion |
| 73 | * routine that strips off the BOM if there is one. |
Linus Walleij | 16571dc | 2006-08-17 20:27:46 +0000 | [diff] [blame] | 74 | * |
Linus Walleij | eab650b | 2006-08-21 23:26:37 +0000 | [diff] [blame] | 75 | * @param device a pointer to the current device. |
Linus Walleij | 16571dc | 2006-08-17 20:27:46 +0000 | [diff] [blame] | 76 | * @param unicstr the UTF-16 unicode string to convert |
| 77 | * @return a UTF-8 string. |
| 78 | */ |
Linus Walleij | 3ec8631 | 2006-08-21 13:25:24 +0000 | [diff] [blame] | 79 | char *utf16_to_utf8(LIBMTP_mtpdevice_t *device, const uint16_t *unicstr) |
Linus Walleij | 16571dc | 2006-08-17 20:27:46 +0000 | [diff] [blame] | 80 | { |
Linus Walleij | d5d51c8 | 2006-09-11 06:57:50 +0000 | [diff] [blame] | 81 | PTPParams *params = (PTPParams *) device->params; |
| 82 | char *stringp = (char *) unicstr; |
| 83 | char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char. |
| 84 | char *locp = loclstr; |
| 85 | size_t nconv; |
| 86 | size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator |
| 87 | size_t convmax = STRING_BUFFER_LENGTH*3; |
Linus Walleij | f3c4405 | 2008-08-16 21:14:56 +0000 | [diff] [blame] | 88 | |
Linus Walleij | d5d51c8 | 2006-09-11 06:57:50 +0000 | [diff] [blame] | 89 | loclstr[0]='\0'; |
| 90 | /* Do the conversion. */ |
| 91 | nconv = iconv(params->cd_ucs2_to_locale, &stringp, &convlen, &locp, &convmax); |
| 92 | if (nconv == (size_t) -1) { |
| 93 | // Return partial string anyway. |
| 94 | *locp = '\0'; |
Linus Walleij | 16571dc | 2006-08-17 20:27:46 +0000 | [diff] [blame] | 95 | } |
Linus Walleij | d5d51c8 | 2006-09-11 06:57:50 +0000 | [diff] [blame] | 96 | loclstr[STRING_BUFFER_LENGTH*3] = '\0'; |
| 97 | // Strip off any BOM, it's totally useless... |
| 98 | if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) { |
| 99 | return strdup(loclstr+3); |
| 100 | } |
| 101 | return strdup(loclstr); |
Linus Walleij | 16571dc | 2006-08-17 20:27:46 +0000 | [diff] [blame] | 102 | } |
Linus Walleij | d3b7857 | 2007-08-24 21:28:24 +0000 | [diff] [blame] | 103 | |
| 104 | /** |
Linus Walleij | f3c4405 | 2008-08-16 21:14:56 +0000 | [diff] [blame] | 105 | * Converts a UTF-8 string to a big-endian UTF-16 2-byte string |
| 106 | * Actually just a UCS-2 internal conversion. |
| 107 | * |
| 108 | * @param device a pointer to the current device. |
| 109 | * @param localstr the UTF-8 unicode string to convert |
| 110 | * @return a UTF-16 string. |
| 111 | */ |
| 112 | uint16_t *utf8_to_utf16(LIBMTP_mtpdevice_t *device, const char *localstr) |
| 113 | { |
| 114 | PTPParams *params = (PTPParams *) device->params; |
| 115 | char *stringp = (char *) localstr; // cast away "const" |
| 116 | char unicstr[(STRING_BUFFER_LENGTH+1)*2]; // UCS2 encoding is 2 bytes per UTF-8 char. |
| 117 | char *unip = unicstr; |
| 118 | size_t nconv = 0; |
| 119 | size_t convlen = strlen(localstr)+1; // utf8 bytes, include terminator |
| 120 | size_t convmax = STRING_BUFFER_LENGTH*2; |
| 121 | |
| 122 | unicstr[0]='\0'; |
| 123 | unicstr[1]='\0'; |
| 124 | |
| 125 | /* Do the conversion. */ |
| 126 | nconv = iconv(params->cd_locale_to_ucs2, &stringp, &convlen, &unip, &convmax); |
| 127 | |
| 128 | if (nconv == (size_t) -1) { |
| 129 | // Return partial string anyway. |
| 130 | unip[0] = '\0'; |
| 131 | unip[1] = '\0'; |
| 132 | } |
| 133 | // make sure the string is null terminated |
| 134 | unicstr[STRING_BUFFER_LENGTH*2] = '\0'; |
| 135 | unicstr[STRING_BUFFER_LENGTH*2+1] = '\0'; |
| 136 | |
| 137 | // allocate the string to be returned |
| 138 | // Note: can't use strdup since every other byte is a null byte |
| 139 | int ret_len = ucs2_strlen((uint16_t*)unicstr)*sizeof(uint16_t)+2; |
| 140 | uint16_t* ret = malloc(ret_len); |
| 141 | memcpy(ret,unicstr,(size_t)ret_len); |
| 142 | return ret; |
| 143 | } |
| 144 | |
| 145 | /** |
Linus Walleij | d3b7857 | 2007-08-24 21:28:24 +0000 | [diff] [blame] | 146 | * This helper function simply removes any consecutive chars |
| 147 | * > 0x7F and replace then with an underscore. In UTF-8 |
| 148 | * consequtive chars > 0x7F represent one single character so |
| 149 | * it has to be done like this (and it's elegant). It will only |
| 150 | * shrink the string in size so no copying is needed. |
| 151 | */ |
| 152 | void strip_7bit_from_utf8(char *str) |
| 153 | { |
| 154 | int i,j,k; |
| 155 | i = 0; |
| 156 | j = 0; |
| 157 | k = strlen(str); |
| 158 | while (i < k) { |
| 159 | if ((uint8_t) str[i] > 0x7FU) { |
| 160 | str[j] = '_'; |
Linus Walleij | 1a90559 | 2007-08-25 18:24:36 +0000 | [diff] [blame] | 161 | i++; |
Linus Walleij | d3b7857 | 2007-08-24 21:28:24 +0000 | [diff] [blame] | 162 | // Skip over any consequtive > 0x7F chars. |
| 163 | while((uint8_t) str[i] > 0x7FU) { |
| 164 | i++; |
| 165 | } |
| 166 | } else { |
| 167 | str[j] = str[i]; |
| 168 | i++; |
| 169 | } |
| 170 | j++; |
| 171 | } |
| 172 | // Terminate stripped string... |
| 173 | str[j] = '\0'; |
| 174 | } |