Linus Walleij | b9256fd | 2006-02-15 09:40:43 +0000 | [diff] [blame] | 1 | /** |
| 2 | * \file unicode.c |
| 3 | * |
| 4 | * This file contains general Unicode string manipulation functions. |
| 5 | * It mainly consist of functions for converting between UCS-2 (used on |
| 6 | * the devices), UTF-8 (used by several applications) and |
| 7 | * ISO 8859-1 / Codepage 1252 (fallback). |
| 8 | */ |
| 9 | |
| 10 | #include <stdlib.h> |
| 11 | #include <string.h> |
| 12 | #include "libmtp.h" |
| 13 | #include "unicode.h" |
| 14 | #include "util.h" |
| 15 | |
| 16 | /** |
| 17 | * The size of the buffer (in characters) used for creating string copies. |
| 18 | */ |
| 19 | #define STRING_BUFFER_LENGTH 256 |
| 20 | |
| 21 | /** |
| 22 | * Gets the length (in characters, not bytes) of a unicode |
| 23 | * UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00 |
| 24 | * will return a value of 1. |
| 25 | * |
| 26 | * @param unicstr a UCS-2 Unicode string |
| 27 | * @return the length of the string, in number of characters. If you |
| 28 | * want to know the length in bytes, multiply this by two and |
| 29 | * add two (for zero terminator). |
| 30 | */ |
| 31 | int ucs2_strlen(const uint16_t *unicstr) |
| 32 | { |
| 33 | int length; |
| 34 | |
| 35 | /* Unicode strings are terminated with 2 * 0x00 */ |
| 36 | for(length = 0; unicstr[length] != 0x0000U; length ++); |
| 37 | return length; |
| 38 | } |
| 39 | |
| 40 | /** |
| 41 | * This routine returns the length in bytes that this |
| 42 | * UCS-2 string would occupy if encoded as UTF-8 |
| 43 | * |
| 44 | * @param unicstr the Unicode UCS-2 string to analyze |
| 45 | * @return the number of bytes this string would occupy |
| 46 | * in UTF-8 |
| 47 | */ |
| 48 | static int ucs2utf8len(const uint16_t *unicstr){ |
| 49 | int length=0; |
| 50 | int i; |
| 51 | uint8_t *locstr = (uint8_t *) unicstr; |
| 52 | for(i = 0; (locstr[i] | locstr[i+1]) != '\0'; i+=2) { |
| 53 | if (locstr[i] == 0x00 && locstr[i+1] < 0x80) |
| 54 | length ++; |
| 55 | else if (locstr[i] < 0x08) |
| 56 | length += 2; |
| 57 | else |
| 58 | length += 3; |
| 59 | } |
| 60 | return length; |
| 61 | } |
| 62 | |
| 63 | /** |
| 64 | * Create a new, allocated UCS-2 string that is a copy |
| 65 | * of the parameter |
| 66 | * |
| 67 | * @param unicstr the UCS-2 string to copy |
| 68 | * @return a newly allocated copy of the string |
| 69 | */ |
| 70 | static uint16_t *ucs2strdup(const uint16_t *unicstr) { |
| 71 | int length = ucs2_strlen(unicstr); |
| 72 | uint8_t *data; |
| 73 | |
| 74 | data = (uint8_t *) malloc(length*2+2); |
| 75 | if ( data == NULL ) { |
| 76 | return NULL; |
| 77 | } |
| 78 | memcpy(data, unicstr, length*2+2); |
| 79 | return (uint16_t *) data; |
| 80 | } |
| 81 | |
| 82 | |
| 83 | /** |
| 84 | * Converts a Unicode UCS-2 2-byte string to a UTF-8 |
| 85 | * string. |
| 86 | * |
| 87 | * @param unicstr the UCS-2 unicode string to convert |
| 88 | * @return a UTF-8 string. |
| 89 | */ |
| 90 | char *ucs2_to_utf8(const uint16_t *unicstr){ |
| 91 | char *data = NULL; |
| 92 | int i = 0; |
| 93 | int l = 0; |
| 94 | int length8; |
| 95 | uint8_t *locstr = (uint8_t *) unicstr; |
| 96 | |
| 97 | length8 = ucs2utf8len(unicstr); |
| 98 | data = (char *) malloc(length8+1); |
| 99 | if ( data == NULL ) { |
| 100 | return NULL; |
| 101 | } |
| 102 | for(l = 0; (locstr[l] | locstr[l+1]) != '\0'; l += 2) { |
| 103 | if (locstr[l+1] == 0x00 && locstr[l] < 0x80U) { |
| 104 | data[i] = locstr[l]; |
| 105 | i ++; |
| 106 | } else if (locstr[l+1] < 0x08) { |
| 107 | data[i] = 0xc0 | (locstr[l+1]<<2 & 0x1C) | (locstr[l]>>6 & 0x03); |
| 108 | data[i+1] = 0x80 | (locstr[l] & 0x3F); |
| 109 | i+=2; |
| 110 | } else { |
| 111 | data[i] = 0xe0 | (locstr[l+1]>>4 & 0x0F); |
| 112 | data[i+1] = 0x80 | (locstr[l+1]<<2 & 0x3C) | (locstr[l]>>6 & 0x03); |
| 113 | data[i+2] = 0x80 | (locstr[l] & 0x3F); |
| 114 | i+=3; |
| 115 | } |
| 116 | } |
| 117 | /* Terminate string */ |
| 118 | data[i] = 0x00; |
| 119 | |
| 120 | return data; |
| 121 | } |
| 122 | |
| 123 | /** |
| 124 | * Convert a UTF-8 string to a unicode UCS-2 string. |
| 125 | * |
| 126 | * @param str the UTF-8 string to convert. |
| 127 | * @return a pointer to a newly allocated UCS-2 string. |
| 128 | */ |
| 129 | uint16_t *utf8_to_ucs2(const unsigned char *str) { |
| 130 | uint16_t *retval; |
| 131 | int i; |
| 132 | unsigned char buffer[STRING_BUFFER_LENGTH*2]; |
| 133 | int length=0; |
| 134 | |
| 135 | for(i = 0; str[i] != '\0';) { |
| 136 | if (str[i] < 0x80) { |
| 137 | buffer[length+1] = 0x00; |
| 138 | buffer[length] = str[i]; |
| 139 | length += 2; |
| 140 | i++; |
| 141 | } else { |
| 142 | unsigned char numbytes = 0; |
| 143 | unsigned char lenbyte = 0; |
| 144 | |
| 145 | /* Read the number of encoded bytes */ |
| 146 | lenbyte = str[i]; |
| 147 | while (lenbyte & 0x80) { |
| 148 | numbytes++; |
| 149 | lenbyte = lenbyte<<1; |
| 150 | } |
| 151 | /* UCS-2 can handle no more than 3 UTF-8 encoded bytes */ |
| 152 | if (numbytes <= 3) { |
| 153 | if (numbytes == 2 && str[i+1] > 0x80) { |
| 154 | /* This character can always be handled correctly */ |
| 155 | buffer[length+1] = (str[i]>>2 & 0x07); |
| 156 | buffer[length] = (str[i]<<6 & 0xC0) | (str[i+1] & 0x3F); |
| 157 | i += 2; |
| 158 | length += 2; |
| 159 | } else if (numbytes == 3 && str[i+1] > 0x80 && str[i+2] > 0x80) { |
| 160 | buffer[length+1] = (str[i]<<4 & 0xF0) | (str[i+1]>>2 & 0x0F); |
| 161 | buffer[length]= (str[i+1]<<6 & 0xC0) | (str[i+2] & 0x3F); |
| 162 | i += 3; |
| 163 | length += 2; |
| 164 | } else { |
| 165 | /* Abnormal string character, just skip */ |
| 166 | i += numbytes; |
| 167 | } |
| 168 | } else { |
| 169 | /* Just skip that character */ |
| 170 | i += numbytes; |
| 171 | } |
| 172 | } |
| 173 | } |
| 174 | /* Copy the buffer contents */ |
| 175 | buffer[length+1] = 0x00; |
| 176 | buffer[length] = 0x00; |
| 177 | |
| 178 | retval = ucs2strdup((uint16_t *) buffer); |
| 179 | return retval; |
| 180 | } |