blob: 89c2cbe87d2c7f9af4f11a99d6b0fc5c89004b60 [file] [log] [blame]
Linus Walleijb9256fd2006-02-15 09:40:43 +00001/**
2 * \file unicode.c
3 *
4 * This file contains general Unicode string manipulation functions.
5 * It mainly consist of functions for converting between UCS-2 (used on
Linus Walleijc00f70d2006-07-30 18:48:55 +00006 * the devices) and UTF-8 (used by several applications).
7 *
8 * For a deeper understanding of Unicode encoding formats see the
9 * Wikipedia entries for
10 * <a href="http://en.wikipedia.org/wiki/UTF-16/UCS-2">UTF-16/UCS-2</a>
11 * and <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>.
Linus Walleij2f45d222007-02-02 22:47:39 +000012 *
Linus Walleij6db174f2009-05-09 13:15:26 +000013 * Copyright (C) 2005-2009 Linus Walleij <triad@df.lth.se>
Linus Walleij2f45d222007-02-02 22:47:39 +000014 *
15 * This library is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU Lesser General Public
17 * License as published by the Free Software Foundation; either
18 * version 2 of the License, or (at your option) any later version.
19 *
20 * This library is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * Lesser General Public License for more details.
24 *
25 * You should have received a copy of the GNU Lesser General Public
26 * License along with this library; if not, write to the
27 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
28 * Boston, MA 02111-1307, USA.
29 *
Linus Walleijb9256fd2006-02-15 09:40:43 +000030 */
31
Lei Zhange65fc0b2013-02-13 15:11:04 -080032#include "config.h"
33
Linus Walleijb9256fd2006-02-15 09:40:43 +000034#include <stdlib.h>
35#include <string.h>
Linus Walleij6db174f2009-05-09 13:15:26 +000036#ifdef HAVE_ICONV
Linus Walleijc6d7c982009-05-10 13:24:36 +000037#include "iconv.h"
Linus Walleij6db174f2009-05-09 13:15:26 +000038#else
39#error "libmtp unicode.c needs fixing to work without iconv()!"
40#endif
Linus Walleijb9256fd2006-02-15 09:40:43 +000041#include "libmtp.h"
42#include "unicode.h"
43#include "util.h"
Linus Walleijd5d51c82006-09-11 06:57:50 +000044#include "ptp.h"
Linus Walleij16571dc2006-08-17 20:27:46 +000045
Linus Walleijb9256fd2006-02-15 09:40:43 +000046/**
47 * The size of the buffer (in characters) used for creating string copies.
48 */
Linus Walleij16571dc2006-08-17 20:27:46 +000049#define STRING_BUFFER_LENGTH 1024
Linus Walleijb9256fd2006-02-15 09:40:43 +000050
Linus Walleijf3c44052008-08-16 21:14:56 +000051/**
52 * Gets the length (in characters, not bytes) of a unicode
Linus Walleijb9256fd2006-02-15 09:40:43 +000053 * UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00
54 * will return a value of 1.
55 *
56 * @param unicstr a UCS-2 Unicode string
Linus Walleijf3c44052008-08-16 21:14:56 +000057 * @return the length of the string, in number of characters. If you
Linus Walleijb9256fd2006-02-15 09:40:43 +000058 * want to know the length in bytes, multiply this by two and
59 * add two (for zero terminator).
60 */
Linus Walleij438bd7f2006-06-08 11:35:44 +000061int ucs2_strlen(uint16_t const * const unicstr)
Linus Walleijb9256fd2006-02-15 09:40:43 +000062{
63 int length;
Linus Walleijf3c44052008-08-16 21:14:56 +000064
Linus Walleijb9256fd2006-02-15 09:40:43 +000065 /* Unicode strings are terminated with 2 * 0x00 */
66 for(length = 0; unicstr[length] != 0x0000U; length ++);
67 return length;
68}
69
70/**
Linus Walleij16571dc2006-08-17 20:27:46 +000071 * Converts a big-endian UTF-16 2-byte string
Linus Walleijd5d51c82006-09-11 06:57:50 +000072 * to a UTF-8 string. Actually just a UCS-2 internal conversion
73 * routine that strips off the BOM if there is one.
Linus Walleij16571dc2006-08-17 20:27:46 +000074 *
Linus Walleijeab650b2006-08-21 23:26:37 +000075 * @param device a pointer to the current device.
Linus Walleij16571dc2006-08-17 20:27:46 +000076 * @param unicstr the UTF-16 unicode string to convert
77 * @return a UTF-8 string.
78 */
Linus Walleij3ec86312006-08-21 13:25:24 +000079char *utf16_to_utf8(LIBMTP_mtpdevice_t *device, const uint16_t *unicstr)
Linus Walleij16571dc2006-08-17 20:27:46 +000080{
Linus Walleijd5d51c82006-09-11 06:57:50 +000081 PTPParams *params = (PTPParams *) device->params;
82 char *stringp = (char *) unicstr;
83 char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char.
84 char *locp = loclstr;
85 size_t nconv;
86 size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator
87 size_t convmax = STRING_BUFFER_LENGTH*3;
Linus Walleijf3c44052008-08-16 21:14:56 +000088
Linus Walleijd5d51c82006-09-11 06:57:50 +000089 loclstr[0]='\0';
90 /* Do the conversion. */
91 nconv = iconv(params->cd_ucs2_to_locale, &stringp, &convlen, &locp, &convmax);
92 if (nconv == (size_t) -1) {
93 // Return partial string anyway.
94 *locp = '\0';
Linus Walleij16571dc2006-08-17 20:27:46 +000095 }
Linus Walleijd5d51c82006-09-11 06:57:50 +000096 loclstr[STRING_BUFFER_LENGTH*3] = '\0';
97 // Strip off any BOM, it's totally useless...
98 if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) {
99 return strdup(loclstr+3);
100 }
101 return strdup(loclstr);
Linus Walleij16571dc2006-08-17 20:27:46 +0000102}
Linus Walleijd3b78572007-08-24 21:28:24 +0000103
104/**
Linus Walleijf3c44052008-08-16 21:14:56 +0000105 * Converts a UTF-8 string to a big-endian UTF-16 2-byte string
106 * Actually just a UCS-2 internal conversion.
107 *
108 * @param device a pointer to the current device.
109 * @param localstr the UTF-8 unicode string to convert
110 * @return a UTF-16 string.
111 */
112uint16_t *utf8_to_utf16(LIBMTP_mtpdevice_t *device, const char *localstr)
113{
114 PTPParams *params = (PTPParams *) device->params;
115 char *stringp = (char *) localstr; // cast away "const"
116 char unicstr[(STRING_BUFFER_LENGTH+1)*2]; // UCS2 encoding is 2 bytes per UTF-8 char.
117 char *unip = unicstr;
118 size_t nconv = 0;
119 size_t convlen = strlen(localstr)+1; // utf8 bytes, include terminator
120 size_t convmax = STRING_BUFFER_LENGTH*2;
121
122 unicstr[0]='\0';
123 unicstr[1]='\0';
124
125 /* Do the conversion. */
126 nconv = iconv(params->cd_locale_to_ucs2, &stringp, &convlen, &unip, &convmax);
127
128 if (nconv == (size_t) -1) {
129 // Return partial string anyway.
130 unip[0] = '\0';
131 unip[1] = '\0';
132 }
133 // make sure the string is null terminated
134 unicstr[STRING_BUFFER_LENGTH*2] = '\0';
135 unicstr[STRING_BUFFER_LENGTH*2+1] = '\0';
136
137 // allocate the string to be returned
138 // Note: can't use strdup since every other byte is a null byte
139 int ret_len = ucs2_strlen((uint16_t*)unicstr)*sizeof(uint16_t)+2;
140 uint16_t* ret = malloc(ret_len);
141 memcpy(ret,unicstr,(size_t)ret_len);
142 return ret;
143}
144
145/**
Linus Walleijd3b78572007-08-24 21:28:24 +0000146 * This helper function simply removes any consecutive chars
147 * > 0x7F and replace then with an underscore. In UTF-8
148 * consequtive chars > 0x7F represent one single character so
149 * it has to be done like this (and it's elegant). It will only
150 * shrink the string in size so no copying is needed.
151 */
152void strip_7bit_from_utf8(char *str)
153{
154 int i,j,k;
155 i = 0;
156 j = 0;
157 k = strlen(str);
158 while (i < k) {
159 if ((uint8_t) str[i] > 0x7FU) {
160 str[j] = '_';
Linus Walleij1a905592007-08-25 18:24:36 +0000161 i++;
Linus Walleijd3b78572007-08-24 21:28:24 +0000162 // Skip over any consequtive > 0x7F chars.
163 while((uint8_t) str[i] > 0x7FU) {
164 i++;
165 }
166 } else {
167 str[j] = str[i];
168 i++;
169 }
170 j++;
171 }
172 // Terminate stripped string...
173 str[j] = '\0';
174}