blob: be07e2105aa49ee751a39a466d8005d3ae536105 [file] [log] [blame]
Linus Walleijb9256fd2006-02-15 09:40:43 +00001/**
2 * \file unicode.c
3 *
4 * This file contains general Unicode string manipulation functions.
5 * It mainly consist of functions for converting between UCS-2 (used on
6 * the devices), UTF-8 (used by several applications) and
7 * ISO 8859-1 / Codepage 1252 (fallback).
8 */
9
10#include <stdlib.h>
11#include <string.h>
12#include "libmtp.h"
13#include "unicode.h"
14#include "util.h"
15
16/**
17 * The size of the buffer (in characters) used for creating string copies.
18 */
19#define STRING_BUFFER_LENGTH 256
20
21/**
22 * Gets the length (in characters, not bytes) of a unicode
23 * UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00
24 * will return a value of 1.
25 *
26 * @param unicstr a UCS-2 Unicode string
27 * @return the length of the string, in number of characters. If you
28 * want to know the length in bytes, multiply this by two and
29 * add two (for zero terminator).
30 */
31int ucs2_strlen(const uint16_t *unicstr)
32{
33 int length;
34
35 /* Unicode strings are terminated with 2 * 0x00 */
36 for(length = 0; unicstr[length] != 0x0000U; length ++);
37 return length;
38}
39
40/**
41 * This routine returns the length in bytes that this
42 * UCS-2 string would occupy if encoded as UTF-8
43 *
44 * @param unicstr the Unicode UCS-2 string to analyze
45 * @return the number of bytes this string would occupy
46 * in UTF-8
47 */
48static int ucs2utf8len(const uint16_t *unicstr){
49 int length=0;
50 int i;
51 uint8_t *locstr = (uint8_t *) unicstr;
52 for(i = 0; (locstr[i] | locstr[i+1]) != '\0'; i+=2) {
53 if (locstr[i] == 0x00 && locstr[i+1] < 0x80)
54 length ++;
55 else if (locstr[i] < 0x08)
56 length += 2;
57 else
58 length += 3;
59 }
60 return length;
61}
62
63/**
64 * Create a new, allocated UCS-2 string that is a copy
65 * of the parameter
66 *
67 * @param unicstr the UCS-2 string to copy
68 * @return a newly allocated copy of the string
69 */
70static uint16_t *ucs2strdup(const uint16_t *unicstr) {
71 int length = ucs2_strlen(unicstr);
72 uint8_t *data;
73
74 data = (uint8_t *) malloc(length*2+2);
75 if ( data == NULL ) {
76 return NULL;
77 }
78 memcpy(data, unicstr, length*2+2);
79 return (uint16_t *) data;
80}
81
82
83/**
84 * Converts a Unicode UCS-2 2-byte string to a UTF-8
85 * string.
86 *
87 * @param unicstr the UCS-2 unicode string to convert
88 * @return a UTF-8 string.
89 */
90char *ucs2_to_utf8(const uint16_t *unicstr){
91 char *data = NULL;
92 int i = 0;
93 int l = 0;
94 int length8;
95 uint8_t *locstr = (uint8_t *) unicstr;
96
97 length8 = ucs2utf8len(unicstr);
98 data = (char *) malloc(length8+1);
99 if ( data == NULL ) {
100 return NULL;
101 }
102 for(l = 0; (locstr[l] | locstr[l+1]) != '\0'; l += 2) {
103 if (locstr[l+1] == 0x00 && locstr[l] < 0x80U) {
104 data[i] = locstr[l];
105 i ++;
106 } else if (locstr[l+1] < 0x08) {
107 data[i] = 0xc0 | (locstr[l+1]<<2 & 0x1C) | (locstr[l]>>6 & 0x03);
108 data[i+1] = 0x80 | (locstr[l] & 0x3F);
109 i+=2;
110 } else {
111 data[i] = 0xe0 | (locstr[l+1]>>4 & 0x0F);
112 data[i+1] = 0x80 | (locstr[l+1]<<2 & 0x3C) | (locstr[l]>>6 & 0x03);
113 data[i+2] = 0x80 | (locstr[l] & 0x3F);
114 i+=3;
115 }
116 }
117 /* Terminate string */
118 data[i] = 0x00;
119
120 return data;
121}
122
123/**
124 * Convert a UTF-8 string to a unicode UCS-2 string.
125 *
126 * @param str the UTF-8 string to convert.
127 * @return a pointer to a newly allocated UCS-2 string.
128 */
129uint16_t *utf8_to_ucs2(const unsigned char *str) {
130 uint16_t *retval;
131 int i;
132 unsigned char buffer[STRING_BUFFER_LENGTH*2];
133 int length=0;
134
135 for(i = 0; str[i] != '\0';) {
136 if (str[i] < 0x80) {
137 buffer[length+1] = 0x00;
138 buffer[length] = str[i];
139 length += 2;
140 i++;
141 } else {
142 unsigned char numbytes = 0;
143 unsigned char lenbyte = 0;
144
145 /* Read the number of encoded bytes */
146 lenbyte = str[i];
147 while (lenbyte & 0x80) {
148 numbytes++;
149 lenbyte = lenbyte<<1;
150 }
151 /* UCS-2 can handle no more than 3 UTF-8 encoded bytes */
152 if (numbytes <= 3) {
153 if (numbytes == 2 && str[i+1] > 0x80) {
154 /* This character can always be handled correctly */
155 buffer[length+1] = (str[i]>>2 & 0x07);
156 buffer[length] = (str[i]<<6 & 0xC0) | (str[i+1] & 0x3F);
157 i += 2;
158 length += 2;
159 } else if (numbytes == 3 && str[i+1] > 0x80 && str[i+2] > 0x80) {
160 buffer[length+1] = (str[i]<<4 & 0xF0) | (str[i+1]>>2 & 0x0F);
161 buffer[length]= (str[i+1]<<6 & 0xC0) | (str[i+2] & 0x3F);
162 i += 3;
163 length += 2;
164 } else {
165 /* Abnormal string character, just skip */
166 i += numbytes;
167 }
168 } else {
169 /* Just skip that character */
170 i += numbytes;
171 }
172 }
173 }
174 /* Copy the buffer contents */
175 buffer[length+1] = 0x00;
176 buffer[length] = 0x00;
177
178 retval = ucs2strdup((uint16_t *) buffer);
179 return retval;
180}