blob: bb3717f94aa0f4a95bc470ed921f857557e5e018 [file] [log] [blame]
Andreas Huberbfb9fb12009-12-03 11:31:19 -08001/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <media/mediascanner.h>
18
19#include <utils/StringArray.h>
20
21#include "autodetect.h"
22#include "unicode/ucnv.h"
23#include "unicode/ustring.h"
24
25namespace android {
26
27MediaScannerClient::MediaScannerClient()
28 : mNames(NULL),
29 mValues(NULL),
30 mLocaleEncoding(kEncodingNone)
31{
32}
33
34MediaScannerClient::~MediaScannerClient()
35{
36 delete mNames;
37 delete mValues;
38}
39
40void MediaScannerClient::setLocale(const char* locale)
41{
42 if (!locale) return;
43
44 if (!strncmp(locale, "ja", 2))
45 mLocaleEncoding = kEncodingShiftJIS;
46 else if (!strncmp(locale, "ko", 2))
47 mLocaleEncoding = kEncodingEUCKR;
48 else if (!strncmp(locale, "zh", 2)) {
49 if (!strcmp(locale, "zh_CN")) {
50 // simplified chinese for mainland China
51 mLocaleEncoding = kEncodingGBK;
52 } else {
53 // assume traditional for non-mainland Chinese locales (Taiwan, Hong Kong, Singapore)
54 mLocaleEncoding = kEncodingBig5;
55 }
56 }
57}
58
59void MediaScannerClient::beginFile()
60{
61 mNames = new StringArray;
62 mValues = new StringArray;
63}
64
65bool MediaScannerClient::addStringTag(const char* name, const char* value)
66{
Marco Nelissenb45b0842010-02-10 09:56:05 -080067 // don't bother caching strings that are all ASCII.
68 // call handleStringTag directly instead.
69 // check to see if value (which should be utf8) has any non-ASCII characters
70 bool nonAscii = false;
71 const char* chp = value;
72 char ch;
73 while ((ch = *chp++)) {
74 if (ch & 0x80) {
75 nonAscii = true;
76 break;
Andreas Huberbfb9fb12009-12-03 11:31:19 -080077 }
Andreas Huberbfb9fb12009-12-03 11:31:19 -080078 }
79
Marco Nelissenb45b0842010-02-10 09:56:05 -080080 if (nonAscii) {
81 // save the strings for later so they can be used for native encoding detection
82 mNames->push_back(name);
83 mValues->push_back(value);
84 return true;
85 }
86 // else fall through
87
Andreas Huberbfb9fb12009-12-03 11:31:19 -080088 // autodetection is not necessary, so no need to cache the values
89 // pass directly to the client instead
90 return handleStringTag(name, value);
91}
92
93static uint32_t possibleEncodings(const char* s)
94{
95 uint32_t result = kEncodingAll;
96 // if s contains a native encoding, then it was mistakenly encoded in utf8 as if it were latin-1
97 // so we need to reverse the latin-1 -> utf8 conversion to get the native chars back
98 uint8_t ch1, ch2;
99 uint8_t* chp = (uint8_t *)s;
100
101 while ((ch1 = *chp++)) {
102 if (ch1 & 0x80) {
103 ch2 = *chp++;
104 ch1 = ((ch1 << 6) & 0xC0) | (ch2 & 0x3F);
105 // ch1 is now the first byte of the potential native char
106
107 ch2 = *chp++;
108 if (ch2 & 0x80)
109 ch2 = ((ch2 << 6) & 0xC0) | (*chp++ & 0x3F);
110 // ch2 is now the second byte of the potential native char
111 int ch = (int)ch1 << 8 | (int)ch2;
112 result &= findPossibleEncodings(ch);
113 }
114 // else ASCII character, which could be anything
115 }
116
117 return result;
118}
119
120void MediaScannerClient::convertValues(uint32_t encoding)
121{
122 const char* enc = NULL;
123 switch (encoding) {
124 case kEncodingShiftJIS:
125 enc = "shift-jis";
126 break;
127 case kEncodingGBK:
128 enc = "gbk";
129 break;
130 case kEncodingBig5:
131 enc = "Big5";
132 break;
133 case kEncodingEUCKR:
134 enc = "EUC-KR";
135 break;
136 }
137
138 if (enc) {
139 UErrorCode status = U_ZERO_ERROR;
140
141 UConverter *conv = ucnv_open(enc, &status);
142 if (U_FAILURE(status)) {
143 LOGE("could not create UConverter for %s\n", enc);
144 return;
145 }
146 UConverter *utf8Conv = ucnv_open("UTF-8", &status);
147 if (U_FAILURE(status)) {
148 LOGE("could not create UConverter for UTF-8\n");
149 ucnv_close(conv);
150 return;
151 }
152
153 // for each value string, convert from native encoding to UTF-8
154 for (int i = 0; i < mNames->size(); i++) {
155 // first we need to untangle the utf8 and convert it back to the original bytes
156 // since we are reducing the length of the string, we can do this in place
157 uint8_t* src = (uint8_t *)mValues->getEntry(i);
158 int len = strlen((char *)src);
159 uint8_t* dest = src;
160
161 uint8_t uch;
162 while ((uch = *src++)) {
163 if (uch & 0x80)
164 *dest++ = ((uch << 6) & 0xC0) | (*src++ & 0x3F);
165 else
166 *dest++ = uch;
167 }
168 *dest = 0;
169
170 // now convert from native encoding to UTF-8
171 const char* source = mValues->getEntry(i);
172 int targetLength = len * 3 + 1;
173 char* buffer = new char[targetLength];
174 if (!buffer)
175 break;
176 char* target = buffer;
177
178 ucnv_convertEx(utf8Conv, conv, &target, target + targetLength,
179 &source, (const char *)dest, NULL, NULL, NULL, NULL, TRUE, TRUE, &status);
180 if (U_FAILURE(status)) {
181 LOGE("ucnv_convertEx failed: %d\n", status);
182 mValues->setEntry(i, "???");
183 } else {
184 // zero terminate
185 *target = 0;
186 mValues->setEntry(i, buffer);
187 }
188
189 delete[] buffer;
190 }
191
192 ucnv_close(conv);
193 ucnv_close(utf8Conv);
194 }
195}
196
197void MediaScannerClient::endFile()
198{
Marco Nelissenb45b0842010-02-10 09:56:05 -0800199 int size = mNames->size();
200 uint32_t encoding = kEncodingAll;
Andreas Huberbfb9fb12009-12-03 11:31:19 -0800201
Marco Nelissenb45b0842010-02-10 09:56:05 -0800202 // compute a bit mask containing all possible encodings
203 for (int i = 0; i < mNames->size(); i++)
204 encoding &= possibleEncodings(mValues->getEntry(i));
Andreas Huberbfb9fb12009-12-03 11:31:19 -0800205
Marco Nelissenb45b0842010-02-10 09:56:05 -0800206 // If one of the possible encodings matches the locale encoding, use that.
207 // Otherwise, if there is only one possible encoding, use that.
208 if (encoding & mLocaleEncoding)
209 convertValues(mLocaleEncoding);
210 else if ((encoding & (encoding - 1)) == 0)
211 convertValues(encoding);
212 else {
213 // TODO: try harder to disambiguate the encoding, perhaps by looking at
214 // other files by same artist, or even the user's entire collection.
215 // For now, fall through and insert the strings as they are.
216 }
Andreas Huberbfb9fb12009-12-03 11:31:19 -0800217
Marco Nelissenb45b0842010-02-10 09:56:05 -0800218 // finally, push all name/value pairs to the client
219 for (int i = 0; i < mNames->size(); i++) {
220 if (!handleStringTag(mNames->getEntry(i), mValues->getEntry(i)))
221 break;
Andreas Huberbfb9fb12009-12-03 11:31:19 -0800222 }
223 // else addStringTag() has done all the work so we have nothing to do
224
225 delete mNames;
226 delete mValues;
227 mNames = NULL;
228 mValues = NULL;
229}
230
231} // namespace android
232