blob: 8b87c70ad50a2c9f224ae95ec8b2b008db3a01fd [file] [log] [blame]
/**
* \file unicode.c
*
* This file contains general Unicode string manipulation functions.
* It mainly consist of functions for converting between UCS-2 (used on
* the devices) and UTF-8 (used by several applications).
*
* For a deeper understanding of Unicode encoding formats see the
* Wikipedia entries for
* <a href="http://en.wikipedia.org/wiki/UTF-16/UCS-2">UTF-16/UCS-2</a>
* and <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>.
*
* Copyright (C) 2005-2007 Linus Walleij <triad@df.lth.se>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*
*/
#include <stdlib.h>
#include <string.h>
#include <iconv.h>
#include "libmtp.h"
#include "unicode.h"
#include "util.h"
#include "ptp.h"
/**
* The size of the buffer (in characters) used for creating string copies.
*/
#define STRING_BUFFER_LENGTH 1024
/**
* Gets the length (in characters, not bytes) of a unicode
* UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00
* will return a value of 1.
*
* @param unicstr a UCS-2 Unicode string
* @return the length of the string, in number of characters. If you
* want to know the length in bytes, multiply this by two and
* add two (for zero terminator).
*/
int ucs2_strlen(uint16_t const * const unicstr)
{
int length;
/* Unicode strings are terminated with 2 * 0x00 */
for(length = 0; unicstr[length] != 0x0000U; length ++);
return length;
}
/**
* Converts a big-endian UTF-16 2-byte string
* to a UTF-8 string. Actually just a UCS-2 internal conversion
* routine that strips off the BOM if there is one.
*
* @param device a pointer to the current device.
* @param unicstr the UTF-16 unicode string to convert
* @return a UTF-8 string.
*/
char *utf16_to_utf8(LIBMTP_mtpdevice_t *device, const uint16_t *unicstr)
{
PTPParams *params = (PTPParams *) device->params;
char *stringp = (char *) unicstr;
char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char.
char *locp = loclstr;
size_t nconv;
size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator
size_t convmax = STRING_BUFFER_LENGTH*3;
loclstr[0]='\0';
/* Do the conversion. */
nconv = iconv(params->cd_ucs2_to_locale, &stringp, &convlen, &locp, &convmax);
if (nconv == (size_t) -1) {
// Return partial string anyway.
*locp = '\0';
}
loclstr[STRING_BUFFER_LENGTH*3] = '\0';
// Strip off any BOM, it's totally useless...
if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) {
return strdup(loclstr+3);
}
return strdup(loclstr);
}