Blame - src/unicode.c - platform/external/libmtp

blob: 89c2cbe87d2c7f9af4f11a99d6b0fc5c89004b60 [file] [log] [blame]

Linus Walleij	b9256fd	2006-02-15 09:40:43 +0000	[diff] [blame]	1	/**
				2	* \file unicode.c
				3	*
				4	* This file contains general Unicode string manipulation functions.
				5	* It mainly consist of functions for converting between UCS-2 (used on
Linus Walleij	c00f70d	2006-07-30 18:48:55 +0000	[diff] [blame]	6	* the devices) and UTF-8 (used by several applications).
				7	*
				8	* For a deeper understanding of Unicode encoding formats see the
				9	* Wikipedia entries for
				10	* <a href="http://en.wikipedia.org/wiki/UTF-16/UCS-2">UTF-16/UCS-2</a>
				11	* and <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>.
Linus Walleij	2f45d22	2007-02-02 22:47:39 +0000	[diff] [blame]	12	*
Linus Walleij	6db174f	2009-05-09 13:15:26 +0000	[diff] [blame]	13	* Copyright (C) 2005-2009 Linus Walleij <triad@df.lth.se>
Linus Walleij	2f45d22	2007-02-02 22:47:39 +0000	[diff] [blame]	14	*
				15	* This library is free software; you can redistribute it and/or
				16	* modify it under the terms of the GNU Lesser General Public
				17	* License as published by the Free Software Foundation; either
				18	* version 2 of the License, or (at your option) any later version.
				19	*
				20	* This library is distributed in the hope that it will be useful,
				21	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				22	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				23	* Lesser General Public License for more details.
				24	*
				25	* You should have received a copy of the GNU Lesser General Public
				26	* License along with this library; if not, write to the
				27	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				28	* Boston, MA 02111-1307, USA.
				29	*
Linus Walleij	b9256fd	2006-02-15 09:40:43 +0000	[diff] [blame]	30	*/
				31
Lei Zhang	e65fc0b	2013-02-13 15:11:04 -0800	[diff] [blame]	32	#include "config.h"
				33
Linus Walleij	b9256fd	2006-02-15 09:40:43 +0000	[diff] [blame]	34	#include <stdlib.h>
				35	#include <string.h>
Linus Walleij	6db174f	2009-05-09 13:15:26 +0000	[diff] [blame]	36	#ifdef HAVE_ICONV
Linus Walleij	c6d7c98	2009-05-10 13:24:36 +0000	[diff] [blame]	37	#include "iconv.h"
Linus Walleij	6db174f	2009-05-09 13:15:26 +0000	[diff] [blame]	38	#else
				39	#error "libmtp unicode.c needs fixing to work without iconv()!"
				40	#endif
Linus Walleij	b9256fd	2006-02-15 09:40:43 +0000	[diff] [blame]	41	#include "libmtp.h"
				42	#include "unicode.h"
				43	#include "util.h"
Linus Walleij	d5d51c8	2006-09-11 06:57:50 +0000	[diff] [blame]	44	#include "ptp.h"
Linus Walleij	16571dc	2006-08-17 20:27:46 +0000	[diff] [blame]	45
Linus Walleij	b9256fd	2006-02-15 09:40:43 +0000	[diff] [blame]	46	/**
				47	* The size of the buffer (in characters) used for creating string copies.
				48	*/
Linus Walleij	16571dc	2006-08-17 20:27:46 +0000	[diff] [blame]	49	#define STRING_BUFFER_LENGTH 1024
Linus Walleij	b9256fd	2006-02-15 09:40:43 +0000	[diff] [blame]	50
Linus Walleij	f3c4405	2008-08-16 21:14:56 +0000	[diff] [blame]	51	/**
				52	* Gets the length (in characters, not bytes) of a unicode
Linus Walleij	b9256fd	2006-02-15 09:40:43 +0000	[diff] [blame]	53	* UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00
				54	* will return a value of 1.
				55	*
				56	* @param unicstr a UCS-2 Unicode string
Linus Walleij	f3c4405	2008-08-16 21:14:56 +0000	[diff] [blame]	57	* @return the length of the string, in number of characters. If you
Linus Walleij	b9256fd	2006-02-15 09:40:43 +0000	[diff] [blame]	58	* want to know the length in bytes, multiply this by two and
				59	* add two (for zero terminator).
				60	*/
Linus Walleij	438bd7f	2006-06-08 11:35:44 +0000	[diff] [blame]	61	int ucs2_strlen(uint16_t const * const unicstr)
Linus Walleij	b9256fd	2006-02-15 09:40:43 +0000	[diff] [blame]	62	{
				63	int length;
Linus Walleij	f3c4405	2008-08-16 21:14:56 +0000	[diff] [blame]	64
Linus Walleij	b9256fd	2006-02-15 09:40:43 +0000	[diff] [blame]	65	/* Unicode strings are terminated with 2 * 0x00 */
				66	for(length = 0; unicstr[length] != 0x0000U; length ++);
				67	return length;
				68	}
				69
				70	/**
Linus Walleij	16571dc	2006-08-17 20:27:46 +0000	[diff] [blame]	71	* Converts a big-endian UTF-16 2-byte string
Linus Walleij	d5d51c8	2006-09-11 06:57:50 +0000	[diff] [blame]	72	* to a UTF-8 string. Actually just a UCS-2 internal conversion
				73	* routine that strips off the BOM if there is one.
Linus Walleij	16571dc	2006-08-17 20:27:46 +0000	[diff] [blame]	74	*
Linus Walleij	eab650b	2006-08-21 23:26:37 +0000	[diff] [blame]	75	* @param device a pointer to the current device.
Linus Walleij	16571dc	2006-08-17 20:27:46 +0000	[diff] [blame]	76	* @param unicstr the UTF-16 unicode string to convert
				77	* @return a UTF-8 string.
				78	*/
Linus Walleij	3ec8631	2006-08-21 13:25:24 +0000	[diff] [blame]	79	char utf16_to_utf8(LIBMTP_mtpdevice_t device, const uint16_t *unicstr)
Linus Walleij	16571dc	2006-08-17 20:27:46 +0000	[diff] [blame]	80	{
Linus Walleij	d5d51c8	2006-09-11 06:57:50 +0000	[diff] [blame]	81	PTPParams params = (PTPParams ) device->params;
				82	char stringp = (char ) unicstr;
				83	char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char.
				84	char *locp = loclstr;
				85	size_t nconv;
				86	size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator
				87	size_t convmax = STRING_BUFFER_LENGTH*3;
Linus Walleij	f3c4405	2008-08-16 21:14:56 +0000	[diff] [blame]	88
Linus Walleij	d5d51c8	2006-09-11 06:57:50 +0000	[diff] [blame]	89	loclstr[0]='\0';
				90	/* Do the conversion. */
				91	nconv = iconv(params->cd_ucs2_to_locale, &stringp, &convlen, &locp, &convmax);
				92	if (nconv == (size_t) -1) {
				93	// Return partial string anyway.
				94	*locp = '\0';
Linus Walleij	16571dc	2006-08-17 20:27:46 +0000	[diff] [blame]	95	}
Linus Walleij	d5d51c8	2006-09-11 06:57:50 +0000	[diff] [blame]	96	loclstr[STRING_BUFFER_LENGTH*3] = '\0';
				97	// Strip off any BOM, it's totally useless...
				98	if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) {
				99	return strdup(loclstr+3);
				100	}
				101	return strdup(loclstr);
Linus Walleij	16571dc	2006-08-17 20:27:46 +0000	[diff] [blame]	102	}
Linus Walleij	d3b7857	2007-08-24 21:28:24 +0000	[diff] [blame]	103
				104	/**
Linus Walleij	f3c4405	2008-08-16 21:14:56 +0000	[diff] [blame]	105	* Converts a UTF-8 string to a big-endian UTF-16 2-byte string
				106	* Actually just a UCS-2 internal conversion.
				107	*
				108	* @param device a pointer to the current device.
				109	* @param localstr the UTF-8 unicode string to convert
				110	* @return a UTF-16 string.
				111	*/
				112	uint16_t utf8_to_utf16(LIBMTP_mtpdevice_t device, const char *localstr)
				113	{
				114	PTPParams params = (PTPParams ) device->params;
				115	char stringp = (char ) localstr; // cast away "const"
				116	char unicstr[(STRING_BUFFER_LENGTH+1)*2]; // UCS2 encoding is 2 bytes per UTF-8 char.
				117	char *unip = unicstr;
				118	size_t nconv = 0;
				119	size_t convlen = strlen(localstr)+1; // utf8 bytes, include terminator
				120	size_t convmax = STRING_BUFFER_LENGTH*2;
				121
				122	unicstr[0]='\0';
				123	unicstr[1]='\0';
				124
				125	/* Do the conversion. */
				126	nconv = iconv(params->cd_locale_to_ucs2, &stringp, &convlen, &unip, &convmax);
				127
				128	if (nconv == (size_t) -1) {
				129	// Return partial string anyway.
				130	unip[0] = '\0';
				131	unip[1] = '\0';
				132	}
				133	// make sure the string is null terminated
				134	unicstr[STRING_BUFFER_LENGTH*2] = '\0';
				135	unicstr[STRING_BUFFER_LENGTH*2+1] = '\0';
				136
				137	// allocate the string to be returned
				138	// Note: can't use strdup since every other byte is a null byte
				139	int ret_len = ucs2_strlen((uint16_t)unicstr)sizeof(uint16_t)+2;
				140	uint16_t* ret = malloc(ret_len);
				141	memcpy(ret,unicstr,(size_t)ret_len);
				142	return ret;
				143	}
				144
				145	/**
Linus Walleij	d3b7857	2007-08-24 21:28:24 +0000	[diff] [blame]	146	* This helper function simply removes any consecutive chars
				147	* > 0x7F and replace then with an underscore. In UTF-8
				148	* consequtive chars > 0x7F represent one single character so
				149	* it has to be done like this (and it's elegant). It will only
				150	* shrink the string in size so no copying is needed.
				151	*/
				152	void strip_7bit_from_utf8(char *str)
				153	{
				154	int i,j,k;
				155	i = 0;
				156	j = 0;
				157	k = strlen(str);
				158	while (i < k) {
				159	if ((uint8_t) str[i] > 0x7FU) {
				160	str[j] = '_';
Linus Walleij	1a90559	2007-08-25 18:24:36 +0000	[diff] [blame]	161	i++;
Linus Walleij	d3b7857	2007-08-24 21:28:24 +0000	[diff] [blame]	162	// Skip over any consequtive > 0x7F chars.
				163	while((uint8_t) str[i] > 0x7FU) {
				164	i++;
				165	}
				166	} else {
				167	str[j] = str[i];
				168	i++;
				169	}
				170	j++;
				171	}
				172	// Terminate stripped string...
				173	str[j] = '\0';
				174	}