Blame - src/unicode.c - platform/external/libmtp

blob: be07e2105aa49ee751a39a466d8005d3ae536105 [file] [log] [blame]

Linus Walleij	b9256fd	2006-02-15 09:40:43 +0000	[diff] [blame]	1	/**
				2	* \file unicode.c
				3	*
				4	* This file contains general Unicode string manipulation functions.
				5	* It mainly consist of functions for converting between UCS-2 (used on
				6	* the devices), UTF-8 (used by several applications) and
				7	* ISO 8859-1 / Codepage 1252 (fallback).
				8	*/
				9
				10	#include <stdlib.h>
				11	#include <string.h>
				12	#include "libmtp.h"
				13	#include "unicode.h"
				14	#include "util.h"
				15
				16	/**
				17	* The size of the buffer (in characters) used for creating string copies.
				18	*/
				19	#define STRING_BUFFER_LENGTH 256
				20
				21	/**
				22	* Gets the length (in characters, not bytes) of a unicode
				23	* UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00
				24	* will return a value of 1.
				25	*
				26	* @param unicstr a UCS-2 Unicode string
				27	* @return the length of the string, in number of characters. If you
				28	* want to know the length in bytes, multiply this by two and
				29	* add two (for zero terminator).
				30	*/
				31	int ucs2_strlen(const uint16_t *unicstr)
				32	{
				33	int length;
				34
				35	/* Unicode strings are terminated with 2 * 0x00 */
				36	for(length = 0; unicstr[length] != 0x0000U; length ++);
				37	return length;
				38	}
				39
				40	/**
				41	* This routine returns the length in bytes that this
				42	* UCS-2 string would occupy if encoded as UTF-8
				43	*
				44	* @param unicstr the Unicode UCS-2 string to analyze
				45	* @return the number of bytes this string would occupy
				46	* in UTF-8
				47	*/
				48	static int ucs2utf8len(const uint16_t *unicstr){
				49	int length=0;
				50	int i;
				51	uint8_t locstr = (uint8_t ) unicstr;
				52	for(i = 0; (locstr[i] \| locstr[i+1]) != '\0'; i+=2) {
				53	if (locstr[i] == 0x00 && locstr[i+1] < 0x80)
				54	length ++;
				55	else if (locstr[i] < 0x08)
				56	length += 2;
				57	else
				58	length += 3;
				59	}
				60	return length;
				61	}
				62
				63	/**
				64	* Create a new, allocated UCS-2 string that is a copy
				65	* of the parameter
				66	*
				67	* @param unicstr the UCS-2 string to copy
				68	* @return a newly allocated copy of the string
				69	*/
				70	static uint16_t ucs2strdup(const uint16_t unicstr) {
				71	int length = ucs2_strlen(unicstr);
				72	uint8_t *data;
				73
				74	data = (uint8_t ) malloc(length2+2);
				75	if ( data == NULL ) {
				76	return NULL;
				77	}
				78	memcpy(data, unicstr, length*2+2);
				79	return (uint16_t *) data;
				80	}
				81
				82
				83	/**
				84	* Converts a Unicode UCS-2 2-byte string to a UTF-8
				85	* string.
				86	*
				87	* @param unicstr the UCS-2 unicode string to convert
				88	* @return a UTF-8 string.
				89	*/
				90	char ucs2_to_utf8(const uint16_t unicstr){
				91	char *data = NULL;
				92	int i = 0;
				93	int l = 0;
				94	int length8;
				95	uint8_t locstr = (uint8_t ) unicstr;
				96
				97	length8 = ucs2utf8len(unicstr);
				98	data = (char *) malloc(length8+1);
				99	if ( data == NULL ) {
				100	return NULL;
				101	}
				102	for(l = 0; (locstr[l] \| locstr[l+1]) != '\0'; l += 2) {
				103	if (locstr[l+1] == 0x00 && locstr[l] < 0x80U) {
				104	data[i] = locstr[l];
				105	i ++;
				106	} else if (locstr[l+1] < 0x08) {
				107	data[i] = 0xc0 \| (locstr[l+1]<<2 & 0x1C) \| (locstr[l]>>6 & 0x03);
				108	data[i+1] = 0x80 \| (locstr[l] & 0x3F);
				109	i+=2;
				110	} else {
				111	data[i] = 0xe0 \| (locstr[l+1]>>4 & 0x0F);
				112	data[i+1] = 0x80 \| (locstr[l+1]<<2 & 0x3C) \| (locstr[l]>>6 & 0x03);
				113	data[i+2] = 0x80 \| (locstr[l] & 0x3F);
				114	i+=3;
				115	}
				116	}
				117	/* Terminate string */
				118	data[i] = 0x00;
				119
				120	return data;
				121	}
				122
				123	/**
				124	* Convert a UTF-8 string to a unicode UCS-2 string.
				125	*
				126	* @param str the UTF-8 string to convert.
				127	* @return a pointer to a newly allocated UCS-2 string.
				128	*/
				129	uint16_t utf8_to_ucs2(const unsigned char str) {
				130	uint16_t *retval;
				131	int i;
				132	unsigned char buffer[STRING_BUFFER_LENGTH*2];
				133	int length=0;
				134
				135	for(i = 0; str[i] != '\0';) {
				136	if (str[i] < 0x80) {
				137	buffer[length+1] = 0x00;
				138	buffer[length] = str[i];
				139	length += 2;
				140	i++;
				141	} else {
				142	unsigned char numbytes = 0;
				143	unsigned char lenbyte = 0;
				144
				145	/* Read the number of encoded bytes */
				146	lenbyte = str[i];
				147	while (lenbyte & 0x80) {
				148	numbytes++;
				149	lenbyte = lenbyte<<1;
				150	}
				151	/* UCS-2 can handle no more than 3 UTF-8 encoded bytes */
				152	if (numbytes <= 3) {
				153	if (numbytes == 2 && str[i+1] > 0x80) {
				154	/* This character can always be handled correctly */
				155	buffer[length+1] = (str[i]>>2 & 0x07);
				156	buffer[length] = (str[i]<<6 & 0xC0) \| (str[i+1] & 0x3F);
				157	i += 2;
				158	length += 2;
				159	} else if (numbytes == 3 && str[i+1] > 0x80 && str[i+2] > 0x80) {
				160	buffer[length+1] = (str[i]<<4 & 0xF0) \| (str[i+1]>>2 & 0x0F);
				161	buffer[length]= (str[i+1]<<6 & 0xC0) \| (str[i+2] & 0x3F);
				162	i += 3;
				163	length += 2;
				164	} else {
				165	/* Abnormal string character, just skip */
				166	i += numbytes;
				167	}
				168	} else {
				169	/* Just skip that character */
				170	i += numbytes;
				171	}
				172	}
				173	}
				174	/* Copy the buffer contents */
				175	buffer[length+1] = 0x00;
				176	buffer[length] = 0x00;
				177
				178	retval = ucs2strdup((uint16_t *) buffer);
				179	return retval;
				180	}