src/unicode.c - platform/external/libmtp - Gitiles

 /**
  * \file unicode.c
  *
  * This file contains general Unicode string manipulation functions.
  * It mainly consist of functions for converting between UCS-2 (used on
  * the devices), UTF-8 (used by several applications) and
  * ISO 8859-1 / Codepage 1252 (fallback).
  */

 #include <stdlib.h>
 #include <string.h>
 #include "libmtp.h"
 #include "unicode.h"
 #include "util.h"

 /**
  * The size of the buffer (in characters) used for creating string copies.
  */
 #define STRING_BUFFER_LENGTH 256

 /**
  * Gets the length (in characters, not bytes) of a unicode
  * UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00
  * will return a value of 1.
  *
  * @param unicstr a UCS-2 Unicode string
  * @return the length of the string, in number of characters. If you
  *         want to know the length in bytes, multiply this by two and
  *         add two (for zero terminator).
  */
 int ucs2_strlen(const uint16_t *unicstr)
 {
   int length;

   /* Unicode strings are terminated with 2 * 0x00 */
   for(length = 0; unicstr[length] != 0x0000U; length ++);
   return length;
 }

 /**
  * This routine returns the length in bytes that this
  * UCS-2 string would occupy if encoded as UTF-8
  *
  * @param unicstr the Unicode UCS-2 string to analyze
  * @return the number of bytes this string would occupy
  *         in UTF-8
  */
 static int ucs2utf8len(const uint16_t *unicstr){
   int length=0;
   int i;
   uint8_t *locstr = (uint8_t *) unicstr;
   for(i = 0; (locstr[i] | locstr[i+1]) != '\0'; i+=2) {
     if (locstr[i] == 0x00 && locstr[i+1] < 0x80)
       length ++;
     else if (locstr[i] < 0x08)
       length += 2;
     else
       length += 3;
   }
   return length;
 }

 /**
  * Create a new, allocated UCS-2 string that is a copy
  * of the parameter
  *
  * @param unicstr the UCS-2 string to copy
  * @return a newly allocated copy of the string
  */
 static uint16_t *ucs2strdup(const uint16_t *unicstr) {
   int length = ucs2_strlen(unicstr);
   uint8_t *data;

   data = (uint8_t *) malloc(length*2+2);
   if ( data == NULL ) {
     return NULL;
   }
   memcpy(data, unicstr, length*2+2);
   return (uint16_t *) data;
 }


 /**
  * Converts a Unicode UCS-2 2-byte string to a UTF-8
  * string.
  *
  * @param unicstr the UCS-2 unicode string to convert
  * @return a UTF-8 string.
  */
 char *ucs2_to_utf8(const uint16_t *unicstr){
   char *data = NULL;
   int i = 0;
   int l = 0;
   int length8;
   uint8_t *locstr = (uint8_t *) unicstr;

   length8 = ucs2utf8len(unicstr);
   data = (char *) malloc(length8+1);
   if ( data == NULL ) {
     return NULL;
   }
   for(l = 0; (locstr[l] | locstr[l+1]) != '\0'; l += 2) {
     if (locstr[l+1] == 0x00 && locstr[l] < 0x80U) {
       data[i] = locstr[l];
       i ++;
     } else if (locstr[l+1] < 0x08) {
       data[i] = 0xc0 | (locstr[l+1]<<2 & 0x1C) | (locstr[l]>>6  & 0x03);
       data[i+1] = 0x80 | (locstr[l] & 0x3F);
       i+=2;
     } else {
       data[i] = 0xe0 | (locstr[l+1]>>4 & 0x0F);
       data[i+1] = 0x80 | (locstr[l+1]<<2 & 0x3C) | (locstr[l]>>6 & 0x03);
       data[i+2] = 0x80 | (locstr[l] & 0x3F);
       i+=3;
     }
   }
   /* Terminate string */
   data[i] = 0x00;

   return data;
 }

 /**
  * Convert a UTF-8 string to a unicode UCS-2 string.
  *
  * @param str the UTF-8 string to convert.
  * @return a pointer to a newly allocated UCS-2 string.
  */
 uint16_t *utf8_to_ucs2(const unsigned char *str) {
   uint16_t *retval;
   int i;
   unsigned char buffer[STRING_BUFFER_LENGTH*2];
   int length=0;

   for(i = 0; str[i] != '\0';) {
     if (str[i] < 0x80) {
       buffer[length+1] = 0x00;
       buffer[length] = str[i];
       length += 2;
       i++;
     } else {
       unsigned char numbytes = 0;
       unsigned char lenbyte = 0;

       /* Read the number of encoded bytes */
       lenbyte = str[i];
       while (lenbyte & 0x80) {
 	numbytes++;
 	lenbyte = lenbyte<<1;
       }
       /* UCS-2 can handle no more than 3 UTF-8 encoded bytes */
       if (numbytes <= 3) {
 	if (numbytes == 2 && str[i+1] > 0x80) {
 	  /* This character can always be handled correctly */
 	  buffer[length+1] = (str[i]>>2 & 0x07);
 	  buffer[length] = (str[i]<<6 & 0xC0) | (str[i+1] & 0x3F);
 	  i += 2;
 	  length += 2;
 	} else if (numbytes == 3 && str[i+1] > 0x80 && str[i+2] > 0x80) {
 	  buffer[length+1] = (str[i]<<4 & 0xF0) | (str[i+1]>>2 & 0x0F);
 	  buffer[length]= (str[i+1]<<6 & 0xC0) | (str[i+2] & 0x3F);
 	  i += 3;
 	  length += 2;
 	} else {
 	  /* Abnormal string character, just skip */
 	  i += numbytes;
 	}
       } else {
 	/* Just skip that character */
 	i += numbytes;
       }
     }
   }
   /* Copy the buffer contents */
   buffer[length+1] = 0x00;
   buffer[length] = 0x00;

   retval = ucs2strdup((uint16_t *) buffer);
   return retval;
 }
	/**
	* \file unicode.c
	*
	* This file contains general Unicode string manipulation functions.
	* It mainly consist of functions for converting between UCS-2 (used on
	* the devices), UTF-8 (used by several applications) and
	* ISO 8859-1 / Codepage 1252 (fallback).
	*/

	#include <stdlib.h>
	#include <string.h>
	#include "libmtp.h"
	#include "unicode.h"
	#include "util.h"

	/**
	* The size of the buffer (in characters) used for creating string copies.
	*/
	#define STRING_BUFFER_LENGTH 256

	/**
	* Gets the length (in characters, not bytes) of a unicode
	* UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00
	* will return a value of 1.
	*
	* @param unicstr a UCS-2 Unicode string
	* @return the length of the string, in number of characters. If you
	* want to know the length in bytes, multiply this by two and
	* add two (for zero terminator).
	*/
	int ucs2_strlen(const uint16_t *unicstr)
	{
	int length;

	/* Unicode strings are terminated with 2 * 0x00 */
	for(length = 0; unicstr[length] != 0x0000U; length ++);
	return length;
	}

	/**
	* This routine returns the length in bytes that this
	* UCS-2 string would occupy if encoded as UTF-8
	*
	* @param unicstr the Unicode UCS-2 string to analyze
	* @return the number of bytes this string would occupy
	* in UTF-8
	*/
	static int ucs2utf8len(const uint16_t *unicstr){
	int length=0;
	int i;
	uint8_t locstr = (uint8_t ) unicstr;
	for(i = 0; (locstr[i] \| locstr[i+1]) != '\0'; i+=2) {
	if (locstr[i] == 0x00 && locstr[i+1] < 0x80)
	length ++;
	else if (locstr[i] < 0x08)
	length += 2;
	else
	length += 3;
	}
	return length;
	}

	/**
	* Create a new, allocated UCS-2 string that is a copy
	* of the parameter
	*
	* @param unicstr the UCS-2 string to copy
	* @return a newly allocated copy of the string
	*/
	static uint16_t ucs2strdup(const uint16_t unicstr) {
	int length = ucs2_strlen(unicstr);
	uint8_t *data;

	data = (uint8_t ) malloc(length2+2);
	if ( data == NULL ) {
	return NULL;
	}
	memcpy(data, unicstr, length*2+2);
	return (uint16_t *) data;
	}


	/**
	* Converts a Unicode UCS-2 2-byte string to a UTF-8
	* string.
	*
	* @param unicstr the UCS-2 unicode string to convert
	* @return a UTF-8 string.
	*/
	char ucs2_to_utf8(const uint16_t unicstr){
	char *data = NULL;
	int i = 0;
	int l = 0;
	int length8;
	uint8_t locstr = (uint8_t ) unicstr;

	length8 = ucs2utf8len(unicstr);
	data = (char *) malloc(length8+1);
	if ( data == NULL ) {
	return NULL;
	}
	for(l = 0; (locstr[l] \| locstr[l+1]) != '\0'; l += 2) {
	if (locstr[l+1] == 0x00 && locstr[l] < 0x80U) {
	data[i] = locstr[l];
	i ++;
	} else if (locstr[l+1] < 0x08) {
	data[i] = 0xc0 \| (locstr[l+1]<<2 & 0x1C) \| (locstr[l]>>6 & 0x03);
	data[i+1] = 0x80 \| (locstr[l] & 0x3F);
	i+=2;
	} else {
	data[i] = 0xe0 \| (locstr[l+1]>>4 & 0x0F);
	data[i+1] = 0x80 \| (locstr[l+1]<<2 & 0x3C) \| (locstr[l]>>6 & 0x03);
	data[i+2] = 0x80 \| (locstr[l] & 0x3F);
	i+=3;
	}
	}
	/* Terminate string */
	data[i] = 0x00;

	return data;
	}

	/**
	* Convert a UTF-8 string to a unicode UCS-2 string.
	*
	* @param str the UTF-8 string to convert.
	* @return a pointer to a newly allocated UCS-2 string.
	*/
	uint16_t utf8_to_ucs2(const unsigned char str) {
	uint16_t *retval;
	int i;
	unsigned char buffer[STRING_BUFFER_LENGTH*2];
	int length=0;

	for(i = 0; str[i] != '\0';) {
	if (str[i] < 0x80) {
	buffer[length+1] = 0x00;
	buffer[length] = str[i];
	length += 2;
	i++;
	} else {
	unsigned char numbytes = 0;
	unsigned char lenbyte = 0;

	/* Read the number of encoded bytes */
	lenbyte = str[i];
	while (lenbyte & 0x80) {
	numbytes++;
	lenbyte = lenbyte<<1;
	}
	/* UCS-2 can handle no more than 3 UTF-8 encoded bytes */
	if (numbytes <= 3) {
	if (numbytes == 2 && str[i+1] > 0x80) {
	/* This character can always be handled correctly */
	buffer[length+1] = (str[i]>>2 & 0x07);
	buffer[length] = (str[i]<<6 & 0xC0) \| (str[i+1] & 0x3F);
	i += 2;
	length += 2;
	} else if (numbytes == 3 && str[i+1] > 0x80 && str[i+2] > 0x80) {
	buffer[length+1] = (str[i]<<4 & 0xF0) \| (str[i+1]>>2 & 0x0F);
	buffer[length]= (str[i+1]<<6 & 0xC0) \| (str[i+2] & 0x3F);
	i += 3;
	length += 2;
	} else {
	/* Abnormal string character, just skip */
	i += numbytes;
	}
	} else {
	/* Just skip that character */
	i += numbytes;
	}
	}
	}
	/* Copy the buffer contents */
	buffer[length+1] = 0x00;
	buffer[length] = 0x00;

	retval = ucs2strdup((uint16_t *) buffer);
	return retval;
	}