media/libdrm/mobile2/src/util/ustl-1.0/utf8.h - platform/frameworks/base - Gitiles

 // This file is part of the ustl library, an STL implementation.
 //
 // Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
 // This file is free software, distributed under the MIT License.
 //
 // utf8.h
 //
 // This file contains stream iterators that read and write UTF-8 encoded
 // characters. The encoding is defined as follows:
 //
 // U-00000000 - U-0000007F: 0xxxxxxx
 // U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
 // U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
 // U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 // U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 // U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 // U-80000000 - U-FFFFFFFF: 11111110 100000xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 //
 // The last range in not in the UTF-8 standard because Unicode forbids
 // characters of those values. However, since ustl::string uses this code
 // to write its length, the support is here. The reason it was put here
 // in the first place, is that extra code would have been necessary to
 // flag that range as invalid.
 //
 #ifndef UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4
 #define UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4

 #include "uiterator.h"

 namespace ustl {

 //----------------------------------------------------------------------

 typedef uint8_t utf8subchar_t;	///< Type for the encoding subcharacters.

 //----------------------------------------------------------------------

 /// Returns the number of bytes required to UTF-8 encode \p v.
 inline size_t Utf8Bytes (wchar_t v)
 {
     static const uint32_t c_Bounds[] = { 0x0000007F, 0x000007FF, 0x0000FFFF, 0x001FFFFF, 0x03FFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, };
     size_t bi = 0;
     while (c_Bounds[bi++] < uint32_t(v));
     return (bi);
 }

 /// Returns the number of bytes in a UTF-8 sequence that starts with \p c.
 inline size_t Utf8SequenceBytes (wchar_t c)	// a wchar_t to keep c in a full register
 {
     // Count the leading bits. Header bits are 1 * nBytes followed by a 0.
     //	0 - single byte character. Take 7 bits (0xFF >> 1)
     //	1 - error, in the middle of the character. Take 6 bits (0xFF >> 2)
     //	    so you will keep reading invalid entries until you hit the next character.
     //	>2 - multibyte character. Take remaining bits, and get the next bytes.
     // All errors are ignored, since the user can not correct them.
     //
     wchar_t mask = 0x80;
     size_t nBytes = 0;
     for (; c & mask; ++nBytes)
 	mask >>= 1;
     return (nBytes ? nBytes : 1); // A sequence is always at least 1 byte.
 }

 //----------------------------------------------------------------------

 /// \class utf8in_iterator utf8.h ustl.h
 /// \ingroup IteratorAdaptors
 ///
 /// \brief An iterator adaptor to character containers for reading UTF-8 encoded text.
 ///
 /// For example, you can copy from ustl::string to ustl::vector<wchar_t> with
 /// copy (utf8in (str.begin()), utf8in (str.end()), back_inserter(wvect));
 /// There is no error handling; if the reading frame slips you'll get extra
 /// characters, one for every misaligned byte. Although it is possible to skip
 /// to the start of the next character, that would result in omitting the
 /// misformatted character and the one after it, making it very difficult to
 /// detect by the user. It is better to write some strange characters and let
 /// the user know his file is corrupted. Another problem is overflow on bad
 /// encodings (like a 0xFF on the end of a string). This is checked through
 /// the end-of-string nul character, which will always be there as long as
 /// you are using the string class.
 ///
 template <typename Iterator, typename WChar = wchar_t>
 class utf8in_iterator {
 public:
     typedef typename iterator_traits<Iterator>::value_type	value_type;
     typedef typename iterator_traits<Iterator>::difference_type	difference_type;
     typedef typename iterator_traits<Iterator>::pointer		pointer;
     typedef typename iterator_traits<Iterator>::reference	reference;
 public:
     explicit			utf8in_iterator (const Iterator& is)		: m_i (is), m_v (0) { Read(); }
 				utf8in_iterator (const utf8in_iterator& i)	: m_i (i.m_i), m_v (i.m_v) {}
     inline const utf8in_iterator& operator= (const utf8in_iterator& i)		{ m_i = i.m_i; m_v = i.m_v; return (*this); }
     inline Iterator		base (void) const	{ return (m_i - (Utf8Bytes(m_v) - 1)); }
     /// Reads and returns the next value.
     inline WChar		operator* (void) const	{ return (m_v); }
     inline utf8in_iterator&	operator++ (void)	{ ++m_i; Read(); return (*this); }
     inline utf8in_iterator	operator++ (int)	{ utf8in_iterator old (*this); operator++(); return (old); }
     inline utf8in_iterator&	operator+= (uoff_t n)	{ while (n--) operator++(); return (*this); }
     inline utf8in_iterator	operator+ (uoff_t n)	{ utf8in_iterator v (*this); return (v += n); }
     inline bool			operator== (const utf8in_iterator& i) const	{ return (m_i == i.m_i); }
     inline bool			operator< (const utf8in_iterator& i) const	{ return (m_i < i.m_i); }
     difference_type		operator- (const utf8in_iterator& i) const;
 private:
     void			Read (void);
 private:
     Iterator			m_i;
     WChar			m_v;
 };

 /// Steps to the next character and updates current returnable value.
 template <typename Iterator, typename WChar>
 void utf8in_iterator<Iterator,WChar>::Read (void)
 {
     const utf8subchar_t c = *m_i;
     size_t nBytes = Utf8SequenceBytes (c);
     m_v = c & (0xFF >> nBytes);	// First byte contains bits after the header.
     while (--nBytes && *++m_i)	// Each subsequent byte has 6 bits.
 	m_v = (m_v << 6) | (*m_i & 0x3F);
 }

 /// Returns the distance in characters (as opposed to the distance in bytes).
 template <typename Iterator, typename WChar>
 typename utf8in_iterator<Iterator,WChar>::difference_type
 utf8in_iterator<Iterator,WChar>::operator- (const utf8in_iterator<Iterator,WChar>& last) const
 {
     difference_type dist = 0;
     for (Iterator first (last.m_i); first < m_i; ++dist)
 	first = advance (first, Utf8SequenceBytes (*first));
     return (dist);
 }

 //----------------------------------------------------------------------

 /// \class utf8out_iterator utf8.h ustl.h
 /// \ingroup IteratorAdaptors
 ///
 /// \brief An iterator adaptor to character containers for writing UTF-8 encoded text.
 ///
 template <typename Iterator, typename WChar = wchar_t>
 class utf8out_iterator {
 public:
     typedef typename iterator_traits<Iterator>::value_type	value_type;
     typedef typename iterator_traits<Iterator>::difference_type	difference_type;
     typedef typename iterator_traits<Iterator>::pointer		pointer;
     typedef typename iterator_traits<Iterator>::reference	reference;
 public:
     explicit			utf8out_iterator (const Iterator& os) : m_i (os) {}
 				utf8out_iterator (const utf8out_iterator& i) : m_i (i.m_i) {}
     inline const Iterator&	base (void) const { return (m_i); }
     /// Writes \p v into the stream.
     utf8out_iterator&		operator= (WChar v);
     inline utf8out_iterator&	operator* (void) { return (*this); }
     inline utf8out_iterator&	operator++ (void) { return (*this); }
     inline utf8out_iterator	operator++ (int) { return (*this); }
     inline bool			operator== (const utf8out_iterator& i) const { return (m_i == i.m_i); }
     inline bool			operator< (const utf8out_iterator& i) const { return (m_i < i.m_i); }
 private:
     Iterator			m_i;
 };

 /// Writes \p v into the stream.
 template <typename Iterator, typename WChar>
 utf8out_iterator<Iterator,WChar>& utf8out_iterator<Iterator,WChar>::operator= (WChar v)
 {
     const size_t nBytes = Utf8Bytes (v);
     if (nBytes > 1) {
 	// Write the bits 6 bits at a time, except for the first one,
 	// which may be less than 6 bits.
 	register wchar_t shift = nBytes * 6;
 	*m_i++ = ((v >> (shift -= 6)) & 0x3F) | (0xFF << (8 - nBytes));
 	while (shift)
 	    *m_i++ = ((v >> (shift -= 6)) & 0x3F) | 0x80;
     } else	// If only one byte, there is no header.
     	*m_i++ = v;
     return (*this);
 }

 //----------------------------------------------------------------------

 /// Returns a UTF-8 adaptor writing to \p i. Useful in conjuction with back_insert_iterator.
 template <typename Iterator>
 inline utf8out_iterator<Iterator> utf8out (Iterator i)
 {
     return (utf8out_iterator<Iterator> (i));
 }

 /// Returns a UTF-8 adaptor reading from \p i.
 template <typename Iterator>
 inline utf8in_iterator<Iterator> utf8in (Iterator i)
 {
     return (utf8in_iterator<Iterator> (i));
 }

 //----------------------------------------------------------------------

 } // namespace ustl

 #endif
	// This file is part of the ustl library, an STL implementation.
	//
	// Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
	// This file is free software, distributed under the MIT License.
	//
	// utf8.h
	//
	// This file contains stream iterators that read and write UTF-8 encoded
	// characters. The encoding is defined as follows:
	//
	// U-00000000 - U-0000007F: 0xxxxxxx
	// U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
	// U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
	// U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	// U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	// U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	// U-80000000 - U-FFFFFFFF: 11111110 100000xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	//
	// The last range in not in the UTF-8 standard because Unicode forbids
	// characters of those values. However, since ustl::string uses this code
	// to write its length, the support is here. The reason it was put here
	// in the first place, is that extra code would have been necessary to
	// flag that range as invalid.
	//
	#ifndef UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4
	#define UTF8_H_3D7AEEEB3A88928D4D280B785F78B6F4

	#include "uiterator.h"

	namespace ustl {

	//----------------------------------------------------------------------

	typedef uint8_t utf8subchar_t; ///< Type for the encoding subcharacters.

	//----------------------------------------------------------------------

	/// Returns the number of bytes required to UTF-8 encode \p v.
	inline size_t Utf8Bytes (wchar_t v)
	{
	static const uint32_t c_Bounds[] = { 0x0000007F, 0x000007FF, 0x0000FFFF, 0x001FFFFF, 0x03FFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, };
	size_t bi = 0;
	while (c_Bounds[bi++] < uint32_t(v));
	return (bi);
	}

	/// Returns the number of bytes in a UTF-8 sequence that starts with \p c.
	inline size_t Utf8SequenceBytes (wchar_t c) // a wchar_t to keep c in a full register
	{
	// Count the leading bits. Header bits are 1 * nBytes followed by a 0.
	// 0 - single byte character. Take 7 bits (0xFF >> 1)
	// 1 - error, in the middle of the character. Take 6 bits (0xFF >> 2)
	// so you will keep reading invalid entries until you hit the next character.
	// >2 - multibyte character. Take remaining bits, and get the next bytes.
	// All errors are ignored, since the user can not correct them.
	//
	wchar_t mask = 0x80;
	size_t nBytes = 0;
	for (; c & mask; ++nBytes)
	mask >>= 1;
	return (nBytes ? nBytes : 1); // A sequence is always at least 1 byte.
	}

	//----------------------------------------------------------------------

	/// \class utf8in_iterator utf8.h ustl.h
	/// \ingroup IteratorAdaptors
	///
	/// \brief An iterator adaptor to character containers for reading UTF-8 encoded text.
	///
	/// For example, you can copy from ustl::string to ustl::vector<wchar_t> with
	/// copy (utf8in (str.begin()), utf8in (str.end()), back_inserter(wvect));
	/// There is no error handling; if the reading frame slips you'll get extra
	/// characters, one for every misaligned byte. Although it is possible to skip
	/// to the start of the next character, that would result in omitting the
	/// misformatted character and the one after it, making it very difficult to
	/// detect by the user. It is better to write some strange characters and let
	/// the user know his file is corrupted. Another problem is overflow on bad
	/// encodings (like a 0xFF on the end of a string). This is checked through
	/// the end-of-string nul character, which will always be there as long as
	/// you are using the string class.
	///
	template <typename Iterator, typename WChar = wchar_t>
	class utf8in_iterator {
	public:
	typedef typename iterator_traits<Iterator>::value_type value_type;
	typedef typename iterator_traits<Iterator>::difference_type difference_type;
	typedef typename iterator_traits<Iterator>::pointer pointer;
	typedef typename iterator_traits<Iterator>::reference reference;
	public:
	explicit utf8in_iterator (const Iterator& is) : m_i (is), m_v (0) { Read(); }
	utf8in_iterator (const utf8in_iterator& i) : m_i (i.m_i), m_v (i.m_v) {}
	inline const utf8in_iterator& operator= (const utf8in_iterator& i) { m_i = i.m_i; m_v = i.m_v; return (*this); }
	inline Iterator base (void) const { return (m_i - (Utf8Bytes(m_v) - 1)); }
	/// Reads and returns the next value.
	inline WChar operator* (void) const { return (m_v); }
	inline utf8in_iterator& operator++ (void) { ++m_i; Read(); return (*this); }
	inline utf8in_iterator operator++ (int) { utf8in_iterator old (*this); operator++(); return (old); }
	inline utf8in_iterator& operator+= (uoff_t n) { while (n--) operator++(); return (*this); }
	inline utf8in_iterator operator+ (uoff_t n) { utf8in_iterator v (*this); return (v += n); }
	inline bool operator== (const utf8in_iterator& i) const { return (m_i == i.m_i); }
	inline bool operator< (const utf8in_iterator& i) const { return (m_i < i.m_i); }
	difference_type operator- (const utf8in_iterator& i) const;
	private:
	void Read (void);
	private:
	Iterator m_i;
	WChar m_v;
	};

	/// Steps to the next character and updates current returnable value.
	template <typename Iterator, typename WChar>
	void utf8in_iterator<Iterator,WChar>::Read (void)
	{
	const utf8subchar_t c = *m_i;
	size_t nBytes = Utf8SequenceBytes (c);
	m_v = c & (0xFF >> nBytes); // First byte contains bits after the header.
	while (--nBytes && *++m_i) // Each subsequent byte has 6 bits.
	m_v = (m_v << 6) \| (*m_i & 0x3F);
	}

	/// Returns the distance in characters (as opposed to the distance in bytes).
	template <typename Iterator, typename WChar>
	typename utf8in_iterator<Iterator,WChar>::difference_type
	utf8in_iterator<Iterator,WChar>::operator- (const utf8in_iterator<Iterator,WChar>& last) const
	{
	difference_type dist = 0;
	for (Iterator first (last.m_i); first < m_i; ++dist)
	first = advance (first, Utf8SequenceBytes (*first));
	return (dist);
	}

	//----------------------------------------------------------------------

	/// \class utf8out_iterator utf8.h ustl.h
	/// \ingroup IteratorAdaptors
	///
	/// \brief An iterator adaptor to character containers for writing UTF-8 encoded text.
	///
	template <typename Iterator, typename WChar = wchar_t>
	class utf8out_iterator {
	public:
	typedef typename iterator_traits<Iterator>::value_type value_type;
	typedef typename iterator_traits<Iterator>::difference_type difference_type;
	typedef typename iterator_traits<Iterator>::pointer pointer;
	typedef typename iterator_traits<Iterator>::reference reference;
	public:
	explicit utf8out_iterator (const Iterator& os) : m_i (os) {}
	utf8out_iterator (const utf8out_iterator& i) : m_i (i.m_i) {}
	inline const Iterator& base (void) const { return (m_i); }
	/// Writes \p v into the stream.
	utf8out_iterator& operator= (WChar v);
	inline utf8out_iterator& operator* (void) { return (*this); }
	inline utf8out_iterator& operator++ (void) { return (*this); }
	inline utf8out_iterator operator++ (int) { return (*this); }
	inline bool operator== (const utf8out_iterator& i) const { return (m_i == i.m_i); }
	inline bool operator< (const utf8out_iterator& i) const { return (m_i < i.m_i); }
	private:
	Iterator m_i;
	};

	/// Writes \p v into the stream.
	template <typename Iterator, typename WChar>
	utf8out_iterator<Iterator,WChar>& utf8out_iterator<Iterator,WChar>::operator= (WChar v)
	{
	const size_t nBytes = Utf8Bytes (v);
	if (nBytes > 1) {
	// Write the bits 6 bits at a time, except for the first one,
	// which may be less than 6 bits.
	register wchar_t shift = nBytes * 6;
	*m_i++ = ((v >> (shift -= 6)) & 0x3F) \| (0xFF << (8 - nBytes));
	while (shift)
	*m_i++ = ((v >> (shift -= 6)) & 0x3F) \| 0x80;
	} else // If only one byte, there is no header.
	*m_i++ = v;
	return (*this);
	}

	//----------------------------------------------------------------------

	/// Returns a UTF-8 adaptor writing to \p i. Useful in conjuction with back_insert_iterator.
	template <typename Iterator>
	inline utf8out_iterator<Iterator> utf8out (Iterator i)
	{
	return (utf8out_iterator<Iterator> (i));
	}

	/// Returns a UTF-8 adaptor reading from \p i.
	template <typename Iterator>
	inline utf8in_iterator<Iterator> utf8in (Iterator i)
	{
	return (utf8in_iterator<Iterator> (i));
	}

	//----------------------------------------------------------------------

	} // namespace ustl

	#endif