Blame - src/unicode.h - fp2-dev/platform/external/v8

blob: fb9e6339e1279dd2e4c5ce6dd08333f805b03c19 [file] [log] [blame]

Ben Murdoch	592a9fc	2012-03-05 11:04:45 +0000	[diff] [blame^]	1	// Copyright 2011 the V8 project authors. All rights reserved.
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2	// Redistribution and use in source and binary forms, with or without
				3	// modification, are permitted provided that the following conditions are
				4	// met:
				5	//
				6	// * Redistributions of source code must retain the above copyright
				7	// notice, this list of conditions and the following disclaimer.
				8	// * Redistributions in binary form must reproduce the above
				9	// copyright notice, this list of conditions and the following
				10	// disclaimer in the documentation and/or other materials provided
				11	// with the distribution.
				12	// * Neither the name of Google Inc. nor the names of its
				13	// contributors may be used to endorse or promote products derived
				14	// from this software without specific prior written permission.
				15	//
				16	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				17	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				18	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				19	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				20	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				21	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				22	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				23	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				24	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				25	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				26	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				27
				28	#ifndef V8_UNICODE_H_
				29	#define V8_UNICODE_H_
				30
				31	#include <sys/types.h>
				32
				33	/**
				34	* \file
				35	* Definitions and convenience functions for working with unicode.
				36	*/
				37
				38	namespace unibrow {
				39
				40	typedef unsigned int uchar;
				41	typedef unsigned char byte;
				42
				43	/**
				44	* The max length of the result of converting the case of a single
				45	* character.
				46	*/
Ben Murdoch	592a9fc	2012-03-05 11:04:45 +0000	[diff] [blame^]	47	const int kMaxMappingSize = 4;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	48
				49	template <class T, int size = 256>
				50	class Predicate {
				51	public:
				52	inline Predicate() { }
				53	inline bool get(uchar c);
				54	private:
				55	friend class Test;
				56	bool CalculateValue(uchar c);
				57	struct CacheEntry {
				58	inline CacheEntry() : code_point_(0), value_(0) { }
				59	inline CacheEntry(uchar code_point, bool value)
				60	: code_point_(code_point),
				61	value_(value) { }
				62	uchar code_point_ : 21;
				63	bool value_ : 1;
				64	};
				65	static const int kSize = size;
				66	static const int kMask = kSize - 1;
				67	CacheEntry entries_[kSize];
				68	};
				69
				70	// A cache used in case conversion. It caches the value for characters
				71	// that either have no mapping or map to a single character independent
				72	// of context. Characters that map to more than one character or that
				73	// map differently depending on context are always looked up.
				74	template <class T, int size = 256>
				75	class Mapping {
				76	public:
				77	inline Mapping() { }
				78	inline int get(uchar c, uchar n, uchar* result);
				79	private:
				80	friend class Test;
				81	int CalculateValue(uchar c, uchar n, uchar* result);
				82	struct CacheEntry {
				83	inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
				84	inline CacheEntry(uchar code_point, signed offset)
				85	: code_point_(code_point),
				86	offset_(offset) { }
				87	uchar code_point_;
				88	signed offset_;
				89	static const int kNoChar = (1 << 21) - 1;
				90	};
				91	static const int kSize = size;
				92	static const int kMask = kSize - 1;
				93	CacheEntry entries_[kSize];
				94	};
				95
				96	class UnicodeData {
				97	private:
				98	friend class Test;
				99	static int GetByteCount();
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	100	static const uchar kMaxCodePoint;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	101	};
				102
				103	// --- U t f 8 ---
				104
				105	template <typename Data>
				106	class Buffer {
				107	public:
				108	inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
				109	inline Buffer() : data_(0), length_(0) { }
				110	Data data() { return data_; }
				111	unsigned length() { return length_; }
				112	private:
				113	Data data_;
				114	unsigned length_;
				115	};
				116
				117	class Utf8 {
				118	public:
				119	static inline uchar Length(uchar chr);
				120	static inline unsigned Encode(char* out, uchar c);
				121	static const byte* ReadBlock(Buffer<const char> str, byte buffer,
				122	unsigned capacity, unsigned* chars_read, unsigned* offset);
Kristian Monsen	0d5e116	2010-09-30 15:31:59 +0100	[diff] [blame]	123	static uchar CalculateValue(const byte* str,
				124	unsigned length,
				125	unsigned* cursor);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	126	static const uchar kBadChar = 0xFFFD;
				127	static const unsigned kMaxEncodedSize = 4;
				128	static const unsigned kMaxOneByteChar = 0x7f;
				129	static const unsigned kMaxTwoByteChar = 0x7ff;
				130	static const unsigned kMaxThreeByteChar = 0xffff;
				131	static const unsigned kMaxFourByteChar = 0x1fffff;
				132
				133	private:
				134	template <unsigned s> friend class Utf8InputBuffer;
				135	friend class Test;
				136	static inline uchar ValueOf(const byte* str,
				137	unsigned length,
				138	unsigned* cursor);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	139	};
				140
				141	// --- C h a r a c t e r S t r e a m ---
				142
				143	class CharacterStream {
				144	public:
				145	inline uchar GetNext();
				146	inline bool has_more() { return remaining_ != 0; }
				147	// Note that default implementation is not efficient.
				148	virtual void Seek(unsigned);
				149	unsigned Length();
				150	virtual ~CharacterStream() { }
				151	static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
				152	unsigned& offset);
				153	static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
				154	unsigned capacity, unsigned& offset);
				155	static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
				156	unsigned capacity, unsigned& offset);
				157	static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
				158	virtual void Rewind() = 0;
				159	protected:
				160	virtual void FillBuffer() = 0;
				161	// The number of characters left in the current buffer
				162	unsigned remaining_;
				163	// The current offset within the buffer
				164	unsigned cursor_;
				165	// The buffer containing the decoded characters.
				166	const byte* buffer_;
				167	};
				168
				169	// --- I n p u t B u f f e r ---
				170
				171	/**
				172	* Provides efficient access to encoded characters in strings. It
				173	* does so by reading characters one block at a time, rather than one
				174	* character at a time, which gives string implementations an
				175	* opportunity to optimize the decoding.
				176	*/
				177	template <class Reader, class Input = Reader*, unsigned kSize = 256>
				178	class InputBuffer : public CharacterStream {
				179	public:
				180	virtual void Rewind();
				181	inline void Reset(Input input);
				182	void Seek(unsigned position);
				183	inline void Reset(unsigned position, Input input);
				184	protected:
				185	InputBuffer() { }
				186	explicit InputBuffer(Input input) { Reset(input); }
				187	virtual void FillBuffer();
				188
				189	// A custom offset that can be used by the string implementation to
				190	// mark progress within the encoded string.
				191	unsigned offset_;
				192	// The input string
				193	Input input_;
				194	// To avoid heap allocation, we keep an internal buffer to which
				195	// the encoded string can write its characters. The string
				196	// implementation is free to decide whether it wants to use this
				197	// buffer or not.
				198	byte util_buffer_[kSize];
				199	};
				200
				201	// --- U t f 8 I n p u t B u f f e r ---
				202
				203	template <unsigned s = 256>
				204	class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
				205	public:
				206	inline Utf8InputBuffer() { }
				207	inline Utf8InputBuffer(const char* data, unsigned length);
				208	inline void Reset(const char* data, unsigned length) {
				209	InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
				210	Buffer<const char*>(data, length));
				211	}
				212	};
				213
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	214
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	215	struct Uppercase {
				216	static bool Is(uchar c);
				217	};
				218	struct Lowercase {
				219	static bool Is(uchar c);
				220	};
				221	struct Letter {
				222	static bool Is(uchar c);
				223	};
				224	struct Space {
				225	static bool Is(uchar c);
				226	};
				227	struct Number {
				228	static bool Is(uchar c);
				229	};
				230	struct WhiteSpace {
				231	static bool Is(uchar c);
				232	};
				233	struct LineTerminator {
				234	static bool Is(uchar c);
				235	};
				236	struct CombiningMark {
				237	static bool Is(uchar c);
				238	};
				239	struct ConnectorPunctuation {
				240	static bool Is(uchar c);
				241	};
				242	struct ToLowercase {
				243	static const int kMaxWidth = 3;
				244	static int Convert(uchar c,
				245	uchar n,
				246	uchar* result,
				247	bool* allow_caching_ptr);
				248	};
				249	struct ToUppercase {
				250	static const int kMaxWidth = 3;
				251	static int Convert(uchar c,
				252	uchar n,
				253	uchar* result,
				254	bool* allow_caching_ptr);
				255	};
				256	struct Ecma262Canonicalize {
				257	static const int kMaxWidth = 1;
				258	static int Convert(uchar c,
				259	uchar n,
				260	uchar* result,
				261	bool* allow_caching_ptr);
				262	};
				263	struct Ecma262UnCanonicalize {
				264	static const int kMaxWidth = 4;
				265	static int Convert(uchar c,
				266	uchar n,
				267	uchar* result,
				268	bool* allow_caching_ptr);
				269	};
				270	struct CanonicalizationRange {
				271	static const int kMaxWidth = 1;
				272	static int Convert(uchar c,
				273	uchar n,
				274	uchar* result,
				275	bool* allow_caching_ptr);
				276	};
				277
				278	} // namespace unibrow
				279
				280	#endif // V8_UNICODE_H_