Blame - utf.h - platform/external/libutf

blob: 02ba472aef6db26bbefe4dd5ada5d039b4087eec [file] [log] [blame]

Alexander Gutkin	96039b7	2014-03-04 17:22:31 +0000	[diff] [blame]	1	#ifndef _UTFH_
				2	#define _UTFH_ 1
Alexander Gutkin	439f3d1	2014-02-28 11:33:45 +0000	[diff] [blame]	3
Alexander Gutkin	96039b7	2014-03-04 17:22:31 +0000	[diff] [blame]	4	#include <stdint.h>
				5
				6	typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
Alexander Gutkin	439f3d1	2014-02-28 11:33:45 +0000	[diff] [blame]	7
				8	enum
				9	{
Alexander Gutkin	96039b7	2014-03-04 17:22:31 +0000	[diff] [blame]	10	UTFmax = 4, /* maximum bytes per rune */
				11	Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
				12	Runeself = 0x80, /* rune and UTF sequences are the same (<) */
				13	Runeerror = 0xFFFD, /* decoding error in UTF */
				14	Runemax = 0x10FFFF, /* maximum rune value */
Alexander Gutkin	439f3d1	2014-02-28 11:33:45 +0000	[diff] [blame]	15	};
				16
Alexander Gutkin	96039b7	2014-03-04 17:22:31 +0000	[diff] [blame]	17	#ifdef __cplusplus
				18	extern "C" {
				19	#endif
Alexander Gutkin	439f3d1	2014-02-28 11:33:45 +0000	[diff] [blame]	20
Alexander Gutkin	96039b7	2014-03-04 17:22:31 +0000	[diff] [blame]	21	/*
				22	* rune routines
				23	*/
				24
				25	/*
				26	* These routines were written by Rob Pike and Ken Thompson
				27	* and first appeared in Plan 9.
				28	* SEE ALSO
				29	* utf (7)
				30	* tcs (1)
				31	*/
				32
				33	// runetochar copies (encodes) one rune, pointed to by r, to at most
				34	// UTFmax bytes starting at s and returns the number of bytes generated.
				35
				36	int runetochar(char* s, const Rune* r);
				37
				38
				39	// chartorune copies (decodes) at most UTFmax bytes starting at s to
				40	// one rune, pointed to by r, and returns the number of bytes consumed.
				41	// If the input is not exactly in UTF format, chartorune will set *r
				42	// to Runeerror and return 1.
				43	//
				44	// Note: There is no special case for a "null-terminated" string. A
				45	// string whose first byte has the value 0 is the UTF8 encoding of the
				46	// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal
				47	// anywhere else in a UTF sequence.
				48
				49	int chartorune(Rune* r, const char* s);
				50
				51
				52	// charntorune is like chartorune, except that it will access at most
				53	// n bytes of s. If the UTF sequence is incomplete within n bytes,
				54	// charntorune will set *r to Runeerror and return 0. If it is complete
				55	// but not in UTF format, it will set *r to Runeerror and return 1.
				56	//
				57	// Added 2004-09-24 by Wei-Hwa Huang
				58
				59	int charntorune(Rune* r, const char* s, int n);
				60
				61	// isvalidcharntorune(str, n, r, consumed)
				62	// is a convenience function that calls "*consumed = charntorune(r, str, n)"
				63	// and returns an int (logically boolean) indicating whether the first
				64	// n bytes of str was a valid and complete UTF sequence.
				65
				66	int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed);
				67
				68	// runelen returns the number of bytes required to convert r into UTF.
				69
				70	int runelen(Rune r);
				71
				72
				73	// runenlen returns the number of bytes required to convert the n
				74	// runes pointed to by r into UTF.
				75
				76	int runenlen(const Rune* r, int n);
				77
				78
				79	// fullrune returns 1 if the string s of length n is long enough to be
				80	// decoded by chartorune, and 0 otherwise. This does not guarantee
				81	// that the string contains a legal UTF encoding. This routine is used
				82	// by programs that obtain input one byte at a time and need to know
				83	// when a full rune has arrived.
				84
				85	int fullrune(const char* s, int n);
				86
				87	// The following routines are analogous to the corresponding string
				88	// routines with "utf" substituted for "str", and "rune" substituted
				89	// for "chr".
				90
				91	// utflen returns the number of runes that are represented by the UTF
				92	// string s. (cf. strlen)
				93
				94	int utflen(const char* s);
				95
				96
				97	// utfnlen returns the number of complete runes that are represented
				98	// by the first n bytes of the UTF string s. If the last few bytes of
				99	// the string contain an incompletely coded rune, utfnlen will not
				100	// count them; in this way, it differs from utflen, which includes
				101	// every byte of the string. (cf. strnlen)
				102
				103	int utfnlen(const char* s, long n);
				104
				105
				106	// utfrune returns a pointer to the first occurrence of rune r in the
				107	// UTF string s, or 0 if r does not occur in the string. The NULL
				108	// byte terminating a string is considered to be part of the string s.
				109	// (cf. strchr)
				110
				111	const char* utfrune(const char* s, Rune r);
				112
				113
				114	// utfrrune returns a pointer to the last occurrence of rune r in the
				115	// UTF string s, or 0 if r does not occur in the string. The NULL
				116	// byte terminating a string is considered to be part of the string s.
				117	// (cf. strrchr)
				118
				119	const char* utfrrune(const char* s, Rune r);
				120
				121
				122	// utfutf returns a pointer to the first occurrence of the UTF string
				123	// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
				124	// null string, utfutf returns s1. (cf. strstr)
				125
				126	const char* utfutf(const char* s1, const char* s2);
				127
				128
				129	// utfecpy copies UTF sequences until a null sequence has been copied,
				130	// but writes no sequences beyond es1. If any sequences are copied,
				131	// s1 is terminated by a null sequence, and a pointer to that sequence
				132	// is returned. Otherwise, the original s1 is returned. (cf. strecpy)
				133
				134	char* utfecpy(char s1, char es1, const char *s2);
				135
				136
				137
				138	// These functions are rune-string analogues of the corresponding
				139	// functions in strcat (3).
				140	//
				141	// These routines first appeared in Plan 9.
				142	// SEE ALSO
				143	// memmove (3)
				144	// rune (3)
				145	// strcat (2)
				146	//
				147	// BUGS: The outcome of overlapping moves varies among implementations.
				148
				149	Rune* runestrcat(Rune* s1, const Rune* s2);
				150	Rune* runestrncat(Rune* s1, const Rune* s2, long n);
				151
				152	const Rune* runestrchr(const Rune* s, Rune c);
				153
				154	int runestrcmp(const Rune* s1, const Rune* s2);
				155	int runestrncmp(const Rune* s1, const Rune* s2, long n);
				156
				157	Rune* runestrcpy(Rune* s1, const Rune* s2);
				158	Rune* runestrncpy(Rune* s1, const Rune* s2, long n);
				159	Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2);
				160
				161	Rune* runestrdup(const Rune* s);
				162
				163	const Rune* runestrrchr(const Rune* s, Rune c);
				164	long runestrlen(const Rune* s);
				165	const Rune* runestrstr(const Rune* s1, const Rune* s2);
				166
				167
				168
				169	// The following routines test types and modify cases for Unicode
				170	// characters. Unicode defines some characters as letters and
				171	// specifies three cases: upper, lower, and title. Mappings among the
				172	// cases are also defined, although they are not exhaustive: some
				173	// upper case letters have no lower case mapping, and so on. Unicode
				174	// also defines several character properties, a subset of which are
				175	// checked by these routines. These routines are based on Unicode
				176	// version 3.0.0.
				177	//
				178	// NOTE: The routines are implemented in C, so the boolean functions
				179	// (e.g., isupperrune) return 0 for false and 1 for true.
				180	//
				181	//
				182	// toupperrune, tolowerrune, and totitlerune are the Unicode case
				183	// mappings. These routines return the character unchanged if it has
				184	// no defined mapping.
				185
				186	Rune toupperrune(Rune r);
				187	Rune tolowerrune(Rune r);
				188	Rune totitlerune(Rune r);
				189
				190
				191	// isupperrune tests for upper case characters, including Unicode
				192	// upper case letters and targets of the toupper mapping. islowerrune
				193	// and istitlerune are defined analogously.
				194
				195	int isupperrune(Rune r);
				196	int islowerrune(Rune r);
				197	int istitlerune(Rune r);
				198
				199
				200	// isalpharune tests for Unicode letters; this includes ideographs in
				201	// addition to alphabetic characters.
				202
				203	int isalpharune(Rune r);
				204
				205
				206	// isdigitrune tests for digits. Non-digit numbers, such as Roman
				207	// numerals, are not included.
				208
				209	int isdigitrune(Rune r);
				210
				211
				212	// isideographicrune tests for ideographic characters and numbers, as
				213	// defined by the Unicode standard.
				214
				215	int isideographicrune(Rune r);
				216
				217
				218	// isspacerune tests for whitespace characters, including "C" locale
				219	// whitespace, Unicode defined whitespace, and the "zero-width
				220	// non-break space" character.
				221
				222	int isspacerune(Rune r);
				223
				224
				225	// (The comments in this file were copied from the manpage files rune.3,
				226	// isalpharune.3, and runestrcat.3. Some formatting changes were also made
				227	// to conform to Google style. /JRM 11/11/05)
				228
				229	#ifdef __cplusplus
Alexander Gutkin	439f3d1	2014-02-28 11:33:45 +0000	[diff] [blame]	230	}
				231	#endif
Alexander Gutkin	96039b7	2014-03-04 17:22:31 +0000	[diff] [blame]	232
Alexander Gutkin	439f3d1	2014-02-28 11:33:45 +0000	[diff] [blame]	233	#endif