Blame - Doc/lib/libunicodedata.tex - platform/external/python/cpython2

blob: 81ad67dc2e3388c4bf5ed7edded1dafe8cd27aa4 [file] [log] [blame]

Fred Drake	28b2944	2000-06-13 20:50:50 +0000	[diff] [blame]	1	\section{\module{unicodedata} ---
				2	Unicode Database}
				3
				4	\declaremodule{standard}{unicodedata}
				5	\modulesynopsis{Access the Unicode Database.}
				6	\moduleauthor{Marc-Andre Lemburg}{mal@lemburg.com}
				7	\sectionauthor{Marc-Andre Lemburg}{mal@lemburg.com}
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	8	\sectionauthor{Martin v. L\"owis}{martin@v.loewis.de}
Fred Drake	28b2944	2000-06-13 20:50:50 +0000	[diff] [blame]	9
				10	\index{Unicode}
				11	\index{character}
				12	\indexii{Unicode}{database}
				13
				14	This module provides access to the Unicode Character Database which
				15	defines character properties for all Unicode characters. The data in
				16	this database is based on the \file{UnicodeData.txt} file version
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	17	3.2.0 which is publically available from \url{ftp://ftp.unicode.org/}.
Fred Drake	28b2944	2000-06-13 20:50:50 +0000	[diff] [blame]	18
				19	The module uses the same names and symbols as defined by the
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	20	UnicodeData File Format 3.2.0 (see
Hye-Shik Chang	4c560ea	2005-06-04 07:31:48 +0000	[diff] [blame^]	21	\url{http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html}). It
Fred Drake	28b2944	2000-06-13 20:50:50 +0000	[diff] [blame]	22	defines the following functions:
				23
Fredrik Lundh	0110d3b	2001-01-24 08:10:07 +0000	[diff] [blame]	24	\begin{funcdesc}{lookup}{name}
				25	Look up character by name. If a character with the
				26	given name is found, return the corresponding Unicode
				27	character. If not found, \exception{KeyError} is raised.
				28	\end{funcdesc}
				29
				30	\begin{funcdesc}{name}{unichr\optional{, default}}
				31	Returns the name assigned to the Unicode character
				32	\var{unichr} as a string. If no name is defined,
				33	\var{default} is returned, or, if not given,
				34	\exception{ValueError} is raised.
				35	\end{funcdesc}
				36
Fred Drake	28b2944	2000-06-13 20:50:50 +0000	[diff] [blame]	37	\begin{funcdesc}{decimal}{unichr\optional{, default}}
				38	Returns the decimal value assigned to the Unicode character
				39	\var{unichr} as integer. If no such value is defined,
				40	\var{default} is returned, or, if not given,
				41	\exception{ValueError} is raised.
				42	\end{funcdesc}
				43
				44	\begin{funcdesc}{digit}{unichr\optional{, default}}
				45	Returns the digit value assigned to the Unicode character
				46	\var{unichr} as integer. If no such value is defined,
				47	\var{default} is returned, or, if not given,
				48	\exception{ValueError} is raised.
				49	\end{funcdesc}
				50
				51	\begin{funcdesc}{numeric}{unichr\optional{, default}}
				52	Returns the numeric value assigned to the Unicode character
				53	\var{unichr} as float. If no such value is defined, \var{default} is
				54	returned, or, if not given, \exception{ValueError} is raised.
				55	\end{funcdesc}
				56
				57	\begin{funcdesc}{category}{unichr}
				58	Returns the general category assigned to the Unicode character
				59	\var{unichr} as string.
				60	\end{funcdesc}
				61
				62	\begin{funcdesc}{bidirectional}{unichr}
				63	Returns the bidirectional category assigned to the Unicode character
				64	\var{unichr} as string. If no such value is defined, an empty string
				65	is returned.
				66	\end{funcdesc}
				67
				68	\begin{funcdesc}{combining}{unichr}
				69	Returns the canonical combining class assigned to the Unicode
				70	character \var{unichr} as integer. Returns \code{0} if no combining
				71	class is defined.
				72	\end{funcdesc}
				73
Hye-Shik Chang	e9ddfbb	2004-08-04 07:38:35 +0000	[diff] [blame]	74	\begin{funcdesc}{east_asian_width}{unichr}
Neal Norwitz	8623b36	2004-08-20 02:36:27 +0000	[diff] [blame]	75	Returns the east asian width assigned to the Unicode character
Hye-Shik Chang	e9ddfbb	2004-08-04 07:38:35 +0000	[diff] [blame]	76	\var{unichr} as string.
Neal Norwitz	8623b36	2004-08-20 02:36:27 +0000	[diff] [blame]	77	\versionadded{2.4}
Hye-Shik Chang	e9ddfbb	2004-08-04 07:38:35 +0000	[diff] [blame]	78	\end{funcdesc}
				79
Fred Drake	28b2944	2000-06-13 20:50:50 +0000	[diff] [blame]	80	\begin{funcdesc}{mirrored}{unichr}
Neal Norwitz	7109b28	2004-08-20 23:13:26 +0000	[diff] [blame]	81	Returns the mirrored property assigned to the Unicode character
Fred Drake	28b2944	2000-06-13 20:50:50 +0000	[diff] [blame]	82	\var{unichr} as integer. Returns \code{1} if the character has been
				83	identified as a ``mirrored'' character in bidirectional text,
				84	\code{0} otherwise.
				85	\end{funcdesc}
				86
				87	\begin{funcdesc}{decomposition}{unichr}
				88	Returns the character decomposition mapping assigned to the Unicode
				89	character \var{unichr} as string. An empty string is returned in case
				90	no such mapping is defined.
				91	\end{funcdesc}
Martin v. Löwis	677bde2	2002-11-23 22:08:15 +0000	[diff] [blame]	92
				93	\begin{funcdesc}{normalize}{form, unistr}
				94
				95	Return the normal form \var{form} for the Unicode string \var{unistr}.
				96	Valid values for \var{form} are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
				97
				98	The Unicode standard defines various normalization forms of a Unicode
				99	string, based on the definition of canonical equivalence and
				100	compatibility equivalence. In Unicode, several characters can be
				101	expressed in various way. For example, the character U+00C7 (LATIN
				102	CAPITAL LETTER C WITH CEDILLA) can also be expressed as the sequence
				103	U+0043 (LATIN CAPITAL LETTER C) U+0327 (COMBINING CEDILLA).
				104
				105	For each character, there are two normal forms: normal form C and
				106	normal form D. Normal form D (NFD) is also known as canonical
				107	decomposition, and translates each character into its decomposed form.
				108	Normal form C (NFC) first applies a canonical decomposition, then
				109	composes pre-combined characters again.
				110
				111	In addition to these two forms, there two additional normal forms
				112	based on compatibility equivalence. In Unicode, certain characters are
				113	supported which normally would be unified with other characters. For
				114	example, U+2160 (ROMAN NUMERAL ONE) is really the same thing as U+0049
				115	(LATIN CAPITAL LETTER I). However, it is supported in Unicode for
				116	compatibility with existing character sets (e.g. gb2312).
				117
				118	The normal form KD (NFKD) will apply the compatibility decomposition,
				119	i.e. replace all compatibility characters with their equivalents. The
				120	normal form KC (NFKC) first applies the compatibility decomposition,
				121	followed by the canonical composition.
				122
				123	\versionadded{2.3}
				124	\end{funcdesc}
				125
Martin v. Löwis	b5c980b	2002-11-25 09:13:37 +0000	[diff] [blame]	126	In addition, the module exposes the following constant:
				127
				128	\begin{datadesc}{unidata_version}
				129	The version of the Unicode database used in this module.
				130
				131	\versionadded{2.3}
Hye-Shik Chang	e9ddfbb	2004-08-04 07:38:35 +0000	[diff] [blame]	132	\end{datadesc}