blob: 62ab957e215c0cd7cbb159f7c7386a8893c2bfa6 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
16#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000017#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000018#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019
20/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000021
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000022typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000023 const unsigned char category; /* index into
24 _PyUnicode_CategoryNames */
25 const unsigned char combining; /* combining class value 0 - 255 */
26 const unsigned char bidirectional; /* index into
27 _PyUnicode_BidirectionalNames */
28 const unsigned char mirrored; /* true if mirrored in bidir mode */
29 const unsigned char east_asian_width; /* index into
30 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000031 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000032} _PyUnicode_DatabaseRecord;
33
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000034typedef struct change_record {
35 /* sequence of fields should be the same as in merge_old_version */
36 const unsigned char bidir_changed;
37 const unsigned char category_changed;
38 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000039 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000040 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000041} change_record;
42
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000043/* data file generated by Tools/unicode/makeunicodedata.py */
44#include "unicodedata_db.h"
45
46static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000047_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000049 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000050 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000051 index = 0;
52 else {
53 index = index1[(code>>SHIFT)];
54 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
55 }
56
57 return &_PyUnicode_Database_Records[index];
58}
59
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000060/* ------------- Previous-version API ------------------------------------- */
61typedef struct previous_version {
62 PyObject_HEAD
63 const char *name;
64 const change_record* (*getrecord)(Py_UCS4);
65 Py_UCS4 (*normalization)(Py_UCS4);
66} PreviousDBVersion;
67
68#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
69
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000070static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000071 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000072 {NULL}
73};
74
Thomas Wouters89f507f2006-12-13 04:49:30 +000075/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000076static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000077#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078
79static PyObject*
80new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
81 Py_UCS4 (*normalization)(Py_UCS4))
82{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000083 PreviousDBVersion *self;
84 self = PyObject_New(PreviousDBVersion, &UCD_Type);
85 if (self == NULL)
86 return NULL;
87 self->name = name;
88 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000089 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000090 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000091}
92
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000093
94static Py_UCS4 getuchar(PyUnicodeObject *obj)
95{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020096 if (PyUnicode_READY(obj))
97 return (Py_UCS4)-1;
98 if (PyUnicode_GET_LENGTH(obj) == 1) {
99 if (PyUnicode_READY(obj))
100 return (Py_UCS4)-1;
101 return PyUnicode_READ_CHAR(obj, 0);
102 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000103 PyErr_SetString(PyExc_TypeError,
104 "need a single Unicode character as parameter");
105 return (Py_UCS4)-1;
106}
107
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000108/* --- Module API --------------------------------------------------------- */
109
Larry Hastings31826802013-10-19 00:09:25 -0700110/*[clinic]
111module unicodedata
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800112class unicodedata.UCD
113unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700114
115 unichr: object(type='str')
116 default: object=NULL
117 /
118
119Converts a Unicode character into its equivalent decimal value.
120
121Returns the decimal value assigned to the Unicode character unichr
122as integer. If no such value is defined, default is returned, or, if
123not given, ValueError is raised.
124[clinic]*/
125
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800126PyDoc_STRVAR(unicodedata_UCD_decimal__doc__,
Larry Hastings31826802013-10-19 00:09:25 -0700127"Converts a Unicode character into its equivalent decimal value.\n"
128"\n"
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800129"unicodedata.UCD.decimal(unichr, default=None)\n"
Larry Hastings31826802013-10-19 00:09:25 -0700130"\n"
131"Returns the decimal value assigned to the Unicode character unichr\n"
132"as integer. If no such value is defined, default is returned, or, if\n"
133"not given, ValueError is raised.");
134
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800135#define UNICODEDATA_UCD_DECIMAL_METHODDEF \
136 {"decimal", (PyCFunction)unicodedata_UCD_decimal, METH_VARARGS, unicodedata_UCD_decimal__doc__},
Larry Hastings31826802013-10-19 00:09:25 -0700137
138static PyObject *
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800139unicodedata_UCD_decimal_impl(PyObject *self, PyObject *unichr, PyObject *default_value);
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000140
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000141static PyObject *
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800142unicodedata_UCD_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000143{
Larry Hastings31826802013-10-19 00:09:25 -0700144 PyObject *return_value = NULL;
145 PyObject *unichr;
146 PyObject *default_value = NULL;
147
148 if (!PyArg_ParseTuple(args,
149 "O!|O:decimal",
150 &PyUnicode_Type, &unichr, &default_value))
151 goto exit;
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800152 return_value = unicodedata_UCD_decimal_impl(self, unichr, default_value);
Larry Hastings31826802013-10-19 00:09:25 -0700153
154exit:
155 return return_value;
156}
157
158static PyObject *
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800159unicodedata_UCD_decimal_impl(PyObject *self, PyObject *unichr, PyObject *default_value)
160/*[clinic checksum: a0980c387387287e2ac230c37d95b26f6903e0d2]*/
Larry Hastings31826802013-10-19 00:09:25 -0700161{
162 PyUnicodeObject *v = (PyUnicodeObject *)unichr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000163 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000164 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000165 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000166
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000167 c = getuchar(v);
168 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000170
Martin v. Löwis1a214512008-06-11 05:26:20 +0000171 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000172 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000173 if (old->category_changed == 0) {
174 /* unassigned */
175 have_old = 1;
176 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000178 else if (old->decimal_changed != 0xFF) {
179 have_old = 1;
180 rc = old->decimal_changed;
181 }
182 }
183
184 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000185 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000186 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700187 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000188 PyErr_SetString(PyExc_ValueError,
189 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000190 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191 }
192 else {
Larry Hastings31826802013-10-19 00:09:25 -0700193 Py_INCREF(default_value);
194 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000195 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000196 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000197 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000198}
199
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000200PyDoc_STRVAR(unicodedata_digit__doc__,
201"digit(unichr[, default])\n\
202\n\
203Returns the digit value assigned to the Unicode character unichr as\n\
204integer. If no such value is defined, default is returned, or, if\n\
205not given, ValueError is raised.");
206
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000207static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000208unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000209{
210 PyUnicodeObject *v;
211 PyObject *defobj = NULL;
212 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000213 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000214
Fredrik Lundh06d12682001-01-24 07:59:11 +0000215 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000216 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000217 c = getuchar(v);
218 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000219 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000220 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000221 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000222 if (defobj == NULL) {
223 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000224 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000225 }
226 else {
227 Py_INCREF(defobj);
228 return defobj;
229 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000230 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000231 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000232}
233
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000234PyDoc_STRVAR(unicodedata_numeric__doc__,
235"numeric(unichr[, default])\n\
236\n\
237Returns the numeric value assigned to the Unicode character unichr\n\
238as float. If no such value is defined, default is returned, or, if\n\
239not given, ValueError is raised.");
240
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000241static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000242unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000243{
244 PyUnicodeObject *v;
245 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000246 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000247 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000248 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000249
Fredrik Lundh06d12682001-01-24 07:59:11 +0000250 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000251 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000252 c = getuchar(v);
253 if (c == (Py_UCS4)-1)
254 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000255
Martin v. Löwis1a214512008-06-11 05:26:20 +0000256 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000257 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000258 if (old->category_changed == 0) {
259 /* unassigned */
260 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000261 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000262 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000263 else if (old->decimal_changed != 0xFF) {
264 have_old = 1;
265 rc = old->decimal_changed;
266 }
267 }
268
269 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000270 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000271 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000272 if (defobj == NULL) {
273 PyErr_SetString(PyExc_ValueError, "not a numeric character");
274 return NULL;
275 }
276 else {
277 Py_INCREF(defobj);
278 return defobj;
279 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000280 }
281 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000282}
283
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000284PyDoc_STRVAR(unicodedata_category__doc__,
285"category(unichr)\n\
286\n\
287Returns the general category assigned to the Unicode character\n\
288unichr as string.");
289
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000290static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000291unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000292{
293 PyUnicodeObject *v;
294 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000295 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000296
297 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000298 &PyUnicode_Type, &v))
299 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000300 c = getuchar(v);
301 if (c == (Py_UCS4)-1)
302 return NULL;
303 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000304 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000305 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000306 if (old->category_changed != 0xFF)
307 index = old->category_changed;
308 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000309 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000310}
311
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000312PyDoc_STRVAR(unicodedata_bidirectional__doc__,
313"bidirectional(unichr)\n\
314\n\
Ezio Melottie3d7e542012-12-14 20:12:25 +0200315Returns the bidirectional class assigned to the Unicode character\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000316unichr as string. If no such value is defined, an empty string is\n\
317returned.");
318
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000319static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000320unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000321{
322 PyUnicodeObject *v;
323 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000324 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000325
326 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000327 &PyUnicode_Type, &v))
328 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000329 c = getuchar(v);
330 if (c == (Py_UCS4)-1)
331 return NULL;
332 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000333 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000334 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000335 if (old->category_changed == 0)
336 index = 0; /* unassigned */
337 else if (old->bidir_changed != 0xFF)
338 index = old->bidir_changed;
339 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000340 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000341}
342
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000343PyDoc_STRVAR(unicodedata_combining__doc__,
344"combining(unichr)\n\
345\n\
346Returns the canonical combining class assigned to the Unicode\n\
347character unichr as integer. Returns 0 if no combining class is\n\
348defined.");
349
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000350static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000351unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000352{
353 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000354 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000355 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000356
357 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000358 &PyUnicode_Type, &v))
359 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000360 c = getuchar(v);
361 if (c == (Py_UCS4)-1)
362 return NULL;
363 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000364 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000365 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000366 if (old->category_changed == 0)
367 index = 0; /* unassigned */
368 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000369 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000370}
371
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000372PyDoc_STRVAR(unicodedata_mirrored__doc__,
373"mirrored(unichr)\n\
374\n\
375Returns the mirrored property assigned to the Unicode character\n\
376unichr as integer. Returns 1 if the character has been identified as\n\
377a \"mirrored\" character in bidirectional text, 0 otherwise.");
378
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000379static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000380unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000381{
382 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000383 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000384 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000385
386 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000387 &PyUnicode_Type, &v))
388 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000389 c = getuchar(v);
390 if (c == (Py_UCS4)-1)
391 return NULL;
392 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000393 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000394 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000395 if (old->category_changed == 0)
396 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000397 else if (old->mirrored_changed != 0xFF)
398 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000399 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000400 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000401}
402
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000403PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
404"east_asian_width(unichr)\n\
405\n\
406Returns the east asian width assigned to the Unicode character\n\
407unichr as string.");
408
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000409static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000410unicodedata_east_asian_width(PyObject *self, PyObject *args)
411{
412 PyUnicodeObject *v;
413 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000414 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000415
416 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000417 &PyUnicode_Type, &v))
418 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000419 c = getuchar(v);
420 if (c == (Py_UCS4)-1)
421 return NULL;
422 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000423 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000424 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000425 if (old->category_changed == 0)
426 index = 0; /* unassigned */
427 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000428 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000429}
430
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000431PyDoc_STRVAR(unicodedata_decomposition__doc__,
432"decomposition(unichr)\n\
433\n\
434Returns the character decomposition mapping assigned to the Unicode\n\
435character unichr as string. An empty string is returned in case no\n\
436such mapping is defined.");
437
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000438static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000439unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000440{
441 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000442 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000443 int code, index, count;
444 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000445 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000446 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000447
448 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000449 &PyUnicode_Type, &v))
450 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000451 c = getuchar(v);
452 if (c == (Py_UCS4)-1)
453 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000454
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000455 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000456
Martin v. Löwis1a214512008-06-11 05:26:20 +0000457 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000458 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000459 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000460 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000461 }
462
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000463 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000464 index = 0;
465 else {
466 index = decomp_index1[(code>>DECOMP_SHIFT)];
467 index = decomp_index2[(index<<DECOMP_SHIFT)+
468 (code&((1<<DECOMP_SHIFT)-1))];
469 }
470
Tim Peters69b83b12001-11-30 07:23:05 +0000471 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000472 is prefix code (from*/
473 count = decomp_data[index] >> 8;
474
475 /* XXX: could allocate the PyString up front instead
476 (strlen(prefix) + 5 * count + 1 bytes) */
477
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000478 /* Based on how index is calculated above and decomp_data is generated
479 from Tools/unicode/makeunicodedata.py, it should not be possible
480 to overflow decomp_prefix. */
481 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200482 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000483
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000484 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000485 i = strlen(decomp_prefix[prefix_index]);
486 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000487
488 while (count-- > 0) {
489 if (i)
490 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000491 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000492 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
493 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000494 i += strlen(decomp + i);
495 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000496 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000497}
498
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000499static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000500get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000501{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000502 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000503 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000504 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000505 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000506 /* unassigned in old version */
507 *index = 0;
508 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000509 else {
510 *index = decomp_index1[(code>>DECOMP_SHIFT)];
511 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
512 (code&((1<<DECOMP_SHIFT)-1))];
513 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000514
Martin v. Löwis677bde22002-11-23 22:08:15 +0000515 /* high byte is number of hex bytes (usually one or two), low byte
516 is prefix code (from*/
517 *count = decomp_data[*index] >> 8;
518 *prefix = decomp_data[*index] & 255;
519
520 (*index)++;
521}
522
523#define SBase 0xAC00
524#define LBase 0x1100
525#define VBase 0x1161
526#define TBase 0x11A7
527#define LCount 19
528#define VCount 21
529#define TCount 28
530#define NCount (VCount*TCount)
531#define SCount (LCount*NCount)
532
533static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000534nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000535{
536 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200537 Py_UCS4 *output;
538 Py_ssize_t i, o, osize;
539 int kind;
540 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000541 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200542 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000543 Py_ssize_t space, isize;
544 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000545 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000546
Martin v. Löwis677bde22002-11-23 22:08:15 +0000547 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200548 isize = PyUnicode_GET_LENGTH(input);
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300549 /* Overallocate at most 10 characters. */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000550 space = (isize > 10 ? 10 : isize) + isize;
Martin v. Löwis22970662011-09-29 13:39:38 +0200551 osize = space;
552 output = PyMem_Malloc(space * sizeof(Py_UCS4));
553 if (!output) {
554 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000555 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200556 }
557 i = o = 0;
558 kind = PyUnicode_KIND(input);
559 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000560
Martin v. Löwis22970662011-09-29 13:39:38 +0200561 while (i < isize) {
562 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000563 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200564 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000565 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300566 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000567 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000568 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200569 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000570 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000571 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
572 if (new_output == NULL) {
573 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200574 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000575 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200576 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000577 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000578 }
579 /* Hangul Decomposition. */
580 if (SBase <= code && code < (SBase+SCount)) {
581 int SIndex = code - SBase;
582 int L = LBase + SIndex / NCount;
583 int V = VBase + (SIndex % NCount) / TCount;
584 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200585 output[o++] = L;
586 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000587 space -= 2;
588 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200589 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000590 space --;
591 }
592 continue;
593 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000594 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000595 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000596 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
597 if (value != 0) {
598 stack[stackptr++] = value;
599 continue;
600 }
601 }
602
603 /* Other decompositions. */
604 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000605
606 /* Copy character if it is not decomposable, or has a
607 compatibility decomposition, but we do NFD. */
608 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200609 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000610 space--;
611 continue;
612 }
613 /* Copy decomposition onto the stack, in reverse
614 order. */
615 while(count) {
616 code = decomp_data[index + (--count)];
617 stack[stackptr++] = code;
618 }
619 }
620 }
621
Martin v. Löwis22970662011-09-29 13:39:38 +0200622 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
623 output, o);
624 PyMem_Free(output);
625 if (!result)
626 return NULL;
627 /* result is guaranteed to be ready, as it is compact. */
628 kind = PyUnicode_KIND(result);
629 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000630
631 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200632 i = 0;
633 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
634 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
635 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000636 if (prev == 0 || cur == 0 || prev <= cur) {
637 prev = cur;
638 continue;
639 }
640 /* Non-canonical order. Need to switch *i with previous. */
641 o = i - 1;
642 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200643 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
644 PyUnicode_WRITE(kind, data, o+1,
645 PyUnicode_READ(kind, data, o));
646 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000647 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200648 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000649 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200650 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000651 if (prev == 0 || prev <= cur)
652 break;
653 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200654 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000655 }
656 return result;
657}
658
659static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200660find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000661{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200662 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000663 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200664 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000665 if (code < start)
666 return -1;
667 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200668 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000669 return nfc[index].index + delta;
670 }
671 }
672 return -1;
673}
674
675static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000676nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000677{
678 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200679 int kind;
680 void *data;
681 Py_UCS4 *output;
682 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000683 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200684 Py_UCS4 code;
685 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000686 int cskipped = 0;
687
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000688 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000689 if (!result)
690 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200691 /* result will be "ready". */
692 kind = PyUnicode_KIND(result);
693 data = PyUnicode_DATA(result);
694 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000695
Martin v. Löwis22970662011-09-29 13:39:38 +0200696 /* We allocate a buffer for the output.
697 If we find that we made no changes, we still return
698 the NFD result. */
699 output = PyMem_Malloc(len * sizeof(Py_UCS4));
700 if (!output) {
701 PyErr_NoMemory();
702 Py_DECREF(result);
703 return 0;
704 }
705 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000706
Martin v. Löwis677bde22002-11-23 22:08:15 +0000707 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200708 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000709 for (index = 0; index < cskipped; index++) {
710 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000711 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000712 Remove from list. */
713 skipped[index] = skipped[cskipped-1];
714 cskipped--;
715 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000716 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000717 }
718 }
719 /* Hangul Composition. We don't need to check for <LV,T>
720 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200721 code = PyUnicode_READ(kind, data, i);
722 if (LBase <= code && code < (LBase+LCount) &&
723 i + 1 < len &&
724 VBase <= PyUnicode_READ(kind, data, i+1) &&
725 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000726 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200727 LIndex = code - LBase;
728 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000729 code = SBase + (LIndex*VCount+VIndex)*TCount;
730 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200731 if (i < len &&
732 TBase <= PyUnicode_READ(kind, data, i) &&
733 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
734 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000735 i++;
736 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200737 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000738 continue;
739 }
740
Martin v. Löwis22970662011-09-29 13:39:38 +0200741 /* code is still input[i] here */
742 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000743 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200744 output[o++] = code;
745 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000746 continue;
747 }
748 /* Find next unblocked character. */
749 i1 = i+1;
750 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200751 /* output base character for now; might be updated later. */
752 output[o] = PyUnicode_READ(kind, data, i);
753 while (i1 < len) {
754 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
755 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000756 if (comb) {
757 if (comb1 == 0)
758 break;
759 if (comb >= comb1) {
760 /* Character is blocked. */
761 i1++;
762 continue;
763 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000764 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200765 l = find_nfc_index(self, nfc_last, code1);
766 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000767 is a starter, we don't need to look further.
768 Otherwise, record the combining class. */
769 if (l == -1) {
770 not_combinable:
771 if (comb1 == 0)
772 break;
773 comb = comb1;
774 i1++;
775 continue;
776 }
777 index = f*TOTAL_LAST + l;
778 index1 = comp_index[index >> COMP_SHIFT];
779 code = comp_data[(index1<<COMP_SHIFT)+
780 (index&((1<<COMP_SHIFT)-1))];
781 if (code == 0)
782 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000783
Martin v. Löwis677bde22002-11-23 22:08:15 +0000784 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200785 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000786 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000787 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000788 skipped[cskipped++] = i1;
789 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200790 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000791 if (f == -1)
792 break;
793 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200794 /* Output character was already written.
795 Just advance the indices. */
796 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000797 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200798 if (o == len) {
799 /* No changes. Return original string. */
800 PyMem_Free(output);
801 return result;
802 }
803 Py_DECREF(result);
804 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
805 output, o);
806 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000807 return result;
808}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000809
810/* Return 1 if the input is certainly normalized, 0 if it might not be. */
811static int
812is_normalized(PyObject *self, PyObject *input, int nfc, int k)
813{
Martin v. Löwis22970662011-09-29 13:39:38 +0200814 Py_ssize_t i, len;
815 int kind;
816 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000817 unsigned char prev_combining = 0, quickcheck_mask;
818
819 /* An older version of the database is requested, quickchecks must be
820 disabled. */
821 if (self && UCD_Check(self))
822 return 0;
823
824 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
825 as described in http://unicode.org/reports/tr15/#Annex8. */
826 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
827
Martin v. Löwis22970662011-09-29 13:39:38 +0200828 i = 0;
829 kind = PyUnicode_KIND(input);
830 data = PyUnicode_DATA(input);
831 len = PyUnicode_GET_LENGTH(input);
832 while (i < len) {
833 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
834 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000835 unsigned char combining = record->combining;
836 unsigned char quickcheck = record->normalization_quick_check;
837
838 if (quickcheck & quickcheck_mask)
839 return 0; /* this string might need normalization */
840 if (combining && prev_combining > combining)
841 return 0; /* non-canonical sort order, not normalized */
842 prev_combining = combining;
843 }
844 return 1; /* certainly normalized */
845}
846
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000847PyDoc_STRVAR(unicodedata_normalize__doc__,
848"normalize(form, unistr)\n\
849\n\
850Return the normal form 'form' for the Unicode string unistr. Valid\n\
851values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
852
Martin v. Löwis677bde22002-11-23 22:08:15 +0000853static PyObject*
854unicodedata_normalize(PyObject *self, PyObject *args)
855{
856 char *form;
857 PyObject *input;
858
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000859 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000860 &form, &PyUnicode_Type, &input))
861 return NULL;
862
Martin v. Löwis22970662011-09-29 13:39:38 +0200863 if (PyUnicode_READY(input) == -1)
864 return NULL;
865
866 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000867 /* Special case empty input strings, since resizing
868 them later would cause internal errors. */
869 Py_INCREF(input);
870 return input;
871 }
872
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000873 if (strcmp(form, "NFC") == 0) {
874 if (is_normalized(self, input, 1, 0)) {
875 Py_INCREF(input);
876 return input;
877 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000878 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000879 }
880 if (strcmp(form, "NFKC") == 0) {
881 if (is_normalized(self, input, 1, 1)) {
882 Py_INCREF(input);
883 return input;
884 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000885 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000886 }
887 if (strcmp(form, "NFD") == 0) {
888 if (is_normalized(self, input, 0, 0)) {
889 Py_INCREF(input);
890 return input;
891 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000892 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000893 }
894 if (strcmp(form, "NFKD") == 0) {
895 if (is_normalized(self, input, 0, 1)) {
896 Py_INCREF(input);
897 return input;
898 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000899 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000900 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000901 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
902 return NULL;
903}
904
Fredrik Lundh06d12682001-01-24 07:59:11 +0000905/* -------------------------------------------------------------------- */
906/* unicode character name tables */
907
908/* data file generated by Tools/unicode/makeunicodedata.py */
909#include "unicodename_db.h"
910
911/* -------------------------------------------------------------------- */
912/* database code (cut and pasted from the unidb package) */
913
914static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000915_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000916{
917 int i;
918 unsigned long h = 0;
919 unsigned long ix;
920 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200921 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000922 ix = h & 0xff000000;
923 if (ix)
924 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
925 }
926 return h;
927}
928
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000929static char *hangul_syllables[][3] = {
930 { "G", "A", "" },
931 { "GG", "AE", "G" },
932 { "N", "YA", "GG" },
933 { "D", "YAE", "GS" },
934 { "DD", "EO", "N", },
935 { "R", "E", "NJ" },
936 { "M", "YEO", "NH" },
937 { "B", "YE", "D" },
938 { "BB", "O", "L" },
939 { "S", "WA", "LG" },
940 { "SS", "WAE", "LM" },
941 { "", "OE", "LB" },
942 { "J", "YO", "LS" },
943 { "JJ", "U", "LT" },
944 { "C", "WEO", "LP" },
945 { "K", "WE", "LH" },
946 { "T", "WI", "M" },
947 { "P", "YU", "B" },
948 { "H", "EU", "BS" },
949 { 0, "YI", "S" },
950 { 0, "I", "SS" },
951 { 0, 0, "NG" },
952 { 0, 0, "J" },
953 { 0, 0, "C" },
954 { 0, 0, "K" },
955 { 0, 0, "T" },
956 { 0, 0, "P" },
957 { 0, 0, "H" }
958};
959
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000960/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000961static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000962is_unified_ideograph(Py_UCS4 code)
963{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000964 return
965 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500966 (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000967 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
968 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
969 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000970}
971
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300972/* macros used to determine if the given codepoint is in the PUA range that
973 * we are using to store aliases and named sequences */
974#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
975#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
976 (cp < named_sequences_end))
977
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000978static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300979_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
980 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000981{
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300982 /* Find the name associated with the given codepoint.
983 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
984 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000985 int offset;
986 int i;
987 int word;
988 unsigned char* w;
989
Martin v. Löwisc3509122006-03-11 12:16:23 +0000990 if (code >= 0x110000)
991 return 0;
992
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300993 /* XXX should we just skip all the codepoints in the PUAs here? */
994 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
995 return 0;
996
Martin v. Löwis1a214512008-06-11 05:26:20 +0000997 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300998 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +0300999 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001000 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1001 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +03001002 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +00001003 if (old->category_changed == 0) {
1004 /* unassigned */
1005 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001006 }
Martin v. Löwisc3509122006-03-11 12:16:23 +00001007 }
1008
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +00001009 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001010 /* Hangul syllable. */
1011 int SIndex = code - SBase;
1012 int L = SIndex / NCount;
1013 int V = (SIndex % NCount) / TCount;
1014 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001015
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001016 if (buflen < 27)
1017 /* Worst case: HANGUL SYLLABLE <10chars>. */
1018 return 0;
1019 strcpy(buffer, "HANGUL SYLLABLE ");
1020 buffer += 16;
1021 strcpy(buffer, hangul_syllables[L][0]);
1022 buffer += strlen(hangul_syllables[L][0]);
1023 strcpy(buffer, hangul_syllables[V][1]);
1024 buffer += strlen(hangul_syllables[V][1]);
1025 strcpy(buffer, hangul_syllables[T][2]);
1026 buffer += strlen(hangul_syllables[T][2]);
1027 *buffer = '\0';
1028 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001029 }
1030
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001031 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001032 if (buflen < 28)
1033 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1034 return 0;
1035 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1036 return 1;
1037 }
1038
Fredrik Lundh06d12682001-01-24 07:59:11 +00001039 /* get offset into phrasebook */
1040 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1041 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1042 (code&((1<<phrasebook_shift)-1))];
1043 if (!offset)
1044 return 0;
1045
1046 i = 0;
1047
1048 for (;;) {
1049 /* get word index */
1050 word = phrasebook[offset] - phrasebook_short;
1051 if (word >= 0) {
1052 word = (word << 8) + phrasebook[offset+1];
1053 offset += 2;
1054 } else
1055 word = phrasebook[offset++];
1056 if (i) {
1057 if (i > buflen)
1058 return 0; /* buffer overflow */
1059 buffer[i++] = ' ';
1060 }
1061 /* copy word string from lexicon. the last character in the
1062 word has bit 7 set. the last word in a string ends with
1063 0x80 */
1064 w = lexicon + lexicon_offset[word];
1065 while (*w < 128) {
1066 if (i >= buflen)
1067 return 0; /* buffer overflow */
1068 buffer[i++] = *w++;
1069 }
1070 if (i >= buflen)
1071 return 0; /* buffer overflow */
1072 buffer[i++] = *w & 127;
1073 if (*w == 128)
1074 break; /* end of word */
1075 }
1076
1077 return 1;
1078}
1079
1080static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001081_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001082{
1083 /* check if code corresponds to the given name */
1084 int i;
1085 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001086 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001087 return 0;
1088 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001089 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001090 return 0;
1091 }
1092 return buffer[namelen] == '\0';
1093}
1094
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001095static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001096find_syllable(const char *str, int *len, int *pos, int count, int column)
1097{
1098 int i, len1;
1099 *len = -1;
1100 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001101 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001102 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001103 if (len1 <= *len)
1104 continue;
1105 if (strncmp(str, s, len1) == 0) {
1106 *len = len1;
1107 *pos = i;
1108 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001109 }
1110 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001111 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001112 }
1113}
1114
Fredrik Lundh06d12682001-01-24 07:59:11 +00001115static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001116_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001117{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001118 /* check if named sequences are allowed */
1119 if (!with_named_seq && IS_NAMED_SEQ(cp))
1120 return 0;
1121 /* if the codepoint is in the PUA range that we use for aliases,
1122 * convert it to obtain the right codepoint */
1123 if (IS_ALIAS(cp))
1124 *code = name_aliases[cp-aliases_start];
1125 else
1126 *code = cp;
1127 return 1;
1128}
1129
1130static int
1131_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1132 int with_named_seq)
1133{
1134 /* Return the codepoint associated with the given name.
1135 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1136 * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
1137 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001138 unsigned int h, v;
1139 unsigned int mask = code_size-1;
1140 unsigned int i, incr;
1141
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001142 /* Check for hangul syllables. */
1143 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001144 int len, L = -1, V = -1, T = -1;
1145 const char *pos = name + 16;
1146 find_syllable(pos, &len, &L, LCount, 0);
1147 pos += len;
1148 find_syllable(pos, &len, &V, VCount, 1);
1149 pos += len;
1150 find_syllable(pos, &len, &T, TCount, 2);
1151 pos += len;
1152 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1153 *code = SBase + (L*VCount+V)*TCount + T;
1154 return 1;
1155 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001156 /* Otherwise, it's an illegal syllable name. */
1157 return 0;
1158 }
1159
1160 /* Check for unified ideographs. */
1161 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1162 /* Four or five hexdigits must follow. */
1163 v = 0;
1164 name += 22;
1165 namelen -= 22;
1166 if (namelen != 4 && namelen != 5)
1167 return 0;
1168 while (namelen--) {
1169 v *= 16;
1170 if (*name >= '0' && *name <= '9')
1171 v += *name - '0';
1172 else if (*name >= 'A' && *name <= 'F')
1173 v += *name - 'A' + 10;
1174 else
1175 return 0;
1176 name++;
1177 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001178 if (!is_unified_ideograph(v))
1179 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001180 *code = v;
1181 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001182 }
1183
Fredrik Lundh06d12682001-01-24 07:59:11 +00001184 /* the following is the same as python's dictionary lookup, with
1185 only minor changes. see the makeunicodedata script for more
1186 details */
1187
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001188 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001189 i = (~h) & mask;
1190 v = code_hash[i];
1191 if (!v)
1192 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001193 if (_cmpname(self, v, name, namelen))
1194 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001195 incr = (h ^ (h >> 3)) & mask;
1196 if (!incr)
1197 incr = mask;
1198 for (;;) {
1199 i = (i + incr) & mask;
1200 v = code_hash[i];
1201 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001202 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001203 if (_cmpname(self, v, name, namelen))
1204 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001205 incr = incr << 1;
1206 if (incr > mask)
1207 incr = incr ^ code_poly;
1208 }
1209}
1210
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001211static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001212{
1213 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001214 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001215 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001216};
1217
1218/* -------------------------------------------------------------------- */
1219/* Python bindings */
1220
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001221PyDoc_STRVAR(unicodedata_name__doc__,
1222"name(unichr[, default])\n\
1223Returns the name assigned to the Unicode character unichr as a\n\
1224string. If no name is defined, default is returned, or, if not\n\
1225given, ValueError is raised.");
1226
Fredrik Lundh06d12682001-01-24 07:59:11 +00001227static PyObject *
1228unicodedata_name(PyObject* self, PyObject* args)
1229{
1230 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001231 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001232
1233 PyUnicodeObject* v;
1234 PyObject* defobj = NULL;
1235 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1236 return NULL;
1237
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001238 c = getuchar(v);
1239 if (c == (Py_UCS4)-1)
1240 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001241
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001242 if (!_getucname(self, c, name, sizeof(name), 0)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001243 if (defobj == NULL) {
1244 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001245 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001246 }
1247 else {
1248 Py_INCREF(defobj);
1249 return defobj;
1250 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001251 }
1252
Walter Dörwald4254e762007-06-05 16:04:09 +00001253 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001254}
1255
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001256PyDoc_STRVAR(unicodedata_lookup__doc__,
1257"lookup(name)\n\
1258\n\
1259Look up character by name. If a character with the\n\
1260given name is found, return the corresponding Unicode\n\
1261character. If not found, KeyError is raised.");
1262
Fredrik Lundh06d12682001-01-24 07:59:11 +00001263static PyObject *
1264unicodedata_lookup(PyObject* self, PyObject* args)
1265{
1266 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001267
1268 char* name;
1269 int namelen;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001270 unsigned int index;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001271 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1272 return NULL;
1273
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001274 if (!_getcode(self, name, namelen, &code, 1)) {
1275 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001276 return NULL;
1277 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001278 /* check if code is in the PUA range that we use for named sequences
1279 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001280 if (IS_NAMED_SEQ(code)) {
1281 index = code-named_sequences_start;
1282 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1283 named_sequences[index].seq,
1284 named_sequences[index].seqlen);
1285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001286 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001287}
1288
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001289/* XXX Add doc strings. */
1290
1291static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001292 UNICODEDATA_UCD_DECIMAL_METHODDEF
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001293 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1294 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1295 {"category", unicodedata_category, METH_VARARGS,
1296 unicodedata_category__doc__},
1297 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1298 unicodedata_bidirectional__doc__},
1299 {"combining", unicodedata_combining, METH_VARARGS,
1300 unicodedata_combining__doc__},
1301 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1302 unicodedata_mirrored__doc__},
1303 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1304 unicodedata_east_asian_width__doc__},
1305 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1306 unicodedata_decomposition__doc__},
1307 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1308 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1309 {"normalize", unicodedata_normalize, METH_VARARGS,
1310 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001311 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001312};
1313
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001314static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001315 /* The ob_type field must be initialized in the module init function
1316 * to be portable to Windows without using C++. */
1317 PyVarObject_HEAD_INIT(NULL, 0)
1318 "unicodedata.UCD", /*tp_name*/
1319 sizeof(PreviousDBVersion), /*tp_basicsize*/
1320 0, /*tp_itemsize*/
1321 /* methods */
1322 (destructor)PyObject_Del, /*tp_dealloc*/
1323 0, /*tp_print*/
1324 0, /*tp_getattr*/
1325 0, /*tp_setattr*/
1326 0, /*tp_reserved*/
1327 0, /*tp_repr*/
1328 0, /*tp_as_number*/
1329 0, /*tp_as_sequence*/
1330 0, /*tp_as_mapping*/
1331 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001332 0, /*tp_call*/
1333 0, /*tp_str*/
1334 PyObject_GenericGetAttr,/*tp_getattro*/
1335 0, /*tp_setattro*/
1336 0, /*tp_as_buffer*/
1337 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1338 0, /*tp_doc*/
1339 0, /*tp_traverse*/
1340 0, /*tp_clear*/
1341 0, /*tp_richcompare*/
1342 0, /*tp_weaklistoffset*/
1343 0, /*tp_iter*/
1344 0, /*tp_iternext*/
1345 unicodedata_functions, /*tp_methods*/
1346 DB_members, /*tp_members*/
1347 0, /*tp_getset*/
1348 0, /*tp_base*/
1349 0, /*tp_dict*/
1350 0, /*tp_descr_get*/
1351 0, /*tp_descr_set*/
1352 0, /*tp_dictoffset*/
1353 0, /*tp_init*/
1354 0, /*tp_alloc*/
1355 0, /*tp_new*/
1356 0, /*tp_free*/
1357 0, /*tp_is_gc*/
1358};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001359
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001360PyDoc_STRVAR(unicodedata_docstring,
1361"This module provides access to the Unicode Character Database which\n\
1362defines character properties for all Unicode characters. The data in\n\
1363this database is based on the UnicodeData.txt file version\n\
Benjamin Peterson8aa7b892013-10-10 20:22:10 -04001364" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001365\n\
1366The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001367UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001368
1369static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001370 PyModuleDef_HEAD_INIT,
1371 "unicodedata",
1372 unicodedata_docstring,
1373 -1,
1374 unicodedata_functions,
1375 NULL,
1376 NULL,
1377 NULL,
1378 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001379};
1380
Mark Hammond62b1ab12002-07-23 06:31:15 +00001381PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001382PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001383{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001384 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001385
Christian Heimes90aa7642007-12-19 02:45:37 +00001386 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001387
Martin v. Löwis1a214512008-06-11 05:26:20 +00001388 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001389 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001390 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001391
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001392 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001393 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001394 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001395
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001396 /* Previous versions */
1397 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1398 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001399 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001400
Fredrik Lundh06d12682001-01-24 07:59:11 +00001401 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001402 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001403 if (v != NULL)
1404 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001405 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001406}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001407
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001408/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001409Local variables:
1410c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001411indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001412End:
1413*/