blob: b2c8733458748852506d9da6b2de546ef0033218 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00003 unicodedata -- Provides access to the Unicode 4.1 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005 Data was extracted from the Unicode 4.1 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Martin v. Löwis7d41e292002-11-23 12:22:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000028 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030} _PyUnicode_DatabaseRecord;
31
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000032typedef struct change_record {
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed;
35 const unsigned char category_changed;
36 const unsigned char decimal_changed;
37 const int numeric_changed;
38} change_record;
39
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000040/* data file generated by Tools/unicode/makeunicodedata.py */
41#include "unicodedata_db.h"
42
43static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000044_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000045{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000046 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000047 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048 index = 0;
49 else {
50 index = index1[(code>>SHIFT)];
51 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
52 }
53
54 return &_PyUnicode_Database_Records[index];
55}
56
Martin v. Löwis677bde22002-11-23 22:08:15 +000057static const _PyUnicode_DatabaseRecord*
58_getrecord(PyUnicodeObject* v)
59{
60 return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
61}
62
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000063/* ------------- Previous-version API ------------------------------------- */
64typedef struct previous_version {
65 PyObject_HEAD
66 const char *name;
67 const change_record* (*getrecord)(Py_UCS4);
68 Py_UCS4 (*normalization)(Py_UCS4);
69} PreviousDBVersion;
70
71#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
72
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000073static PyMemberDef DB_members[] = {
74 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
75 {NULL}
76};
77
Thomas Wouters89f507f2006-12-13 04:49:30 +000078/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000079static PyTypeObject UCD_Type;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000080
81static PyObject*
82new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
83 Py_UCS4 (*normalization)(Py_UCS4))
84{
85 PreviousDBVersion *self;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000086 self = PyObject_New(PreviousDBVersion, &UCD_Type);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000087 if (self == NULL)
88 return NULL;
89 self->name = name;
90 self->getrecord = getrecord;
91 self->normalization = normalization;
92 return (PyObject*)self;
93}
94
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000095/* --- Module API --------------------------------------------------------- */
96
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +000097PyDoc_STRVAR(unicodedata_decimal__doc__,
98"decimal(unichr[, default])\n\
99\n\
100Returns the decimal value assigned to the Unicode character unichr\n\
101as integer. If no such value is defined, default is returned, or, if\n\
102not given, ValueError is raised.");
103
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000104static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000105unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000106{
107 PyUnicodeObject *v;
108 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000109 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000110 long rc;
111
Fredrik Lundh06d12682001-01-24 07:59:11 +0000112 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000113 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000114 if (PyUnicode_GET_SIZE(v) != 1) {
115 PyErr_SetString(PyExc_TypeError,
116 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000117 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000119
120 if (self) {
121 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
122 if (old->category_changed == 0) {
123 /* unassigned */
124 have_old = 1;
125 rc = -1;
126 }
127 else if (old->decimal_changed != 0xFF) {
128 have_old = 1;
129 rc = old->decimal_changed;
130 }
131 }
132
133 if (!have_old)
134 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000135 if (rc < 0) {
136 if (defobj == NULL) {
137 PyErr_SetString(PyExc_ValueError,
138 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000139 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000140 }
141 else {
142 Py_INCREF(defobj);
143 return defobj;
144 }
145 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000146 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000147}
148
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000149PyDoc_STRVAR(unicodedata_digit__doc__,
150"digit(unichr[, default])\n\
151\n\
152Returns the digit value assigned to the Unicode character unichr as\n\
153integer. If no such value is defined, default is returned, or, if\n\
154not given, ValueError is raised.");
155
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000156static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000157unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000158{
159 PyUnicodeObject *v;
160 PyObject *defobj = NULL;
161 long rc;
162
Fredrik Lundh06d12682001-01-24 07:59:11 +0000163 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000164 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000165 if (PyUnicode_GET_SIZE(v) != 1) {
166 PyErr_SetString(PyExc_TypeError,
167 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000168 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169 }
170 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
171 if (rc < 0) {
172 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000173 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000174 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000175 }
176 else {
177 Py_INCREF(defobj);
178 return defobj;
179 }
180 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000181 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000182}
183
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000184PyDoc_STRVAR(unicodedata_numeric__doc__,
185"numeric(unichr[, default])\n\
186\n\
187Returns the numeric value assigned to the Unicode character unichr\n\
188as float. If no such value is defined, default is returned, or, if\n\
189not given, ValueError is raised.");
190
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000191static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000192unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193{
194 PyUnicodeObject *v;
195 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000196 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000197 double rc;
198
Fredrik Lundh06d12682001-01-24 07:59:11 +0000199 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000200 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201 if (PyUnicode_GET_SIZE(v) != 1) {
202 PyErr_SetString(PyExc_TypeError,
203 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000204 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000205 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000206
207 if (self) {
208 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
209 if (old->category_changed == 0) {
210 /* unassigned */
211 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000212 rc = -1.0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000213 }
214 else if (old->decimal_changed != 0xFF) {
215 have_old = 1;
216 rc = old->decimal_changed;
217 }
218 }
219
220 if (!have_old)
221 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000222 if (rc == -1.0) {
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000223 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000224 PyErr_SetString(PyExc_ValueError, "not a numeric character");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000225 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000226 }
227 else {
228 Py_INCREF(defobj);
229 return defobj;
230 }
231 }
232 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000233}
234
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000235PyDoc_STRVAR(unicodedata_category__doc__,
236"category(unichr)\n\
237\n\
238Returns the general category assigned to the Unicode character\n\
239unichr as string.");
240
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000241static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000242unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000243{
244 PyUnicodeObject *v;
245 int index;
246
247 if (!PyArg_ParseTuple(args, "O!:category",
248 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000249 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250 if (PyUnicode_GET_SIZE(v) != 1) {
251 PyErr_SetString(PyExc_TypeError,
252 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000253 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000254 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000255 index = (int) _getrecord(v)->category;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000256 if (self) {
257 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
258 if (old->category_changed != 0xFF)
259 index = old->category_changed;
260 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000261 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000262}
263
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000264PyDoc_STRVAR(unicodedata_bidirectional__doc__,
265"bidirectional(unichr)\n\
266\n\
267Returns the bidirectional category assigned to the Unicode character\n\
268unichr as string. If no such value is defined, an empty string is\n\
269returned.");
270
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000271static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000272unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000273{
274 PyUnicodeObject *v;
275 int index;
276
277 if (!PyArg_ParseTuple(args, "O!:bidirectional",
278 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000279 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000280 if (PyUnicode_GET_SIZE(v) != 1) {
281 PyErr_SetString(PyExc_TypeError,
282 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000283 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000284 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000285 index = (int) _getrecord(v)->bidirectional;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000286 if (self) {
287 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
288 if (old->category_changed == 0)
289 index = 0; /* unassigned */
290 else if (old->bidir_changed != 0xFF)
291 index = old->bidir_changed;
292 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000293 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000294}
295
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000296PyDoc_STRVAR(unicodedata_combining__doc__,
297"combining(unichr)\n\
298\n\
299Returns the canonical combining class assigned to the Unicode\n\
300character unichr as integer. Returns 0 if no combining class is\n\
301defined.");
302
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000303static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000304unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000305{
306 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000307 int index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000308
309 if (!PyArg_ParseTuple(args, "O!:combining",
310 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000311 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000312 if (PyUnicode_GET_SIZE(v) != 1) {
313 PyErr_SetString(PyExc_TypeError,
314 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000315 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000316 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000317 index = (int) _getrecord(v)->combining;
318 if (self) {
319 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
320 if (old->category_changed == 0)
321 index = 0; /* unassigned */
322 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000323 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000324}
325
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000326PyDoc_STRVAR(unicodedata_mirrored__doc__,
327"mirrored(unichr)\n\
328\n\
329Returns the mirrored property assigned to the Unicode character\n\
330unichr as integer. Returns 1 if the character has been identified as\n\
331a \"mirrored\" character in bidirectional text, 0 otherwise.");
332
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000333static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000334unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000335{
336 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000337 int index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000338
339 if (!PyArg_ParseTuple(args, "O!:mirrored",
340 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000341 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000342 if (PyUnicode_GET_SIZE(v) != 1) {
343 PyErr_SetString(PyExc_TypeError,
344 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000345 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000346 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000347 index = (int) _getrecord(v)->mirrored;
348 if (self) {
349 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
350 if (old->category_changed == 0)
351 index = 0; /* unassigned */
352 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000353 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000354}
355
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000356PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
357"east_asian_width(unichr)\n\
358\n\
359Returns the east asian width assigned to the Unicode character\n\
360unichr as string.");
361
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000362static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000363unicodedata_east_asian_width(PyObject *self, PyObject *args)
364{
365 PyUnicodeObject *v;
366 int index;
367
368 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
369 &PyUnicode_Type, &v))
370 return NULL;
371 if (PyUnicode_GET_SIZE(v) != 1) {
372 PyErr_SetString(PyExc_TypeError,
373 "need a single Unicode character as parameter");
374 return NULL;
375 }
376 index = (int) _getrecord(v)->east_asian_width;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000377 if (self) {
378 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
379 if (old->category_changed == 0)
380 index = 0; /* unassigned */
381 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000382 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000383}
384
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000385PyDoc_STRVAR(unicodedata_decomposition__doc__,
386"decomposition(unichr)\n\
387\n\
388Returns the character decomposition mapping assigned to the Unicode\n\
389character unichr as string. An empty string is returned in case no\n\
390such mapping is defined.");
391
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000392static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000393unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000394{
395 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000396 char decomp[256];
397 int code, index, count, i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000398 unsigned int prefix_index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000399
400 if (!PyArg_ParseTuple(args, "O!:decomposition",
401 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000402 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000403 if (PyUnicode_GET_SIZE(v) != 1) {
404 PyErr_SetString(PyExc_TypeError,
405 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000406 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000407 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000408
409 code = (int) *PyUnicode_AS_UNICODE(v);
410
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000411 if (self) {
412 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
413 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000414 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000415 }
416
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000417 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000418 index = 0;
419 else {
420 index = decomp_index1[(code>>DECOMP_SHIFT)];
421 index = decomp_index2[(index<<DECOMP_SHIFT)+
422 (code&((1<<DECOMP_SHIFT)-1))];
423 }
424
Tim Peters69b83b12001-11-30 07:23:05 +0000425 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000426 is prefix code (from*/
427 count = decomp_data[index] >> 8;
428
429 /* XXX: could allocate the PyString up front instead
430 (strlen(prefix) + 5 * count + 1 bytes) */
431
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000432 /* Based on how index is calculated above and decomp_data is generated
433 from Tools/unicode/makeunicodedata.py, it should not be possible
434 to overflow decomp_prefix. */
435 prefix_index = decomp_data[index] & 255;
436 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
437
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000438 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000439 i = strlen(decomp_prefix[prefix_index]);
440 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000441
442 while (count-- > 0) {
443 if (i)
444 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000445 assert((size_t)i < sizeof(decomp));
446 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
447 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000448 i += strlen(decomp + i);
449 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000450
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000451 decomp[i] = '\0';
452
Walter Dörwald4254e762007-06-05 16:04:09 +0000453 return PyUnicode_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000454}
455
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000456static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000457get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000458{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000459 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000460 *index = 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000461 } else if (self && get_old_record(self, code)->category_changed==0) {
462 /* unassigned in old version */
463 *index = 0;
464 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000465 else {
466 *index = decomp_index1[(code>>DECOMP_SHIFT)];
467 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
468 (code&((1<<DECOMP_SHIFT)-1))];
469 }
470
471 /* high byte is number of hex bytes (usually one or two), low byte
472 is prefix code (from*/
473 *count = decomp_data[*index] >> 8;
474 *prefix = decomp_data[*index] & 255;
475
476 (*index)++;
477}
478
479#define SBase 0xAC00
480#define LBase 0x1100
481#define VBase 0x1161
482#define TBase 0x11A7
483#define LCount 19
484#define VCount 21
485#define TCount 28
486#define NCount (VCount*TCount)
487#define SCount (LCount*NCount)
488
489static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000490nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000491{
492 PyObject *result;
493 Py_UNICODE *i, *end, *o;
494 /* Longest decomposition in Unicode 3.2: U+FDFA */
495 Py_UNICODE stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000496 Py_ssize_t space, isize;
497 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000498 unsigned char prev, cur;
499
500 stackptr = 0;
501 isize = PyUnicode_GET_SIZE(input);
502 /* Overallocate atmost 10 characters. */
503 space = (isize > 10 ? 10 : isize) + isize;
504 result = PyUnicode_FromUnicode(NULL, space);
505 if (!result)
506 return NULL;
507 i = PyUnicode_AS_UNICODE(input);
508 end = i + isize;
509 o = PyUnicode_AS_UNICODE(result);
510
511 while (i < end) {
512 stack[stackptr++] = *i++;
513 while(stackptr) {
514 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000515 /* Hangul Decomposition adds three characters in
516 a single step, so we need atleast that much room. */
517 if (space < 3) {
Martin v. Löwis5b222132007-06-10 09:51:05 +0000518 Py_ssize_t newsize = PyUnicode_GET_SIZE(result) + 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000519 space += 10;
520 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000521 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000522 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000523 }
524 /* Hangul Decomposition. */
525 if (SBase <= code && code < (SBase+SCount)) {
526 int SIndex = code - SBase;
527 int L = LBase + SIndex / NCount;
528 int V = VBase + (SIndex % NCount) / TCount;
529 int T = TBase + SIndex % TCount;
530 *o++ = L;
531 *o++ = V;
532 space -= 2;
533 if (T != TBase) {
534 *o++ = T;
535 space --;
536 }
537 continue;
538 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000539 /* normalization changes */
540 if (self) {
541 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
542 if (value != 0) {
543 stack[stackptr++] = value;
544 continue;
545 }
546 }
547
548 /* Other decompositions. */
549 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000550
551 /* Copy character if it is not decomposable, or has a
552 compatibility decomposition, but we do NFD. */
553 if (!count || (prefix && !k)) {
554 *o++ = code;
555 space--;
556 continue;
557 }
558 /* Copy decomposition onto the stack, in reverse
559 order. */
560 while(count) {
561 code = decomp_data[index + (--count)];
562 stack[stackptr++] = code;
563 }
564 }
565 }
566
567 /* Drop overallocation. Cannot fail. */
568 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
569
570 /* Sort canonically. */
571 i = PyUnicode_AS_UNICODE(result);
572 prev = _getrecord_ex(*i)->combining;
573 end = i + PyUnicode_GET_SIZE(result);
574 for (i++; i < end; i++) {
575 cur = _getrecord_ex(*i)->combining;
576 if (prev == 0 || cur == 0 || prev <= cur) {
577 prev = cur;
578 continue;
579 }
580 /* Non-canonical order. Need to switch *i with previous. */
581 o = i - 1;
582 while (1) {
583 Py_UNICODE tmp = o[1];
584 o[1] = o[0];
585 o[0] = tmp;
586 o--;
587 if (o < PyUnicode_AS_UNICODE(result))
588 break;
589 prev = _getrecord_ex(*o)->combining;
590 if (prev == 0 || prev <= cur)
591 break;
592 }
593 prev = _getrecord_ex(*i)->combining;
594 }
595 return result;
596}
597
598static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000599find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000600{
601 int index;
602 for (index = 0; nfc[index].start; index++) {
603 int start = nfc[index].start;
604 if (code < start)
605 return -1;
606 if (code <= start + nfc[index].count) {
607 int delta = code - start;
608 return nfc[index].index + delta;
609 }
610 }
611 return -1;
612}
613
614static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000615nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000616{
617 PyObject *result;
618 Py_UNICODE *i, *i1, *o, *end;
619 int f,l,index,index1,comb;
620 Py_UNICODE code;
621 Py_UNICODE *skipped[20];
622 int cskipped = 0;
623
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000624 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000625 if (!result)
626 return NULL;
627
628 /* We are going to modify result in-place.
629 If nfd_nfkd is changed to sometimes return the input,
630 this code needs to be reviewed. */
631 assert(result != input);
632
633 i = PyUnicode_AS_UNICODE(result);
634 end = i + PyUnicode_GET_SIZE(result);
635 o = PyUnicode_AS_UNICODE(result);
636
637 again:
638 while (i < end) {
639 for (index = 0; index < cskipped; index++) {
640 if (skipped[index] == i) {
641 /* *i character is skipped.
642 Remove from list. */
643 skipped[index] = skipped[cskipped-1];
644 cskipped--;
645 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000646 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000647 }
648 }
649 /* Hangul Composition. We don't need to check for <LV,T>
650 pairs, since we always have decomposed data. */
651 if (LBase <= *i && *i < (LBase+LCount) &&
652 i + 1 < end &&
653 VBase <= i[1] && i[1] <= (VBase+VCount)) {
654 int LIndex, VIndex;
655 LIndex = i[0] - LBase;
656 VIndex = i[1] - VBase;
657 code = SBase + (LIndex*VCount+VIndex)*TCount;
658 i+=2;
659 if (i < end &&
660 TBase <= *i && *i <= (TBase+TCount)) {
661 code += *i-TBase;
662 i++;
663 }
664 *o++ = code;
665 continue;
666 }
667
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000668 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000669 if (f == -1) {
670 *o++ = *i++;
671 continue;
672 }
673 /* Find next unblocked character. */
674 i1 = i+1;
675 comb = 0;
676 while (i1 < end) {
677 int comb1 = _getrecord_ex(*i1)->combining;
678 if (comb1 && comb == comb1) {
679 /* Character is blocked. */
680 i1++;
681 continue;
682 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000683 l = find_nfc_index(self, nfc_last, *i1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000684 /* *i1 cannot be combined with *i. If *i1
685 is a starter, we don't need to look further.
686 Otherwise, record the combining class. */
687 if (l == -1) {
688 not_combinable:
689 if (comb1 == 0)
690 break;
691 comb = comb1;
692 i1++;
693 continue;
694 }
695 index = f*TOTAL_LAST + l;
696 index1 = comp_index[index >> COMP_SHIFT];
697 code = comp_data[(index1<<COMP_SHIFT)+
698 (index&((1<<COMP_SHIFT)-1))];
699 if (code == 0)
700 goto not_combinable;
701
702 /* Replace the original character. */
703 *i = code;
704 /* Mark the second character unused. */
705 skipped[cskipped++] = i1;
706 i1++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000707 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000708 if (f == -1)
709 break;
710 }
711 *o++ = *i++;
712 }
713 if (o != end)
714 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
715 return result;
716}
717
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000718PyDoc_STRVAR(unicodedata_normalize__doc__,
719"normalize(form, unistr)\n\
720\n\
721Return the normal form 'form' for the Unicode string unistr. Valid\n\
722values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
723
Martin v. Löwis677bde22002-11-23 22:08:15 +0000724static PyObject*
725unicodedata_normalize(PyObject *self, PyObject *args)
726{
727 char *form;
728 PyObject *input;
729
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000730 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000731 &form, &PyUnicode_Type, &input))
732 return NULL;
733
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000734 if (PyUnicode_GetSize(input) == 0) {
735 /* Special case empty input strings, since resizing
736 them later would cause internal errors. */
737 Py_INCREF(input);
738 return input;
739 }
740
Martin v. Löwis677bde22002-11-23 22:08:15 +0000741 if (strcmp(form, "NFC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000742 return nfc_nfkc(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000743 if (strcmp(form, "NFKC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000744 return nfc_nfkc(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000745 if (strcmp(form, "NFD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000746 return nfd_nfkd(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000747 if (strcmp(form, "NFKD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000748 return nfd_nfkd(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000749 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
750 return NULL;
751}
752
Fredrik Lundh06d12682001-01-24 07:59:11 +0000753/* -------------------------------------------------------------------- */
754/* unicode character name tables */
755
756/* data file generated by Tools/unicode/makeunicodedata.py */
757#include "unicodename_db.h"
758
759/* -------------------------------------------------------------------- */
760/* database code (cut and pasted from the unidb package) */
761
762static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000763_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000764{
765 int i;
766 unsigned long h = 0;
767 unsigned long ix;
768 for (i = 0; i < len; i++) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000769 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000770 ix = h & 0xff000000;
771 if (ix)
772 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
773 }
774 return h;
775}
776
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000777static char *hangul_syllables[][3] = {
778 { "G", "A", "" },
779 { "GG", "AE", "G" },
780 { "N", "YA", "GG" },
781 { "D", "YAE", "GS" },
782 { "DD", "EO", "N", },
783 { "R", "E", "NJ" },
784 { "M", "YEO", "NH" },
785 { "B", "YE", "D" },
786 { "BB", "O", "L" },
787 { "S", "WA", "LG" },
788 { "SS", "WAE", "LM" },
789 { "", "OE", "LB" },
790 { "J", "YO", "LS" },
791 { "JJ", "U", "LT" },
792 { "C", "WEO", "LP" },
793 { "K", "WE", "LH" },
794 { "T", "WI", "M" },
795 { "P", "YU", "B" },
796 { "H", "EU", "BS" },
797 { 0, "YI", "S" },
798 { 0, "I", "SS" },
799 { 0, 0, "NG" },
800 { 0, 0, "J" },
801 { 0, 0, "C" },
802 { 0, 0, "K" },
803 { 0, 0, "T" },
804 { 0, 0, "P" },
805 { 0, 0, "H" }
806};
807
Fredrik Lundh06d12682001-01-24 07:59:11 +0000808static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000809is_unified_ideograph(Py_UCS4 code)
810{
811 return (
812 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Martin v. Löwisc3509122006-03-11 12:16:23 +0000813 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000814 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
815}
816
817static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000818_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000819{
820 int offset;
821 int i;
822 int word;
823 unsigned char* w;
824
Martin v. Löwisc3509122006-03-11 12:16:23 +0000825 if (code >= 0x110000)
826 return 0;
827
828 if (self) {
829 const change_record *old = get_old_record(self, code);
830 if (old->category_changed == 0) {
831 /* unassigned */
832 return 0;
833 }
834 }
835
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000836 if (SBase <= code && code < SBase+SCount) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000837 /* Hangul syllable. */
838 int SIndex = code - SBase;
839 int L = SIndex / NCount;
840 int V = (SIndex % NCount) / TCount;
841 int T = SIndex % TCount;
842
843 if (buflen < 27)
844 /* Worst case: HANGUL SYLLABLE <10chars>. */
845 return 0;
846 strcpy(buffer, "HANGUL SYLLABLE ");
847 buffer += 16;
848 strcpy(buffer, hangul_syllables[L][0]);
849 buffer += strlen(hangul_syllables[L][0]);
850 strcpy(buffer, hangul_syllables[V][1]);
851 buffer += strlen(hangul_syllables[V][1]);
852 strcpy(buffer, hangul_syllables[T][2]);
853 buffer += strlen(hangul_syllables[T][2]);
854 *buffer = '\0';
855 return 1;
856 }
857
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000858 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000859 if (buflen < 28)
860 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
861 return 0;
862 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
863 return 1;
864 }
865
Fredrik Lundh06d12682001-01-24 07:59:11 +0000866 /* get offset into phrasebook */
867 offset = phrasebook_offset1[(code>>phrasebook_shift)];
868 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
869 (code&((1<<phrasebook_shift)-1))];
870 if (!offset)
871 return 0;
872
873 i = 0;
874
875 for (;;) {
876 /* get word index */
877 word = phrasebook[offset] - phrasebook_short;
878 if (word >= 0) {
879 word = (word << 8) + phrasebook[offset+1];
880 offset += 2;
881 } else
882 word = phrasebook[offset++];
883 if (i) {
884 if (i > buflen)
885 return 0; /* buffer overflow */
886 buffer[i++] = ' ';
887 }
888 /* copy word string from lexicon. the last character in the
889 word has bit 7 set. the last word in a string ends with
890 0x80 */
891 w = lexicon + lexicon_offset[word];
892 while (*w < 128) {
893 if (i >= buflen)
894 return 0; /* buffer overflow */
895 buffer[i++] = *w++;
896 }
897 if (i >= buflen)
898 return 0; /* buffer overflow */
899 buffer[i++] = *w & 127;
900 if (*w == 128)
901 break; /* end of word */
902 }
903
904 return 1;
905}
906
907static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000908_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000909{
910 /* check if code corresponds to the given name */
911 int i;
912 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000913 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000914 return 0;
915 for (i = 0; i < namelen; i++) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000916 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +0000917 return 0;
918 }
919 return buffer[namelen] == '\0';
920}
921
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000922static void
923find_syllable(const char *str, int *len, int *pos, int count, int column)
924{
925 int i, len1;
926 *len = -1;
927 for (i = 0; i < count; i++) {
928 char *s = hangul_syllables[i][column];
929 len1 = strlen(s);
930 if (len1 <= *len)
931 continue;
932 if (strncmp(str, s, len1) == 0) {
933 *len = len1;
934 *pos = i;
935 }
936 }
937 if (*len == -1) {
938 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000939 }
940}
941
Fredrik Lundh06d12682001-01-24 07:59:11 +0000942static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000943_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000944{
945 unsigned int h, v;
946 unsigned int mask = code_size-1;
947 unsigned int i, incr;
948
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000949 /* Check for hangul syllables. */
950 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Thomas Wouters1e365b22006-03-01 21:58:30 +0000951 int len, L = -1, V = -1, T = -1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000952 const char *pos = name + 16;
953 find_syllable(pos, &len, &L, LCount, 0);
954 pos += len;
955 find_syllable(pos, &len, &V, VCount, 1);
956 pos += len;
957 find_syllable(pos, &len, &T, TCount, 2);
958 pos += len;
Martin v. Löwis8b291e22005-09-18 08:17:56 +0000959 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000960 *code = SBase + (L*VCount+V)*TCount + T;
961 return 1;
962 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000963 /* Otherwise, it's an illegal syllable name. */
964 return 0;
965 }
966
967 /* Check for unified ideographs. */
968 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
969 /* Four or five hexdigits must follow. */
970 v = 0;
971 name += 22;
972 namelen -= 22;
973 if (namelen != 4 && namelen != 5)
974 return 0;
975 while (namelen--) {
976 v *= 16;
977 if (*name >= '0' && *name <= '9')
978 v += *name - '0';
979 else if (*name >= 'A' && *name <= 'F')
980 v += *name - 'A' + 10;
981 else
982 return 0;
983 name++;
984 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000985 if (!is_unified_ideograph(v))
986 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000987 *code = v;
988 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000989 }
990
Fredrik Lundh06d12682001-01-24 07:59:11 +0000991 /* the following is the same as python's dictionary lookup, with
992 only minor changes. see the makeunicodedata script for more
993 details */
994
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000995 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000996 i = (~h) & mask;
997 v = code_hash[i];
998 if (!v)
999 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001000 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001001 *code = v;
1002 return 1;
1003 }
1004 incr = (h ^ (h >> 3)) & mask;
1005 if (!incr)
1006 incr = mask;
1007 for (;;) {
1008 i = (i + incr) & mask;
1009 v = code_hash[i];
1010 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001011 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001012 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001013 *code = v;
1014 return 1;
1015 }
1016 incr = incr << 1;
1017 if (incr > mask)
1018 incr = incr ^ code_poly;
1019 }
1020}
1021
1022static const _PyUnicode_Name_CAPI hashAPI =
1023{
1024 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001025 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001026 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001027};
1028
1029/* -------------------------------------------------------------------- */
1030/* Python bindings */
1031
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001032PyDoc_STRVAR(unicodedata_name__doc__,
1033"name(unichr[, default])\n\
1034Returns the name assigned to the Unicode character unichr as a\n\
1035string. If no name is defined, default is returned, or, if not\n\
1036given, ValueError is raised.");
1037
Fredrik Lundh06d12682001-01-24 07:59:11 +00001038static PyObject *
1039unicodedata_name(PyObject* self, PyObject* args)
1040{
1041 char name[NAME_MAXLEN];
1042
1043 PyUnicodeObject* v;
1044 PyObject* defobj = NULL;
1045 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1046 return NULL;
1047
1048 if (PyUnicode_GET_SIZE(v) != 1) {
1049 PyErr_SetString(PyExc_TypeError,
1050 "need a single Unicode character as parameter");
1051 return NULL;
1052 }
1053
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001054 if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
1055 name, sizeof(name))) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001056 if (defobj == NULL) {
1057 PyErr_SetString(PyExc_ValueError, "no such name");
1058 return NULL;
1059 }
1060 else {
1061 Py_INCREF(defobj);
1062 return defobj;
1063 }
1064 }
1065
Walter Dörwald4254e762007-06-05 16:04:09 +00001066 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001067}
1068
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001069PyDoc_STRVAR(unicodedata_lookup__doc__,
1070"lookup(name)\n\
1071\n\
1072Look up character by name. If a character with the\n\
1073given name is found, return the corresponding Unicode\n\
1074character. If not found, KeyError is raised.");
1075
Fredrik Lundh06d12682001-01-24 07:59:11 +00001076static PyObject *
1077unicodedata_lookup(PyObject* self, PyObject* args)
1078{
1079 Py_UCS4 code;
Guido van Rossum806c2462007-08-06 23:33:07 +00001080 Py_UNICODE str[2];
Fredrik Lundh06d12682001-01-24 07:59:11 +00001081
1082 char* name;
1083 int namelen;
1084 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1085 return NULL;
1086
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001087 if (!_getcode(self, name, namelen, &code)) {
Guido van Rossum806c2462007-08-06 23:33:07 +00001088 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1089 name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001090 return NULL;
1091 }
1092
Guido van Rossum806c2462007-08-06 23:33:07 +00001093#ifndef Py_UNICODE_WIDE
1094 if (code >= 0x10000) {
1095 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1096 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1097 return PyUnicode_FromUnicode(str, 2);
1098 }
1099#endif
Fredrik Lundh06d12682001-01-24 07:59:11 +00001100 str[0] = (Py_UNICODE) code;
Guido van Rossum806c2462007-08-06 23:33:07 +00001101 return PyUnicode_FromUnicode(str, 1);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001102}
1103
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001104/* XXX Add doc strings. */
1105
1106static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001107 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1108 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1109 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1110 {"category", unicodedata_category, METH_VARARGS,
1111 unicodedata_category__doc__},
1112 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1113 unicodedata_bidirectional__doc__},
1114 {"combining", unicodedata_combining, METH_VARARGS,
1115 unicodedata_combining__doc__},
1116 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1117 unicodedata_mirrored__doc__},
1118 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1119 unicodedata_east_asian_width__doc__},
1120 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1121 unicodedata_decomposition__doc__},
1122 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1123 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1124 {"normalize", unicodedata_normalize, METH_VARARGS,
1125 unicodedata_normalize__doc__},
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001126 {NULL, NULL} /* sentinel */
1127};
1128
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001129static PyTypeObject UCD_Type = {
1130 /* The ob_type field must be initialized in the module init function
1131 * to be portable to Windows without using C++. */
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001132 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001133 "unicodedata.UCD", /*tp_name*/
1134 sizeof(PreviousDBVersion), /*tp_basicsize*/
1135 0, /*tp_itemsize*/
1136 /* methods */
1137 (destructor)PyObject_Del, /*tp_dealloc*/
1138 0, /*tp_print*/
1139 0, /*tp_getattr*/
1140 0, /*tp_setattr*/
1141 0, /*tp_compare*/
1142 0, /*tp_repr*/
1143 0, /*tp_as_number*/
1144 0, /*tp_as_sequence*/
1145 0, /*tp_as_mapping*/
1146 0, /*tp_hash*/
1147 0, /*tp_call*/
1148 0, /*tp_str*/
1149 PyObject_GenericGetAttr,/*tp_getattro*/
1150 0, /*tp_setattro*/
1151 0, /*tp_as_buffer*/
1152 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1153 0, /*tp_doc*/
1154 0, /*tp_traverse*/
1155 0, /*tp_clear*/
1156 0, /*tp_richcompare*/
1157 0, /*tp_weaklistoffset*/
1158 0, /*tp_iter*/
1159 0, /*tp_iternext*/
1160 unicodedata_functions, /*tp_methods*/
1161 DB_members, /*tp_members*/
1162 0, /*tp_getset*/
1163 0, /*tp_base*/
1164 0, /*tp_dict*/
1165 0, /*tp_descr_get*/
1166 0, /*tp_descr_set*/
1167 0, /*tp_dictoffset*/
1168 0, /*tp_init*/
1169 0, /*tp_alloc*/
1170 0, /*tp_new*/
1171 0, /*tp_free*/
1172 0, /*tp_is_gc*/
1173};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001174
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001175PyDoc_STRVAR(unicodedata_docstring,
1176"This module provides access to the Unicode Character Database which\n\
1177defines character properties for all Unicode characters. The data in\n\
1178this database is based on the UnicodeData.txt file version\n\
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011794.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001180\n\
1181The module uses the same names and symbols as defined by the\n\
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001182UnicodeData File Format 4.1.0 (see\n\
1183http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001184
Mark Hammond62b1ab12002-07-23 06:31:15 +00001185PyMODINIT_FUNC
Thomas Woutersf3f33dc2000-07-21 06:00:07 +00001186initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001187{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001188 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001189
Christian Heimes90aa7642007-12-19 02:45:37 +00001190 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001191
Fred Drakef585bef2001-03-03 19:41:55 +00001192 m = Py_InitModule3(
1193 "unicodedata", unicodedata_functions, unicodedata_docstring);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001194 if (!m)
1195 return;
1196
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001197 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001198 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001199 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001200
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001201 /* Previous versions */
1202 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1203 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001204 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001205
Fredrik Lundh06d12682001-01-24 07:59:11 +00001206 /* Export C API */
1207 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001208 if (v != NULL)
1209 PyModule_AddObject(m, "ucnhash_CAPI", v);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001210}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001211
1212/*
1213Local variables:
1214c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001215indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001216End:
1217*/