blob: a8548376b9d2a302d5c0746891659152fc607246 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Martin v. Löwis7d41e292002-11-23 12:22:32 +00003 unicodedata -- Provides access to the Unicode 3.2 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Martin v. Löwis7d41e292002-11-23 12:22:32 +00005 Data was extracted from the Unicode 3.2 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Martin v. Löwis7d41e292002-11-23 12:22:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000028 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030} _PyUnicode_DatabaseRecord;
31
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000032typedef struct change_record {
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed;
35 const unsigned char category_changed;
36 const unsigned char decimal_changed;
37 const int numeric_changed;
38} change_record;
39
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000040/* data file generated by Tools/unicode/makeunicodedata.py */
41#include "unicodedata_db.h"
42
43static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000044_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000045{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000046 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000047 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048 index = 0;
49 else {
50 index = index1[(code>>SHIFT)];
51 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
52 }
53
54 return &_PyUnicode_Database_Records[index];
55}
56
Martin v. Löwis677bde22002-11-23 22:08:15 +000057static const _PyUnicode_DatabaseRecord*
58_getrecord(PyUnicodeObject* v)
59{
60 return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
61}
62
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000063/* ------------- Previous-version API ------------------------------------- */
64typedef struct previous_version {
65 PyObject_HEAD
66 const char *name;
67 const change_record* (*getrecord)(Py_UCS4);
68 Py_UCS4 (*normalization)(Py_UCS4);
69} PreviousDBVersion;
70
71#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
72
73/* Forward declaration */
74static PyMethodDef unicodedata_functions[];
75
76static PyMemberDef DB_members[] = {
77 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
78 {NULL}
79};
80
81static PyTypeObject Xxo_Type = {
82 /* The ob_type field must be initialized in the module init function
83 * to be portable to Windows without using C++. */
84 PyObject_HEAD_INIT(NULL)
85 0, /*ob_size*/
86 "unicodedata.DB", /*tp_name*/
87 sizeof(PreviousDBVersion), /*tp_basicsize*/
88 0, /*tp_itemsize*/
89 /* methods */
90 (destructor)PyObject_Del, /*tp_dealloc*/
91 0, /*tp_print*/
92 0, /*tp_getattr*/
93 0, /*tp_setattr*/
94 0, /*tp_compare*/
95 0, /*tp_repr*/
96 0, /*tp_as_number*/
97 0, /*tp_as_sequence*/
98 0, /*tp_as_mapping*/
99 0, /*tp_hash*/
100 0, /*tp_call*/
101 0, /*tp_str*/
102 PyObject_GenericGetAttr,/*tp_getattro*/
103 0, /*tp_setattro*/
104 0, /*tp_as_buffer*/
105 Py_TPFLAGS_DEFAULT, /*tp_flags*/
106 0, /*tp_doc*/
107 0, /*tp_traverse*/
108 0, /*tp_clear*/
109 0, /*tp_richcompare*/
110 0, /*tp_weaklistoffset*/
111 0, /*tp_iter*/
112 0, /*tp_iternext*/
113 unicodedata_functions, /*tp_methods*/
114 DB_members, /*tp_members*/
115 0, /*tp_getset*/
116 0, /*tp_base*/
117 0, /*tp_dict*/
118 0, /*tp_descr_get*/
119 0, /*tp_descr_set*/
120 0, /*tp_dictoffset*/
121 0, /*tp_init*/
122 0, /*tp_alloc*/
123 0, /*tp_new*/
124 0, /*tp_free*/
125 0, /*tp_is_gc*/
126};
127
128static PyObject*
129new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
130 Py_UCS4 (*normalization)(Py_UCS4))
131{
132 PreviousDBVersion *self;
133 self = PyObject_New(PreviousDBVersion, &Xxo_Type);
134 if (self == NULL)
135 return NULL;
136 self->name = name;
137 self->getrecord = getrecord;
138 self->normalization = normalization;
139 return (PyObject*)self;
140}
141
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000142/* --- Module API --------------------------------------------------------- */
143
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000144PyDoc_STRVAR(unicodedata_decimal__doc__,
145"decimal(unichr[, default])\n\
146\n\
147Returns the decimal value assigned to the Unicode character unichr\n\
148as integer. If no such value is defined, default is returned, or, if\n\
149not given, ValueError is raised.");
150
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000151static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000152unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000153{
154 PyUnicodeObject *v;
155 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000156 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000157 long rc;
158
Fredrik Lundh06d12682001-01-24 07:59:11 +0000159 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000160 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000161 if (PyUnicode_GET_SIZE(v) != 1) {
162 PyErr_SetString(PyExc_TypeError,
163 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000164 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000165 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000166
167 if (self) {
168 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
169 if (old->category_changed == 0) {
170 /* unassigned */
171 have_old = 1;
172 rc = -1;
173 }
174 else if (old->decimal_changed != 0xFF) {
175 have_old = 1;
176 rc = old->decimal_changed;
177 }
178 }
179
180 if (!have_old)
181 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000182 if (rc < 0) {
183 if (defobj == NULL) {
184 PyErr_SetString(PyExc_ValueError,
185 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000186 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000187 }
188 else {
189 Py_INCREF(defobj);
190 return defobj;
191 }
192 }
193 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000194}
195
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000196PyDoc_STRVAR(unicodedata_digit__doc__,
197"digit(unichr[, default])\n\
198\n\
199Returns the digit value assigned to the Unicode character unichr as\n\
200integer. If no such value is defined, default is returned, or, if\n\
201not given, ValueError is raised.");
202
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000204unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000205{
206 PyUnicodeObject *v;
207 PyObject *defobj = NULL;
208 long rc;
209
Fredrik Lundh06d12682001-01-24 07:59:11 +0000210 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000211 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000212 if (PyUnicode_GET_SIZE(v) != 1) {
213 PyErr_SetString(PyExc_TypeError,
214 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000215 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000216 }
217 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
218 if (rc < 0) {
219 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000220 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000221 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000222 }
223 else {
224 Py_INCREF(defobj);
225 return defobj;
226 }
227 }
228 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000229}
230
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000231PyDoc_STRVAR(unicodedata_numeric__doc__,
232"numeric(unichr[, default])\n\
233\n\
234Returns the numeric value assigned to the Unicode character unichr\n\
235as float. If no such value is defined, default is returned, or, if\n\
236not given, ValueError is raised.");
237
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000238static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000239unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000240{
241 PyUnicodeObject *v;
242 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000243 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000244 double rc;
245
Fredrik Lundh06d12682001-01-24 07:59:11 +0000246 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000247 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000248 if (PyUnicode_GET_SIZE(v) != 1) {
249 PyErr_SetString(PyExc_TypeError,
250 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000251 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000252 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000253
254 if (self) {
255 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
256 if (old->category_changed == 0) {
257 /* unassigned */
258 have_old = 1;
259 rc = -1;
260 }
261 else if (old->decimal_changed != 0xFF) {
262 have_old = 1;
263 rc = old->decimal_changed;
264 }
265 }
266
267 if (!have_old)
268 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000269 if (rc < 0) {
270 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000271 PyErr_SetString(PyExc_ValueError, "not a numeric character");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000272 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000273 }
274 else {
275 Py_INCREF(defobj);
276 return defobj;
277 }
278 }
279 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000280}
281
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000282PyDoc_STRVAR(unicodedata_category__doc__,
283"category(unichr)\n\
284\n\
285Returns the general category assigned to the Unicode character\n\
286unichr as string.");
287
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000288static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000289unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000290{
291 PyUnicodeObject *v;
292 int index;
293
294 if (!PyArg_ParseTuple(args, "O!:category",
295 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000296 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000297 if (PyUnicode_GET_SIZE(v) != 1) {
298 PyErr_SetString(PyExc_TypeError,
299 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000300 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000301 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000302 index = (int) _getrecord(v)->category;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000303 if (self) {
304 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
305 if (old->category_changed != 0xFF)
306 index = old->category_changed;
307 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000308 return PyString_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000309}
310
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000311PyDoc_STRVAR(unicodedata_bidirectional__doc__,
312"bidirectional(unichr)\n\
313\n\
314Returns the bidirectional category assigned to the Unicode character\n\
315unichr as string. If no such value is defined, an empty string is\n\
316returned.");
317
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000318static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000319unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000320{
321 PyUnicodeObject *v;
322 int index;
323
324 if (!PyArg_ParseTuple(args, "O!:bidirectional",
325 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000326 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000327 if (PyUnicode_GET_SIZE(v) != 1) {
328 PyErr_SetString(PyExc_TypeError,
329 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000330 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000331 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000332 index = (int) _getrecord(v)->bidirectional;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000333 if (self) {
334 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
335 if (old->category_changed == 0)
336 index = 0; /* unassigned */
337 else if (old->bidir_changed != 0xFF)
338 index = old->bidir_changed;
339 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000340 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000341}
342
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000343PyDoc_STRVAR(unicodedata_combining__doc__,
344"combining(unichr)\n\
345\n\
346Returns the canonical combining class assigned to the Unicode\n\
347character unichr as integer. Returns 0 if no combining class is\n\
348defined.");
349
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000350static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000351unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000352{
353 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000354 int index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000355
356 if (!PyArg_ParseTuple(args, "O!:combining",
357 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000358 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000359 if (PyUnicode_GET_SIZE(v) != 1) {
360 PyErr_SetString(PyExc_TypeError,
361 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000362 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000363 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000364 index = (int) _getrecord(v)->combining;
365 if (self) {
366 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
367 if (old->category_changed == 0)
368 index = 0; /* unassigned */
369 }
370 return PyInt_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000371}
372
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000373PyDoc_STRVAR(unicodedata_mirrored__doc__,
374"mirrored(unichr)\n\
375\n\
376Returns the mirrored property assigned to the Unicode character\n\
377unichr as integer. Returns 1 if the character has been identified as\n\
378a \"mirrored\" character in bidirectional text, 0 otherwise.");
379
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000380static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000381unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000382{
383 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000384 int index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000385
386 if (!PyArg_ParseTuple(args, "O!:mirrored",
387 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000388 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000389 if (PyUnicode_GET_SIZE(v) != 1) {
390 PyErr_SetString(PyExc_TypeError,
391 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000392 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000393 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000394 index = (int) _getrecord(v)->mirrored;
395 if (self) {
396 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
397 if (old->category_changed == 0)
398 index = 0; /* unassigned */
399 }
400 return PyInt_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000401}
402
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000403PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
404"east_asian_width(unichr)\n\
405\n\
406Returns the east asian width assigned to the Unicode character\n\
407unichr as string.");
408
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000409static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000410unicodedata_east_asian_width(PyObject *self, PyObject *args)
411{
412 PyUnicodeObject *v;
413 int index;
414
415 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
416 &PyUnicode_Type, &v))
417 return NULL;
418 if (PyUnicode_GET_SIZE(v) != 1) {
419 PyErr_SetString(PyExc_TypeError,
420 "need a single Unicode character as parameter");
421 return NULL;
422 }
423 index = (int) _getrecord(v)->east_asian_width;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000424 if (self) {
425 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
426 if (old->category_changed == 0)
427 index = 0; /* unassigned */
428 }
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000429 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
430}
431
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000432PyDoc_STRVAR(unicodedata_decomposition__doc__,
433"decomposition(unichr)\n\
434\n\
435Returns the character decomposition mapping assigned to the Unicode\n\
436character unichr as string. An empty string is returned in case no\n\
437such mapping is defined.");
438
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000439static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000440unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000441{
442 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000443 char decomp[256];
444 int code, index, count, i;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000445
446 if (!PyArg_ParseTuple(args, "O!:decomposition",
447 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000448 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000449 if (PyUnicode_GET_SIZE(v) != 1) {
450 PyErr_SetString(PyExc_TypeError,
451 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000452 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000453 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000454
455 code = (int) *PyUnicode_AS_UNICODE(v);
456
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000457 if (self) {
458 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
459 if (old->category_changed == 0)
460 return PyString_FromString(""); /* unassigned */
461 }
462
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000463 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000464 index = 0;
465 else {
466 index = decomp_index1[(code>>DECOMP_SHIFT)];
467 index = decomp_index2[(index<<DECOMP_SHIFT)+
468 (code&((1<<DECOMP_SHIFT)-1))];
469 }
470
Tim Peters69b83b12001-11-30 07:23:05 +0000471 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000472 is prefix code (from*/
473 count = decomp_data[index] >> 8;
474
475 /* XXX: could allocate the PyString up front instead
476 (strlen(prefix) + 5 * count + 1 bytes) */
477
478 /* copy prefix */
479 i = strlen(decomp_prefix[decomp_data[index] & 255]);
480 memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
481
482 while (count-- > 0) {
483 if (i)
484 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000485 assert((size_t)i < sizeof(decomp));
486 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
487 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000488 i += strlen(decomp + i);
489 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000490
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000491 decomp[i] = '\0';
492
493 return PyString_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000494}
495
Martin v. Löwis677bde22002-11-23 22:08:15 +0000496void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000497get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000498{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000499 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000500 *index = 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000501 } else if (self && get_old_record(self, code)->category_changed==0) {
502 /* unassigned in old version */
503 *index = 0;
504 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000505 else {
506 *index = decomp_index1[(code>>DECOMP_SHIFT)];
507 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
508 (code&((1<<DECOMP_SHIFT)-1))];
509 }
510
511 /* high byte is number of hex bytes (usually one or two), low byte
512 is prefix code (from*/
513 *count = decomp_data[*index] >> 8;
514 *prefix = decomp_data[*index] & 255;
515
516 (*index)++;
517}
518
519#define SBase 0xAC00
520#define LBase 0x1100
521#define VBase 0x1161
522#define TBase 0x11A7
523#define LCount 19
524#define VCount 21
525#define TCount 28
526#define NCount (VCount*TCount)
527#define SCount (LCount*NCount)
528
529static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000530nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000531{
532 PyObject *result;
533 Py_UNICODE *i, *end, *o;
534 /* Longest decomposition in Unicode 3.2: U+FDFA */
535 Py_UNICODE stack[20];
536 int space, stackptr, isize;
537 int index, prefix, count;
538 unsigned char prev, cur;
539
540 stackptr = 0;
541 isize = PyUnicode_GET_SIZE(input);
542 /* Overallocate atmost 10 characters. */
543 space = (isize > 10 ? 10 : isize) + isize;
544 result = PyUnicode_FromUnicode(NULL, space);
545 if (!result)
546 return NULL;
547 i = PyUnicode_AS_UNICODE(input);
548 end = i + isize;
549 o = PyUnicode_AS_UNICODE(result);
550
551 while (i < end) {
552 stack[stackptr++] = *i++;
553 while(stackptr) {
554 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000555 /* Hangul Decomposition adds three characters in
556 a single step, so we need atleast that much room. */
557 if (space < 3) {
558 int newsize = PyString_GET_SIZE(result) + 10;
559 space += 10;
560 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000561 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000562 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000563 }
564 /* Hangul Decomposition. */
565 if (SBase <= code && code < (SBase+SCount)) {
566 int SIndex = code - SBase;
567 int L = LBase + SIndex / NCount;
568 int V = VBase + (SIndex % NCount) / TCount;
569 int T = TBase + SIndex % TCount;
570 *o++ = L;
571 *o++ = V;
572 space -= 2;
573 if (T != TBase) {
574 *o++ = T;
575 space --;
576 }
577 continue;
578 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000579 /* normalization changes */
580 if (self) {
581 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
582 if (value != 0) {
583 stack[stackptr++] = value;
584 continue;
585 }
586 }
587
588 /* Other decompositions. */
589 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000590
591 /* Copy character if it is not decomposable, or has a
592 compatibility decomposition, but we do NFD. */
593 if (!count || (prefix && !k)) {
594 *o++ = code;
595 space--;
596 continue;
597 }
598 /* Copy decomposition onto the stack, in reverse
599 order. */
600 while(count) {
601 code = decomp_data[index + (--count)];
602 stack[stackptr++] = code;
603 }
604 }
605 }
606
607 /* Drop overallocation. Cannot fail. */
608 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
609
610 /* Sort canonically. */
611 i = PyUnicode_AS_UNICODE(result);
612 prev = _getrecord_ex(*i)->combining;
613 end = i + PyUnicode_GET_SIZE(result);
614 for (i++; i < end; i++) {
615 cur = _getrecord_ex(*i)->combining;
616 if (prev == 0 || cur == 0 || prev <= cur) {
617 prev = cur;
618 continue;
619 }
620 /* Non-canonical order. Need to switch *i with previous. */
621 o = i - 1;
622 while (1) {
623 Py_UNICODE tmp = o[1];
624 o[1] = o[0];
625 o[0] = tmp;
626 o--;
627 if (o < PyUnicode_AS_UNICODE(result))
628 break;
629 prev = _getrecord_ex(*o)->combining;
630 if (prev == 0 || prev <= cur)
631 break;
632 }
633 prev = _getrecord_ex(*i)->combining;
634 }
635 return result;
636}
637
638static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000639find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000640{
641 int index;
642 for (index = 0; nfc[index].start; index++) {
643 int start = nfc[index].start;
644 if (code < start)
645 return -1;
646 if (code <= start + nfc[index].count) {
647 int delta = code - start;
648 return nfc[index].index + delta;
649 }
650 }
651 return -1;
652}
653
654static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000655nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000656{
657 PyObject *result;
658 Py_UNICODE *i, *i1, *o, *end;
659 int f,l,index,index1,comb;
660 Py_UNICODE code;
661 Py_UNICODE *skipped[20];
662 int cskipped = 0;
663
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000664 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000665 if (!result)
666 return NULL;
667
668 /* We are going to modify result in-place.
669 If nfd_nfkd is changed to sometimes return the input,
670 this code needs to be reviewed. */
671 assert(result != input);
672
673 i = PyUnicode_AS_UNICODE(result);
674 end = i + PyUnicode_GET_SIZE(result);
675 o = PyUnicode_AS_UNICODE(result);
676
677 again:
678 while (i < end) {
679 for (index = 0; index < cskipped; index++) {
680 if (skipped[index] == i) {
681 /* *i character is skipped.
682 Remove from list. */
683 skipped[index] = skipped[cskipped-1];
684 cskipped--;
685 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000686 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000687 }
688 }
689 /* Hangul Composition. We don't need to check for <LV,T>
690 pairs, since we always have decomposed data. */
691 if (LBase <= *i && *i < (LBase+LCount) &&
692 i + 1 < end &&
693 VBase <= i[1] && i[1] <= (VBase+VCount)) {
694 int LIndex, VIndex;
695 LIndex = i[0] - LBase;
696 VIndex = i[1] - VBase;
697 code = SBase + (LIndex*VCount+VIndex)*TCount;
698 i+=2;
699 if (i < end &&
700 TBase <= *i && *i <= (TBase+TCount)) {
701 code += *i-TBase;
702 i++;
703 }
704 *o++ = code;
705 continue;
706 }
707
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000708 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000709 if (f == -1) {
710 *o++ = *i++;
711 continue;
712 }
713 /* Find next unblocked character. */
714 i1 = i+1;
715 comb = 0;
716 while (i1 < end) {
717 int comb1 = _getrecord_ex(*i1)->combining;
718 if (comb1 && comb == comb1) {
719 /* Character is blocked. */
720 i1++;
721 continue;
722 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000723 l = find_nfc_index(self, nfc_last, *i1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000724 /* *i1 cannot be combined with *i. If *i1
725 is a starter, we don't need to look further.
726 Otherwise, record the combining class. */
727 if (l == -1) {
728 not_combinable:
729 if (comb1 == 0)
730 break;
731 comb = comb1;
732 i1++;
733 continue;
734 }
735 index = f*TOTAL_LAST + l;
736 index1 = comp_index[index >> COMP_SHIFT];
737 code = comp_data[(index1<<COMP_SHIFT)+
738 (index&((1<<COMP_SHIFT)-1))];
739 if (code == 0)
740 goto not_combinable;
741
742 /* Replace the original character. */
743 *i = code;
744 /* Mark the second character unused. */
745 skipped[cskipped++] = i1;
746 i1++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000747 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000748 if (f == -1)
749 break;
750 }
751 *o++ = *i++;
752 }
753 if (o != end)
754 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
755 return result;
756}
757
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000758PyDoc_STRVAR(unicodedata_normalize__doc__,
759"normalize(form, unistr)\n\
760\n\
761Return the normal form 'form' for the Unicode string unistr. Valid\n\
762values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
763
Martin v. Löwis677bde22002-11-23 22:08:15 +0000764static PyObject*
765unicodedata_normalize(PyObject *self, PyObject *args)
766{
767 char *form;
768 PyObject *input;
769
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000770 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000771 &form, &PyUnicode_Type, &input))
772 return NULL;
773
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000774 if (PyUnicode_GetSize(input) == 0) {
775 /* Special case empty input strings, since resizing
776 them later would cause internal errors. */
777 Py_INCREF(input);
778 return input;
779 }
780
Martin v. Löwis677bde22002-11-23 22:08:15 +0000781 if (strcmp(form, "NFC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000782 return nfc_nfkc(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000783 if (strcmp(form, "NFKC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000784 return nfc_nfkc(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000785 if (strcmp(form, "NFD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000786 return nfd_nfkd(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000787 if (strcmp(form, "NFKD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000788 return nfd_nfkd(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000789 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
790 return NULL;
791}
792
Fredrik Lundh06d12682001-01-24 07:59:11 +0000793/* -------------------------------------------------------------------- */
794/* unicode character name tables */
795
796/* data file generated by Tools/unicode/makeunicodedata.py */
797#include "unicodename_db.h"
798
799/* -------------------------------------------------------------------- */
800/* database code (cut and pasted from the unidb package) */
801
802static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000803_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000804{
805 int i;
806 unsigned long h = 0;
807 unsigned long ix;
808 for (i = 0; i < len; i++) {
809 h = (h * scale) + (unsigned char) toupper(s[i]);
810 ix = h & 0xff000000;
811 if (ix)
812 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
813 }
814 return h;
815}
816
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000817static char *hangul_syllables[][3] = {
818 { "G", "A", "" },
819 { "GG", "AE", "G" },
820 { "N", "YA", "GG" },
821 { "D", "YAE", "GS" },
822 { "DD", "EO", "N", },
823 { "R", "E", "NJ" },
824 { "M", "YEO", "NH" },
825 { "B", "YE", "D" },
826 { "BB", "O", "L" },
827 { "S", "WA", "LG" },
828 { "SS", "WAE", "LM" },
829 { "", "OE", "LB" },
830 { "J", "YO", "LS" },
831 { "JJ", "U", "LT" },
832 { "C", "WEO", "LP" },
833 { "K", "WE", "LH" },
834 { "T", "WI", "M" },
835 { "P", "YU", "B" },
836 { "H", "EU", "BS" },
837 { 0, "YI", "S" },
838 { 0, "I", "SS" },
839 { 0, 0, "NG" },
840 { 0, 0, "J" },
841 { 0, 0, "C" },
842 { 0, 0, "K" },
843 { 0, 0, "T" },
844 { 0, 0, "P" },
845 { 0, 0, "H" }
846};
847
Fredrik Lundh06d12682001-01-24 07:59:11 +0000848static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000849is_unified_ideograph(Py_UCS4 code)
850{
851 return (
852 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
853 (0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
854 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
855}
856
857static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000858_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000859{
860 int offset;
861 int i;
862 int word;
863 unsigned char* w;
864
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000865 if (SBase <= code && code < SBase+SCount) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000866 /* Hangul syllable. */
867 int SIndex = code - SBase;
868 int L = SIndex / NCount;
869 int V = (SIndex % NCount) / TCount;
870 int T = SIndex % TCount;
871
872 if (buflen < 27)
873 /* Worst case: HANGUL SYLLABLE <10chars>. */
874 return 0;
875 strcpy(buffer, "HANGUL SYLLABLE ");
876 buffer += 16;
877 strcpy(buffer, hangul_syllables[L][0]);
878 buffer += strlen(hangul_syllables[L][0]);
879 strcpy(buffer, hangul_syllables[V][1]);
880 buffer += strlen(hangul_syllables[V][1]);
881 strcpy(buffer, hangul_syllables[T][2]);
882 buffer += strlen(hangul_syllables[T][2]);
883 *buffer = '\0';
884 return 1;
885 }
886
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000887 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000888 if (buflen < 28)
889 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
890 return 0;
891 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
892 return 1;
893 }
894
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000895 if (code >= 0x110000)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000896 return 0;
897
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000898 if (self) {
899 const change_record *old = get_old_record(self, code);
900 if (old->category_changed == 0) {
901 /* unassigned */
902 return 0;
903 }
904 }
905
906
Fredrik Lundh06d12682001-01-24 07:59:11 +0000907 /* get offset into phrasebook */
908 offset = phrasebook_offset1[(code>>phrasebook_shift)];
909 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
910 (code&((1<<phrasebook_shift)-1))];
911 if (!offset)
912 return 0;
913
914 i = 0;
915
916 for (;;) {
917 /* get word index */
918 word = phrasebook[offset] - phrasebook_short;
919 if (word >= 0) {
920 word = (word << 8) + phrasebook[offset+1];
921 offset += 2;
922 } else
923 word = phrasebook[offset++];
924 if (i) {
925 if (i > buflen)
926 return 0; /* buffer overflow */
927 buffer[i++] = ' ';
928 }
929 /* copy word string from lexicon. the last character in the
930 word has bit 7 set. the last word in a string ends with
931 0x80 */
932 w = lexicon + lexicon_offset[word];
933 while (*w < 128) {
934 if (i >= buflen)
935 return 0; /* buffer overflow */
936 buffer[i++] = *w++;
937 }
938 if (i >= buflen)
939 return 0; /* buffer overflow */
940 buffer[i++] = *w & 127;
941 if (*w == 128)
942 break; /* end of word */
943 }
944
945 return 1;
946}
947
948static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000949_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000950{
951 /* check if code corresponds to the given name */
952 int i;
953 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000954 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000955 return 0;
956 for (i = 0; i < namelen; i++) {
957 if (toupper(name[i]) != buffer[i])
958 return 0;
959 }
960 return buffer[namelen] == '\0';
961}
962
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000963static void
964find_syllable(const char *str, int *len, int *pos, int count, int column)
965{
966 int i, len1;
967 *len = -1;
968 for (i = 0; i < count; i++) {
969 char *s = hangul_syllables[i][column];
970 len1 = strlen(s);
971 if (len1 <= *len)
972 continue;
973 if (strncmp(str, s, len1) == 0) {
974 *len = len1;
975 *pos = i;
976 }
977 }
978 if (*len == -1) {
979 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000980 }
981}
982
Fredrik Lundh06d12682001-01-24 07:59:11 +0000983static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000984_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000985{
986 unsigned int h, v;
987 unsigned int mask = code_size-1;
988 unsigned int i, incr;
989
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000990 /* Check for hangul syllables. */
991 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Thomas Wouters1e365b22006-03-01 21:58:30 +0000992 int len, L = -1, V = -1, T = -1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000993 const char *pos = name + 16;
994 find_syllable(pos, &len, &L, LCount, 0);
995 pos += len;
996 find_syllable(pos, &len, &V, VCount, 1);
997 pos += len;
998 find_syllable(pos, &len, &T, TCount, 2);
999 pos += len;
Martin v. Löwis8b291e22005-09-18 08:17:56 +00001000 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001001 *code = SBase + (L*VCount+V)*TCount + T;
1002 return 1;
1003 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001004 /* Otherwise, it's an illegal syllable name. */
1005 return 0;
1006 }
1007
1008 /* Check for unified ideographs. */
1009 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1010 /* Four or five hexdigits must follow. */
1011 v = 0;
1012 name += 22;
1013 namelen -= 22;
1014 if (namelen != 4 && namelen != 5)
1015 return 0;
1016 while (namelen--) {
1017 v *= 16;
1018 if (*name >= '0' && *name <= '9')
1019 v += *name - '0';
1020 else if (*name >= 'A' && *name <= 'F')
1021 v += *name - 'A' + 10;
1022 else
1023 return 0;
1024 name++;
1025 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001026 if (!is_unified_ideograph(v))
1027 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001028 *code = v;
1029 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001030 }
1031
Fredrik Lundh06d12682001-01-24 07:59:11 +00001032 /* the following is the same as python's dictionary lookup, with
1033 only minor changes. see the makeunicodedata script for more
1034 details */
1035
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001036 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001037 i = (~h) & mask;
1038 v = code_hash[i];
1039 if (!v)
1040 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001041 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001042 *code = v;
1043 return 1;
1044 }
1045 incr = (h ^ (h >> 3)) & mask;
1046 if (!incr)
1047 incr = mask;
1048 for (;;) {
1049 i = (i + incr) & mask;
1050 v = code_hash[i];
1051 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001052 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001053 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001054 *code = v;
1055 return 1;
1056 }
1057 incr = incr << 1;
1058 if (incr > mask)
1059 incr = incr ^ code_poly;
1060 }
1061}
1062
1063static const _PyUnicode_Name_CAPI hashAPI =
1064{
1065 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001066 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001067 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001068};
1069
1070/* -------------------------------------------------------------------- */
1071/* Python bindings */
1072
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001073PyDoc_STRVAR(unicodedata_name__doc__,
1074"name(unichr[, default])\n\
1075Returns the name assigned to the Unicode character unichr as a\n\
1076string. If no name is defined, default is returned, or, if not\n\
1077given, ValueError is raised.");
1078
Fredrik Lundh06d12682001-01-24 07:59:11 +00001079static PyObject *
1080unicodedata_name(PyObject* self, PyObject* args)
1081{
1082 char name[NAME_MAXLEN];
1083
1084 PyUnicodeObject* v;
1085 PyObject* defobj = NULL;
1086 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1087 return NULL;
1088
1089 if (PyUnicode_GET_SIZE(v) != 1) {
1090 PyErr_SetString(PyExc_TypeError,
1091 "need a single Unicode character as parameter");
1092 return NULL;
1093 }
1094
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001095 if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
1096 name, sizeof(name))) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001097 if (defobj == NULL) {
1098 PyErr_SetString(PyExc_ValueError, "no such name");
1099 return NULL;
1100 }
1101 else {
1102 Py_INCREF(defobj);
1103 return defobj;
1104 }
1105 }
1106
1107 return Py_BuildValue("s", name);
1108}
1109
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001110PyDoc_STRVAR(unicodedata_lookup__doc__,
1111"lookup(name)\n\
1112\n\
1113Look up character by name. If a character with the\n\
1114given name is found, return the corresponding Unicode\n\
1115character. If not found, KeyError is raised.");
1116
Fredrik Lundh06d12682001-01-24 07:59:11 +00001117static PyObject *
1118unicodedata_lookup(PyObject* self, PyObject* args)
1119{
1120 Py_UCS4 code;
1121 Py_UNICODE str[1];
1122
1123 char* name;
1124 int namelen;
1125 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1126 return NULL;
1127
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001128 if (!_getcode(self, name, namelen, &code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001129 char fmt[] = "undefined character name '%s'";
1130 char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
1131 sprintf(buf, fmt, name);
1132 PyErr_SetString(PyExc_KeyError, buf);
1133 PyMem_FREE(buf);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001134 return NULL;
1135 }
1136
1137 str[0] = (Py_UNICODE) code;
1138 return PyUnicode_FromUnicode(str, 1);
1139}
1140
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001141/* XXX Add doc strings. */
1142
1143static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001144 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1145 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1146 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1147 {"category", unicodedata_category, METH_VARARGS,
1148 unicodedata_category__doc__},
1149 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1150 unicodedata_bidirectional__doc__},
1151 {"combining", unicodedata_combining, METH_VARARGS,
1152 unicodedata_combining__doc__},
1153 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1154 unicodedata_mirrored__doc__},
1155 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1156 unicodedata_east_asian_width__doc__},
1157 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1158 unicodedata_decomposition__doc__},
1159 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1160 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1161 {"normalize", unicodedata_normalize, METH_VARARGS,
1162 unicodedata_normalize__doc__},
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001163 {NULL, NULL} /* sentinel */
1164};
1165
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001166
1167
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001168PyDoc_STRVAR(unicodedata_docstring,
1169"This module provides access to the Unicode Character Database which\n\
1170defines character properties for all Unicode characters. The data in\n\
1171this database is based on the UnicodeData.txt file version\n\
11723.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
1173\n\
1174The module uses the same names and symbols as defined by the\n\
1175UnicodeData File Format 3.2.0 (see\n\
Hye-Shik Chang4c560ea2005-06-04 07:31:48 +00001176http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001177
Mark Hammond62b1ab12002-07-23 06:31:15 +00001178PyMODINIT_FUNC
Thomas Woutersf3f33dc2000-07-21 06:00:07 +00001179initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001180{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001181 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001182
Fred Drakef585bef2001-03-03 19:41:55 +00001183 m = Py_InitModule3(
1184 "unicodedata", unicodedata_functions, unicodedata_docstring);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001185 if (!m)
1186 return;
1187
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001188 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1189
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001190 /* Previous versions */
1191 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1192 if (v != NULL)
1193 PyModule_AddObject(m, "db_3_2_0", v);
1194
Fredrik Lundh06d12682001-01-24 07:59:11 +00001195 /* Export C API */
1196 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001197 if (v != NULL)
1198 PyModule_AddObject(m, "ucnhash_CAPI", v);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001199}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001200
1201/*
1202Local variables:
1203c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001204indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001205End:
1206*/