blob: 760b7cfafb40776217c1a7cf49c200a0d715e616 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00003 unicodedata -- Provides access to the Unicode 4.1 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005 Data was extracted from the Unicode 4.1 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Martin v. Löwis7d41e292002-11-23 12:22:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000028 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030} _PyUnicode_DatabaseRecord;
31
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000032typedef struct change_record {
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed;
35 const unsigned char category_changed;
36 const unsigned char decimal_changed;
37 const int numeric_changed;
38} change_record;
39
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000040/* data file generated by Tools/unicode/makeunicodedata.py */
41#include "unicodedata_db.h"
42
43static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000044_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000045{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000046 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000047 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048 index = 0;
49 else {
50 index = index1[(code>>SHIFT)];
51 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
52 }
53
54 return &_PyUnicode_Database_Records[index];
55}
56
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000057/* ------------- Previous-version API ------------------------------------- */
58typedef struct previous_version {
59 PyObject_HEAD
60 const char *name;
61 const change_record* (*getrecord)(Py_UCS4);
62 Py_UCS4 (*normalization)(Py_UCS4);
63} PreviousDBVersion;
64
65#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
66
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000067static PyMemberDef DB_members[] = {
68 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
69 {NULL}
70};
71
Thomas Wouters89f507f2006-12-13 04:49:30 +000072/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000073static PyTypeObject UCD_Type;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000074
75static PyObject*
76new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
77 Py_UCS4 (*normalization)(Py_UCS4))
78{
79 PreviousDBVersion *self;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000080 self = PyObject_New(PreviousDBVersion, &UCD_Type);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000081 if (self == NULL)
82 return NULL;
83 self->name = name;
84 self->getrecord = getrecord;
85 self->normalization = normalization;
86 return (PyObject*)self;
87}
88
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000089
90static Py_UCS4 getuchar(PyUnicodeObject *obj)
91{
92 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
93
94 if (PyUnicode_GET_SIZE(obj) == 1)
95 return *v;
96#ifndef Py_UNICODE_WIDE
97 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
98 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
99 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
100 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
101#endif
102 PyErr_SetString(PyExc_TypeError,
103 "need a single Unicode character as parameter");
104 return (Py_UCS4)-1;
105}
106
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000107/* --- Module API --------------------------------------------------------- */
108
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000109PyDoc_STRVAR(unicodedata_decimal__doc__,
110"decimal(unichr[, default])\n\
111\n\
112Returns the decimal value assigned to the Unicode character unichr\n\
113as integer. If no such value is defined, default is returned, or, if\n\
114not given, ValueError is raised.");
115
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000116static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000117unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118{
119 PyUnicodeObject *v;
120 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000121 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000122 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000123 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000124
Fredrik Lundh06d12682001-01-24 07:59:11 +0000125 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000126 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000127 c = getuchar(v);
128 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000129 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000130
131 if (self) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000132 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000133 if (old->category_changed == 0) {
134 /* unassigned */
135 have_old = 1;
136 rc = -1;
137 }
138 else if (old->decimal_changed != 0xFF) {
139 have_old = 1;
140 rc = old->decimal_changed;
141 }
142 }
143
144 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000145 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000146 if (rc < 0) {
147 if (defobj == NULL) {
148 PyErr_SetString(PyExc_ValueError,
149 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000150 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000151 }
152 else {
153 Py_INCREF(defobj);
154 return defobj;
155 }
156 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000157 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000158}
159
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000160PyDoc_STRVAR(unicodedata_digit__doc__,
161"digit(unichr[, default])\n\
162\n\
163Returns the digit value assigned to the Unicode character unichr as\n\
164integer. If no such value is defined, default is returned, or, if\n\
165not given, ValueError is raised.");
166
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000167static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000168unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169{
170 PyUnicodeObject *v;
171 PyObject *defobj = NULL;
172 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000173 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000174
Fredrik Lundh06d12682001-01-24 07:59:11 +0000175 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000176 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000177 c = getuchar(v);
178 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000179 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000180 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000181 if (rc < 0) {
182 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000183 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000184 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000185 }
186 else {
187 Py_INCREF(defobj);
188 return defobj;
189 }
190 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000191 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000192}
193
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000194PyDoc_STRVAR(unicodedata_numeric__doc__,
195"numeric(unichr[, default])\n\
196\n\
197Returns the numeric value assigned to the Unicode character unichr\n\
198as float. If no such value is defined, default is returned, or, if\n\
199not given, ValueError is raised.");
200
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000202unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203{
204 PyUnicodeObject *v;
205 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000206 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000207 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000208 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000209
Fredrik Lundh06d12682001-01-24 07:59:11 +0000210 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000211 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000212 c = getuchar(v);
213 if (c == (Py_UCS4)-1)
214 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000215
216 if (self) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000217 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000218 if (old->category_changed == 0) {
219 /* unassigned */
220 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221 rc = -1.0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000222 }
223 else if (old->decimal_changed != 0xFF) {
224 have_old = 1;
225 rc = old->decimal_changed;
226 }
227 }
228
229 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000230 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 if (rc == -1.0) {
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000232 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000233 PyErr_SetString(PyExc_ValueError, "not a numeric character");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000234 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000235 }
236 else {
237 Py_INCREF(defobj);
238 return defobj;
239 }
240 }
241 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000242}
243
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000244PyDoc_STRVAR(unicodedata_category__doc__,
245"category(unichr)\n\
246\n\
247Returns the general category assigned to the Unicode character\n\
248unichr as string.");
249
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000251unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000252{
253 PyUnicodeObject *v;
254 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000255 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000256
257 if (!PyArg_ParseTuple(args, "O!:category",
258 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000259 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000260 c = getuchar(v);
261 if (c == (Py_UCS4)-1)
262 return NULL;
263 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000264 if (self) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000265 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000266 if (old->category_changed != 0xFF)
267 index = old->category_changed;
268 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000269 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000270}
271
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000272PyDoc_STRVAR(unicodedata_bidirectional__doc__,
273"bidirectional(unichr)\n\
274\n\
275Returns the bidirectional category assigned to the Unicode character\n\
276unichr as string. If no such value is defined, an empty string is\n\
277returned.");
278
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000279static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000280unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000281{
282 PyUnicodeObject *v;
283 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000284 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000285
286 if (!PyArg_ParseTuple(args, "O!:bidirectional",
287 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000288 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000289 c = getuchar(v);
290 if (c == (Py_UCS4)-1)
291 return NULL;
292 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000293 if (self) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000294 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000295 if (old->category_changed == 0)
296 index = 0; /* unassigned */
297 else if (old->bidir_changed != 0xFF)
298 index = old->bidir_changed;
299 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000300 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000301}
302
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000303PyDoc_STRVAR(unicodedata_combining__doc__,
304"combining(unichr)\n\
305\n\
306Returns the canonical combining class assigned to the Unicode\n\
307character unichr as integer. Returns 0 if no combining class is\n\
308defined.");
309
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000310static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000311unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000312{
313 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000314 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000315 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000316
317 if (!PyArg_ParseTuple(args, "O!:combining",
318 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000319 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000320 c = getuchar(v);
321 if (c == (Py_UCS4)-1)
322 return NULL;
323 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000324 if (self) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000325 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000326 if (old->category_changed == 0)
327 index = 0; /* unassigned */
328 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000329 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000330}
331
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000332PyDoc_STRVAR(unicodedata_mirrored__doc__,
333"mirrored(unichr)\n\
334\n\
335Returns the mirrored property assigned to the Unicode character\n\
336unichr as integer. Returns 1 if the character has been identified as\n\
337a \"mirrored\" character in bidirectional text, 0 otherwise.");
338
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000339static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000340unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000341{
342 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000343 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000344 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000345
346 if (!PyArg_ParseTuple(args, "O!:mirrored",
347 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000348 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000349 c = getuchar(v);
350 if (c == (Py_UCS4)-1)
351 return NULL;
352 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000353 if (self) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000354 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000355 if (old->category_changed == 0)
356 index = 0; /* unassigned */
357 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000358 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000359}
360
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000361PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
362"east_asian_width(unichr)\n\
363\n\
364Returns the east asian width assigned to the Unicode character\n\
365unichr as string.");
366
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000367static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000368unicodedata_east_asian_width(PyObject *self, PyObject *args)
369{
370 PyUnicodeObject *v;
371 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000372 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000373
374 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
375 &PyUnicode_Type, &v))
376 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000377 c = getuchar(v);
378 if (c == (Py_UCS4)-1)
379 return NULL;
380 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000381 if (self) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000382 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000383 if (old->category_changed == 0)
384 index = 0; /* unassigned */
385 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000386 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000387}
388
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000389PyDoc_STRVAR(unicodedata_decomposition__doc__,
390"decomposition(unichr)\n\
391\n\
392Returns the character decomposition mapping assigned to the Unicode\n\
393character unichr as string. An empty string is returned in case no\n\
394such mapping is defined.");
395
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000396static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000397unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000398{
399 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000400 char decomp[256];
401 int code, index, count, i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000402 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000403 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000404
405 if (!PyArg_ParseTuple(args, "O!:decomposition",
406 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000407 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000408 c = getuchar(v);
409 if (c == (Py_UCS4)-1)
410 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000411
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000412 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000413
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000414 if (self) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000415 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000416 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000417 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000418 }
419
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000420 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000421 index = 0;
422 else {
423 index = decomp_index1[(code>>DECOMP_SHIFT)];
424 index = decomp_index2[(index<<DECOMP_SHIFT)+
425 (code&((1<<DECOMP_SHIFT)-1))];
426 }
427
Tim Peters69b83b12001-11-30 07:23:05 +0000428 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000429 is prefix code (from*/
430 count = decomp_data[index] >> 8;
431
432 /* XXX: could allocate the PyString up front instead
433 (strlen(prefix) + 5 * count + 1 bytes) */
434
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000435 /* Based on how index is calculated above and decomp_data is generated
436 from Tools/unicode/makeunicodedata.py, it should not be possible
437 to overflow decomp_prefix. */
438 prefix_index = decomp_data[index] & 255;
439 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
440
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000441 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000442 i = strlen(decomp_prefix[prefix_index]);
443 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000444
445 while (count-- > 0) {
446 if (i)
447 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000448 assert((size_t)i < sizeof(decomp));
449 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
450 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000451 i += strlen(decomp + i);
452 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000453
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000454 decomp[i] = '\0';
455
Walter Dörwald4254e762007-06-05 16:04:09 +0000456 return PyUnicode_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000457}
458
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000459static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000460get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000461{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000462 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000463 *index = 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000464 } else if (self && get_old_record(self, code)->category_changed==0) {
465 /* unassigned in old version */
466 *index = 0;
467 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000468 else {
469 *index = decomp_index1[(code>>DECOMP_SHIFT)];
470 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
471 (code&((1<<DECOMP_SHIFT)-1))];
472 }
473
474 /* high byte is number of hex bytes (usually one or two), low byte
475 is prefix code (from*/
476 *count = decomp_data[*index] >> 8;
477 *prefix = decomp_data[*index] & 255;
478
479 (*index)++;
480}
481
482#define SBase 0xAC00
483#define LBase 0x1100
484#define VBase 0x1161
485#define TBase 0x11A7
486#define LCount 19
487#define VCount 21
488#define TCount 28
489#define NCount (VCount*TCount)
490#define SCount (LCount*NCount)
491
492static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000493nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000494{
495 PyObject *result;
496 Py_UNICODE *i, *end, *o;
497 /* Longest decomposition in Unicode 3.2: U+FDFA */
498 Py_UNICODE stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000499 Py_ssize_t space, isize;
500 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000501 unsigned char prev, cur;
502
503 stackptr = 0;
504 isize = PyUnicode_GET_SIZE(input);
505 /* Overallocate atmost 10 characters. */
506 space = (isize > 10 ? 10 : isize) + isize;
507 result = PyUnicode_FromUnicode(NULL, space);
508 if (!result)
509 return NULL;
510 i = PyUnicode_AS_UNICODE(input);
511 end = i + isize;
512 o = PyUnicode_AS_UNICODE(result);
513
514 while (i < end) {
515 stack[stackptr++] = *i++;
516 while(stackptr) {
517 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000518 /* Hangul Decomposition adds three characters in
519 a single step, so we need atleast that much room. */
520 if (space < 3) {
Martin v. Löwis5b222132007-06-10 09:51:05 +0000521 Py_ssize_t newsize = PyUnicode_GET_SIZE(result) + 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000522 space += 10;
523 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000524 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000525 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000526 }
527 /* Hangul Decomposition. */
528 if (SBase <= code && code < (SBase+SCount)) {
529 int SIndex = code - SBase;
530 int L = LBase + SIndex / NCount;
531 int V = VBase + (SIndex % NCount) / TCount;
532 int T = TBase + SIndex % TCount;
533 *o++ = L;
534 *o++ = V;
535 space -= 2;
536 if (T != TBase) {
537 *o++ = T;
538 space --;
539 }
540 continue;
541 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000542 /* normalization changes */
543 if (self) {
544 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
545 if (value != 0) {
546 stack[stackptr++] = value;
547 continue;
548 }
549 }
550
551 /* Other decompositions. */
552 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000553
554 /* Copy character if it is not decomposable, or has a
555 compatibility decomposition, but we do NFD. */
556 if (!count || (prefix && !k)) {
557 *o++ = code;
558 space--;
559 continue;
560 }
561 /* Copy decomposition onto the stack, in reverse
562 order. */
563 while(count) {
564 code = decomp_data[index + (--count)];
565 stack[stackptr++] = code;
566 }
567 }
568 }
569
570 /* Drop overallocation. Cannot fail. */
571 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
572
573 /* Sort canonically. */
574 i = PyUnicode_AS_UNICODE(result);
575 prev = _getrecord_ex(*i)->combining;
576 end = i + PyUnicode_GET_SIZE(result);
577 for (i++; i < end; i++) {
578 cur = _getrecord_ex(*i)->combining;
579 if (prev == 0 || cur == 0 || prev <= cur) {
580 prev = cur;
581 continue;
582 }
583 /* Non-canonical order. Need to switch *i with previous. */
584 o = i - 1;
585 while (1) {
586 Py_UNICODE tmp = o[1];
587 o[1] = o[0];
588 o[0] = tmp;
589 o--;
590 if (o < PyUnicode_AS_UNICODE(result))
591 break;
592 prev = _getrecord_ex(*o)->combining;
593 if (prev == 0 || prev <= cur)
594 break;
595 }
596 prev = _getrecord_ex(*i)->combining;
597 }
598 return result;
599}
600
601static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000602find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000603{
604 int index;
605 for (index = 0; nfc[index].start; index++) {
606 int start = nfc[index].start;
607 if (code < start)
608 return -1;
609 if (code <= start + nfc[index].count) {
610 int delta = code - start;
611 return nfc[index].index + delta;
612 }
613 }
614 return -1;
615}
616
617static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000618nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000619{
620 PyObject *result;
621 Py_UNICODE *i, *i1, *o, *end;
622 int f,l,index,index1,comb;
623 Py_UNICODE code;
624 Py_UNICODE *skipped[20];
625 int cskipped = 0;
626
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000627 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000628 if (!result)
629 return NULL;
630
631 /* We are going to modify result in-place.
632 If nfd_nfkd is changed to sometimes return the input,
633 this code needs to be reviewed. */
634 assert(result != input);
635
636 i = PyUnicode_AS_UNICODE(result);
637 end = i + PyUnicode_GET_SIZE(result);
638 o = PyUnicode_AS_UNICODE(result);
639
640 again:
641 while (i < end) {
642 for (index = 0; index < cskipped; index++) {
643 if (skipped[index] == i) {
644 /* *i character is skipped.
645 Remove from list. */
646 skipped[index] = skipped[cskipped-1];
647 cskipped--;
648 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000649 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000650 }
651 }
652 /* Hangul Composition. We don't need to check for <LV,T>
653 pairs, since we always have decomposed data. */
654 if (LBase <= *i && *i < (LBase+LCount) &&
655 i + 1 < end &&
656 VBase <= i[1] && i[1] <= (VBase+VCount)) {
657 int LIndex, VIndex;
658 LIndex = i[0] - LBase;
659 VIndex = i[1] - VBase;
660 code = SBase + (LIndex*VCount+VIndex)*TCount;
661 i+=2;
662 if (i < end &&
663 TBase <= *i && *i <= (TBase+TCount)) {
664 code += *i-TBase;
665 i++;
666 }
667 *o++ = code;
668 continue;
669 }
670
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000671 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000672 if (f == -1) {
673 *o++ = *i++;
674 continue;
675 }
676 /* Find next unblocked character. */
677 i1 = i+1;
678 comb = 0;
679 while (i1 < end) {
680 int comb1 = _getrecord_ex(*i1)->combining;
681 if (comb1 && comb == comb1) {
682 /* Character is blocked. */
683 i1++;
684 continue;
685 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000686 l = find_nfc_index(self, nfc_last, *i1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000687 /* *i1 cannot be combined with *i. If *i1
688 is a starter, we don't need to look further.
689 Otherwise, record the combining class. */
690 if (l == -1) {
691 not_combinable:
692 if (comb1 == 0)
693 break;
694 comb = comb1;
695 i1++;
696 continue;
697 }
698 index = f*TOTAL_LAST + l;
699 index1 = comp_index[index >> COMP_SHIFT];
700 code = comp_data[(index1<<COMP_SHIFT)+
701 (index&((1<<COMP_SHIFT)-1))];
702 if (code == 0)
703 goto not_combinable;
704
705 /* Replace the original character. */
706 *i = code;
707 /* Mark the second character unused. */
708 skipped[cskipped++] = i1;
709 i1++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000710 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000711 if (f == -1)
712 break;
713 }
714 *o++ = *i++;
715 }
716 if (o != end)
717 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
718 return result;
719}
720
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000721PyDoc_STRVAR(unicodedata_normalize__doc__,
722"normalize(form, unistr)\n\
723\n\
724Return the normal form 'form' for the Unicode string unistr. Valid\n\
725values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
726
Martin v. Löwis677bde22002-11-23 22:08:15 +0000727static PyObject*
728unicodedata_normalize(PyObject *self, PyObject *args)
729{
730 char *form;
731 PyObject *input;
732
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000733 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000734 &form, &PyUnicode_Type, &input))
735 return NULL;
736
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000737 if (PyUnicode_GetSize(input) == 0) {
738 /* Special case empty input strings, since resizing
739 them later would cause internal errors. */
740 Py_INCREF(input);
741 return input;
742 }
743
Martin v. Löwis677bde22002-11-23 22:08:15 +0000744 if (strcmp(form, "NFC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000745 return nfc_nfkc(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000746 if (strcmp(form, "NFKC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000747 return nfc_nfkc(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000748 if (strcmp(form, "NFD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000749 return nfd_nfkd(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000750 if (strcmp(form, "NFKD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000751 return nfd_nfkd(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000752 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
753 return NULL;
754}
755
Fredrik Lundh06d12682001-01-24 07:59:11 +0000756/* -------------------------------------------------------------------- */
757/* unicode character name tables */
758
759/* data file generated by Tools/unicode/makeunicodedata.py */
760#include "unicodename_db.h"
761
762/* -------------------------------------------------------------------- */
763/* database code (cut and pasted from the unidb package) */
764
765static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000766_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000767{
768 int i;
769 unsigned long h = 0;
770 unsigned long ix;
771 for (i = 0; i < len; i++) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000772 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000773 ix = h & 0xff000000;
774 if (ix)
775 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
776 }
777 return h;
778}
779
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000780static char *hangul_syllables[][3] = {
781 { "G", "A", "" },
782 { "GG", "AE", "G" },
783 { "N", "YA", "GG" },
784 { "D", "YAE", "GS" },
785 { "DD", "EO", "N", },
786 { "R", "E", "NJ" },
787 { "M", "YEO", "NH" },
788 { "B", "YE", "D" },
789 { "BB", "O", "L" },
790 { "S", "WA", "LG" },
791 { "SS", "WAE", "LM" },
792 { "", "OE", "LB" },
793 { "J", "YO", "LS" },
794 { "JJ", "U", "LT" },
795 { "C", "WEO", "LP" },
796 { "K", "WE", "LH" },
797 { "T", "WI", "M" },
798 { "P", "YU", "B" },
799 { "H", "EU", "BS" },
800 { 0, "YI", "S" },
801 { 0, "I", "SS" },
802 { 0, 0, "NG" },
803 { 0, 0, "J" },
804 { 0, 0, "C" },
805 { 0, 0, "K" },
806 { 0, 0, "T" },
807 { 0, 0, "P" },
808 { 0, 0, "H" }
809};
810
Fredrik Lundh06d12682001-01-24 07:59:11 +0000811static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000812is_unified_ideograph(Py_UCS4 code)
813{
814 return (
815 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Martin v. Löwisc3509122006-03-11 12:16:23 +0000816 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000817 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
818}
819
820static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000821_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000822{
823 int offset;
824 int i;
825 int word;
826 unsigned char* w;
827
Martin v. Löwisc3509122006-03-11 12:16:23 +0000828 if (code >= 0x110000)
829 return 0;
830
831 if (self) {
832 const change_record *old = get_old_record(self, code);
833 if (old->category_changed == 0) {
834 /* unassigned */
835 return 0;
836 }
837 }
838
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000839 if (SBase <= code && code < SBase+SCount) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000840 /* Hangul syllable. */
841 int SIndex = code - SBase;
842 int L = SIndex / NCount;
843 int V = (SIndex % NCount) / TCount;
844 int T = SIndex % TCount;
845
846 if (buflen < 27)
847 /* Worst case: HANGUL SYLLABLE <10chars>. */
848 return 0;
849 strcpy(buffer, "HANGUL SYLLABLE ");
850 buffer += 16;
851 strcpy(buffer, hangul_syllables[L][0]);
852 buffer += strlen(hangul_syllables[L][0]);
853 strcpy(buffer, hangul_syllables[V][1]);
854 buffer += strlen(hangul_syllables[V][1]);
855 strcpy(buffer, hangul_syllables[T][2]);
856 buffer += strlen(hangul_syllables[T][2]);
857 *buffer = '\0';
858 return 1;
859 }
860
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000861 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000862 if (buflen < 28)
863 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
864 return 0;
865 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
866 return 1;
867 }
868
Fredrik Lundh06d12682001-01-24 07:59:11 +0000869 /* get offset into phrasebook */
870 offset = phrasebook_offset1[(code>>phrasebook_shift)];
871 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
872 (code&((1<<phrasebook_shift)-1))];
873 if (!offset)
874 return 0;
875
876 i = 0;
877
878 for (;;) {
879 /* get word index */
880 word = phrasebook[offset] - phrasebook_short;
881 if (word >= 0) {
882 word = (word << 8) + phrasebook[offset+1];
883 offset += 2;
884 } else
885 word = phrasebook[offset++];
886 if (i) {
887 if (i > buflen)
888 return 0; /* buffer overflow */
889 buffer[i++] = ' ';
890 }
891 /* copy word string from lexicon. the last character in the
892 word has bit 7 set. the last word in a string ends with
893 0x80 */
894 w = lexicon + lexicon_offset[word];
895 while (*w < 128) {
896 if (i >= buflen)
897 return 0; /* buffer overflow */
898 buffer[i++] = *w++;
899 }
900 if (i >= buflen)
901 return 0; /* buffer overflow */
902 buffer[i++] = *w & 127;
903 if (*w == 128)
904 break; /* end of word */
905 }
906
907 return 1;
908}
909
910static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000911_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000912{
913 /* check if code corresponds to the given name */
914 int i;
915 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000916 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000917 return 0;
918 for (i = 0; i < namelen; i++) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000919 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +0000920 return 0;
921 }
922 return buffer[namelen] == '\0';
923}
924
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000925static void
926find_syllable(const char *str, int *len, int *pos, int count, int column)
927{
928 int i, len1;
929 *len = -1;
930 for (i = 0; i < count; i++) {
931 char *s = hangul_syllables[i][column];
932 len1 = strlen(s);
933 if (len1 <= *len)
934 continue;
935 if (strncmp(str, s, len1) == 0) {
936 *len = len1;
937 *pos = i;
938 }
939 }
940 if (*len == -1) {
941 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000942 }
943}
944
Fredrik Lundh06d12682001-01-24 07:59:11 +0000945static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000946_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000947{
948 unsigned int h, v;
949 unsigned int mask = code_size-1;
950 unsigned int i, incr;
951
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000952 /* Check for hangul syllables. */
953 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Thomas Wouters1e365b22006-03-01 21:58:30 +0000954 int len, L = -1, V = -1, T = -1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000955 const char *pos = name + 16;
956 find_syllable(pos, &len, &L, LCount, 0);
957 pos += len;
958 find_syllable(pos, &len, &V, VCount, 1);
959 pos += len;
960 find_syllable(pos, &len, &T, TCount, 2);
961 pos += len;
Martin v. Löwis8b291e22005-09-18 08:17:56 +0000962 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000963 *code = SBase + (L*VCount+V)*TCount + T;
964 return 1;
965 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000966 /* Otherwise, it's an illegal syllable name. */
967 return 0;
968 }
969
970 /* Check for unified ideographs. */
971 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
972 /* Four or five hexdigits must follow. */
973 v = 0;
974 name += 22;
975 namelen -= 22;
976 if (namelen != 4 && namelen != 5)
977 return 0;
978 while (namelen--) {
979 v *= 16;
980 if (*name >= '0' && *name <= '9')
981 v += *name - '0';
982 else if (*name >= 'A' && *name <= 'F')
983 v += *name - 'A' + 10;
984 else
985 return 0;
986 name++;
987 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000988 if (!is_unified_ideograph(v))
989 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000990 *code = v;
991 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000992 }
993
Fredrik Lundh06d12682001-01-24 07:59:11 +0000994 /* the following is the same as python's dictionary lookup, with
995 only minor changes. see the makeunicodedata script for more
996 details */
997
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000998 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000999 i = (~h) & mask;
1000 v = code_hash[i];
1001 if (!v)
1002 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001003 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001004 *code = v;
1005 return 1;
1006 }
1007 incr = (h ^ (h >> 3)) & mask;
1008 if (!incr)
1009 incr = mask;
1010 for (;;) {
1011 i = (i + incr) & mask;
1012 v = code_hash[i];
1013 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001014 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001015 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001016 *code = v;
1017 return 1;
1018 }
1019 incr = incr << 1;
1020 if (incr > mask)
1021 incr = incr ^ code_poly;
1022 }
1023}
1024
1025static const _PyUnicode_Name_CAPI hashAPI =
1026{
1027 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001028 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001029 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001030};
1031
1032/* -------------------------------------------------------------------- */
1033/* Python bindings */
1034
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001035PyDoc_STRVAR(unicodedata_name__doc__,
1036"name(unichr[, default])\n\
1037Returns the name assigned to the Unicode character unichr as a\n\
1038string. If no name is defined, default is returned, or, if not\n\
1039given, ValueError is raised.");
1040
Fredrik Lundh06d12682001-01-24 07:59:11 +00001041static PyObject *
1042unicodedata_name(PyObject* self, PyObject* args)
1043{
1044 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001045 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001046
1047 PyUnicodeObject* v;
1048 PyObject* defobj = NULL;
1049 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1050 return NULL;
1051
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001052 c = getuchar(v);
1053 if (c == (Py_UCS4)-1)
1054 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001055
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001056 if (!_getucname(self, c, name, sizeof(name))) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001057 if (defobj == NULL) {
1058 PyErr_SetString(PyExc_ValueError, "no such name");
1059 return NULL;
1060 }
1061 else {
1062 Py_INCREF(defobj);
1063 return defobj;
1064 }
1065 }
1066
Walter Dörwald4254e762007-06-05 16:04:09 +00001067 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001068}
1069
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001070PyDoc_STRVAR(unicodedata_lookup__doc__,
1071"lookup(name)\n\
1072\n\
1073Look up character by name. If a character with the\n\
1074given name is found, return the corresponding Unicode\n\
1075character. If not found, KeyError is raised.");
1076
Fredrik Lundh06d12682001-01-24 07:59:11 +00001077static PyObject *
1078unicodedata_lookup(PyObject* self, PyObject* args)
1079{
1080 Py_UCS4 code;
Guido van Rossum806c2462007-08-06 23:33:07 +00001081 Py_UNICODE str[2];
Fredrik Lundh06d12682001-01-24 07:59:11 +00001082
1083 char* name;
1084 int namelen;
1085 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1086 return NULL;
1087
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001088 if (!_getcode(self, name, namelen, &code)) {
Guido van Rossum806c2462007-08-06 23:33:07 +00001089 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1090 name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001091 return NULL;
1092 }
1093
Guido van Rossum806c2462007-08-06 23:33:07 +00001094#ifndef Py_UNICODE_WIDE
1095 if (code >= 0x10000) {
1096 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1097 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1098 return PyUnicode_FromUnicode(str, 2);
1099 }
1100#endif
Fredrik Lundh06d12682001-01-24 07:59:11 +00001101 str[0] = (Py_UNICODE) code;
Guido van Rossum806c2462007-08-06 23:33:07 +00001102 return PyUnicode_FromUnicode(str, 1);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001103}
1104
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001105/* XXX Add doc strings. */
1106
1107static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001108 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1109 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1110 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1111 {"category", unicodedata_category, METH_VARARGS,
1112 unicodedata_category__doc__},
1113 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1114 unicodedata_bidirectional__doc__},
1115 {"combining", unicodedata_combining, METH_VARARGS,
1116 unicodedata_combining__doc__},
1117 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1118 unicodedata_mirrored__doc__},
1119 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1120 unicodedata_east_asian_width__doc__},
1121 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1122 unicodedata_decomposition__doc__},
1123 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1124 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1125 {"normalize", unicodedata_normalize, METH_VARARGS,
1126 unicodedata_normalize__doc__},
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001127 {NULL, NULL} /* sentinel */
1128};
1129
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001130static PyTypeObject UCD_Type = {
1131 /* The ob_type field must be initialized in the module init function
1132 * to be portable to Windows without using C++. */
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001133 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001134 "unicodedata.UCD", /*tp_name*/
1135 sizeof(PreviousDBVersion), /*tp_basicsize*/
1136 0, /*tp_itemsize*/
1137 /* methods */
1138 (destructor)PyObject_Del, /*tp_dealloc*/
1139 0, /*tp_print*/
1140 0, /*tp_getattr*/
1141 0, /*tp_setattr*/
1142 0, /*tp_compare*/
1143 0, /*tp_repr*/
1144 0, /*tp_as_number*/
1145 0, /*tp_as_sequence*/
1146 0, /*tp_as_mapping*/
1147 0, /*tp_hash*/
1148 0, /*tp_call*/
1149 0, /*tp_str*/
1150 PyObject_GenericGetAttr,/*tp_getattro*/
1151 0, /*tp_setattro*/
1152 0, /*tp_as_buffer*/
1153 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1154 0, /*tp_doc*/
1155 0, /*tp_traverse*/
1156 0, /*tp_clear*/
1157 0, /*tp_richcompare*/
1158 0, /*tp_weaklistoffset*/
1159 0, /*tp_iter*/
1160 0, /*tp_iternext*/
1161 unicodedata_functions, /*tp_methods*/
1162 DB_members, /*tp_members*/
1163 0, /*tp_getset*/
1164 0, /*tp_base*/
1165 0, /*tp_dict*/
1166 0, /*tp_descr_get*/
1167 0, /*tp_descr_set*/
1168 0, /*tp_dictoffset*/
1169 0, /*tp_init*/
1170 0, /*tp_alloc*/
1171 0, /*tp_new*/
1172 0, /*tp_free*/
1173 0, /*tp_is_gc*/
1174};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001175
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001176PyDoc_STRVAR(unicodedata_docstring,
1177"This module provides access to the Unicode Character Database which\n\
1178defines character properties for all Unicode characters. The data in\n\
1179this database is based on the UnicodeData.txt file version\n\
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011804.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001181\n\
1182The module uses the same names and symbols as defined by the\n\
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001183UnicodeData File Format 4.1.0 (see\n\
1184http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001185
Mark Hammond62b1ab12002-07-23 06:31:15 +00001186PyMODINIT_FUNC
Thomas Woutersf3f33dc2000-07-21 06:00:07 +00001187initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001188{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001189 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001190
Christian Heimes90aa7642007-12-19 02:45:37 +00001191 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001192
Fred Drakef585bef2001-03-03 19:41:55 +00001193 m = Py_InitModule3(
1194 "unicodedata", unicodedata_functions, unicodedata_docstring);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001195 if (!m)
1196 return;
1197
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001198 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001199 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001200 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001201
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001202 /* Previous versions */
1203 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1204 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001205 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001206
Fredrik Lundh06d12682001-01-24 07:59:11 +00001207 /* Export C API */
1208 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001209 if (v != NULL)
1210 PyModule_AddObject(m, "ucnhash_CAPI", v);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001211}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001212
1213/*
1214Local variables:
1215c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001216indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001217End:
1218*/