blob: 1e4908fd9f07c749553610e24f54cdbb5640a57a [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Martin v. Löwis7d41e292002-11-23 12:22:32 +00003 unicodedata -- Provides access to the Unicode 3.2 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Martin v. Löwis7d41e292002-11-23 12:22:32 +00005 Data was extracted from the Unicode 3.2 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Martin v. Löwis7d41e292002-11-23 12:22:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000028 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030} _PyUnicode_DatabaseRecord;
31
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000032typedef struct change_record {
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed;
35 const unsigned char category_changed;
36 const unsigned char decimal_changed;
37 const int numeric_changed;
38} change_record;
39
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000040/* data file generated by Tools/unicode/makeunicodedata.py */
41#include "unicodedata_db.h"
42
43static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000044_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000045{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000046 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000047 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048 index = 0;
49 else {
50 index = index1[(code>>SHIFT)];
51 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
52 }
53
54 return &_PyUnicode_Database_Records[index];
55}
56
Martin v. Löwis677bde22002-11-23 22:08:15 +000057static const _PyUnicode_DatabaseRecord*
58_getrecord(PyUnicodeObject* v)
59{
60 return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
61}
62
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000063/* ------------- Previous-version API ------------------------------------- */
64typedef struct previous_version {
65 PyObject_HEAD
66 const char *name;
67 const change_record* (*getrecord)(Py_UCS4);
68 Py_UCS4 (*normalization)(Py_UCS4);
69} PreviousDBVersion;
70
71#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
72
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000073static PyMemberDef DB_members[] = {
74 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
75 {NULL}
76};
77
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000078// forward declaration
79static PyTypeObject UCD_Type;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000080
81static PyObject*
82new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
83 Py_UCS4 (*normalization)(Py_UCS4))
84{
85 PreviousDBVersion *self;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000086 self = PyObject_New(PreviousDBVersion, &UCD_Type);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000087 if (self == NULL)
88 return NULL;
89 self->name = name;
90 self->getrecord = getrecord;
91 self->normalization = normalization;
92 return (PyObject*)self;
93}
94
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000095/* --- Module API --------------------------------------------------------- */
96
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +000097PyDoc_STRVAR(unicodedata_decimal__doc__,
98"decimal(unichr[, default])\n\
99\n\
100Returns the decimal value assigned to the Unicode character unichr\n\
101as integer. If no such value is defined, default is returned, or, if\n\
102not given, ValueError is raised.");
103
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000104static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000105unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000106{
107 PyUnicodeObject *v;
108 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000109 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000110 long rc;
111
Fredrik Lundh06d12682001-01-24 07:59:11 +0000112 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000113 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000114 if (PyUnicode_GET_SIZE(v) != 1) {
115 PyErr_SetString(PyExc_TypeError,
116 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000117 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000119
120 if (self) {
121 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
122 if (old->category_changed == 0) {
123 /* unassigned */
124 have_old = 1;
125 rc = -1;
126 }
127 else if (old->decimal_changed != 0xFF) {
128 have_old = 1;
129 rc = old->decimal_changed;
130 }
131 }
132
133 if (!have_old)
134 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000135 if (rc < 0) {
136 if (defobj == NULL) {
137 PyErr_SetString(PyExc_ValueError,
138 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000139 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000140 }
141 else {
142 Py_INCREF(defobj);
143 return defobj;
144 }
145 }
146 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000147}
148
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000149PyDoc_STRVAR(unicodedata_digit__doc__,
150"digit(unichr[, default])\n\
151\n\
152Returns the digit value assigned to the Unicode character unichr as\n\
153integer. If no such value is defined, default is returned, or, if\n\
154not given, ValueError is raised.");
155
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000156static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000157unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000158{
159 PyUnicodeObject *v;
160 PyObject *defobj = NULL;
161 long rc;
162
Fredrik Lundh06d12682001-01-24 07:59:11 +0000163 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000164 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000165 if (PyUnicode_GET_SIZE(v) != 1) {
166 PyErr_SetString(PyExc_TypeError,
167 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000168 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169 }
170 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
171 if (rc < 0) {
172 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000173 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000174 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000175 }
176 else {
177 Py_INCREF(defobj);
178 return defobj;
179 }
180 }
181 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000182}
183
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000184PyDoc_STRVAR(unicodedata_numeric__doc__,
185"numeric(unichr[, default])\n\
186\n\
187Returns the numeric value assigned to the Unicode character unichr\n\
188as float. If no such value is defined, default is returned, or, if\n\
189not given, ValueError is raised.");
190
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000191static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000192unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193{
194 PyUnicodeObject *v;
195 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000196 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000197 double rc;
198
Fredrik Lundh06d12682001-01-24 07:59:11 +0000199 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000200 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201 if (PyUnicode_GET_SIZE(v) != 1) {
202 PyErr_SetString(PyExc_TypeError,
203 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000204 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000205 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000206
207 if (self) {
208 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
209 if (old->category_changed == 0) {
210 /* unassigned */
211 have_old = 1;
212 rc = -1;
213 }
214 else if (old->decimal_changed != 0xFF) {
215 have_old = 1;
216 rc = old->decimal_changed;
217 }
218 }
219
220 if (!have_old)
221 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000222 if (rc < 0) {
223 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000224 PyErr_SetString(PyExc_ValueError, "not a numeric character");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000225 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000226 }
227 else {
228 Py_INCREF(defobj);
229 return defobj;
230 }
231 }
232 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000233}
234
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000235PyDoc_STRVAR(unicodedata_category__doc__,
236"category(unichr)\n\
237\n\
238Returns the general category assigned to the Unicode character\n\
239unichr as string.");
240
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000241static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000242unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000243{
244 PyUnicodeObject *v;
245 int index;
246
247 if (!PyArg_ParseTuple(args, "O!:category",
248 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000249 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250 if (PyUnicode_GET_SIZE(v) != 1) {
251 PyErr_SetString(PyExc_TypeError,
252 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000253 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000254 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000255 index = (int) _getrecord(v)->category;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000256 if (self) {
257 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
258 if (old->category_changed != 0xFF)
259 index = old->category_changed;
260 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000261 return PyString_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000262}
263
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000264PyDoc_STRVAR(unicodedata_bidirectional__doc__,
265"bidirectional(unichr)\n\
266\n\
267Returns the bidirectional category assigned to the Unicode character\n\
268unichr as string. If no such value is defined, an empty string is\n\
269returned.");
270
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000271static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000272unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000273{
274 PyUnicodeObject *v;
275 int index;
276
277 if (!PyArg_ParseTuple(args, "O!:bidirectional",
278 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000279 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000280 if (PyUnicode_GET_SIZE(v) != 1) {
281 PyErr_SetString(PyExc_TypeError,
282 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000283 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000284 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000285 index = (int) _getrecord(v)->bidirectional;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000286 if (self) {
287 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
288 if (old->category_changed == 0)
289 index = 0; /* unassigned */
290 else if (old->bidir_changed != 0xFF)
291 index = old->bidir_changed;
292 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000293 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000294}
295
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000296PyDoc_STRVAR(unicodedata_combining__doc__,
297"combining(unichr)\n\
298\n\
299Returns the canonical combining class assigned to the Unicode\n\
300character unichr as integer. Returns 0 if no combining class is\n\
301defined.");
302
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000303static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000304unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000305{
306 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000307 int index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000308
309 if (!PyArg_ParseTuple(args, "O!:combining",
310 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000311 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000312 if (PyUnicode_GET_SIZE(v) != 1) {
313 PyErr_SetString(PyExc_TypeError,
314 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000315 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000316 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000317 index = (int) _getrecord(v)->combining;
318 if (self) {
319 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
320 if (old->category_changed == 0)
321 index = 0; /* unassigned */
322 }
323 return PyInt_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000324}
325
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000326PyDoc_STRVAR(unicodedata_mirrored__doc__,
327"mirrored(unichr)\n\
328\n\
329Returns the mirrored property assigned to the Unicode character\n\
330unichr as integer. Returns 1 if the character has been identified as\n\
331a \"mirrored\" character in bidirectional text, 0 otherwise.");
332
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000333static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000334unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000335{
336 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000337 int index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000338
339 if (!PyArg_ParseTuple(args, "O!:mirrored",
340 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000341 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000342 if (PyUnicode_GET_SIZE(v) != 1) {
343 PyErr_SetString(PyExc_TypeError,
344 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000345 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000346 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000347 index = (int) _getrecord(v)->mirrored;
348 if (self) {
349 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
350 if (old->category_changed == 0)
351 index = 0; /* unassigned */
352 }
353 return PyInt_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000354}
355
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000356PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
357"east_asian_width(unichr)\n\
358\n\
359Returns the east asian width assigned to the Unicode character\n\
360unichr as string.");
361
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000362static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000363unicodedata_east_asian_width(PyObject *self, PyObject *args)
364{
365 PyUnicodeObject *v;
366 int index;
367
368 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
369 &PyUnicode_Type, &v))
370 return NULL;
371 if (PyUnicode_GET_SIZE(v) != 1) {
372 PyErr_SetString(PyExc_TypeError,
373 "need a single Unicode character as parameter");
374 return NULL;
375 }
376 index = (int) _getrecord(v)->east_asian_width;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000377 if (self) {
378 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
379 if (old->category_changed == 0)
380 index = 0; /* unassigned */
381 }
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000382 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
383}
384
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000385PyDoc_STRVAR(unicodedata_decomposition__doc__,
386"decomposition(unichr)\n\
387\n\
388Returns the character decomposition mapping assigned to the Unicode\n\
389character unichr as string. An empty string is returned in case no\n\
390such mapping is defined.");
391
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000392static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000393unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000394{
395 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000396 char decomp[256];
397 int code, index, count, i;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000398
399 if (!PyArg_ParseTuple(args, "O!:decomposition",
400 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000401 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000402 if (PyUnicode_GET_SIZE(v) != 1) {
403 PyErr_SetString(PyExc_TypeError,
404 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000405 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000406 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000407
408 code = (int) *PyUnicode_AS_UNICODE(v);
409
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000410 if (self) {
411 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
412 if (old->category_changed == 0)
413 return PyString_FromString(""); /* unassigned */
414 }
415
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000416 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000417 index = 0;
418 else {
419 index = decomp_index1[(code>>DECOMP_SHIFT)];
420 index = decomp_index2[(index<<DECOMP_SHIFT)+
421 (code&((1<<DECOMP_SHIFT)-1))];
422 }
423
Tim Peters69b83b12001-11-30 07:23:05 +0000424 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000425 is prefix code (from*/
426 count = decomp_data[index] >> 8;
427
428 /* XXX: could allocate the PyString up front instead
429 (strlen(prefix) + 5 * count + 1 bytes) */
430
431 /* copy prefix */
432 i = strlen(decomp_prefix[decomp_data[index] & 255]);
433 memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
434
435 while (count-- > 0) {
436 if (i)
437 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000438 assert((size_t)i < sizeof(decomp));
439 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
440 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000441 i += strlen(decomp + i);
442 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000443
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000444 decomp[i] = '\0';
445
446 return PyString_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000447}
448
Martin v. Löwis677bde22002-11-23 22:08:15 +0000449void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000450get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000451{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000452 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000453 *index = 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000454 } else if (self && get_old_record(self, code)->category_changed==0) {
455 /* unassigned in old version */
456 *index = 0;
457 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000458 else {
459 *index = decomp_index1[(code>>DECOMP_SHIFT)];
460 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
461 (code&((1<<DECOMP_SHIFT)-1))];
462 }
463
464 /* high byte is number of hex bytes (usually one or two), low byte
465 is prefix code (from*/
466 *count = decomp_data[*index] >> 8;
467 *prefix = decomp_data[*index] & 255;
468
469 (*index)++;
470}
471
472#define SBase 0xAC00
473#define LBase 0x1100
474#define VBase 0x1161
475#define TBase 0x11A7
476#define LCount 19
477#define VCount 21
478#define TCount 28
479#define NCount (VCount*TCount)
480#define SCount (LCount*NCount)
481
482static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000483nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000484{
485 PyObject *result;
486 Py_UNICODE *i, *end, *o;
487 /* Longest decomposition in Unicode 3.2: U+FDFA */
488 Py_UNICODE stack[20];
Martin v. Löwis3c6e4182006-04-13 06:36:31 +0000489 Py_ssize_t space, isize;
490 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000491 unsigned char prev, cur;
492
493 stackptr = 0;
494 isize = PyUnicode_GET_SIZE(input);
495 /* Overallocate atmost 10 characters. */
496 space = (isize > 10 ? 10 : isize) + isize;
497 result = PyUnicode_FromUnicode(NULL, space);
498 if (!result)
499 return NULL;
500 i = PyUnicode_AS_UNICODE(input);
501 end = i + isize;
502 o = PyUnicode_AS_UNICODE(result);
503
504 while (i < end) {
505 stack[stackptr++] = *i++;
506 while(stackptr) {
507 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000508 /* Hangul Decomposition adds three characters in
509 a single step, so we need atleast that much room. */
510 if (space < 3) {
Martin v. Löwis3c6e4182006-04-13 06:36:31 +0000511 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000512 space += 10;
513 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000514 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000515 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000516 }
517 /* Hangul Decomposition. */
518 if (SBase <= code && code < (SBase+SCount)) {
519 int SIndex = code - SBase;
520 int L = LBase + SIndex / NCount;
521 int V = VBase + (SIndex % NCount) / TCount;
522 int T = TBase + SIndex % TCount;
523 *o++ = L;
524 *o++ = V;
525 space -= 2;
526 if (T != TBase) {
527 *o++ = T;
528 space --;
529 }
530 continue;
531 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000532 /* normalization changes */
533 if (self) {
534 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
535 if (value != 0) {
536 stack[stackptr++] = value;
537 continue;
538 }
539 }
540
541 /* Other decompositions. */
542 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000543
544 /* Copy character if it is not decomposable, or has a
545 compatibility decomposition, but we do NFD. */
546 if (!count || (prefix && !k)) {
547 *o++ = code;
548 space--;
549 continue;
550 }
551 /* Copy decomposition onto the stack, in reverse
552 order. */
553 while(count) {
554 code = decomp_data[index + (--count)];
555 stack[stackptr++] = code;
556 }
557 }
558 }
559
560 /* Drop overallocation. Cannot fail. */
561 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
562
563 /* Sort canonically. */
564 i = PyUnicode_AS_UNICODE(result);
565 prev = _getrecord_ex(*i)->combining;
566 end = i + PyUnicode_GET_SIZE(result);
567 for (i++; i < end; i++) {
568 cur = _getrecord_ex(*i)->combining;
569 if (prev == 0 || cur == 0 || prev <= cur) {
570 prev = cur;
571 continue;
572 }
573 /* Non-canonical order. Need to switch *i with previous. */
574 o = i - 1;
575 while (1) {
576 Py_UNICODE tmp = o[1];
577 o[1] = o[0];
578 o[0] = tmp;
579 o--;
580 if (o < PyUnicode_AS_UNICODE(result))
581 break;
582 prev = _getrecord_ex(*o)->combining;
583 if (prev == 0 || prev <= cur)
584 break;
585 }
586 prev = _getrecord_ex(*i)->combining;
587 }
588 return result;
589}
590
591static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000592find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000593{
594 int index;
595 for (index = 0; nfc[index].start; index++) {
596 int start = nfc[index].start;
597 if (code < start)
598 return -1;
599 if (code <= start + nfc[index].count) {
600 int delta = code - start;
601 return nfc[index].index + delta;
602 }
603 }
604 return -1;
605}
606
607static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000608nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000609{
610 PyObject *result;
611 Py_UNICODE *i, *i1, *o, *end;
612 int f,l,index,index1,comb;
613 Py_UNICODE code;
614 Py_UNICODE *skipped[20];
615 int cskipped = 0;
616
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000617 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000618 if (!result)
619 return NULL;
620
621 /* We are going to modify result in-place.
622 If nfd_nfkd is changed to sometimes return the input,
623 this code needs to be reviewed. */
624 assert(result != input);
625
626 i = PyUnicode_AS_UNICODE(result);
627 end = i + PyUnicode_GET_SIZE(result);
628 o = PyUnicode_AS_UNICODE(result);
629
630 again:
631 while (i < end) {
632 for (index = 0; index < cskipped; index++) {
633 if (skipped[index] == i) {
634 /* *i character is skipped.
635 Remove from list. */
636 skipped[index] = skipped[cskipped-1];
637 cskipped--;
638 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000639 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000640 }
641 }
642 /* Hangul Composition. We don't need to check for <LV,T>
643 pairs, since we always have decomposed data. */
644 if (LBase <= *i && *i < (LBase+LCount) &&
645 i + 1 < end &&
646 VBase <= i[1] && i[1] <= (VBase+VCount)) {
647 int LIndex, VIndex;
648 LIndex = i[0] - LBase;
649 VIndex = i[1] - VBase;
650 code = SBase + (LIndex*VCount+VIndex)*TCount;
651 i+=2;
652 if (i < end &&
653 TBase <= *i && *i <= (TBase+TCount)) {
654 code += *i-TBase;
655 i++;
656 }
657 *o++ = code;
658 continue;
659 }
660
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000661 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000662 if (f == -1) {
663 *o++ = *i++;
664 continue;
665 }
666 /* Find next unblocked character. */
667 i1 = i+1;
668 comb = 0;
669 while (i1 < end) {
670 int comb1 = _getrecord_ex(*i1)->combining;
671 if (comb1 && comb == comb1) {
672 /* Character is blocked. */
673 i1++;
674 continue;
675 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000676 l = find_nfc_index(self, nfc_last, *i1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000677 /* *i1 cannot be combined with *i. If *i1
678 is a starter, we don't need to look further.
679 Otherwise, record the combining class. */
680 if (l == -1) {
681 not_combinable:
682 if (comb1 == 0)
683 break;
684 comb = comb1;
685 i1++;
686 continue;
687 }
688 index = f*TOTAL_LAST + l;
689 index1 = comp_index[index >> COMP_SHIFT];
690 code = comp_data[(index1<<COMP_SHIFT)+
691 (index&((1<<COMP_SHIFT)-1))];
692 if (code == 0)
693 goto not_combinable;
694
695 /* Replace the original character. */
696 *i = code;
697 /* Mark the second character unused. */
698 skipped[cskipped++] = i1;
699 i1++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000700 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000701 if (f == -1)
702 break;
703 }
704 *o++ = *i++;
705 }
706 if (o != end)
707 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
708 return result;
709}
710
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000711PyDoc_STRVAR(unicodedata_normalize__doc__,
712"normalize(form, unistr)\n\
713\n\
714Return the normal form 'form' for the Unicode string unistr. Valid\n\
715values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
716
Martin v. Löwis677bde22002-11-23 22:08:15 +0000717static PyObject*
718unicodedata_normalize(PyObject *self, PyObject *args)
719{
720 char *form;
721 PyObject *input;
722
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000723 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000724 &form, &PyUnicode_Type, &input))
725 return NULL;
726
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000727 if (PyUnicode_GetSize(input) == 0) {
728 /* Special case empty input strings, since resizing
729 them later would cause internal errors. */
730 Py_INCREF(input);
731 return input;
732 }
733
Martin v. Löwis677bde22002-11-23 22:08:15 +0000734 if (strcmp(form, "NFC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000735 return nfc_nfkc(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000736 if (strcmp(form, "NFKC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000737 return nfc_nfkc(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000738 if (strcmp(form, "NFD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000739 return nfd_nfkd(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000740 if (strcmp(form, "NFKD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000741 return nfd_nfkd(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000742 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
743 return NULL;
744}
745
Fredrik Lundh06d12682001-01-24 07:59:11 +0000746/* -------------------------------------------------------------------- */
747/* unicode character name tables */
748
749/* data file generated by Tools/unicode/makeunicodedata.py */
750#include "unicodename_db.h"
751
752/* -------------------------------------------------------------------- */
753/* database code (cut and pasted from the unidb package) */
754
755static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000756_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000757{
758 int i;
759 unsigned long h = 0;
760 unsigned long ix;
761 for (i = 0; i < len; i++) {
Neal Norwitz65c05b22006-04-10 02:17:47 +0000762 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000763 ix = h & 0xff000000;
764 if (ix)
765 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
766 }
767 return h;
768}
769
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000770static char *hangul_syllables[][3] = {
771 { "G", "A", "" },
772 { "GG", "AE", "G" },
773 { "N", "YA", "GG" },
774 { "D", "YAE", "GS" },
775 { "DD", "EO", "N", },
776 { "R", "E", "NJ" },
777 { "M", "YEO", "NH" },
778 { "B", "YE", "D" },
779 { "BB", "O", "L" },
780 { "S", "WA", "LG" },
781 { "SS", "WAE", "LM" },
782 { "", "OE", "LB" },
783 { "J", "YO", "LS" },
784 { "JJ", "U", "LT" },
785 { "C", "WEO", "LP" },
786 { "K", "WE", "LH" },
787 { "T", "WI", "M" },
788 { "P", "YU", "B" },
789 { "H", "EU", "BS" },
790 { 0, "YI", "S" },
791 { 0, "I", "SS" },
792 { 0, 0, "NG" },
793 { 0, 0, "J" },
794 { 0, 0, "C" },
795 { 0, 0, "K" },
796 { 0, 0, "T" },
797 { 0, 0, "P" },
798 { 0, 0, "H" }
799};
800
Fredrik Lundh06d12682001-01-24 07:59:11 +0000801static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000802is_unified_ideograph(Py_UCS4 code)
803{
804 return (
805 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Martin v. Löwisc3509122006-03-11 12:16:23 +0000806 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000807 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
808}
809
810static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000811_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000812{
813 int offset;
814 int i;
815 int word;
816 unsigned char* w;
817
Martin v. Löwisc3509122006-03-11 12:16:23 +0000818 if (code >= 0x110000)
819 return 0;
820
821 if (self) {
822 const change_record *old = get_old_record(self, code);
823 if (old->category_changed == 0) {
824 /* unassigned */
825 return 0;
826 }
827 }
828
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000829 if (SBase <= code && code < SBase+SCount) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000830 /* Hangul syllable. */
831 int SIndex = code - SBase;
832 int L = SIndex / NCount;
833 int V = (SIndex % NCount) / TCount;
834 int T = SIndex % TCount;
835
836 if (buflen < 27)
837 /* Worst case: HANGUL SYLLABLE <10chars>. */
838 return 0;
839 strcpy(buffer, "HANGUL SYLLABLE ");
840 buffer += 16;
841 strcpy(buffer, hangul_syllables[L][0]);
842 buffer += strlen(hangul_syllables[L][0]);
843 strcpy(buffer, hangul_syllables[V][1]);
844 buffer += strlen(hangul_syllables[V][1]);
845 strcpy(buffer, hangul_syllables[T][2]);
846 buffer += strlen(hangul_syllables[T][2]);
847 *buffer = '\0';
848 return 1;
849 }
850
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000851 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000852 if (buflen < 28)
853 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
854 return 0;
855 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
856 return 1;
857 }
858
Fredrik Lundh06d12682001-01-24 07:59:11 +0000859 /* get offset into phrasebook */
860 offset = phrasebook_offset1[(code>>phrasebook_shift)];
861 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
862 (code&((1<<phrasebook_shift)-1))];
863 if (!offset)
864 return 0;
865
866 i = 0;
867
868 for (;;) {
869 /* get word index */
870 word = phrasebook[offset] - phrasebook_short;
871 if (word >= 0) {
872 word = (word << 8) + phrasebook[offset+1];
873 offset += 2;
874 } else
875 word = phrasebook[offset++];
876 if (i) {
877 if (i > buflen)
878 return 0; /* buffer overflow */
879 buffer[i++] = ' ';
880 }
881 /* copy word string from lexicon. the last character in the
882 word has bit 7 set. the last word in a string ends with
883 0x80 */
884 w = lexicon + lexicon_offset[word];
885 while (*w < 128) {
886 if (i >= buflen)
887 return 0; /* buffer overflow */
888 buffer[i++] = *w++;
889 }
890 if (i >= buflen)
891 return 0; /* buffer overflow */
892 buffer[i++] = *w & 127;
893 if (*w == 128)
894 break; /* end of word */
895 }
896
897 return 1;
898}
899
900static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000901_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000902{
903 /* check if code corresponds to the given name */
904 int i;
905 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000906 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000907 return 0;
908 for (i = 0; i < namelen; i++) {
Neal Norwitz65c05b22006-04-10 02:17:47 +0000909 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +0000910 return 0;
911 }
912 return buffer[namelen] == '\0';
913}
914
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000915static void
916find_syllable(const char *str, int *len, int *pos, int count, int column)
917{
918 int i, len1;
919 *len = -1;
920 for (i = 0; i < count; i++) {
921 char *s = hangul_syllables[i][column];
922 len1 = strlen(s);
923 if (len1 <= *len)
924 continue;
925 if (strncmp(str, s, len1) == 0) {
926 *len = len1;
927 *pos = i;
928 }
929 }
930 if (*len == -1) {
931 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000932 }
933}
934
Fredrik Lundh06d12682001-01-24 07:59:11 +0000935static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000936_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000937{
938 unsigned int h, v;
939 unsigned int mask = code_size-1;
940 unsigned int i, incr;
941
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000942 /* Check for hangul syllables. */
943 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Thomas Wouters1e365b22006-03-01 21:58:30 +0000944 int len, L = -1, V = -1, T = -1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000945 const char *pos = name + 16;
946 find_syllable(pos, &len, &L, LCount, 0);
947 pos += len;
948 find_syllable(pos, &len, &V, VCount, 1);
949 pos += len;
950 find_syllable(pos, &len, &T, TCount, 2);
951 pos += len;
Martin v. Löwis8b291e22005-09-18 08:17:56 +0000952 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000953 *code = SBase + (L*VCount+V)*TCount + T;
954 return 1;
955 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000956 /* Otherwise, it's an illegal syllable name. */
957 return 0;
958 }
959
960 /* Check for unified ideographs. */
961 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
962 /* Four or five hexdigits must follow. */
963 v = 0;
964 name += 22;
965 namelen -= 22;
966 if (namelen != 4 && namelen != 5)
967 return 0;
968 while (namelen--) {
969 v *= 16;
970 if (*name >= '0' && *name <= '9')
971 v += *name - '0';
972 else if (*name >= 'A' && *name <= 'F')
973 v += *name - 'A' + 10;
974 else
975 return 0;
976 name++;
977 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000978 if (!is_unified_ideograph(v))
979 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000980 *code = v;
981 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000982 }
983
Fredrik Lundh06d12682001-01-24 07:59:11 +0000984 /* the following is the same as python's dictionary lookup, with
985 only minor changes. see the makeunicodedata script for more
986 details */
987
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000988 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000989 i = (~h) & mask;
990 v = code_hash[i];
991 if (!v)
992 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000993 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000994 *code = v;
995 return 1;
996 }
997 incr = (h ^ (h >> 3)) & mask;
998 if (!incr)
999 incr = mask;
1000 for (;;) {
1001 i = (i + incr) & mask;
1002 v = code_hash[i];
1003 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001004 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001005 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001006 *code = v;
1007 return 1;
1008 }
1009 incr = incr << 1;
1010 if (incr > mask)
1011 incr = incr ^ code_poly;
1012 }
1013}
1014
1015static const _PyUnicode_Name_CAPI hashAPI =
1016{
1017 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001018 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001019 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001020};
1021
1022/* -------------------------------------------------------------------- */
1023/* Python bindings */
1024
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001025PyDoc_STRVAR(unicodedata_name__doc__,
1026"name(unichr[, default])\n\
1027Returns the name assigned to the Unicode character unichr as a\n\
1028string. If no name is defined, default is returned, or, if not\n\
1029given, ValueError is raised.");
1030
Fredrik Lundh06d12682001-01-24 07:59:11 +00001031static PyObject *
1032unicodedata_name(PyObject* self, PyObject* args)
1033{
1034 char name[NAME_MAXLEN];
1035
1036 PyUnicodeObject* v;
1037 PyObject* defobj = NULL;
1038 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1039 return NULL;
1040
1041 if (PyUnicode_GET_SIZE(v) != 1) {
1042 PyErr_SetString(PyExc_TypeError,
1043 "need a single Unicode character as parameter");
1044 return NULL;
1045 }
1046
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001047 if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
1048 name, sizeof(name))) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001049 if (defobj == NULL) {
1050 PyErr_SetString(PyExc_ValueError, "no such name");
1051 return NULL;
1052 }
1053 else {
1054 Py_INCREF(defobj);
1055 return defobj;
1056 }
1057 }
1058
1059 return Py_BuildValue("s", name);
1060}
1061
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001062PyDoc_STRVAR(unicodedata_lookup__doc__,
1063"lookup(name)\n\
1064\n\
1065Look up character by name. If a character with the\n\
1066given name is found, return the corresponding Unicode\n\
1067character. If not found, KeyError is raised.");
1068
Fredrik Lundh06d12682001-01-24 07:59:11 +00001069static PyObject *
1070unicodedata_lookup(PyObject* self, PyObject* args)
1071{
1072 Py_UCS4 code;
1073 Py_UNICODE str[1];
1074
1075 char* name;
1076 int namelen;
1077 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1078 return NULL;
1079
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001080 if (!_getcode(self, name, namelen, &code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001081 char fmt[] = "undefined character name '%s'";
1082 char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
1083 sprintf(buf, fmt, name);
1084 PyErr_SetString(PyExc_KeyError, buf);
1085 PyMem_FREE(buf);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001086 return NULL;
1087 }
1088
1089 str[0] = (Py_UNICODE) code;
1090 return PyUnicode_FromUnicode(str, 1);
1091}
1092
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001093/* XXX Add doc strings. */
1094
1095static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001096 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1097 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1098 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1099 {"category", unicodedata_category, METH_VARARGS,
1100 unicodedata_category__doc__},
1101 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1102 unicodedata_bidirectional__doc__},
1103 {"combining", unicodedata_combining, METH_VARARGS,
1104 unicodedata_combining__doc__},
1105 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1106 unicodedata_mirrored__doc__},
1107 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1108 unicodedata_east_asian_width__doc__},
1109 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1110 unicodedata_decomposition__doc__},
1111 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1112 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1113 {"normalize", unicodedata_normalize, METH_VARARGS,
1114 unicodedata_normalize__doc__},
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001115 {NULL, NULL} /* sentinel */
1116};
1117
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001118static PyTypeObject UCD_Type = {
1119 /* The ob_type field must be initialized in the module init function
1120 * to be portable to Windows without using C++. */
1121 PyObject_HEAD_INIT(NULL)
1122 0, /*ob_size*/
1123 "unicodedata.UCD", /*tp_name*/
1124 sizeof(PreviousDBVersion), /*tp_basicsize*/
1125 0, /*tp_itemsize*/
1126 /* methods */
1127 (destructor)PyObject_Del, /*tp_dealloc*/
1128 0, /*tp_print*/
1129 0, /*tp_getattr*/
1130 0, /*tp_setattr*/
1131 0, /*tp_compare*/
1132 0, /*tp_repr*/
1133 0, /*tp_as_number*/
1134 0, /*tp_as_sequence*/
1135 0, /*tp_as_mapping*/
1136 0, /*tp_hash*/
1137 0, /*tp_call*/
1138 0, /*tp_str*/
1139 PyObject_GenericGetAttr,/*tp_getattro*/
1140 0, /*tp_setattro*/
1141 0, /*tp_as_buffer*/
1142 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1143 0, /*tp_doc*/
1144 0, /*tp_traverse*/
1145 0, /*tp_clear*/
1146 0, /*tp_richcompare*/
1147 0, /*tp_weaklistoffset*/
1148 0, /*tp_iter*/
1149 0, /*tp_iternext*/
1150 unicodedata_functions, /*tp_methods*/
1151 DB_members, /*tp_members*/
1152 0, /*tp_getset*/
1153 0, /*tp_base*/
1154 0, /*tp_dict*/
1155 0, /*tp_descr_get*/
1156 0, /*tp_descr_set*/
1157 0, /*tp_dictoffset*/
1158 0, /*tp_init*/
1159 0, /*tp_alloc*/
1160 0, /*tp_new*/
1161 0, /*tp_free*/
1162 0, /*tp_is_gc*/
1163};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001164
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001165PyDoc_STRVAR(unicodedata_docstring,
1166"This module provides access to the Unicode Character Database which\n\
1167defines character properties for all Unicode characters. The data in\n\
1168this database is based on the UnicodeData.txt file version\n\
11693.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
1170\n\
1171The module uses the same names and symbols as defined by the\n\
1172UnicodeData File Format 3.2.0 (see\n\
Hye-Shik Chang4c560ea2005-06-04 07:31:48 +00001173http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001174
Mark Hammond62b1ab12002-07-23 06:31:15 +00001175PyMODINIT_FUNC
Thomas Woutersf3f33dc2000-07-21 06:00:07 +00001176initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001177{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001178 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001179
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001180 UCD_Type.ob_type = &PyType_Type;
1181
Fred Drakef585bef2001-03-03 19:41:55 +00001182 m = Py_InitModule3(
1183 "unicodedata", unicodedata_functions, unicodedata_docstring);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001184 if (!m)
1185 return;
1186
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001187 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001188 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001189 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001190
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001191 /* Previous versions */
1192 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1193 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001194 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001195
Fredrik Lundh06d12682001-01-24 07:59:11 +00001196 /* Export C API */
1197 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001198 if (v != NULL)
1199 PyModule_AddObject(m, "ucnhash_CAPI", v);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001200}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001201
1202/*
1203Local variables:
1204c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001205indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001206End:
1207*/