blob: 5a4378babf1db331f137760c058e488804c62bdf [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Martin v. Löwis7d41e292002-11-23 12:22:32 +00003 unicodedata -- Provides access to the Unicode 3.2 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Martin v. Löwis7d41e292002-11-23 12:22:32 +00005 Data was extracted from the Unicode 3.2 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Martin v. Löwis7d41e292002-11-23 12:22:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000028 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030} _PyUnicode_DatabaseRecord;
31
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000032typedef struct change_record {
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed;
35 const unsigned char category_changed;
36 const unsigned char decimal_changed;
37 const int numeric_changed;
38} change_record;
39
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000040/* data file generated by Tools/unicode/makeunicodedata.py */
41#include "unicodedata_db.h"
42
43static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000044_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000045{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000046 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000047 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048 index = 0;
49 else {
50 index = index1[(code>>SHIFT)];
51 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
52 }
53
54 return &_PyUnicode_Database_Records[index];
55}
56
Martin v. Löwis677bde22002-11-23 22:08:15 +000057static const _PyUnicode_DatabaseRecord*
58_getrecord(PyUnicodeObject* v)
59{
60 return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
61}
62
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000063/* ------------- Previous-version API ------------------------------------- */
64typedef struct previous_version {
65 PyObject_HEAD
66 const char *name;
67 const change_record* (*getrecord)(Py_UCS4);
68 Py_UCS4 (*normalization)(Py_UCS4);
69} PreviousDBVersion;
70
71#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
72
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000073static PyMemberDef DB_members[] = {
74 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
75 {NULL}
76};
77
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000078// forward declaration
79static PyTypeObject UCD_Type;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000080
81static PyObject*
82new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
83 Py_UCS4 (*normalization)(Py_UCS4))
84{
85 PreviousDBVersion *self;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000086 self = PyObject_New(PreviousDBVersion, &UCD_Type);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000087 if (self == NULL)
88 return NULL;
89 self->name = name;
90 self->getrecord = getrecord;
91 self->normalization = normalization;
92 return (PyObject*)self;
93}
94
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000095/* --- Module API --------------------------------------------------------- */
96
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +000097PyDoc_STRVAR(unicodedata_decimal__doc__,
98"decimal(unichr[, default])\n\
99\n\
100Returns the decimal value assigned to the Unicode character unichr\n\
101as integer. If no such value is defined, default is returned, or, if\n\
102not given, ValueError is raised.");
103
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000104static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000105unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000106{
107 PyUnicodeObject *v;
108 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000109 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000110 long rc;
111
Fredrik Lundh06d12682001-01-24 07:59:11 +0000112 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000113 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000114 if (PyUnicode_GET_SIZE(v) != 1) {
115 PyErr_SetString(PyExc_TypeError,
116 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000117 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000119
120 if (self) {
121 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
122 if (old->category_changed == 0) {
123 /* unassigned */
124 have_old = 1;
125 rc = -1;
126 }
127 else if (old->decimal_changed != 0xFF) {
128 have_old = 1;
129 rc = old->decimal_changed;
130 }
131 }
132
133 if (!have_old)
134 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000135 if (rc < 0) {
136 if (defobj == NULL) {
137 PyErr_SetString(PyExc_ValueError,
138 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000139 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000140 }
141 else {
142 Py_INCREF(defobj);
143 return defobj;
144 }
145 }
146 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000147}
148
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000149PyDoc_STRVAR(unicodedata_digit__doc__,
150"digit(unichr[, default])\n\
151\n\
152Returns the digit value assigned to the Unicode character unichr as\n\
153integer. If no such value is defined, default is returned, or, if\n\
154not given, ValueError is raised.");
155
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000156static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000157unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000158{
159 PyUnicodeObject *v;
160 PyObject *defobj = NULL;
161 long rc;
162
Fredrik Lundh06d12682001-01-24 07:59:11 +0000163 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000164 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000165 if (PyUnicode_GET_SIZE(v) != 1) {
166 PyErr_SetString(PyExc_TypeError,
167 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000168 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169 }
170 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
171 if (rc < 0) {
172 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000173 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000174 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000175 }
176 else {
177 Py_INCREF(defobj);
178 return defobj;
179 }
180 }
181 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000182}
183
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000184PyDoc_STRVAR(unicodedata_numeric__doc__,
185"numeric(unichr[, default])\n\
186\n\
187Returns the numeric value assigned to the Unicode character unichr\n\
188as float. If no such value is defined, default is returned, or, if\n\
189not given, ValueError is raised.");
190
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000191static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000192unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193{
194 PyUnicodeObject *v;
195 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000196 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000197 double rc;
198
Fredrik Lundh06d12682001-01-24 07:59:11 +0000199 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000200 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201 if (PyUnicode_GET_SIZE(v) != 1) {
202 PyErr_SetString(PyExc_TypeError,
203 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000204 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000205 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000206
207 if (self) {
208 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
209 if (old->category_changed == 0) {
210 /* unassigned */
211 have_old = 1;
212 rc = -1;
213 }
214 else if (old->decimal_changed != 0xFF) {
215 have_old = 1;
216 rc = old->decimal_changed;
217 }
218 }
219
220 if (!have_old)
221 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000222 if (rc < 0) {
223 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000224 PyErr_SetString(PyExc_ValueError, "not a numeric character");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000225 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000226 }
227 else {
228 Py_INCREF(defobj);
229 return defobj;
230 }
231 }
232 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000233}
234
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000235PyDoc_STRVAR(unicodedata_category__doc__,
236"category(unichr)\n\
237\n\
238Returns the general category assigned to the Unicode character\n\
239unichr as string.");
240
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000241static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000242unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000243{
244 PyUnicodeObject *v;
245 int index;
246
247 if (!PyArg_ParseTuple(args, "O!:category",
248 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000249 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250 if (PyUnicode_GET_SIZE(v) != 1) {
251 PyErr_SetString(PyExc_TypeError,
252 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000253 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000254 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000255 index = (int) _getrecord(v)->category;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000256 if (self) {
257 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
258 if (old->category_changed != 0xFF)
259 index = old->category_changed;
260 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000261 return PyString_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000262}
263
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000264PyDoc_STRVAR(unicodedata_bidirectional__doc__,
265"bidirectional(unichr)\n\
266\n\
267Returns the bidirectional category assigned to the Unicode character\n\
268unichr as string. If no such value is defined, an empty string is\n\
269returned.");
270
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000271static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000272unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000273{
274 PyUnicodeObject *v;
275 int index;
276
277 if (!PyArg_ParseTuple(args, "O!:bidirectional",
278 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000279 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000280 if (PyUnicode_GET_SIZE(v) != 1) {
281 PyErr_SetString(PyExc_TypeError,
282 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000283 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000284 }
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000285 index = (int) _getrecord(v)->bidirectional;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000286 if (self) {
287 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
288 if (old->category_changed == 0)
289 index = 0; /* unassigned */
290 else if (old->bidir_changed != 0xFF)
291 index = old->bidir_changed;
292 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000293 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000294}
295
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000296PyDoc_STRVAR(unicodedata_combining__doc__,
297"combining(unichr)\n\
298\n\
299Returns the canonical combining class assigned to the Unicode\n\
300character unichr as integer. Returns 0 if no combining class is\n\
301defined.");
302
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000303static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000304unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000305{
306 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000307 int index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000308
309 if (!PyArg_ParseTuple(args, "O!:combining",
310 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000311 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000312 if (PyUnicode_GET_SIZE(v) != 1) {
313 PyErr_SetString(PyExc_TypeError,
314 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000315 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000316 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000317 index = (int) _getrecord(v)->combining;
318 if (self) {
319 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
320 if (old->category_changed == 0)
321 index = 0; /* unassigned */
322 }
323 return PyInt_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000324}
325
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000326PyDoc_STRVAR(unicodedata_mirrored__doc__,
327"mirrored(unichr)\n\
328\n\
329Returns the mirrored property assigned to the Unicode character\n\
330unichr as integer. Returns 1 if the character has been identified as\n\
331a \"mirrored\" character in bidirectional text, 0 otherwise.");
332
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000333static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000334unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000335{
336 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000337 int index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000338
339 if (!PyArg_ParseTuple(args, "O!:mirrored",
340 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000341 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000342 if (PyUnicode_GET_SIZE(v) != 1) {
343 PyErr_SetString(PyExc_TypeError,
344 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000345 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000346 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000347 index = (int) _getrecord(v)->mirrored;
348 if (self) {
349 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
350 if (old->category_changed == 0)
351 index = 0; /* unassigned */
352 }
353 return PyInt_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000354}
355
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000356PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
357"east_asian_width(unichr)\n\
358\n\
359Returns the east asian width assigned to the Unicode character\n\
360unichr as string.");
361
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000362static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000363unicodedata_east_asian_width(PyObject *self, PyObject *args)
364{
365 PyUnicodeObject *v;
366 int index;
367
368 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
369 &PyUnicode_Type, &v))
370 return NULL;
371 if (PyUnicode_GET_SIZE(v) != 1) {
372 PyErr_SetString(PyExc_TypeError,
373 "need a single Unicode character as parameter");
374 return NULL;
375 }
376 index = (int) _getrecord(v)->east_asian_width;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000377 if (self) {
378 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
379 if (old->category_changed == 0)
380 index = 0; /* unassigned */
381 }
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000382 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
383}
384
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000385PyDoc_STRVAR(unicodedata_decomposition__doc__,
386"decomposition(unichr)\n\
387\n\
388Returns the character decomposition mapping assigned to the Unicode\n\
389character unichr as string. An empty string is returned in case no\n\
390such mapping is defined.");
391
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000392static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000393unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000394{
395 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000396 char decomp[256];
397 int code, index, count, i;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000398
399 if (!PyArg_ParseTuple(args, "O!:decomposition",
400 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000401 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000402 if (PyUnicode_GET_SIZE(v) != 1) {
403 PyErr_SetString(PyExc_TypeError,
404 "need a single Unicode character as parameter");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000405 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000406 }
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000407
408 code = (int) *PyUnicode_AS_UNICODE(v);
409
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000410 if (self) {
411 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
412 if (old->category_changed == 0)
413 return PyString_FromString(""); /* unassigned */
414 }
415
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000416 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000417 index = 0;
418 else {
419 index = decomp_index1[(code>>DECOMP_SHIFT)];
420 index = decomp_index2[(index<<DECOMP_SHIFT)+
421 (code&((1<<DECOMP_SHIFT)-1))];
422 }
423
Tim Peters69b83b12001-11-30 07:23:05 +0000424 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000425 is prefix code (from*/
426 count = decomp_data[index] >> 8;
427
428 /* XXX: could allocate the PyString up front instead
429 (strlen(prefix) + 5 * count + 1 bytes) */
430
431 /* copy prefix */
432 i = strlen(decomp_prefix[decomp_data[index] & 255]);
433 memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
434
435 while (count-- > 0) {
436 if (i)
437 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000438 assert((size_t)i < sizeof(decomp));
439 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
440 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000441 i += strlen(decomp + i);
442 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000443
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000444 decomp[i] = '\0';
445
446 return PyString_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000447}
448
Martin v. Löwis677bde22002-11-23 22:08:15 +0000449void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000450get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000451{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000452 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000453 *index = 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000454 } else if (self && get_old_record(self, code)->category_changed==0) {
455 /* unassigned in old version */
456 *index = 0;
457 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000458 else {
459 *index = decomp_index1[(code>>DECOMP_SHIFT)];
460 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
461 (code&((1<<DECOMP_SHIFT)-1))];
462 }
463
464 /* high byte is number of hex bytes (usually one or two), low byte
465 is prefix code (from*/
466 *count = decomp_data[*index] >> 8;
467 *prefix = decomp_data[*index] & 255;
468
469 (*index)++;
470}
471
472#define SBase 0xAC00
473#define LBase 0x1100
474#define VBase 0x1161
475#define TBase 0x11A7
476#define LCount 19
477#define VCount 21
478#define TCount 28
479#define NCount (VCount*TCount)
480#define SCount (LCount*NCount)
481
482static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000483nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000484{
485 PyObject *result;
486 Py_UNICODE *i, *end, *o;
487 /* Longest decomposition in Unicode 3.2: U+FDFA */
488 Py_UNICODE stack[20];
489 int space, stackptr, isize;
490 int index, prefix, count;
491 unsigned char prev, cur;
492
493 stackptr = 0;
494 isize = PyUnicode_GET_SIZE(input);
495 /* Overallocate atmost 10 characters. */
496 space = (isize > 10 ? 10 : isize) + isize;
497 result = PyUnicode_FromUnicode(NULL, space);
498 if (!result)
499 return NULL;
500 i = PyUnicode_AS_UNICODE(input);
501 end = i + isize;
502 o = PyUnicode_AS_UNICODE(result);
503
504 while (i < end) {
505 stack[stackptr++] = *i++;
506 while(stackptr) {
507 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000508 /* Hangul Decomposition adds three characters in
509 a single step, so we need atleast that much room. */
510 if (space < 3) {
511 int newsize = PyString_GET_SIZE(result) + 10;
512 space += 10;
513 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000514 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000515 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000516 }
517 /* Hangul Decomposition. */
518 if (SBase <= code && code < (SBase+SCount)) {
519 int SIndex = code - SBase;
520 int L = LBase + SIndex / NCount;
521 int V = VBase + (SIndex % NCount) / TCount;
522 int T = TBase + SIndex % TCount;
523 *o++ = L;
524 *o++ = V;
525 space -= 2;
526 if (T != TBase) {
527 *o++ = T;
528 space --;
529 }
530 continue;
531 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000532 /* normalization changes */
533 if (self) {
534 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
535 if (value != 0) {
536 stack[stackptr++] = value;
537 continue;
538 }
539 }
540
541 /* Other decompositions. */
542 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000543
544 /* Copy character if it is not decomposable, or has a
545 compatibility decomposition, but we do NFD. */
546 if (!count || (prefix && !k)) {
547 *o++ = code;
548 space--;
549 continue;
550 }
551 /* Copy decomposition onto the stack, in reverse
552 order. */
553 while(count) {
554 code = decomp_data[index + (--count)];
555 stack[stackptr++] = code;
556 }
557 }
558 }
559
560 /* Drop overallocation. Cannot fail. */
561 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
562
563 /* Sort canonically. */
564 i = PyUnicode_AS_UNICODE(result);
565 prev = _getrecord_ex(*i)->combining;
566 end = i + PyUnicode_GET_SIZE(result);
567 for (i++; i < end; i++) {
568 cur = _getrecord_ex(*i)->combining;
569 if (prev == 0 || cur == 0 || prev <= cur) {
570 prev = cur;
571 continue;
572 }
573 /* Non-canonical order. Need to switch *i with previous. */
574 o = i - 1;
575 while (1) {
576 Py_UNICODE tmp = o[1];
577 o[1] = o[0];
578 o[0] = tmp;
579 o--;
580 if (o < PyUnicode_AS_UNICODE(result))
581 break;
582 prev = _getrecord_ex(*o)->combining;
583 if (prev == 0 || prev <= cur)
584 break;
585 }
586 prev = _getrecord_ex(*i)->combining;
587 }
588 return result;
589}
590
591static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000592find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000593{
594 int index;
595 for (index = 0; nfc[index].start; index++) {
596 int start = nfc[index].start;
597 if (code < start)
598 return -1;
599 if (code <= start + nfc[index].count) {
600 int delta = code - start;
601 return nfc[index].index + delta;
602 }
603 }
604 return -1;
605}
606
607static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000608nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000609{
610 PyObject *result;
611 Py_UNICODE *i, *i1, *o, *end;
612 int f,l,index,index1,comb;
613 Py_UNICODE code;
614 Py_UNICODE *skipped[20];
615 int cskipped = 0;
616
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000617 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000618 if (!result)
619 return NULL;
620
621 /* We are going to modify result in-place.
622 If nfd_nfkd is changed to sometimes return the input,
623 this code needs to be reviewed. */
624 assert(result != input);
625
626 i = PyUnicode_AS_UNICODE(result);
627 end = i + PyUnicode_GET_SIZE(result);
628 o = PyUnicode_AS_UNICODE(result);
629
630 again:
631 while (i < end) {
632 for (index = 0; index < cskipped; index++) {
633 if (skipped[index] == i) {
634 /* *i character is skipped.
635 Remove from list. */
636 skipped[index] = skipped[cskipped-1];
637 cskipped--;
638 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000639 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000640 }
641 }
642 /* Hangul Composition. We don't need to check for <LV,T>
643 pairs, since we always have decomposed data. */
644 if (LBase <= *i && *i < (LBase+LCount) &&
645 i + 1 < end &&
646 VBase <= i[1] && i[1] <= (VBase+VCount)) {
647 int LIndex, VIndex;
648 LIndex = i[0] - LBase;
649 VIndex = i[1] - VBase;
650 code = SBase + (LIndex*VCount+VIndex)*TCount;
651 i+=2;
652 if (i < end &&
653 TBase <= *i && *i <= (TBase+TCount)) {
654 code += *i-TBase;
655 i++;
656 }
657 *o++ = code;
658 continue;
659 }
660
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000661 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000662 if (f == -1) {
663 *o++ = *i++;
664 continue;
665 }
666 /* Find next unblocked character. */
667 i1 = i+1;
668 comb = 0;
669 while (i1 < end) {
670 int comb1 = _getrecord_ex(*i1)->combining;
671 if (comb1 && comb == comb1) {
672 /* Character is blocked. */
673 i1++;
674 continue;
675 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000676 l = find_nfc_index(self, nfc_last, *i1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000677 /* *i1 cannot be combined with *i. If *i1
678 is a starter, we don't need to look further.
679 Otherwise, record the combining class. */
680 if (l == -1) {
681 not_combinable:
682 if (comb1 == 0)
683 break;
684 comb = comb1;
685 i1++;
686 continue;
687 }
688 index = f*TOTAL_LAST + l;
689 index1 = comp_index[index >> COMP_SHIFT];
690 code = comp_data[(index1<<COMP_SHIFT)+
691 (index&((1<<COMP_SHIFT)-1))];
692 if (code == 0)
693 goto not_combinable;
694
695 /* Replace the original character. */
696 *i = code;
697 /* Mark the second character unused. */
698 skipped[cskipped++] = i1;
699 i1++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000700 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000701 if (f == -1)
702 break;
703 }
704 *o++ = *i++;
705 }
706 if (o != end)
707 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
708 return result;
709}
710
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000711PyDoc_STRVAR(unicodedata_normalize__doc__,
712"normalize(form, unistr)\n\
713\n\
714Return the normal form 'form' for the Unicode string unistr. Valid\n\
715values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
716
Martin v. Löwis677bde22002-11-23 22:08:15 +0000717static PyObject*
718unicodedata_normalize(PyObject *self, PyObject *args)
719{
720 char *form;
721 PyObject *input;
722
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000723 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000724 &form, &PyUnicode_Type, &input))
725 return NULL;
726
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000727 if (PyUnicode_GetSize(input) == 0) {
728 /* Special case empty input strings, since resizing
729 them later would cause internal errors. */
730 Py_INCREF(input);
731 return input;
732 }
733
Martin v. Löwis677bde22002-11-23 22:08:15 +0000734 if (strcmp(form, "NFC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000735 return nfc_nfkc(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000736 if (strcmp(form, "NFKC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000737 return nfc_nfkc(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000738 if (strcmp(form, "NFD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000739 return nfd_nfkd(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000740 if (strcmp(form, "NFKD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000741 return nfd_nfkd(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000742 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
743 return NULL;
744}
745
Fredrik Lundh06d12682001-01-24 07:59:11 +0000746/* -------------------------------------------------------------------- */
747/* unicode character name tables */
748
749/* data file generated by Tools/unicode/makeunicodedata.py */
750#include "unicodename_db.h"
751
752/* -------------------------------------------------------------------- */
753/* database code (cut and pasted from the unidb package) */
754
755static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000756_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000757{
758 int i;
759 unsigned long h = 0;
760 unsigned long ix;
761 for (i = 0; i < len; i++) {
762 h = (h * scale) + (unsigned char) toupper(s[i]);
763 ix = h & 0xff000000;
764 if (ix)
765 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
766 }
767 return h;
768}
769
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000770static char *hangul_syllables[][3] = {
771 { "G", "A", "" },
772 { "GG", "AE", "G" },
773 { "N", "YA", "GG" },
774 { "D", "YAE", "GS" },
775 { "DD", "EO", "N", },
776 { "R", "E", "NJ" },
777 { "M", "YEO", "NH" },
778 { "B", "YE", "D" },
779 { "BB", "O", "L" },
780 { "S", "WA", "LG" },
781 { "SS", "WAE", "LM" },
782 { "", "OE", "LB" },
783 { "J", "YO", "LS" },
784 { "JJ", "U", "LT" },
785 { "C", "WEO", "LP" },
786 { "K", "WE", "LH" },
787 { "T", "WI", "M" },
788 { "P", "YU", "B" },
789 { "H", "EU", "BS" },
790 { 0, "YI", "S" },
791 { 0, "I", "SS" },
792 { 0, 0, "NG" },
793 { 0, 0, "J" },
794 { 0, 0, "C" },
795 { 0, 0, "K" },
796 { 0, 0, "T" },
797 { 0, 0, "P" },
798 { 0, 0, "H" }
799};
800
Fredrik Lundh06d12682001-01-24 07:59:11 +0000801static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000802is_unified_ideograph(Py_UCS4 code)
803{
804 return (
805 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
806 (0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
807 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
808}
809
810static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000811_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000812{
813 int offset;
814 int i;
815 int word;
816 unsigned char* w;
817
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000818 if (SBase <= code && code < SBase+SCount) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000819 /* Hangul syllable. */
820 int SIndex = code - SBase;
821 int L = SIndex / NCount;
822 int V = (SIndex % NCount) / TCount;
823 int T = SIndex % TCount;
824
825 if (buflen < 27)
826 /* Worst case: HANGUL SYLLABLE <10chars>. */
827 return 0;
828 strcpy(buffer, "HANGUL SYLLABLE ");
829 buffer += 16;
830 strcpy(buffer, hangul_syllables[L][0]);
831 buffer += strlen(hangul_syllables[L][0]);
832 strcpy(buffer, hangul_syllables[V][1]);
833 buffer += strlen(hangul_syllables[V][1]);
834 strcpy(buffer, hangul_syllables[T][2]);
835 buffer += strlen(hangul_syllables[T][2]);
836 *buffer = '\0';
837 return 1;
838 }
839
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000840 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000841 if (buflen < 28)
842 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
843 return 0;
844 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
845 return 1;
846 }
847
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000848 if (code >= 0x110000)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000849 return 0;
850
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000851 if (self) {
852 const change_record *old = get_old_record(self, code);
853 if (old->category_changed == 0) {
854 /* unassigned */
855 return 0;
856 }
857 }
858
859
Fredrik Lundh06d12682001-01-24 07:59:11 +0000860 /* get offset into phrasebook */
861 offset = phrasebook_offset1[(code>>phrasebook_shift)];
862 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
863 (code&((1<<phrasebook_shift)-1))];
864 if (!offset)
865 return 0;
866
867 i = 0;
868
869 for (;;) {
870 /* get word index */
871 word = phrasebook[offset] - phrasebook_short;
872 if (word >= 0) {
873 word = (word << 8) + phrasebook[offset+1];
874 offset += 2;
875 } else
876 word = phrasebook[offset++];
877 if (i) {
878 if (i > buflen)
879 return 0; /* buffer overflow */
880 buffer[i++] = ' ';
881 }
882 /* copy word string from lexicon. the last character in the
883 word has bit 7 set. the last word in a string ends with
884 0x80 */
885 w = lexicon + lexicon_offset[word];
886 while (*w < 128) {
887 if (i >= buflen)
888 return 0; /* buffer overflow */
889 buffer[i++] = *w++;
890 }
891 if (i >= buflen)
892 return 0; /* buffer overflow */
893 buffer[i++] = *w & 127;
894 if (*w == 128)
895 break; /* end of word */
896 }
897
898 return 1;
899}
900
901static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000902_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000903{
904 /* check if code corresponds to the given name */
905 int i;
906 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000907 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000908 return 0;
909 for (i = 0; i < namelen; i++) {
910 if (toupper(name[i]) != buffer[i])
911 return 0;
912 }
913 return buffer[namelen] == '\0';
914}
915
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000916static void
917find_syllable(const char *str, int *len, int *pos, int count, int column)
918{
919 int i, len1;
920 *len = -1;
921 for (i = 0; i < count; i++) {
922 char *s = hangul_syllables[i][column];
923 len1 = strlen(s);
924 if (len1 <= *len)
925 continue;
926 if (strncmp(str, s, len1) == 0) {
927 *len = len1;
928 *pos = i;
929 }
930 }
931 if (*len == -1) {
932 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000933 }
934}
935
Fredrik Lundh06d12682001-01-24 07:59:11 +0000936static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000937_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000938{
939 unsigned int h, v;
940 unsigned int mask = code_size-1;
941 unsigned int i, incr;
942
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000943 /* Check for hangul syllables. */
944 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Thomas Wouters1e365b22006-03-01 21:58:30 +0000945 int len, L = -1, V = -1, T = -1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000946 const char *pos = name + 16;
947 find_syllable(pos, &len, &L, LCount, 0);
948 pos += len;
949 find_syllable(pos, &len, &V, VCount, 1);
950 pos += len;
951 find_syllable(pos, &len, &T, TCount, 2);
952 pos += len;
Martin v. Löwis8b291e22005-09-18 08:17:56 +0000953 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000954 *code = SBase + (L*VCount+V)*TCount + T;
955 return 1;
956 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000957 /* Otherwise, it's an illegal syllable name. */
958 return 0;
959 }
960
961 /* Check for unified ideographs. */
962 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
963 /* Four or five hexdigits must follow. */
964 v = 0;
965 name += 22;
966 namelen -= 22;
967 if (namelen != 4 && namelen != 5)
968 return 0;
969 while (namelen--) {
970 v *= 16;
971 if (*name >= '0' && *name <= '9')
972 v += *name - '0';
973 else if (*name >= 'A' && *name <= 'F')
974 v += *name - 'A' + 10;
975 else
976 return 0;
977 name++;
978 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000979 if (!is_unified_ideograph(v))
980 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000981 *code = v;
982 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000983 }
984
Fredrik Lundh06d12682001-01-24 07:59:11 +0000985 /* the following is the same as python's dictionary lookup, with
986 only minor changes. see the makeunicodedata script for more
987 details */
988
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000989 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000990 i = (~h) & mask;
991 v = code_hash[i];
992 if (!v)
993 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000994 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000995 *code = v;
996 return 1;
997 }
998 incr = (h ^ (h >> 3)) & mask;
999 if (!incr)
1000 incr = mask;
1001 for (;;) {
1002 i = (i + incr) & mask;
1003 v = code_hash[i];
1004 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001005 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001006 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001007 *code = v;
1008 return 1;
1009 }
1010 incr = incr << 1;
1011 if (incr > mask)
1012 incr = incr ^ code_poly;
1013 }
1014}
1015
1016static const _PyUnicode_Name_CAPI hashAPI =
1017{
1018 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001019 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001020 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001021};
1022
1023/* -------------------------------------------------------------------- */
1024/* Python bindings */
1025
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001026PyDoc_STRVAR(unicodedata_name__doc__,
1027"name(unichr[, default])\n\
1028Returns the name assigned to the Unicode character unichr as a\n\
1029string. If no name is defined, default is returned, or, if not\n\
1030given, ValueError is raised.");
1031
Fredrik Lundh06d12682001-01-24 07:59:11 +00001032static PyObject *
1033unicodedata_name(PyObject* self, PyObject* args)
1034{
1035 char name[NAME_MAXLEN];
1036
1037 PyUnicodeObject* v;
1038 PyObject* defobj = NULL;
1039 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1040 return NULL;
1041
1042 if (PyUnicode_GET_SIZE(v) != 1) {
1043 PyErr_SetString(PyExc_TypeError,
1044 "need a single Unicode character as parameter");
1045 return NULL;
1046 }
1047
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001048 if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
1049 name, sizeof(name))) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001050 if (defobj == NULL) {
1051 PyErr_SetString(PyExc_ValueError, "no such name");
1052 return NULL;
1053 }
1054 else {
1055 Py_INCREF(defobj);
1056 return defobj;
1057 }
1058 }
1059
1060 return Py_BuildValue("s", name);
1061}
1062
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001063PyDoc_STRVAR(unicodedata_lookup__doc__,
1064"lookup(name)\n\
1065\n\
1066Look up character by name. If a character with the\n\
1067given name is found, return the corresponding Unicode\n\
1068character. If not found, KeyError is raised.");
1069
Fredrik Lundh06d12682001-01-24 07:59:11 +00001070static PyObject *
1071unicodedata_lookup(PyObject* self, PyObject* args)
1072{
1073 Py_UCS4 code;
1074 Py_UNICODE str[1];
1075
1076 char* name;
1077 int namelen;
1078 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1079 return NULL;
1080
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001081 if (!_getcode(self, name, namelen, &code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001082 char fmt[] = "undefined character name '%s'";
1083 char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
1084 sprintf(buf, fmt, name);
1085 PyErr_SetString(PyExc_KeyError, buf);
1086 PyMem_FREE(buf);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001087 return NULL;
1088 }
1089
1090 str[0] = (Py_UNICODE) code;
1091 return PyUnicode_FromUnicode(str, 1);
1092}
1093
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001094/* XXX Add doc strings. */
1095
1096static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001097 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1098 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1099 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1100 {"category", unicodedata_category, METH_VARARGS,
1101 unicodedata_category__doc__},
1102 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1103 unicodedata_bidirectional__doc__},
1104 {"combining", unicodedata_combining, METH_VARARGS,
1105 unicodedata_combining__doc__},
1106 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1107 unicodedata_mirrored__doc__},
1108 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1109 unicodedata_east_asian_width__doc__},
1110 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1111 unicodedata_decomposition__doc__},
1112 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1113 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1114 {"normalize", unicodedata_normalize, METH_VARARGS,
1115 unicodedata_normalize__doc__},
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001116 {NULL, NULL} /* sentinel */
1117};
1118
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001119static PyTypeObject UCD_Type = {
1120 /* The ob_type field must be initialized in the module init function
1121 * to be portable to Windows without using C++. */
1122 PyObject_HEAD_INIT(NULL)
1123 0, /*ob_size*/
1124 "unicodedata.UCD", /*tp_name*/
1125 sizeof(PreviousDBVersion), /*tp_basicsize*/
1126 0, /*tp_itemsize*/
1127 /* methods */
1128 (destructor)PyObject_Del, /*tp_dealloc*/
1129 0, /*tp_print*/
1130 0, /*tp_getattr*/
1131 0, /*tp_setattr*/
1132 0, /*tp_compare*/
1133 0, /*tp_repr*/
1134 0, /*tp_as_number*/
1135 0, /*tp_as_sequence*/
1136 0, /*tp_as_mapping*/
1137 0, /*tp_hash*/
1138 0, /*tp_call*/
1139 0, /*tp_str*/
1140 PyObject_GenericGetAttr,/*tp_getattro*/
1141 0, /*tp_setattro*/
1142 0, /*tp_as_buffer*/
1143 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1144 0, /*tp_doc*/
1145 0, /*tp_traverse*/
1146 0, /*tp_clear*/
1147 0, /*tp_richcompare*/
1148 0, /*tp_weaklistoffset*/
1149 0, /*tp_iter*/
1150 0, /*tp_iternext*/
1151 unicodedata_functions, /*tp_methods*/
1152 DB_members, /*tp_members*/
1153 0, /*tp_getset*/
1154 0, /*tp_base*/
1155 0, /*tp_dict*/
1156 0, /*tp_descr_get*/
1157 0, /*tp_descr_set*/
1158 0, /*tp_dictoffset*/
1159 0, /*tp_init*/
1160 0, /*tp_alloc*/
1161 0, /*tp_new*/
1162 0, /*tp_free*/
1163 0, /*tp_is_gc*/
1164};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001165
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001166PyDoc_STRVAR(unicodedata_docstring,
1167"This module provides access to the Unicode Character Database which\n\
1168defines character properties for all Unicode characters. The data in\n\
1169this database is based on the UnicodeData.txt file version\n\
11703.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
1171\n\
1172The module uses the same names and symbols as defined by the\n\
1173UnicodeData File Format 3.2.0 (see\n\
Hye-Shik Chang4c560ea2005-06-04 07:31:48 +00001174http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001175
Mark Hammond62b1ab12002-07-23 06:31:15 +00001176PyMODINIT_FUNC
Thomas Woutersf3f33dc2000-07-21 06:00:07 +00001177initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001178{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001179 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001180
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001181 UCD_Type.ob_type = &PyType_Type;
1182
Fred Drakef585bef2001-03-03 19:41:55 +00001183 m = Py_InitModule3(
1184 "unicodedata", unicodedata_functions, unicodedata_docstring);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001185 if (!m)
1186 return;
1187
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001188 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001189 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001190 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001191
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001192 /* Previous versions */
1193 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1194 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001195 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001196
Fredrik Lundh06d12682001-01-24 07:59:11 +00001197 /* Export C API */
1198 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001199 if (v != NULL)
1200 PyModule_AddObject(m, "ucnhash_CAPI", v);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001201}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001202
1203/*
1204Local variables:
1205c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001206indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001207End:
1208*/