blob: 575d836c94a4dd10d2b1539fc1eb948e9882fdbb [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00003 unicodedata -- Provides access to the Unicode 4.1 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005 Data was extracted from the Unicode 4.1 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Martin v. Löwis7d41e292002-11-23 12:22:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000028 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030} _PyUnicode_DatabaseRecord;
31
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000032typedef struct change_record {
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed;
35 const unsigned char category_changed;
36 const unsigned char decimal_changed;
37 const int numeric_changed;
38} change_record;
39
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000040/* data file generated by Tools/unicode/makeunicodedata.py */
41#include "unicodedata_db.h"
42
43static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000044_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000045{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000046 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000047 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048 index = 0;
49 else {
50 index = index1[(code>>SHIFT)];
51 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
52 }
53
54 return &_PyUnicode_Database_Records[index];
55}
56
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000057/* ------------- Previous-version API ------------------------------------- */
58typedef struct previous_version {
59 PyObject_HEAD
60 const char *name;
61 const change_record* (*getrecord)(Py_UCS4);
62 Py_UCS4 (*normalization)(Py_UCS4);
63} PreviousDBVersion;
64
65#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
66
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000067static PyMemberDef DB_members[] = {
68 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
69 {NULL}
70};
71
Thomas Wouters89f507f2006-12-13 04:49:30 +000072/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000073static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000074#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000075
76static PyObject*
77new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
78 Py_UCS4 (*normalization)(Py_UCS4))
79{
80 PreviousDBVersion *self;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000081 self = PyObject_New(PreviousDBVersion, &UCD_Type);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000082 if (self == NULL)
83 return NULL;
84 self->name = name;
85 self->getrecord = getrecord;
86 self->normalization = normalization;
87 return (PyObject*)self;
88}
89
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000090
91static Py_UCS4 getuchar(PyUnicodeObject *obj)
92{
93 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
94
95 if (PyUnicode_GET_SIZE(obj) == 1)
96 return *v;
97#ifndef Py_UNICODE_WIDE
98 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
99 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
100 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
101 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
102#endif
103 PyErr_SetString(PyExc_TypeError,
104 "need a single Unicode character as parameter");
105 return (Py_UCS4)-1;
106}
107
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000108/* --- Module API --------------------------------------------------------- */
109
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000110PyDoc_STRVAR(unicodedata_decimal__doc__,
111"decimal(unichr[, default])\n\
112\n\
113Returns the decimal value assigned to the Unicode character unichr\n\
114as integer. If no such value is defined, default is returned, or, if\n\
115not given, ValueError is raised.");
116
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000117static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000118unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000119{
120 PyUnicodeObject *v;
121 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000122 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000123 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000124 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000125
Fredrik Lundh06d12682001-01-24 07:59:11 +0000126 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000127 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000128 c = getuchar(v);
129 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000130 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000131
Martin v. Löwis1a214512008-06-11 05:26:20 +0000132 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000133 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000134 if (old->category_changed == 0) {
135 /* unassigned */
136 have_old = 1;
137 rc = -1;
138 }
139 else if (old->decimal_changed != 0xFF) {
140 have_old = 1;
141 rc = old->decimal_changed;
142 }
143 }
144
145 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000146 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000147 if (rc < 0) {
148 if (defobj == NULL) {
149 PyErr_SetString(PyExc_ValueError,
150 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000151 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000152 }
153 else {
154 Py_INCREF(defobj);
155 return defobj;
156 }
157 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000158 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000159}
160
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000161PyDoc_STRVAR(unicodedata_digit__doc__,
162"digit(unichr[, default])\n\
163\n\
164Returns the digit value assigned to the Unicode character unichr as\n\
165integer. If no such value is defined, default is returned, or, if\n\
166not given, ValueError is raised.");
167
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000168static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000170{
171 PyUnicodeObject *v;
172 PyObject *defobj = NULL;
173 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000174 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000175
Fredrik Lundh06d12682001-01-24 07:59:11 +0000176 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000177 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000178 c = getuchar(v);
179 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000180 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000181 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000182 if (rc < 0) {
183 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000184 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000185 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000186 }
187 else {
188 Py_INCREF(defobj);
189 return defobj;
190 }
191 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000192 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193}
194
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000195PyDoc_STRVAR(unicodedata_numeric__doc__,
196"numeric(unichr[, default])\n\
197\n\
198Returns the numeric value assigned to the Unicode character unichr\n\
199as float. If no such value is defined, default is returned, or, if\n\
200not given, ValueError is raised.");
201
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000202static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000203unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000204{
205 PyUnicodeObject *v;
206 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000207 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000208 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000209 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000210
Fredrik Lundh06d12682001-01-24 07:59:11 +0000211 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000212 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000213 c = getuchar(v);
214 if (c == (Py_UCS4)-1)
215 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000216
Martin v. Löwis1a214512008-06-11 05:26:20 +0000217 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000218 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000219 if (old->category_changed == 0) {
220 /* unassigned */
221 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000222 rc = -1.0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000223 }
224 else if (old->decimal_changed != 0xFF) {
225 have_old = 1;
226 rc = old->decimal_changed;
227 }
228 }
229
230 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000231 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 if (rc == -1.0) {
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000233 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000234 PyErr_SetString(PyExc_ValueError, "not a numeric character");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000235 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000236 }
237 else {
238 Py_INCREF(defobj);
239 return defobj;
240 }
241 }
242 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000243}
244
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000245PyDoc_STRVAR(unicodedata_category__doc__,
246"category(unichr)\n\
247\n\
248Returns the general category assigned to the Unicode character\n\
249unichr as string.");
250
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000251static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000252unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000253{
254 PyUnicodeObject *v;
255 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000256 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000257
258 if (!PyArg_ParseTuple(args, "O!:category",
259 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000260 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000261 c = getuchar(v);
262 if (c == (Py_UCS4)-1)
263 return NULL;
264 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000265 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000266 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000267 if (old->category_changed != 0xFF)
268 index = old->category_changed;
269 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000270 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000271}
272
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000273PyDoc_STRVAR(unicodedata_bidirectional__doc__,
274"bidirectional(unichr)\n\
275\n\
276Returns the bidirectional category assigned to the Unicode character\n\
277unichr as string. If no such value is defined, an empty string is\n\
278returned.");
279
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000280static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000281unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000282{
283 PyUnicodeObject *v;
284 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000285 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000286
287 if (!PyArg_ParseTuple(args, "O!:bidirectional",
288 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000289 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000290 c = getuchar(v);
291 if (c == (Py_UCS4)-1)
292 return NULL;
293 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000294 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000295 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000296 if (old->category_changed == 0)
297 index = 0; /* unassigned */
298 else if (old->bidir_changed != 0xFF)
299 index = old->bidir_changed;
300 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000301 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000302}
303
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000304PyDoc_STRVAR(unicodedata_combining__doc__,
305"combining(unichr)\n\
306\n\
307Returns the canonical combining class assigned to the Unicode\n\
308character unichr as integer. Returns 0 if no combining class is\n\
309defined.");
310
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000311static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000312unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000313{
314 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000315 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000316 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000317
318 if (!PyArg_ParseTuple(args, "O!:combining",
319 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000320 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000321 c = getuchar(v);
322 if (c == (Py_UCS4)-1)
323 return NULL;
324 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000325 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000326 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000327 if (old->category_changed == 0)
328 index = 0; /* unassigned */
329 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000330 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000331}
332
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000333PyDoc_STRVAR(unicodedata_mirrored__doc__,
334"mirrored(unichr)\n\
335\n\
336Returns the mirrored property assigned to the Unicode character\n\
337unichr as integer. Returns 1 if the character has been identified as\n\
338a \"mirrored\" character in bidirectional text, 0 otherwise.");
339
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000340static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000341unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000342{
343 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000344 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000345 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000346
347 if (!PyArg_ParseTuple(args, "O!:mirrored",
348 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000349 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000350 c = getuchar(v);
351 if (c == (Py_UCS4)-1)
352 return NULL;
353 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000354 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000355 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000356 if (old->category_changed == 0)
357 index = 0; /* unassigned */
358 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000359 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000360}
361
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000362PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
363"east_asian_width(unichr)\n\
364\n\
365Returns the east asian width assigned to the Unicode character\n\
366unichr as string.");
367
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000368static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000369unicodedata_east_asian_width(PyObject *self, PyObject *args)
370{
371 PyUnicodeObject *v;
372 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000373 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000374
375 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
376 &PyUnicode_Type, &v))
377 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000378 c = getuchar(v);
379 if (c == (Py_UCS4)-1)
380 return NULL;
381 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000382 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000383 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000384 if (old->category_changed == 0)
385 index = 0; /* unassigned */
386 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000387 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000388}
389
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000390PyDoc_STRVAR(unicodedata_decomposition__doc__,
391"decomposition(unichr)\n\
392\n\
393Returns the character decomposition mapping assigned to the Unicode\n\
394character unichr as string. An empty string is returned in case no\n\
395such mapping is defined.");
396
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000397static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000398unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000399{
400 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000401 char decomp[256];
402 int code, index, count, i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000403 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000404 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000405
406 if (!PyArg_ParseTuple(args, "O!:decomposition",
407 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000408 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000409 c = getuchar(v);
410 if (c == (Py_UCS4)-1)
411 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000412
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000413 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000414
Martin v. Löwis1a214512008-06-11 05:26:20 +0000415 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000416 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000417 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000418 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000419 }
420
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000421 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000422 index = 0;
423 else {
424 index = decomp_index1[(code>>DECOMP_SHIFT)];
425 index = decomp_index2[(index<<DECOMP_SHIFT)+
426 (code&((1<<DECOMP_SHIFT)-1))];
427 }
428
Tim Peters69b83b12001-11-30 07:23:05 +0000429 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000430 is prefix code (from*/
431 count = decomp_data[index] >> 8;
432
433 /* XXX: could allocate the PyString up front instead
434 (strlen(prefix) + 5 * count + 1 bytes) */
435
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000436 /* Based on how index is calculated above and decomp_data is generated
437 from Tools/unicode/makeunicodedata.py, it should not be possible
438 to overflow decomp_prefix. */
439 prefix_index = decomp_data[index] & 255;
440 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
441
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000442 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000443 i = strlen(decomp_prefix[prefix_index]);
444 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000445
446 while (count-- > 0) {
447 if (i)
448 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000449 assert((size_t)i < sizeof(decomp));
450 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
451 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000452 i += strlen(decomp + i);
453 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000454
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000455 decomp[i] = '\0';
456
Walter Dörwald4254e762007-06-05 16:04:09 +0000457 return PyUnicode_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000458}
459
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000460static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000461get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000462{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000463 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000464 *index = 0;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000465 } else if (self && UCD_Check(self) &&
466 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000467 /* unassigned in old version */
468 *index = 0;
469 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000470 else {
471 *index = decomp_index1[(code>>DECOMP_SHIFT)];
472 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
473 (code&((1<<DECOMP_SHIFT)-1))];
474 }
475
476 /* high byte is number of hex bytes (usually one or two), low byte
477 is prefix code (from*/
478 *count = decomp_data[*index] >> 8;
479 *prefix = decomp_data[*index] & 255;
480
481 (*index)++;
482}
483
484#define SBase 0xAC00
485#define LBase 0x1100
486#define VBase 0x1161
487#define TBase 0x11A7
488#define LCount 19
489#define VCount 21
490#define TCount 28
491#define NCount (VCount*TCount)
492#define SCount (LCount*NCount)
493
494static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000495nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000496{
497 PyObject *result;
498 Py_UNICODE *i, *end, *o;
499 /* Longest decomposition in Unicode 3.2: U+FDFA */
500 Py_UNICODE stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000501 Py_ssize_t space, isize;
502 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000503 unsigned char prev, cur;
504
505 stackptr = 0;
506 isize = PyUnicode_GET_SIZE(input);
507 /* Overallocate atmost 10 characters. */
508 space = (isize > 10 ? 10 : isize) + isize;
509 result = PyUnicode_FromUnicode(NULL, space);
510 if (!result)
511 return NULL;
512 i = PyUnicode_AS_UNICODE(input);
513 end = i + isize;
514 o = PyUnicode_AS_UNICODE(result);
515
516 while (i < end) {
517 stack[stackptr++] = *i++;
518 while(stackptr) {
519 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000520 /* Hangul Decomposition adds three characters in
521 a single step, so we need atleast that much room. */
522 if (space < 3) {
Martin v. Löwis5b222132007-06-10 09:51:05 +0000523 Py_ssize_t newsize = PyUnicode_GET_SIZE(result) + 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000524 space += 10;
525 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000526 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000527 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000528 }
529 /* Hangul Decomposition. */
530 if (SBase <= code && code < (SBase+SCount)) {
531 int SIndex = code - SBase;
532 int L = LBase + SIndex / NCount;
533 int V = VBase + (SIndex % NCount) / TCount;
534 int T = TBase + SIndex % TCount;
535 *o++ = L;
536 *o++ = V;
537 space -= 2;
538 if (T != TBase) {
539 *o++ = T;
540 space --;
541 }
542 continue;
543 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000544 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000545 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000546 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
547 if (value != 0) {
548 stack[stackptr++] = value;
549 continue;
550 }
551 }
552
553 /* Other decompositions. */
554 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000555
556 /* Copy character if it is not decomposable, or has a
557 compatibility decomposition, but we do NFD. */
558 if (!count || (prefix && !k)) {
559 *o++ = code;
560 space--;
561 continue;
562 }
563 /* Copy decomposition onto the stack, in reverse
564 order. */
565 while(count) {
566 code = decomp_data[index + (--count)];
567 stack[stackptr++] = code;
568 }
569 }
570 }
571
572 /* Drop overallocation. Cannot fail. */
573 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
574
575 /* Sort canonically. */
576 i = PyUnicode_AS_UNICODE(result);
577 prev = _getrecord_ex(*i)->combining;
578 end = i + PyUnicode_GET_SIZE(result);
579 for (i++; i < end; i++) {
580 cur = _getrecord_ex(*i)->combining;
581 if (prev == 0 || cur == 0 || prev <= cur) {
582 prev = cur;
583 continue;
584 }
585 /* Non-canonical order. Need to switch *i with previous. */
586 o = i - 1;
587 while (1) {
588 Py_UNICODE tmp = o[1];
589 o[1] = o[0];
590 o[0] = tmp;
591 o--;
592 if (o < PyUnicode_AS_UNICODE(result))
593 break;
594 prev = _getrecord_ex(*o)->combining;
595 if (prev == 0 || prev <= cur)
596 break;
597 }
598 prev = _getrecord_ex(*i)->combining;
599 }
600 return result;
601}
602
603static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000604find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000605{
606 int index;
607 for (index = 0; nfc[index].start; index++) {
608 int start = nfc[index].start;
609 if (code < start)
610 return -1;
611 if (code <= start + nfc[index].count) {
612 int delta = code - start;
613 return nfc[index].index + delta;
614 }
615 }
616 return -1;
617}
618
619static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000620nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000621{
622 PyObject *result;
623 Py_UNICODE *i, *i1, *o, *end;
624 int f,l,index,index1,comb;
625 Py_UNICODE code;
626 Py_UNICODE *skipped[20];
627 int cskipped = 0;
628
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000629 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000630 if (!result)
631 return NULL;
632
633 /* We are going to modify result in-place.
634 If nfd_nfkd is changed to sometimes return the input,
635 this code needs to be reviewed. */
636 assert(result != input);
637
638 i = PyUnicode_AS_UNICODE(result);
639 end = i + PyUnicode_GET_SIZE(result);
640 o = PyUnicode_AS_UNICODE(result);
641
642 again:
643 while (i < end) {
644 for (index = 0; index < cskipped; index++) {
645 if (skipped[index] == i) {
646 /* *i character is skipped.
647 Remove from list. */
648 skipped[index] = skipped[cskipped-1];
649 cskipped--;
650 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000651 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000652 }
653 }
654 /* Hangul Composition. We don't need to check for <LV,T>
655 pairs, since we always have decomposed data. */
656 if (LBase <= *i && *i < (LBase+LCount) &&
657 i + 1 < end &&
658 VBase <= i[1] && i[1] <= (VBase+VCount)) {
659 int LIndex, VIndex;
660 LIndex = i[0] - LBase;
661 VIndex = i[1] - VBase;
662 code = SBase + (LIndex*VCount+VIndex)*TCount;
663 i+=2;
664 if (i < end &&
665 TBase <= *i && *i <= (TBase+TCount)) {
666 code += *i-TBase;
667 i++;
668 }
669 *o++ = code;
670 continue;
671 }
672
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000673 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000674 if (f == -1) {
675 *o++ = *i++;
676 continue;
677 }
678 /* Find next unblocked character. */
679 i1 = i+1;
680 comb = 0;
681 while (i1 < end) {
682 int comb1 = _getrecord_ex(*i1)->combining;
683 if (comb1 && comb == comb1) {
684 /* Character is blocked. */
685 i1++;
686 continue;
687 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000688 l = find_nfc_index(self, nfc_last, *i1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000689 /* *i1 cannot be combined with *i. If *i1
690 is a starter, we don't need to look further.
691 Otherwise, record the combining class. */
692 if (l == -1) {
693 not_combinable:
694 if (comb1 == 0)
695 break;
696 comb = comb1;
697 i1++;
698 continue;
699 }
700 index = f*TOTAL_LAST + l;
701 index1 = comp_index[index >> COMP_SHIFT];
702 code = comp_data[(index1<<COMP_SHIFT)+
703 (index&((1<<COMP_SHIFT)-1))];
704 if (code == 0)
705 goto not_combinable;
706
707 /* Replace the original character. */
708 *i = code;
709 /* Mark the second character unused. */
710 skipped[cskipped++] = i1;
711 i1++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000712 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000713 if (f == -1)
714 break;
715 }
716 *o++ = *i++;
717 }
718 if (o != end)
719 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
720 return result;
721}
722
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000723PyDoc_STRVAR(unicodedata_normalize__doc__,
724"normalize(form, unistr)\n\
725\n\
726Return the normal form 'form' for the Unicode string unistr. Valid\n\
727values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
728
Martin v. Löwis677bde22002-11-23 22:08:15 +0000729static PyObject*
730unicodedata_normalize(PyObject *self, PyObject *args)
731{
732 char *form;
733 PyObject *input;
734
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000735 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000736 &form, &PyUnicode_Type, &input))
737 return NULL;
738
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000739 if (PyUnicode_GetSize(input) == 0) {
740 /* Special case empty input strings, since resizing
741 them later would cause internal errors. */
742 Py_INCREF(input);
743 return input;
744 }
745
Martin v. Löwis677bde22002-11-23 22:08:15 +0000746 if (strcmp(form, "NFC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000747 return nfc_nfkc(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000748 if (strcmp(form, "NFKC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000749 return nfc_nfkc(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000750 if (strcmp(form, "NFD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000751 return nfd_nfkd(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000752 if (strcmp(form, "NFKD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000753 return nfd_nfkd(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000754 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
755 return NULL;
756}
757
Fredrik Lundh06d12682001-01-24 07:59:11 +0000758/* -------------------------------------------------------------------- */
759/* unicode character name tables */
760
761/* data file generated by Tools/unicode/makeunicodedata.py */
762#include "unicodename_db.h"
763
764/* -------------------------------------------------------------------- */
765/* database code (cut and pasted from the unidb package) */
766
767static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000768_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000769{
770 int i;
771 unsigned long h = 0;
772 unsigned long ix;
773 for (i = 0; i < len; i++) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000774 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000775 ix = h & 0xff000000;
776 if (ix)
777 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
778 }
779 return h;
780}
781
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000782static char *hangul_syllables[][3] = {
783 { "G", "A", "" },
784 { "GG", "AE", "G" },
785 { "N", "YA", "GG" },
786 { "D", "YAE", "GS" },
787 { "DD", "EO", "N", },
788 { "R", "E", "NJ" },
789 { "M", "YEO", "NH" },
790 { "B", "YE", "D" },
791 { "BB", "O", "L" },
792 { "S", "WA", "LG" },
793 { "SS", "WAE", "LM" },
794 { "", "OE", "LB" },
795 { "J", "YO", "LS" },
796 { "JJ", "U", "LT" },
797 { "C", "WEO", "LP" },
798 { "K", "WE", "LH" },
799 { "T", "WI", "M" },
800 { "P", "YU", "B" },
801 { "H", "EU", "BS" },
802 { 0, "YI", "S" },
803 { 0, "I", "SS" },
804 { 0, 0, "NG" },
805 { 0, 0, "J" },
806 { 0, 0, "C" },
807 { 0, 0, "K" },
808 { 0, 0, "T" },
809 { 0, 0, "P" },
810 { 0, 0, "H" }
811};
812
Fredrik Lundh06d12682001-01-24 07:59:11 +0000813static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000814is_unified_ideograph(Py_UCS4 code)
815{
816 return (
817 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Martin v. Löwisc3509122006-03-11 12:16:23 +0000818 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000819 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
820}
821
822static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000823_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000824{
825 int offset;
826 int i;
827 int word;
828 unsigned char* w;
829
Martin v. Löwisc3509122006-03-11 12:16:23 +0000830 if (code >= 0x110000)
831 return 0;
832
Martin v. Löwis1a214512008-06-11 05:26:20 +0000833 if (self && UCD_Check(self)) {
Martin v. Löwisc3509122006-03-11 12:16:23 +0000834 const change_record *old = get_old_record(self, code);
835 if (old->category_changed == 0) {
836 /* unassigned */
837 return 0;
838 }
839 }
840
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000841 if (SBase <= code && code < SBase+SCount) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000842 /* Hangul syllable. */
843 int SIndex = code - SBase;
844 int L = SIndex / NCount;
845 int V = (SIndex % NCount) / TCount;
846 int T = SIndex % TCount;
847
848 if (buflen < 27)
849 /* Worst case: HANGUL SYLLABLE <10chars>. */
850 return 0;
851 strcpy(buffer, "HANGUL SYLLABLE ");
852 buffer += 16;
853 strcpy(buffer, hangul_syllables[L][0]);
854 buffer += strlen(hangul_syllables[L][0]);
855 strcpy(buffer, hangul_syllables[V][1]);
856 buffer += strlen(hangul_syllables[V][1]);
857 strcpy(buffer, hangul_syllables[T][2]);
858 buffer += strlen(hangul_syllables[T][2]);
859 *buffer = '\0';
860 return 1;
861 }
862
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000863 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000864 if (buflen < 28)
865 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
866 return 0;
867 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
868 return 1;
869 }
870
Fredrik Lundh06d12682001-01-24 07:59:11 +0000871 /* get offset into phrasebook */
872 offset = phrasebook_offset1[(code>>phrasebook_shift)];
873 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
874 (code&((1<<phrasebook_shift)-1))];
875 if (!offset)
876 return 0;
877
878 i = 0;
879
880 for (;;) {
881 /* get word index */
882 word = phrasebook[offset] - phrasebook_short;
883 if (word >= 0) {
884 word = (word << 8) + phrasebook[offset+1];
885 offset += 2;
886 } else
887 word = phrasebook[offset++];
888 if (i) {
889 if (i > buflen)
890 return 0; /* buffer overflow */
891 buffer[i++] = ' ';
892 }
893 /* copy word string from lexicon. the last character in the
894 word has bit 7 set. the last word in a string ends with
895 0x80 */
896 w = lexicon + lexicon_offset[word];
897 while (*w < 128) {
898 if (i >= buflen)
899 return 0; /* buffer overflow */
900 buffer[i++] = *w++;
901 }
902 if (i >= buflen)
903 return 0; /* buffer overflow */
904 buffer[i++] = *w & 127;
905 if (*w == 128)
906 break; /* end of word */
907 }
908
909 return 1;
910}
911
912static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000913_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000914{
915 /* check if code corresponds to the given name */
916 int i;
917 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000918 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000919 return 0;
920 for (i = 0; i < namelen; i++) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000921 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +0000922 return 0;
923 }
924 return buffer[namelen] == '\0';
925}
926
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000927static void
928find_syllable(const char *str, int *len, int *pos, int count, int column)
929{
930 int i, len1;
931 *len = -1;
932 for (i = 0; i < count; i++) {
933 char *s = hangul_syllables[i][column];
934 len1 = strlen(s);
935 if (len1 <= *len)
936 continue;
937 if (strncmp(str, s, len1) == 0) {
938 *len = len1;
939 *pos = i;
940 }
941 }
942 if (*len == -1) {
943 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000944 }
945}
946
Fredrik Lundh06d12682001-01-24 07:59:11 +0000947static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000948_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000949{
950 unsigned int h, v;
951 unsigned int mask = code_size-1;
952 unsigned int i, incr;
953
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000954 /* Check for hangul syllables. */
955 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Thomas Wouters1e365b22006-03-01 21:58:30 +0000956 int len, L = -1, V = -1, T = -1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000957 const char *pos = name + 16;
958 find_syllable(pos, &len, &L, LCount, 0);
959 pos += len;
960 find_syllable(pos, &len, &V, VCount, 1);
961 pos += len;
962 find_syllable(pos, &len, &T, TCount, 2);
963 pos += len;
Martin v. Löwis8b291e22005-09-18 08:17:56 +0000964 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000965 *code = SBase + (L*VCount+V)*TCount + T;
966 return 1;
967 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000968 /* Otherwise, it's an illegal syllable name. */
969 return 0;
970 }
971
972 /* Check for unified ideographs. */
973 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
974 /* Four or five hexdigits must follow. */
975 v = 0;
976 name += 22;
977 namelen -= 22;
978 if (namelen != 4 && namelen != 5)
979 return 0;
980 while (namelen--) {
981 v *= 16;
982 if (*name >= '0' && *name <= '9')
983 v += *name - '0';
984 else if (*name >= 'A' && *name <= 'F')
985 v += *name - 'A' + 10;
986 else
987 return 0;
988 name++;
989 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000990 if (!is_unified_ideograph(v))
991 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000992 *code = v;
993 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000994 }
995
Fredrik Lundh06d12682001-01-24 07:59:11 +0000996 /* the following is the same as python's dictionary lookup, with
997 only minor changes. see the makeunicodedata script for more
998 details */
999
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001000 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001001 i = (~h) & mask;
1002 v = code_hash[i];
1003 if (!v)
1004 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001005 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001006 *code = v;
1007 return 1;
1008 }
1009 incr = (h ^ (h >> 3)) & mask;
1010 if (!incr)
1011 incr = mask;
1012 for (;;) {
1013 i = (i + incr) & mask;
1014 v = code_hash[i];
1015 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001016 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001017 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001018 *code = v;
1019 return 1;
1020 }
1021 incr = incr << 1;
1022 if (incr > mask)
1023 incr = incr ^ code_poly;
1024 }
1025}
1026
1027static const _PyUnicode_Name_CAPI hashAPI =
1028{
1029 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001030 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001031 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001032};
1033
1034/* -------------------------------------------------------------------- */
1035/* Python bindings */
1036
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001037PyDoc_STRVAR(unicodedata_name__doc__,
1038"name(unichr[, default])\n\
1039Returns the name assigned to the Unicode character unichr as a\n\
1040string. If no name is defined, default is returned, or, if not\n\
1041given, ValueError is raised.");
1042
Fredrik Lundh06d12682001-01-24 07:59:11 +00001043static PyObject *
1044unicodedata_name(PyObject* self, PyObject* args)
1045{
1046 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001047 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001048
1049 PyUnicodeObject* v;
1050 PyObject* defobj = NULL;
1051 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1052 return NULL;
1053
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001054 c = getuchar(v);
1055 if (c == (Py_UCS4)-1)
1056 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001057
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001058 if (!_getucname(self, c, name, sizeof(name))) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001059 if (defobj == NULL) {
1060 PyErr_SetString(PyExc_ValueError, "no such name");
1061 return NULL;
1062 }
1063 else {
1064 Py_INCREF(defobj);
1065 return defobj;
1066 }
1067 }
1068
Walter Dörwald4254e762007-06-05 16:04:09 +00001069 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001070}
1071
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001072PyDoc_STRVAR(unicodedata_lookup__doc__,
1073"lookup(name)\n\
1074\n\
1075Look up character by name. If a character with the\n\
1076given name is found, return the corresponding Unicode\n\
1077character. If not found, KeyError is raised.");
1078
Fredrik Lundh06d12682001-01-24 07:59:11 +00001079static PyObject *
1080unicodedata_lookup(PyObject* self, PyObject* args)
1081{
1082 Py_UCS4 code;
Guido van Rossum806c2462007-08-06 23:33:07 +00001083 Py_UNICODE str[2];
Fredrik Lundh06d12682001-01-24 07:59:11 +00001084
1085 char* name;
1086 int namelen;
1087 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1088 return NULL;
1089
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001090 if (!_getcode(self, name, namelen, &code)) {
Guido van Rossum806c2462007-08-06 23:33:07 +00001091 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1092 name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001093 return NULL;
1094 }
1095
Guido van Rossum806c2462007-08-06 23:33:07 +00001096#ifndef Py_UNICODE_WIDE
1097 if (code >= 0x10000) {
1098 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1099 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1100 return PyUnicode_FromUnicode(str, 2);
1101 }
1102#endif
Fredrik Lundh06d12682001-01-24 07:59:11 +00001103 str[0] = (Py_UNICODE) code;
Guido van Rossum806c2462007-08-06 23:33:07 +00001104 return PyUnicode_FromUnicode(str, 1);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001105}
1106
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001107/* XXX Add doc strings. */
1108
1109static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001110 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1111 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1112 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1113 {"category", unicodedata_category, METH_VARARGS,
1114 unicodedata_category__doc__},
1115 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1116 unicodedata_bidirectional__doc__},
1117 {"combining", unicodedata_combining, METH_VARARGS,
1118 unicodedata_combining__doc__},
1119 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1120 unicodedata_mirrored__doc__},
1121 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1122 unicodedata_east_asian_width__doc__},
1123 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1124 unicodedata_decomposition__doc__},
1125 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1126 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1127 {"normalize", unicodedata_normalize, METH_VARARGS,
1128 unicodedata_normalize__doc__},
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001129 {NULL, NULL} /* sentinel */
1130};
1131
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001132static PyTypeObject UCD_Type = {
1133 /* The ob_type field must be initialized in the module init function
1134 * to be portable to Windows without using C++. */
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001135 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001136 "unicodedata.UCD", /*tp_name*/
1137 sizeof(PreviousDBVersion), /*tp_basicsize*/
1138 0, /*tp_itemsize*/
1139 /* methods */
1140 (destructor)PyObject_Del, /*tp_dealloc*/
1141 0, /*tp_print*/
1142 0, /*tp_getattr*/
1143 0, /*tp_setattr*/
1144 0, /*tp_compare*/
1145 0, /*tp_repr*/
1146 0, /*tp_as_number*/
1147 0, /*tp_as_sequence*/
1148 0, /*tp_as_mapping*/
1149 0, /*tp_hash*/
1150 0, /*tp_call*/
1151 0, /*tp_str*/
1152 PyObject_GenericGetAttr,/*tp_getattro*/
1153 0, /*tp_setattro*/
1154 0, /*tp_as_buffer*/
1155 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1156 0, /*tp_doc*/
1157 0, /*tp_traverse*/
1158 0, /*tp_clear*/
1159 0, /*tp_richcompare*/
1160 0, /*tp_weaklistoffset*/
1161 0, /*tp_iter*/
1162 0, /*tp_iternext*/
1163 unicodedata_functions, /*tp_methods*/
1164 DB_members, /*tp_members*/
1165 0, /*tp_getset*/
1166 0, /*tp_base*/
1167 0, /*tp_dict*/
1168 0, /*tp_descr_get*/
1169 0, /*tp_descr_set*/
1170 0, /*tp_dictoffset*/
1171 0, /*tp_init*/
1172 0, /*tp_alloc*/
1173 0, /*tp_new*/
1174 0, /*tp_free*/
1175 0, /*tp_is_gc*/
1176};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001177
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001178PyDoc_STRVAR(unicodedata_docstring,
1179"This module provides access to the Unicode Character Database which\n\
1180defines character properties for all Unicode characters. The data in\n\
1181this database is based on the UnicodeData.txt file version\n\
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011824.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001183\n\
1184The module uses the same names and symbols as defined by the\n\
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001185UnicodeData File Format 4.1.0 (see\n\
1186http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001187
Martin v. Löwis1a214512008-06-11 05:26:20 +00001188
1189static struct PyModuleDef unicodedatamodule = {
1190 PyModuleDef_HEAD_INIT,
1191 "unicodedata",
1192 unicodedata_docstring,
1193 -1,
1194 unicodedata_functions,
1195 NULL,
1196 NULL,
1197 NULL,
1198 NULL
1199};
1200
Mark Hammond62b1ab12002-07-23 06:31:15 +00001201PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001202PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001203{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001204 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001205
Christian Heimes90aa7642007-12-19 02:45:37 +00001206 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001207
Martin v. Löwis1a214512008-06-11 05:26:20 +00001208 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001209 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001210 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001211
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001212 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001213 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001214 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001215
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001216 /* Previous versions */
1217 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1218 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001219 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001220
Fredrik Lundh06d12682001-01-24 07:59:11 +00001221 /* Export C API */
1222 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001223 if (v != NULL)
1224 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001225 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001226}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001227
1228/*
1229Local variables:
1230c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001231indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001232End:
1233*/