blob: ad77651a26c4fce17f918b61d4eaac990071017a [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Martin v. Löwis24329ba2008-09-10 13:38:12 +00003 unicodedata -- Provides access to the Unicode 5.1 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Martin v. Löwis24329ba2008-09-10 13:38:12 +00005 Data was extracted from the Unicode 5.1 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Martin v. Löwis7d41e292002-11-23 12:22:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000028 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030} _PyUnicode_DatabaseRecord;
31
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000032typedef struct change_record {
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed;
35 const unsigned char category_changed;
36 const unsigned char decimal_changed;
Martin v. Löwis24329ba2008-09-10 13:38:12 +000037 const unsigned char mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000038 const int numeric_changed;
39} change_record;
40
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000041/* data file generated by Tools/unicode/makeunicodedata.py */
42#include "unicodedata_db.h"
43
44static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000045_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000046{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000048 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000049 index = 0;
50 else {
51 index = index1[(code>>SHIFT)];
52 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
53 }
54
55 return &_PyUnicode_Database_Records[index];
56}
57
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000058/* ------------- Previous-version API ------------------------------------- */
59typedef struct previous_version {
60 PyObject_HEAD
61 const char *name;
62 const change_record* (*getrecord)(Py_UCS4);
63 Py_UCS4 (*normalization)(Py_UCS4);
64} PreviousDBVersion;
65
66#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
67
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000068static PyMemberDef DB_members[] = {
69 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
70 {NULL}
71};
72
Walter Dörwald6fc23822006-11-09 16:23:26 +000073/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000074static PyTypeObject UCD_Type;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000075
76static PyObject*
77new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
78 Py_UCS4 (*normalization)(Py_UCS4))
79{
80 PreviousDBVersion *self;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000081 self = PyObject_New(PreviousDBVersion, &UCD_Type);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000082 if (self == NULL)
83 return NULL;
84 self->name = name;
85 self->getrecord = getrecord;
86 self->normalization = normalization;
87 return (PyObject*)self;
88}
89
Walter Dörwalda2a89a82008-06-02 20:36:03 +000090
91static Py_UCS4 getuchar(PyUnicodeObject *obj)
92{
93 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
94
95 if (PyUnicode_GET_SIZE(obj) == 1)
96 return *v;
97#ifndef Py_UNICODE_WIDE
98 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
99 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
100 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
101 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
102#endif
103 PyErr_SetString(PyExc_TypeError,
104 "need a single Unicode character as parameter");
105 return (Py_UCS4)-1;
106}
107
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000108/* --- Module API --------------------------------------------------------- */
109
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000110PyDoc_STRVAR(unicodedata_decimal__doc__,
111"decimal(unichr[, default])\n\
112\n\
113Returns the decimal value assigned to the Unicode character unichr\n\
114as integer. If no such value is defined, default is returned, or, if\n\
115not given, ValueError is raised.");
116
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000117static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000118unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000119{
120 PyUnicodeObject *v;
121 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000122 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000123 long rc;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000124 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000125
Fredrik Lundh06d12682001-01-24 07:59:11 +0000126 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000127 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000128 c = getuchar(v);
129 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000130 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000131
132 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000133 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000134 if (old->category_changed == 0) {
135 /* unassigned */
136 have_old = 1;
137 rc = -1;
138 }
139 else if (old->decimal_changed != 0xFF) {
140 have_old = 1;
141 rc = old->decimal_changed;
142 }
143 }
144
145 if (!have_old)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000146 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000147 if (rc < 0) {
148 if (defobj == NULL) {
149 PyErr_SetString(PyExc_ValueError,
150 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000151 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000152 }
153 else {
154 Py_INCREF(defobj);
155 return defobj;
156 }
157 }
158 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000159}
160
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000161PyDoc_STRVAR(unicodedata_digit__doc__,
162"digit(unichr[, default])\n\
163\n\
164Returns the digit value assigned to the Unicode character unichr as\n\
165integer. If no such value is defined, default is returned, or, if\n\
166not given, ValueError is raised.");
167
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000168static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000170{
171 PyUnicodeObject *v;
172 PyObject *defobj = NULL;
173 long rc;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000174 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000175
Fredrik Lundh06d12682001-01-24 07:59:11 +0000176 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000177 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000178 c = getuchar(v);
179 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000180 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000181 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000182 if (rc < 0) {
183 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000184 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000185 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000186 }
187 else {
188 Py_INCREF(defobj);
189 return defobj;
190 }
191 }
192 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193}
194
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000195PyDoc_STRVAR(unicodedata_numeric__doc__,
196"numeric(unichr[, default])\n\
197\n\
198Returns the numeric value assigned to the Unicode character unichr\n\
199as float. If no such value is defined, default is returned, or, if\n\
200not given, ValueError is raised.");
201
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000202static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000203unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000204{
205 PyUnicodeObject *v;
206 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000207 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000208 double rc;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000209 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000210
Fredrik Lundh06d12682001-01-24 07:59:11 +0000211 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000212 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000213 c = getuchar(v);
214 if (c == (Py_UCS4)-1)
215 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000216
217 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000218 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000219 if (old->category_changed == 0) {
220 /* unassigned */
221 have_old = 1;
Martin v. Löwisd004fc82006-05-27 08:36:52 +0000222 rc = -1.0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000223 }
224 else if (old->decimal_changed != 0xFF) {
225 have_old = 1;
226 rc = old->decimal_changed;
227 }
228 }
229
230 if (!have_old)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000231 rc = Py_UNICODE_TONUMERIC(c);
Martin v. Löwisd004fc82006-05-27 08:36:52 +0000232 if (rc == -1.0) {
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000233 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000234 PyErr_SetString(PyExc_ValueError, "not a numeric character");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000235 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000236 }
237 else {
238 Py_INCREF(defobj);
239 return defobj;
240 }
241 }
242 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000243}
244
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000245PyDoc_STRVAR(unicodedata_category__doc__,
246"category(unichr)\n\
247\n\
248Returns the general category assigned to the Unicode character\n\
249unichr as string.");
250
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000251static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000252unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000253{
254 PyUnicodeObject *v;
255 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000256 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000257
258 if (!PyArg_ParseTuple(args, "O!:category",
259 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000260 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000261 c = getuchar(v);
262 if (c == (Py_UCS4)-1)
263 return NULL;
264 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000265 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000266 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000267 if (old->category_changed != 0xFF)
268 index = old->category_changed;
269 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000270 return PyString_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000271}
272
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000273PyDoc_STRVAR(unicodedata_bidirectional__doc__,
274"bidirectional(unichr)\n\
275\n\
276Returns the bidirectional category assigned to the Unicode character\n\
277unichr as string. If no such value is defined, an empty string is\n\
278returned.");
279
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000280static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000281unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000282{
283 PyUnicodeObject *v;
284 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000285 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000286
287 if (!PyArg_ParseTuple(args, "O!:bidirectional",
288 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000289 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000290 c = getuchar(v);
291 if (c == (Py_UCS4)-1)
292 return NULL;
293 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000294 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000295 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000296 if (old->category_changed == 0)
297 index = 0; /* unassigned */
298 else if (old->bidir_changed != 0xFF)
299 index = old->bidir_changed;
300 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000301 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000302}
303
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000304PyDoc_STRVAR(unicodedata_combining__doc__,
305"combining(unichr)\n\
306\n\
307Returns the canonical combining class assigned to the Unicode\n\
308character unichr as integer. Returns 0 if no combining class is\n\
309defined.");
310
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000311static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000312unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000313{
314 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000315 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000316 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000317
318 if (!PyArg_ParseTuple(args, "O!:combining",
319 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000320 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000321 c = getuchar(v);
322 if (c == (Py_UCS4)-1)
323 return NULL;
324 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000325 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000326 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000327 if (old->category_changed == 0)
328 index = 0; /* unassigned */
329 }
330 return PyInt_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000331}
332
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000333PyDoc_STRVAR(unicodedata_mirrored__doc__,
334"mirrored(unichr)\n\
335\n\
336Returns the mirrored property assigned to the Unicode character\n\
337unichr as integer. Returns 1 if the character has been identified as\n\
338a \"mirrored\" character in bidirectional text, 0 otherwise.");
339
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000340static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000341unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000342{
343 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000344 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000345 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000346
347 if (!PyArg_ParseTuple(args, "O!:mirrored",
348 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000349 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000350 c = getuchar(v);
351 if (c == (Py_UCS4)-1)
352 return NULL;
353 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000354 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000355 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000356 if (old->category_changed == 0)
357 index = 0; /* unassigned */
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000358 else if (old->mirrored_changed != 0xFF)
359 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000360 }
361 return PyInt_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000362}
363
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000364PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
365"east_asian_width(unichr)\n\
366\n\
367Returns the east asian width assigned to the Unicode character\n\
368unichr as string.");
369
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000370static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000371unicodedata_east_asian_width(PyObject *self, PyObject *args)
372{
373 PyUnicodeObject *v;
374 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000375 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000376
377 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
378 &PyUnicode_Type, &v))
379 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000380 c = getuchar(v);
381 if (c == (Py_UCS4)-1)
382 return NULL;
383 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000384 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000385 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000386 if (old->category_changed == 0)
387 index = 0; /* unassigned */
388 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000389 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000390}
391
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000392PyDoc_STRVAR(unicodedata_decomposition__doc__,
393"decomposition(unichr)\n\
394\n\
395Returns the character decomposition mapping assigned to the Unicode\n\
396character unichr as string. An empty string is returned in case no\n\
397such mapping is defined.");
398
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000399static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000400unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000401{
402 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000403 char decomp[256];
404 int code, index, count, i;
Neal Norwitz37f694f2006-07-27 04:04:50 +0000405 unsigned int prefix_index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000406 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000407
408 if (!PyArg_ParseTuple(args, "O!:decomposition",
409 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000410 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000411 c = getuchar(v);
412 if (c == (Py_UCS4)-1)
413 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000414
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000415 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000416
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000417 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000418 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000419 if (old->category_changed == 0)
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000420 return PyString_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000421 }
422
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000423 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000424 index = 0;
425 else {
426 index = decomp_index1[(code>>DECOMP_SHIFT)];
427 index = decomp_index2[(index<<DECOMP_SHIFT)+
428 (code&((1<<DECOMP_SHIFT)-1))];
429 }
430
Tim Peters69b83b12001-11-30 07:23:05 +0000431 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000432 is prefix code (from*/
433 count = decomp_data[index] >> 8;
434
435 /* XXX: could allocate the PyString up front instead
436 (strlen(prefix) + 5 * count + 1 bytes) */
437
Neal Norwitz37f694f2006-07-27 04:04:50 +0000438 /* Based on how index is calculated above and decomp_data is generated
439 from Tools/unicode/makeunicodedata.py, it should not be possible
440 to overflow decomp_prefix. */
441 prefix_index = decomp_data[index] & 255;
442 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
443
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000444 /* copy prefix */
Neal Norwitz37f694f2006-07-27 04:04:50 +0000445 i = strlen(decomp_prefix[prefix_index]);
446 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000447
448 while (count-- > 0) {
449 if (i)
450 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000451 assert((size_t)i < sizeof(decomp));
452 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
453 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000454 i += strlen(decomp + i);
455 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000456
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000457 decomp[i] = '\0';
458
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000459 return PyString_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000460}
461
Neal Norwitz88c97842006-04-17 00:36:29 +0000462static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000463get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000464{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000465 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000466 *index = 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000467 } else if (self && get_old_record(self, code)->category_changed==0) {
468 /* unassigned in old version */
469 *index = 0;
470 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000471 else {
472 *index = decomp_index1[(code>>DECOMP_SHIFT)];
473 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
474 (code&((1<<DECOMP_SHIFT)-1))];
475 }
476
477 /* high byte is number of hex bytes (usually one or two), low byte
478 is prefix code (from*/
479 *count = decomp_data[*index] >> 8;
480 *prefix = decomp_data[*index] & 255;
481
482 (*index)++;
483}
484
485#define SBase 0xAC00
486#define LBase 0x1100
487#define VBase 0x1161
488#define TBase 0x11A7
489#define LCount 19
490#define VCount 21
491#define TCount 28
492#define NCount (VCount*TCount)
493#define SCount (LCount*NCount)
494
495static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000496nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000497{
498 PyObject *result;
499 Py_UNICODE *i, *end, *o;
500 /* Longest decomposition in Unicode 3.2: U+FDFA */
501 Py_UNICODE stack[20];
Martin v. Löwis3c6e4182006-04-13 06:36:31 +0000502 Py_ssize_t space, isize;
503 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000504 unsigned char prev, cur;
505
506 stackptr = 0;
507 isize = PyUnicode_GET_SIZE(input);
508 /* Overallocate atmost 10 characters. */
509 space = (isize > 10 ? 10 : isize) + isize;
510 result = PyUnicode_FromUnicode(NULL, space);
511 if (!result)
512 return NULL;
513 i = PyUnicode_AS_UNICODE(input);
514 end = i + isize;
515 o = PyUnicode_AS_UNICODE(result);
516
517 while (i < end) {
518 stack[stackptr++] = *i++;
519 while(stackptr) {
520 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000521 /* Hangul Decomposition adds three characters in
522 a single step, so we need atleast that much room. */
523 if (space < 3) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000524 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000525 space += 10;
526 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000527 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000528 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000529 }
530 /* Hangul Decomposition. */
531 if (SBase <= code && code < (SBase+SCount)) {
532 int SIndex = code - SBase;
533 int L = LBase + SIndex / NCount;
534 int V = VBase + (SIndex % NCount) / TCount;
535 int T = TBase + SIndex % TCount;
536 *o++ = L;
537 *o++ = V;
538 space -= 2;
539 if (T != TBase) {
540 *o++ = T;
541 space --;
542 }
543 continue;
544 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000545 /* normalization changes */
546 if (self) {
547 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
548 if (value != 0) {
549 stack[stackptr++] = value;
550 continue;
551 }
552 }
553
554 /* Other decompositions. */
555 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000556
557 /* Copy character if it is not decomposable, or has a
558 compatibility decomposition, but we do NFD. */
559 if (!count || (prefix && !k)) {
560 *o++ = code;
561 space--;
562 continue;
563 }
564 /* Copy decomposition onto the stack, in reverse
565 order. */
566 while(count) {
567 code = decomp_data[index + (--count)];
568 stack[stackptr++] = code;
569 }
570 }
571 }
572
573 /* Drop overallocation. Cannot fail. */
574 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
575
576 /* Sort canonically. */
577 i = PyUnicode_AS_UNICODE(result);
578 prev = _getrecord_ex(*i)->combining;
579 end = i + PyUnicode_GET_SIZE(result);
580 for (i++; i < end; i++) {
581 cur = _getrecord_ex(*i)->combining;
582 if (prev == 0 || cur == 0 || prev <= cur) {
583 prev = cur;
584 continue;
585 }
586 /* Non-canonical order. Need to switch *i with previous. */
587 o = i - 1;
588 while (1) {
589 Py_UNICODE tmp = o[1];
590 o[1] = o[0];
591 o[0] = tmp;
592 o--;
593 if (o < PyUnicode_AS_UNICODE(result))
594 break;
595 prev = _getrecord_ex(*o)->combining;
596 if (prev == 0 || prev <= cur)
597 break;
598 }
599 prev = _getrecord_ex(*i)->combining;
600 }
601 return result;
602}
603
604static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000605find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000606{
607 int index;
608 for (index = 0; nfc[index].start; index++) {
609 int start = nfc[index].start;
610 if (code < start)
611 return -1;
612 if (code <= start + nfc[index].count) {
613 int delta = code - start;
614 return nfc[index].index + delta;
615 }
616 }
617 return -1;
618}
619
620static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000621nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000622{
623 PyObject *result;
624 Py_UNICODE *i, *i1, *o, *end;
625 int f,l,index,index1,comb;
626 Py_UNICODE code;
627 Py_UNICODE *skipped[20];
628 int cskipped = 0;
629
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000630 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000631 if (!result)
632 return NULL;
633
634 /* We are going to modify result in-place.
635 If nfd_nfkd is changed to sometimes return the input,
636 this code needs to be reviewed. */
637 assert(result != input);
638
639 i = PyUnicode_AS_UNICODE(result);
640 end = i + PyUnicode_GET_SIZE(result);
641 o = PyUnicode_AS_UNICODE(result);
642
643 again:
644 while (i < end) {
645 for (index = 0; index < cskipped; index++) {
646 if (skipped[index] == i) {
647 /* *i character is skipped.
648 Remove from list. */
649 skipped[index] = skipped[cskipped-1];
650 cskipped--;
651 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000652 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000653 }
654 }
655 /* Hangul Composition. We don't need to check for <LV,T>
656 pairs, since we always have decomposed data. */
657 if (LBase <= *i && *i < (LBase+LCount) &&
658 i + 1 < end &&
659 VBase <= i[1] && i[1] <= (VBase+VCount)) {
660 int LIndex, VIndex;
661 LIndex = i[0] - LBase;
662 VIndex = i[1] - VBase;
663 code = SBase + (LIndex*VCount+VIndex)*TCount;
664 i+=2;
665 if (i < end &&
666 TBase <= *i && *i <= (TBase+TCount)) {
667 code += *i-TBase;
668 i++;
669 }
670 *o++ = code;
671 continue;
672 }
673
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000674 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000675 if (f == -1) {
676 *o++ = *i++;
677 continue;
678 }
679 /* Find next unblocked character. */
680 i1 = i+1;
681 comb = 0;
682 while (i1 < end) {
683 int comb1 = _getrecord_ex(*i1)->combining;
684 if (comb1 && comb == comb1) {
685 /* Character is blocked. */
686 i1++;
687 continue;
688 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000689 l = find_nfc_index(self, nfc_last, *i1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000690 /* *i1 cannot be combined with *i. If *i1
691 is a starter, we don't need to look further.
692 Otherwise, record the combining class. */
693 if (l == -1) {
694 not_combinable:
695 if (comb1 == 0)
696 break;
697 comb = comb1;
698 i1++;
699 continue;
700 }
701 index = f*TOTAL_LAST + l;
702 index1 = comp_index[index >> COMP_SHIFT];
703 code = comp_data[(index1<<COMP_SHIFT)+
704 (index&((1<<COMP_SHIFT)-1))];
705 if (code == 0)
706 goto not_combinable;
707
708 /* Replace the original character. */
709 *i = code;
710 /* Mark the second character unused. */
711 skipped[cskipped++] = i1;
712 i1++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000713 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000714 if (f == -1)
715 break;
716 }
717 *o++ = *i++;
718 }
719 if (o != end)
720 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
721 return result;
722}
723
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000724PyDoc_STRVAR(unicodedata_normalize__doc__,
725"normalize(form, unistr)\n\
726\n\
727Return the normal form 'form' for the Unicode string unistr. Valid\n\
728values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
729
Martin v. Löwis677bde22002-11-23 22:08:15 +0000730static PyObject*
731unicodedata_normalize(PyObject *self, PyObject *args)
732{
733 char *form;
734 PyObject *input;
735
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000736 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000737 &form, &PyUnicode_Type, &input))
738 return NULL;
739
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000740 if (PyUnicode_GetSize(input) == 0) {
741 /* Special case empty input strings, since resizing
742 them later would cause internal errors. */
743 Py_INCREF(input);
744 return input;
745 }
746
Martin v. Löwis677bde22002-11-23 22:08:15 +0000747 if (strcmp(form, "NFC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000748 return nfc_nfkc(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000749 if (strcmp(form, "NFKC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000750 return nfc_nfkc(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000751 if (strcmp(form, "NFD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000752 return nfd_nfkd(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000753 if (strcmp(form, "NFKD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000754 return nfd_nfkd(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000755 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
756 return NULL;
757}
758
Fredrik Lundh06d12682001-01-24 07:59:11 +0000759/* -------------------------------------------------------------------- */
760/* unicode character name tables */
761
762/* data file generated by Tools/unicode/makeunicodedata.py */
763#include "unicodename_db.h"
764
765/* -------------------------------------------------------------------- */
766/* database code (cut and pasted from the unidb package) */
767
768static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000769_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000770{
771 int i;
772 unsigned long h = 0;
773 unsigned long ix;
774 for (i = 0; i < len; i++) {
Neal Norwitz65c05b22006-04-10 02:17:47 +0000775 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000776 ix = h & 0xff000000;
777 if (ix)
778 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
779 }
780 return h;
781}
782
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000783static char *hangul_syllables[][3] = {
784 { "G", "A", "" },
785 { "GG", "AE", "G" },
786 { "N", "YA", "GG" },
787 { "D", "YAE", "GS" },
788 { "DD", "EO", "N", },
789 { "R", "E", "NJ" },
790 { "M", "YEO", "NH" },
791 { "B", "YE", "D" },
792 { "BB", "O", "L" },
793 { "S", "WA", "LG" },
794 { "SS", "WAE", "LM" },
795 { "", "OE", "LB" },
796 { "J", "YO", "LS" },
797 { "JJ", "U", "LT" },
798 { "C", "WEO", "LP" },
799 { "K", "WE", "LH" },
800 { "T", "WI", "M" },
801 { "P", "YU", "B" },
802 { "H", "EU", "BS" },
803 { 0, "YI", "S" },
804 { 0, "I", "SS" },
805 { 0, 0, "NG" },
806 { 0, 0, "J" },
807 { 0, 0, "C" },
808 { 0, 0, "K" },
809 { 0, 0, "T" },
810 { 0, 0, "P" },
811 { 0, 0, "H" }
812};
813
Fredrik Lundh06d12682001-01-24 07:59:11 +0000814static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000815is_unified_ideograph(Py_UCS4 code)
816{
817 return (
818 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Martin v. Löwisc3509122006-03-11 12:16:23 +0000819 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000820 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
821}
822
823static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000824_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000825{
826 int offset;
827 int i;
828 int word;
829 unsigned char* w;
830
Martin v. Löwisc3509122006-03-11 12:16:23 +0000831 if (code >= 0x110000)
832 return 0;
833
834 if (self) {
835 const change_record *old = get_old_record(self, code);
836 if (old->category_changed == 0) {
837 /* unassigned */
838 return 0;
839 }
840 }
841
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000842 if (SBase <= code && code < SBase+SCount) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000843 /* Hangul syllable. */
844 int SIndex = code - SBase;
845 int L = SIndex / NCount;
846 int V = (SIndex % NCount) / TCount;
847 int T = SIndex % TCount;
848
849 if (buflen < 27)
850 /* Worst case: HANGUL SYLLABLE <10chars>. */
851 return 0;
852 strcpy(buffer, "HANGUL SYLLABLE ");
853 buffer += 16;
854 strcpy(buffer, hangul_syllables[L][0]);
855 buffer += strlen(hangul_syllables[L][0]);
856 strcpy(buffer, hangul_syllables[V][1]);
857 buffer += strlen(hangul_syllables[V][1]);
858 strcpy(buffer, hangul_syllables[T][2]);
859 buffer += strlen(hangul_syllables[T][2]);
860 *buffer = '\0';
861 return 1;
862 }
863
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000864 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000865 if (buflen < 28)
866 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
867 return 0;
868 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
869 return 1;
870 }
871
Fredrik Lundh06d12682001-01-24 07:59:11 +0000872 /* get offset into phrasebook */
873 offset = phrasebook_offset1[(code>>phrasebook_shift)];
874 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
875 (code&((1<<phrasebook_shift)-1))];
876 if (!offset)
877 return 0;
878
879 i = 0;
880
881 for (;;) {
882 /* get word index */
883 word = phrasebook[offset] - phrasebook_short;
884 if (word >= 0) {
885 word = (word << 8) + phrasebook[offset+1];
886 offset += 2;
887 } else
888 word = phrasebook[offset++];
889 if (i) {
890 if (i > buflen)
891 return 0; /* buffer overflow */
892 buffer[i++] = ' ';
893 }
894 /* copy word string from lexicon. the last character in the
895 word has bit 7 set. the last word in a string ends with
896 0x80 */
897 w = lexicon + lexicon_offset[word];
898 while (*w < 128) {
899 if (i >= buflen)
900 return 0; /* buffer overflow */
901 buffer[i++] = *w++;
902 }
903 if (i >= buflen)
904 return 0; /* buffer overflow */
905 buffer[i++] = *w & 127;
906 if (*w == 128)
907 break; /* end of word */
908 }
909
910 return 1;
911}
912
913static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000914_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000915{
916 /* check if code corresponds to the given name */
917 int i;
918 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000919 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000920 return 0;
921 for (i = 0; i < namelen; i++) {
Neal Norwitz65c05b22006-04-10 02:17:47 +0000922 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +0000923 return 0;
924 }
925 return buffer[namelen] == '\0';
926}
927
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000928static void
929find_syllable(const char *str, int *len, int *pos, int count, int column)
930{
931 int i, len1;
932 *len = -1;
933 for (i = 0; i < count; i++) {
934 char *s = hangul_syllables[i][column];
935 len1 = strlen(s);
936 if (len1 <= *len)
937 continue;
938 if (strncmp(str, s, len1) == 0) {
939 *len = len1;
940 *pos = i;
941 }
942 }
943 if (*len == -1) {
944 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000945 }
946}
947
Fredrik Lundh06d12682001-01-24 07:59:11 +0000948static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000949_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000950{
951 unsigned int h, v;
952 unsigned int mask = code_size-1;
953 unsigned int i, incr;
954
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000955 /* Check for hangul syllables. */
956 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Thomas Wouters1e365b22006-03-01 21:58:30 +0000957 int len, L = -1, V = -1, T = -1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000958 const char *pos = name + 16;
959 find_syllable(pos, &len, &L, LCount, 0);
960 pos += len;
961 find_syllable(pos, &len, &V, VCount, 1);
962 pos += len;
963 find_syllable(pos, &len, &T, TCount, 2);
964 pos += len;
Martin v. Löwis8b291e22005-09-18 08:17:56 +0000965 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000966 *code = SBase + (L*VCount+V)*TCount + T;
967 return 1;
968 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000969 /* Otherwise, it's an illegal syllable name. */
970 return 0;
971 }
972
973 /* Check for unified ideographs. */
974 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
975 /* Four or five hexdigits must follow. */
976 v = 0;
977 name += 22;
978 namelen -= 22;
979 if (namelen != 4 && namelen != 5)
980 return 0;
981 while (namelen--) {
982 v *= 16;
983 if (*name >= '0' && *name <= '9')
984 v += *name - '0';
985 else if (*name >= 'A' && *name <= 'F')
986 v += *name - 'A' + 10;
987 else
988 return 0;
989 name++;
990 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000991 if (!is_unified_ideograph(v))
992 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000993 *code = v;
994 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000995 }
996
Fredrik Lundh06d12682001-01-24 07:59:11 +0000997 /* the following is the same as python's dictionary lookup, with
998 only minor changes. see the makeunicodedata script for more
999 details */
1000
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001001 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001002 i = (~h) & mask;
1003 v = code_hash[i];
1004 if (!v)
1005 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001006 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001007 *code = v;
1008 return 1;
1009 }
1010 incr = (h ^ (h >> 3)) & mask;
1011 if (!incr)
1012 incr = mask;
1013 for (;;) {
1014 i = (i + incr) & mask;
1015 v = code_hash[i];
1016 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001017 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001018 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001019 *code = v;
1020 return 1;
1021 }
1022 incr = incr << 1;
1023 if (incr > mask)
1024 incr = incr ^ code_poly;
1025 }
1026}
1027
1028static const _PyUnicode_Name_CAPI hashAPI =
1029{
1030 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001031 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001032 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001033};
1034
1035/* -------------------------------------------------------------------- */
1036/* Python bindings */
1037
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001038PyDoc_STRVAR(unicodedata_name__doc__,
1039"name(unichr[, default])\n\
1040Returns the name assigned to the Unicode character unichr as a\n\
1041string. If no name is defined, default is returned, or, if not\n\
1042given, ValueError is raised.");
1043
Fredrik Lundh06d12682001-01-24 07:59:11 +00001044static PyObject *
1045unicodedata_name(PyObject* self, PyObject* args)
1046{
1047 char name[NAME_MAXLEN];
Walter Dörwalda2a89a82008-06-02 20:36:03 +00001048 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001049
1050 PyUnicodeObject* v;
1051 PyObject* defobj = NULL;
1052 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1053 return NULL;
1054
Walter Dörwalda2a89a82008-06-02 20:36:03 +00001055 c = getuchar(v);
1056 if (c == (Py_UCS4)-1)
1057 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001058
Walter Dörwalda2a89a82008-06-02 20:36:03 +00001059 if (!_getucname(self, c, name, sizeof(name))) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001060 if (defobj == NULL) {
1061 PyErr_SetString(PyExc_ValueError, "no such name");
1062 return NULL;
1063 }
1064 else {
1065 Py_INCREF(defobj);
1066 return defobj;
1067 }
1068 }
1069
1070 return Py_BuildValue("s", name);
1071}
1072
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001073PyDoc_STRVAR(unicodedata_lookup__doc__,
1074"lookup(name)\n\
1075\n\
1076Look up character by name. If a character with the\n\
1077given name is found, return the corresponding Unicode\n\
1078character. If not found, KeyError is raised.");
1079
Fredrik Lundh06d12682001-01-24 07:59:11 +00001080static PyObject *
1081unicodedata_lookup(PyObject* self, PyObject* args)
1082{
1083 Py_UCS4 code;
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +00001084 Py_UNICODE str[2];
Fredrik Lundh06d12682001-01-24 07:59:11 +00001085
1086 char* name;
1087 int namelen;
1088 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1089 return NULL;
1090
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001091 if (!_getcode(self, name, namelen, &code)) {
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +00001092 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1093 name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001094 return NULL;
1095 }
1096
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +00001097#ifndef Py_UNICODE_WIDE
1098 if (code >= 0x10000) {
1099 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1100 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1101 return PyUnicode_FromUnicode(str, 2);
1102 }
1103#endif
Fredrik Lundh06d12682001-01-24 07:59:11 +00001104 str[0] = (Py_UNICODE) code;
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +00001105 return PyUnicode_FromUnicode(str, 1);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001106}
1107
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001108/* XXX Add doc strings. */
1109
1110static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001111 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1112 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1113 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1114 {"category", unicodedata_category, METH_VARARGS,
1115 unicodedata_category__doc__},
1116 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1117 unicodedata_bidirectional__doc__},
1118 {"combining", unicodedata_combining, METH_VARARGS,
1119 unicodedata_combining__doc__},
1120 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1121 unicodedata_mirrored__doc__},
1122 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1123 unicodedata_east_asian_width__doc__},
1124 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1125 unicodedata_decomposition__doc__},
1126 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1127 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1128 {"normalize", unicodedata_normalize, METH_VARARGS,
1129 unicodedata_normalize__doc__},
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001130 {NULL, NULL} /* sentinel */
1131};
1132
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001133static PyTypeObject UCD_Type = {
1134 /* The ob_type field must be initialized in the module init function
1135 * to be portable to Windows without using C++. */
Martin v. Löwis68192102007-07-21 06:55:02 +00001136 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001137 "unicodedata.UCD", /*tp_name*/
1138 sizeof(PreviousDBVersion), /*tp_basicsize*/
1139 0, /*tp_itemsize*/
1140 /* methods */
1141 (destructor)PyObject_Del, /*tp_dealloc*/
1142 0, /*tp_print*/
1143 0, /*tp_getattr*/
1144 0, /*tp_setattr*/
1145 0, /*tp_compare*/
1146 0, /*tp_repr*/
1147 0, /*tp_as_number*/
1148 0, /*tp_as_sequence*/
1149 0, /*tp_as_mapping*/
1150 0, /*tp_hash*/
1151 0, /*tp_call*/
1152 0, /*tp_str*/
1153 PyObject_GenericGetAttr,/*tp_getattro*/
1154 0, /*tp_setattro*/
1155 0, /*tp_as_buffer*/
1156 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1157 0, /*tp_doc*/
1158 0, /*tp_traverse*/
1159 0, /*tp_clear*/
1160 0, /*tp_richcompare*/
1161 0, /*tp_weaklistoffset*/
1162 0, /*tp_iter*/
1163 0, /*tp_iternext*/
1164 unicodedata_functions, /*tp_methods*/
1165 DB_members, /*tp_members*/
1166 0, /*tp_getset*/
1167 0, /*tp_base*/
1168 0, /*tp_dict*/
1169 0, /*tp_descr_get*/
1170 0, /*tp_descr_set*/
1171 0, /*tp_dictoffset*/
1172 0, /*tp_init*/
1173 0, /*tp_alloc*/
1174 0, /*tp_new*/
1175 0, /*tp_free*/
1176 0, /*tp_is_gc*/
1177};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001178
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001179PyDoc_STRVAR(unicodedata_docstring,
1180"This module provides access to the Unicode Character Database which\n\
1181defines character properties for all Unicode characters. The data in\n\
1182this database is based on the UnicodeData.txt file version\n\
Martin v. Löwis24329ba2008-09-10 13:38:12 +000011835.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001184\n\
1185The module uses the same names and symbols as defined by the\n\
Martin v. Löwis24329ba2008-09-10 13:38:12 +00001186UnicodeData File Format 5.1.0 (see\n\
1187http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001188
Mark Hammond62b1ab12002-07-23 06:31:15 +00001189PyMODINIT_FUNC
Thomas Woutersf3f33dc2000-07-21 06:00:07 +00001190initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001191{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001192 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001193
Christian Heimese93237d2007-12-19 02:37:44 +00001194 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001195
Fred Drakef585bef2001-03-03 19:41:55 +00001196 m = Py_InitModule3(
1197 "unicodedata", unicodedata_functions, unicodedata_docstring);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001198 if (!m)
1199 return;
1200
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001201 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001202 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001203 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001204
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001205 /* Previous versions */
1206 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1207 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001208 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001209
Fredrik Lundh06d12682001-01-24 07:59:11 +00001210 /* Export C API */
1211 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001212 if (v != NULL)
1213 PyModule_AddObject(m, "ucnhash_CAPI", v);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001214}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001215
1216/*
1217Local variables:
1218c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001219indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001220End:
1221*/