blob: 12bda093f7369a8803327f650049d1fd82791d32 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Martin v. Löwis93cbca32008-09-10 14:08:48 +00003 unicodedata -- Provides access to the Unicode 5.1 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Martin v. Löwis93cbca32008-09-10 14:08:48 +00005 Data was extracted from the Unicode 5.1 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Martin v. Löwis7d41e292002-11-23 12:22:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +000028 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030} _PyUnicode_DatabaseRecord;
31
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000032typedef struct change_record {
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed;
35 const unsigned char category_changed;
36 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000037 const unsigned char mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000038 const int numeric_changed;
39} change_record;
40
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000041/* data file generated by Tools/unicode/makeunicodedata.py */
42#include "unicodedata_db.h"
43
44static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000045_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000046{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000048 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000049 index = 0;
50 else {
51 index = index1[(code>>SHIFT)];
52 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
53 }
54
55 return &_PyUnicode_Database_Records[index];
56}
57
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000058/* ------------- Previous-version API ------------------------------------- */
59typedef struct previous_version {
60 PyObject_HEAD
61 const char *name;
62 const change_record* (*getrecord)(Py_UCS4);
63 Py_UCS4 (*normalization)(Py_UCS4);
64} PreviousDBVersion;
65
66#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
67
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000068static PyMemberDef DB_members[] = {
69 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
70 {NULL}
71};
72
Thomas Wouters89f507f2006-12-13 04:49:30 +000073/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000074static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000075#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000076
77static PyObject*
78new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
79 Py_UCS4 (*normalization)(Py_UCS4))
80{
81 PreviousDBVersion *self;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000082 self = PyObject_New(PreviousDBVersion, &UCD_Type);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000083 if (self == NULL)
84 return NULL;
85 self->name = name;
86 self->getrecord = getrecord;
87 self->normalization = normalization;
88 return (PyObject*)self;
89}
90
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000091
92static Py_UCS4 getuchar(PyUnicodeObject *obj)
93{
94 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
95
96 if (PyUnicode_GET_SIZE(obj) == 1)
97 return *v;
98#ifndef Py_UNICODE_WIDE
99 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
100 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
101 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
102 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
103#endif
104 PyErr_SetString(PyExc_TypeError,
105 "need a single Unicode character as parameter");
106 return (Py_UCS4)-1;
107}
108
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000109/* --- Module API --------------------------------------------------------- */
110
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000111PyDoc_STRVAR(unicodedata_decimal__doc__,
112"decimal(unichr[, default])\n\
113\n\
114Returns the decimal value assigned to the Unicode character unichr\n\
115as integer. If no such value is defined, default is returned, or, if\n\
116not given, ValueError is raised.");
117
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000119unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000120{
121 PyUnicodeObject *v;
122 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000123 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000124 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000125 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000126
Fredrik Lundh06d12682001-01-24 07:59:11 +0000127 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000128 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000129 c = getuchar(v);
130 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000131 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000132
Martin v. Löwis1a214512008-06-11 05:26:20 +0000133 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000134 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000135 if (old->category_changed == 0) {
136 /* unassigned */
137 have_old = 1;
138 rc = -1;
139 }
140 else if (old->decimal_changed != 0xFF) {
141 have_old = 1;
142 rc = old->decimal_changed;
143 }
144 }
145
146 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000147 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000148 if (rc < 0) {
149 if (defobj == NULL) {
150 PyErr_SetString(PyExc_ValueError,
151 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000152 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000153 }
154 else {
155 Py_INCREF(defobj);
156 return defobj;
157 }
158 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000159 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000160}
161
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000162PyDoc_STRVAR(unicodedata_digit__doc__,
163"digit(unichr[, default])\n\
164\n\
165Returns the digit value assigned to the Unicode character unichr as\n\
166integer. If no such value is defined, default is returned, or, if\n\
167not given, ValueError is raised.");
168
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000170unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000171{
172 PyUnicodeObject *v;
173 PyObject *defobj = NULL;
174 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000175 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000176
Fredrik Lundh06d12682001-01-24 07:59:11 +0000177 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000178 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000179 c = getuchar(v);
180 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000181 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000182 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000183 if (rc < 0) {
184 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000185 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000186 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000187 }
188 else {
189 Py_INCREF(defobj);
190 return defobj;
191 }
192 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000193 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000194}
195
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000196PyDoc_STRVAR(unicodedata_numeric__doc__,
197"numeric(unichr[, default])\n\
198\n\
199Returns the numeric value assigned to the Unicode character unichr\n\
200as float. If no such value is defined, default is returned, or, if\n\
201not given, ValueError is raised.");
202
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000204unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000205{
206 PyUnicodeObject *v;
207 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000208 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000209 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000210 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000211
Fredrik Lundh06d12682001-01-24 07:59:11 +0000212 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000213 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000214 c = getuchar(v);
215 if (c == (Py_UCS4)-1)
216 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000217
Martin v. Löwis1a214512008-06-11 05:26:20 +0000218 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000219 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000220 if (old->category_changed == 0) {
221 /* unassigned */
222 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223 rc = -1.0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000224 }
225 else if (old->decimal_changed != 0xFF) {
226 have_old = 1;
227 rc = old->decimal_changed;
228 }
229 }
230
231 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000232 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233 if (rc == -1.0) {
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000234 if (defobj == NULL) {
Fredrik Lundh06d12682001-01-24 07:59:11 +0000235 PyErr_SetString(PyExc_ValueError, "not a numeric character");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000236 return NULL;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000237 }
238 else {
239 Py_INCREF(defobj);
240 return defobj;
241 }
242 }
243 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000244}
245
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000246PyDoc_STRVAR(unicodedata_category__doc__,
247"category(unichr)\n\
248\n\
249Returns the general category assigned to the Unicode character\n\
250unichr as string.");
251
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000252static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000253unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000254{
255 PyUnicodeObject *v;
256 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000257 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000258
259 if (!PyArg_ParseTuple(args, "O!:category",
260 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000261 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000262 c = getuchar(v);
263 if (c == (Py_UCS4)-1)
264 return NULL;
265 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000266 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000267 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000268 if (old->category_changed != 0xFF)
269 index = old->category_changed;
270 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000271 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000272}
273
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000274PyDoc_STRVAR(unicodedata_bidirectional__doc__,
275"bidirectional(unichr)\n\
276\n\
277Returns the bidirectional category assigned to the Unicode character\n\
278unichr as string. If no such value is defined, an empty string is\n\
279returned.");
280
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000281static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000282unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000283{
284 PyUnicodeObject *v;
285 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000286 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000287
288 if (!PyArg_ParseTuple(args, "O!:bidirectional",
289 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000290 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000291 c = getuchar(v);
292 if (c == (Py_UCS4)-1)
293 return NULL;
294 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000295 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000296 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000297 if (old->category_changed == 0)
298 index = 0; /* unassigned */
299 else if (old->bidir_changed != 0xFF)
300 index = old->bidir_changed;
301 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000302 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000303}
304
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000305PyDoc_STRVAR(unicodedata_combining__doc__,
306"combining(unichr)\n\
307\n\
308Returns the canonical combining class assigned to the Unicode\n\
309character unichr as integer. Returns 0 if no combining class is\n\
310defined.");
311
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000312static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000313unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000314{
315 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000316 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000317 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000318
319 if (!PyArg_ParseTuple(args, "O!:combining",
320 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000321 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000322 c = getuchar(v);
323 if (c == (Py_UCS4)-1)
324 return NULL;
325 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000326 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000327 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000328 if (old->category_changed == 0)
329 index = 0; /* unassigned */
330 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000331 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000332}
333
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000334PyDoc_STRVAR(unicodedata_mirrored__doc__,
335"mirrored(unichr)\n\
336\n\
337Returns the mirrored property assigned to the Unicode character\n\
338unichr as integer. Returns 1 if the character has been identified as\n\
339a \"mirrored\" character in bidirectional text, 0 otherwise.");
340
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000341static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000342unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000343{
344 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000345 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000346 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000347
348 if (!PyArg_ParseTuple(args, "O!:mirrored",
349 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000350 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000351 c = getuchar(v);
352 if (c == (Py_UCS4)-1)
353 return NULL;
354 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000355 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000356 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000357 if (old->category_changed == 0)
358 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000359 else if (old->mirrored_changed != 0xFF)
360 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000361 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000362 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000363}
364
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000365PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
366"east_asian_width(unichr)\n\
367\n\
368Returns the east asian width assigned to the Unicode character\n\
369unichr as string.");
370
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000371static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000372unicodedata_east_asian_width(PyObject *self, PyObject *args)
373{
374 PyUnicodeObject *v;
375 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000376 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000377
378 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
379 &PyUnicode_Type, &v))
380 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000381 c = getuchar(v);
382 if (c == (Py_UCS4)-1)
383 return NULL;
384 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000385 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000386 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000387 if (old->category_changed == 0)
388 index = 0; /* unassigned */
389 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000390 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000391}
392
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000393PyDoc_STRVAR(unicodedata_decomposition__doc__,
394"decomposition(unichr)\n\
395\n\
396Returns the character decomposition mapping assigned to the Unicode\n\
397character unichr as string. An empty string is returned in case no\n\
398such mapping is defined.");
399
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000400static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000401unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000402{
403 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000404 char decomp[256];
405 int code, index, count, i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000406 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000407 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000408
409 if (!PyArg_ParseTuple(args, "O!:decomposition",
410 &PyUnicode_Type, &v))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000411 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000412 c = getuchar(v);
413 if (c == (Py_UCS4)-1)
414 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000415
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000416 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000417
Martin v. Löwis1a214512008-06-11 05:26:20 +0000418 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000419 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000420 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000421 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000422 }
423
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000424 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000425 index = 0;
426 else {
427 index = decomp_index1[(code>>DECOMP_SHIFT)];
428 index = decomp_index2[(index<<DECOMP_SHIFT)+
429 (code&((1<<DECOMP_SHIFT)-1))];
430 }
431
Tim Peters69b83b12001-11-30 07:23:05 +0000432 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000433 is prefix code (from*/
434 count = decomp_data[index] >> 8;
435
436 /* XXX: could allocate the PyString up front instead
437 (strlen(prefix) + 5 * count + 1 bytes) */
438
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000439 /* Based on how index is calculated above and decomp_data is generated
440 from Tools/unicode/makeunicodedata.py, it should not be possible
441 to overflow decomp_prefix. */
442 prefix_index = decomp_data[index] & 255;
443 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
444
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000445 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000446 i = strlen(decomp_prefix[prefix_index]);
447 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000448
449 while (count-- > 0) {
450 if (i)
451 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000452 assert((size_t)i < sizeof(decomp));
453 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
454 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000455 i += strlen(decomp + i);
456 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000457
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000458 decomp[i] = '\0';
459
Walter Dörwald4254e762007-06-05 16:04:09 +0000460 return PyUnicode_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000461}
462
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000463static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000464get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000465{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000466 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000467 *index = 0;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000468 } else if (self && UCD_Check(self) &&
469 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000470 /* unassigned in old version */
471 *index = 0;
472 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000473 else {
474 *index = decomp_index1[(code>>DECOMP_SHIFT)];
475 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
476 (code&((1<<DECOMP_SHIFT)-1))];
477 }
478
479 /* high byte is number of hex bytes (usually one or two), low byte
480 is prefix code (from*/
481 *count = decomp_data[*index] >> 8;
482 *prefix = decomp_data[*index] & 255;
483
484 (*index)++;
485}
486
487#define SBase 0xAC00
488#define LBase 0x1100
489#define VBase 0x1161
490#define TBase 0x11A7
491#define LCount 19
492#define VCount 21
493#define TCount 28
494#define NCount (VCount*TCount)
495#define SCount (LCount*NCount)
496
497static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000498nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000499{
500 PyObject *result;
501 Py_UNICODE *i, *end, *o;
502 /* Longest decomposition in Unicode 3.2: U+FDFA */
503 Py_UNICODE stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000504 Py_ssize_t space, isize;
505 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000506 unsigned char prev, cur;
507
508 stackptr = 0;
509 isize = PyUnicode_GET_SIZE(input);
510 /* Overallocate atmost 10 characters. */
511 space = (isize > 10 ? 10 : isize) + isize;
512 result = PyUnicode_FromUnicode(NULL, space);
513 if (!result)
514 return NULL;
515 i = PyUnicode_AS_UNICODE(input);
516 end = i + isize;
517 o = PyUnicode_AS_UNICODE(result);
518
519 while (i < end) {
520 stack[stackptr++] = *i++;
521 while(stackptr) {
522 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000523 /* Hangul Decomposition adds three characters in
524 a single step, so we need atleast that much room. */
525 if (space < 3) {
Martin v. Löwis5b222132007-06-10 09:51:05 +0000526 Py_ssize_t newsize = PyUnicode_GET_SIZE(result) + 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000527 space += 10;
528 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000529 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000530 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000531 }
532 /* Hangul Decomposition. */
533 if (SBase <= code && code < (SBase+SCount)) {
534 int SIndex = code - SBase;
535 int L = LBase + SIndex / NCount;
536 int V = VBase + (SIndex % NCount) / TCount;
537 int T = TBase + SIndex % TCount;
538 *o++ = L;
539 *o++ = V;
540 space -= 2;
541 if (T != TBase) {
542 *o++ = T;
543 space --;
544 }
545 continue;
546 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000547 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000548 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000549 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
550 if (value != 0) {
551 stack[stackptr++] = value;
552 continue;
553 }
554 }
555
556 /* Other decompositions. */
557 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000558
559 /* Copy character if it is not decomposable, or has a
560 compatibility decomposition, but we do NFD. */
561 if (!count || (prefix && !k)) {
562 *o++ = code;
563 space--;
564 continue;
565 }
566 /* Copy decomposition onto the stack, in reverse
567 order. */
568 while(count) {
569 code = decomp_data[index + (--count)];
570 stack[stackptr++] = code;
571 }
572 }
573 }
574
575 /* Drop overallocation. Cannot fail. */
576 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
577
578 /* Sort canonically. */
579 i = PyUnicode_AS_UNICODE(result);
580 prev = _getrecord_ex(*i)->combining;
581 end = i + PyUnicode_GET_SIZE(result);
582 for (i++; i < end; i++) {
583 cur = _getrecord_ex(*i)->combining;
584 if (prev == 0 || cur == 0 || prev <= cur) {
585 prev = cur;
586 continue;
587 }
588 /* Non-canonical order. Need to switch *i with previous. */
589 o = i - 1;
590 while (1) {
591 Py_UNICODE tmp = o[1];
592 o[1] = o[0];
593 o[0] = tmp;
594 o--;
595 if (o < PyUnicode_AS_UNICODE(result))
596 break;
597 prev = _getrecord_ex(*o)->combining;
598 if (prev == 0 || prev <= cur)
599 break;
600 }
601 prev = _getrecord_ex(*i)->combining;
602 }
603 return result;
604}
605
606static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000607find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000608{
609 int index;
610 for (index = 0; nfc[index].start; index++) {
611 int start = nfc[index].start;
612 if (code < start)
613 return -1;
614 if (code <= start + nfc[index].count) {
615 int delta = code - start;
616 return nfc[index].index + delta;
617 }
618 }
619 return -1;
620}
621
622static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000623nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000624{
625 PyObject *result;
626 Py_UNICODE *i, *i1, *o, *end;
627 int f,l,index,index1,comb;
628 Py_UNICODE code;
629 Py_UNICODE *skipped[20];
630 int cskipped = 0;
631
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000632 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000633 if (!result)
634 return NULL;
635
636 /* We are going to modify result in-place.
637 If nfd_nfkd is changed to sometimes return the input,
638 this code needs to be reviewed. */
639 assert(result != input);
640
641 i = PyUnicode_AS_UNICODE(result);
642 end = i + PyUnicode_GET_SIZE(result);
643 o = PyUnicode_AS_UNICODE(result);
644
645 again:
646 while (i < end) {
647 for (index = 0; index < cskipped; index++) {
648 if (skipped[index] == i) {
649 /* *i character is skipped.
650 Remove from list. */
651 skipped[index] = skipped[cskipped-1];
652 cskipped--;
653 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000654 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000655 }
656 }
657 /* Hangul Composition. We don't need to check for <LV,T>
658 pairs, since we always have decomposed data. */
659 if (LBase <= *i && *i < (LBase+LCount) &&
660 i + 1 < end &&
661 VBase <= i[1] && i[1] <= (VBase+VCount)) {
662 int LIndex, VIndex;
663 LIndex = i[0] - LBase;
664 VIndex = i[1] - VBase;
665 code = SBase + (LIndex*VCount+VIndex)*TCount;
666 i+=2;
667 if (i < end &&
668 TBase <= *i && *i <= (TBase+TCount)) {
669 code += *i-TBase;
670 i++;
671 }
672 *o++ = code;
673 continue;
674 }
675
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000676 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000677 if (f == -1) {
678 *o++ = *i++;
679 continue;
680 }
681 /* Find next unblocked character. */
682 i1 = i+1;
683 comb = 0;
684 while (i1 < end) {
685 int comb1 = _getrecord_ex(*i1)->combining;
686 if (comb1 && comb == comb1) {
687 /* Character is blocked. */
688 i1++;
689 continue;
690 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000691 l = find_nfc_index(self, nfc_last, *i1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000692 /* *i1 cannot be combined with *i. If *i1
693 is a starter, we don't need to look further.
694 Otherwise, record the combining class. */
695 if (l == -1) {
696 not_combinable:
697 if (comb1 == 0)
698 break;
699 comb = comb1;
700 i1++;
701 continue;
702 }
703 index = f*TOTAL_LAST + l;
704 index1 = comp_index[index >> COMP_SHIFT];
705 code = comp_data[(index1<<COMP_SHIFT)+
706 (index&((1<<COMP_SHIFT)-1))];
707 if (code == 0)
708 goto not_combinable;
709
710 /* Replace the original character. */
711 *i = code;
712 /* Mark the second character unused. */
713 skipped[cskipped++] = i1;
714 i1++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000715 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000716 if (f == -1)
717 break;
718 }
719 *o++ = *i++;
720 }
721 if (o != end)
722 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
723 return result;
724}
725
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000726PyDoc_STRVAR(unicodedata_normalize__doc__,
727"normalize(form, unistr)\n\
728\n\
729Return the normal form 'form' for the Unicode string unistr. Valid\n\
730values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
731
Martin v. Löwis677bde22002-11-23 22:08:15 +0000732static PyObject*
733unicodedata_normalize(PyObject *self, PyObject *args)
734{
735 char *form;
736 PyObject *input;
737
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000738 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000739 &form, &PyUnicode_Type, &input))
740 return NULL;
741
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000742 if (PyUnicode_GetSize(input) == 0) {
743 /* Special case empty input strings, since resizing
744 them later would cause internal errors. */
745 Py_INCREF(input);
746 return input;
747 }
748
Martin v. Löwis677bde22002-11-23 22:08:15 +0000749 if (strcmp(form, "NFC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000750 return nfc_nfkc(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000751 if (strcmp(form, "NFKC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000752 return nfc_nfkc(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000753 if (strcmp(form, "NFD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000754 return nfd_nfkd(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000755 if (strcmp(form, "NFKD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000756 return nfd_nfkd(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000757 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
758 return NULL;
759}
760
Fredrik Lundh06d12682001-01-24 07:59:11 +0000761/* -------------------------------------------------------------------- */
762/* unicode character name tables */
763
764/* data file generated by Tools/unicode/makeunicodedata.py */
765#include "unicodename_db.h"
766
767/* -------------------------------------------------------------------- */
768/* database code (cut and pasted from the unidb package) */
769
770static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000771_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000772{
773 int i;
774 unsigned long h = 0;
775 unsigned long ix;
776 for (i = 0; i < len; i++) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000777 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000778 ix = h & 0xff000000;
779 if (ix)
780 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
781 }
782 return h;
783}
784
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000785static char *hangul_syllables[][3] = {
786 { "G", "A", "" },
787 { "GG", "AE", "G" },
788 { "N", "YA", "GG" },
789 { "D", "YAE", "GS" },
790 { "DD", "EO", "N", },
791 { "R", "E", "NJ" },
792 { "M", "YEO", "NH" },
793 { "B", "YE", "D" },
794 { "BB", "O", "L" },
795 { "S", "WA", "LG" },
796 { "SS", "WAE", "LM" },
797 { "", "OE", "LB" },
798 { "J", "YO", "LS" },
799 { "JJ", "U", "LT" },
800 { "C", "WEO", "LP" },
801 { "K", "WE", "LH" },
802 { "T", "WI", "M" },
803 { "P", "YU", "B" },
804 { "H", "EU", "BS" },
805 { 0, "YI", "S" },
806 { 0, "I", "SS" },
807 { 0, 0, "NG" },
808 { 0, 0, "J" },
809 { 0, 0, "C" },
810 { 0, 0, "K" },
811 { 0, 0, "T" },
812 { 0, 0, "P" },
813 { 0, 0, "H" }
814};
815
Fredrik Lundh06d12682001-01-24 07:59:11 +0000816static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000817is_unified_ideograph(Py_UCS4 code)
818{
819 return (
820 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Martin v. Löwisc3509122006-03-11 12:16:23 +0000821 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000822 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
823}
824
825static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000826_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000827{
828 int offset;
829 int i;
830 int word;
831 unsigned char* w;
832
Martin v. Löwisc3509122006-03-11 12:16:23 +0000833 if (code >= 0x110000)
834 return 0;
835
Martin v. Löwis1a214512008-06-11 05:26:20 +0000836 if (self && UCD_Check(self)) {
Martin v. Löwisc3509122006-03-11 12:16:23 +0000837 const change_record *old = get_old_record(self, code);
838 if (old->category_changed == 0) {
839 /* unassigned */
840 return 0;
841 }
842 }
843
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000844 if (SBase <= code && code < SBase+SCount) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000845 /* Hangul syllable. */
846 int SIndex = code - SBase;
847 int L = SIndex / NCount;
848 int V = (SIndex % NCount) / TCount;
849 int T = SIndex % TCount;
850
851 if (buflen < 27)
852 /* Worst case: HANGUL SYLLABLE <10chars>. */
853 return 0;
854 strcpy(buffer, "HANGUL SYLLABLE ");
855 buffer += 16;
856 strcpy(buffer, hangul_syllables[L][0]);
857 buffer += strlen(hangul_syllables[L][0]);
858 strcpy(buffer, hangul_syllables[V][1]);
859 buffer += strlen(hangul_syllables[V][1]);
860 strcpy(buffer, hangul_syllables[T][2]);
861 buffer += strlen(hangul_syllables[T][2]);
862 *buffer = '\0';
863 return 1;
864 }
865
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000866 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000867 if (buflen < 28)
868 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
869 return 0;
870 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
871 return 1;
872 }
873
Fredrik Lundh06d12682001-01-24 07:59:11 +0000874 /* get offset into phrasebook */
875 offset = phrasebook_offset1[(code>>phrasebook_shift)];
876 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
877 (code&((1<<phrasebook_shift)-1))];
878 if (!offset)
879 return 0;
880
881 i = 0;
882
883 for (;;) {
884 /* get word index */
885 word = phrasebook[offset] - phrasebook_short;
886 if (word >= 0) {
887 word = (word << 8) + phrasebook[offset+1];
888 offset += 2;
889 } else
890 word = phrasebook[offset++];
891 if (i) {
892 if (i > buflen)
893 return 0; /* buffer overflow */
894 buffer[i++] = ' ';
895 }
896 /* copy word string from lexicon. the last character in the
897 word has bit 7 set. the last word in a string ends with
898 0x80 */
899 w = lexicon + lexicon_offset[word];
900 while (*w < 128) {
901 if (i >= buflen)
902 return 0; /* buffer overflow */
903 buffer[i++] = *w++;
904 }
905 if (i >= buflen)
906 return 0; /* buffer overflow */
907 buffer[i++] = *w & 127;
908 if (*w == 128)
909 break; /* end of word */
910 }
911
912 return 1;
913}
914
915static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000916_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000917{
918 /* check if code corresponds to the given name */
919 int i;
920 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000921 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000922 return 0;
923 for (i = 0; i < namelen; i++) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000924 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +0000925 return 0;
926 }
927 return buffer[namelen] == '\0';
928}
929
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000930static void
931find_syllable(const char *str, int *len, int *pos, int count, int column)
932{
933 int i, len1;
934 *len = -1;
935 for (i = 0; i < count; i++) {
936 char *s = hangul_syllables[i][column];
937 len1 = strlen(s);
938 if (len1 <= *len)
939 continue;
940 if (strncmp(str, s, len1) == 0) {
941 *len = len1;
942 *pos = i;
943 }
944 }
945 if (*len == -1) {
946 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000947 }
948}
949
Fredrik Lundh06d12682001-01-24 07:59:11 +0000950static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000951_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000952{
953 unsigned int h, v;
954 unsigned int mask = code_size-1;
955 unsigned int i, incr;
956
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000957 /* Check for hangul syllables. */
958 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Thomas Wouters1e365b22006-03-01 21:58:30 +0000959 int len, L = -1, V = -1, T = -1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000960 const char *pos = name + 16;
961 find_syllable(pos, &len, &L, LCount, 0);
962 pos += len;
963 find_syllable(pos, &len, &V, VCount, 1);
964 pos += len;
965 find_syllable(pos, &len, &T, TCount, 2);
966 pos += len;
Martin v. Löwis8b291e22005-09-18 08:17:56 +0000967 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000968 *code = SBase + (L*VCount+V)*TCount + T;
969 return 1;
970 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000971 /* Otherwise, it's an illegal syllable name. */
972 return 0;
973 }
974
975 /* Check for unified ideographs. */
976 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
977 /* Four or five hexdigits must follow. */
978 v = 0;
979 name += 22;
980 namelen -= 22;
981 if (namelen != 4 && namelen != 5)
982 return 0;
983 while (namelen--) {
984 v *= 16;
985 if (*name >= '0' && *name <= '9')
986 v += *name - '0';
987 else if (*name >= 'A' && *name <= 'F')
988 v += *name - 'A' + 10;
989 else
990 return 0;
991 name++;
992 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000993 if (!is_unified_ideograph(v))
994 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000995 *code = v;
996 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000997 }
998
Fredrik Lundh06d12682001-01-24 07:59:11 +0000999 /* the following is the same as python's dictionary lookup, with
1000 only minor changes. see the makeunicodedata script for more
1001 details */
1002
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001003 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001004 i = (~h) & mask;
1005 v = code_hash[i];
1006 if (!v)
1007 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001008 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001009 *code = v;
1010 return 1;
1011 }
1012 incr = (h ^ (h >> 3)) & mask;
1013 if (!incr)
1014 incr = mask;
1015 for (;;) {
1016 i = (i + incr) & mask;
1017 v = code_hash[i];
1018 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001019 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001020 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001021 *code = v;
1022 return 1;
1023 }
1024 incr = incr << 1;
1025 if (incr > mask)
1026 incr = incr ^ code_poly;
1027 }
1028}
1029
1030static const _PyUnicode_Name_CAPI hashAPI =
1031{
1032 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001033 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001034 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001035};
1036
1037/* -------------------------------------------------------------------- */
1038/* Python bindings */
1039
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001040PyDoc_STRVAR(unicodedata_name__doc__,
1041"name(unichr[, default])\n\
1042Returns the name assigned to the Unicode character unichr as a\n\
1043string. If no name is defined, default is returned, or, if not\n\
1044given, ValueError is raised.");
1045
Fredrik Lundh06d12682001-01-24 07:59:11 +00001046static PyObject *
1047unicodedata_name(PyObject* self, PyObject* args)
1048{
1049 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001050 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001051
1052 PyUnicodeObject* v;
1053 PyObject* defobj = NULL;
1054 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1055 return NULL;
1056
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001057 c = getuchar(v);
1058 if (c == (Py_UCS4)-1)
1059 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001060
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001061 if (!_getucname(self, c, name, sizeof(name))) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001062 if (defobj == NULL) {
1063 PyErr_SetString(PyExc_ValueError, "no such name");
1064 return NULL;
1065 }
1066 else {
1067 Py_INCREF(defobj);
1068 return defobj;
1069 }
1070 }
1071
Walter Dörwald4254e762007-06-05 16:04:09 +00001072 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001073}
1074
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001075PyDoc_STRVAR(unicodedata_lookup__doc__,
1076"lookup(name)\n\
1077\n\
1078Look up character by name. If a character with the\n\
1079given name is found, return the corresponding Unicode\n\
1080character. If not found, KeyError is raised.");
1081
Fredrik Lundh06d12682001-01-24 07:59:11 +00001082static PyObject *
1083unicodedata_lookup(PyObject* self, PyObject* args)
1084{
1085 Py_UCS4 code;
Guido van Rossum806c2462007-08-06 23:33:07 +00001086 Py_UNICODE str[2];
Fredrik Lundh06d12682001-01-24 07:59:11 +00001087
1088 char* name;
1089 int namelen;
1090 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1091 return NULL;
1092
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001093 if (!_getcode(self, name, namelen, &code)) {
Guido van Rossum806c2462007-08-06 23:33:07 +00001094 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1095 name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001096 return NULL;
1097 }
1098
Guido van Rossum806c2462007-08-06 23:33:07 +00001099#ifndef Py_UNICODE_WIDE
1100 if (code >= 0x10000) {
1101 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1102 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1103 return PyUnicode_FromUnicode(str, 2);
1104 }
1105#endif
Fredrik Lundh06d12682001-01-24 07:59:11 +00001106 str[0] = (Py_UNICODE) code;
Guido van Rossum806c2462007-08-06 23:33:07 +00001107 return PyUnicode_FromUnicode(str, 1);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001108}
1109
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001110/* XXX Add doc strings. */
1111
1112static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001113 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1114 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1115 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1116 {"category", unicodedata_category, METH_VARARGS,
1117 unicodedata_category__doc__},
1118 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1119 unicodedata_bidirectional__doc__},
1120 {"combining", unicodedata_combining, METH_VARARGS,
1121 unicodedata_combining__doc__},
1122 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1123 unicodedata_mirrored__doc__},
1124 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1125 unicodedata_east_asian_width__doc__},
1126 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1127 unicodedata_decomposition__doc__},
1128 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1129 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1130 {"normalize", unicodedata_normalize, METH_VARARGS,
1131 unicodedata_normalize__doc__},
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001132 {NULL, NULL} /* sentinel */
1133};
1134
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001135static PyTypeObject UCD_Type = {
1136 /* The ob_type field must be initialized in the module init function
1137 * to be portable to Windows without using C++. */
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001138 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001139 "unicodedata.UCD", /*tp_name*/
1140 sizeof(PreviousDBVersion), /*tp_basicsize*/
1141 0, /*tp_itemsize*/
1142 /* methods */
1143 (destructor)PyObject_Del, /*tp_dealloc*/
1144 0, /*tp_print*/
1145 0, /*tp_getattr*/
1146 0, /*tp_setattr*/
1147 0, /*tp_compare*/
1148 0, /*tp_repr*/
1149 0, /*tp_as_number*/
1150 0, /*tp_as_sequence*/
1151 0, /*tp_as_mapping*/
1152 0, /*tp_hash*/
1153 0, /*tp_call*/
1154 0, /*tp_str*/
1155 PyObject_GenericGetAttr,/*tp_getattro*/
1156 0, /*tp_setattro*/
1157 0, /*tp_as_buffer*/
1158 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1159 0, /*tp_doc*/
1160 0, /*tp_traverse*/
1161 0, /*tp_clear*/
1162 0, /*tp_richcompare*/
1163 0, /*tp_weaklistoffset*/
1164 0, /*tp_iter*/
1165 0, /*tp_iternext*/
1166 unicodedata_functions, /*tp_methods*/
1167 DB_members, /*tp_members*/
1168 0, /*tp_getset*/
1169 0, /*tp_base*/
1170 0, /*tp_dict*/
1171 0, /*tp_descr_get*/
1172 0, /*tp_descr_set*/
1173 0, /*tp_dictoffset*/
1174 0, /*tp_init*/
1175 0, /*tp_alloc*/
1176 0, /*tp_new*/
1177 0, /*tp_free*/
1178 0, /*tp_is_gc*/
1179};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001180
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001181PyDoc_STRVAR(unicodedata_docstring,
1182"This module provides access to the Unicode Character Database which\n\
1183defines character properties for all Unicode characters. The data in\n\
1184this database is based on the UnicodeData.txt file version\n\
Martin v. Löwis93cbca32008-09-10 14:08:48 +000011855.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001186\n\
1187The module uses the same names and symbols as defined by the\n\
Martin v. Löwis93cbca32008-09-10 14:08:48 +00001188UnicodeData File Format 5.1.0 (see\n\
1189http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001190
Martin v. Löwis1a214512008-06-11 05:26:20 +00001191
1192static struct PyModuleDef unicodedatamodule = {
1193 PyModuleDef_HEAD_INIT,
1194 "unicodedata",
1195 unicodedata_docstring,
1196 -1,
1197 unicodedata_functions,
1198 NULL,
1199 NULL,
1200 NULL,
1201 NULL
1202};
1203
Mark Hammond62b1ab12002-07-23 06:31:15 +00001204PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001205PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001206{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001207 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001208
Christian Heimes90aa7642007-12-19 02:45:37 +00001209 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001210
Martin v. Löwis1a214512008-06-11 05:26:20 +00001211 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001212 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001213 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001214
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001215 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001216 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001217 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001218
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001219 /* Previous versions */
1220 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1221 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001222 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001223
Fredrik Lundh06d12682001-01-24 07:59:11 +00001224 /* Export C API */
1225 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001226 if (v != NULL)
1227 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001228 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001229}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001230
1231/*
1232Local variables:
1233c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001234indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001235End:
1236*/