blob: b4af29071af56bb4d0be316ddf525838b2dbb684 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melottiae735a72010-03-22 23:07:32 +00003 unicodedata -- Provides access to the Unicode 5.2 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melottiae735a72010-03-22 23:07:32 +00005 Data was extracted from the Unicode 5.2 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Martin v. Löwis7d41e292002-11-23 12:22:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
Antoine Pitrouc83ea132010-05-09 14:46:46 +000022 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Antoine Pitroue988e282009-04-27 21:53:26 +000030 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000031} _PyUnicode_DatabaseRecord;
32
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000033typedef struct change_record {
34 /* sequence of fields should be the same as in merge_old_version */
35 const unsigned char bidir_changed;
36 const unsigned char category_changed;
37 const unsigned char decimal_changed;
Martin v. Löwis24329ba2008-09-10 13:38:12 +000038 const unsigned char mirrored_changed;
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000039 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000040} change_record;
41
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000042/* data file generated by Tools/unicode/makeunicodedata.py */
43#include "unicodedata_db.h"
44
45static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000046_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000049 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000050 index = 0;
51 else {
52 index = index1[(code>>SHIFT)];
53 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54 }
55
56 return &_PyUnicode_Database_Records[index];
57}
58
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000059/* ------------- Previous-version API ------------------------------------- */
60typedef struct previous_version {
61 PyObject_HEAD
62 const char *name;
63 const change_record* (*getrecord)(Py_UCS4);
64 Py_UCS4 (*normalization)(Py_UCS4);
65} PreviousDBVersion;
66
67#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
68
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000069static PyMemberDef DB_members[] = {
Antoine Pitrouc83ea132010-05-09 14:46:46 +000070 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000071 {NULL}
72};
73
Walter Dörwald6fc23822006-11-09 16:23:26 +000074/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000075static PyTypeObject UCD_Type;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000076
77static PyObject*
78new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
79 Py_UCS4 (*normalization)(Py_UCS4))
80{
Antoine Pitrouc83ea132010-05-09 14:46:46 +000081 PreviousDBVersion *self;
82 self = PyObject_New(PreviousDBVersion, &UCD_Type);
83 if (self == NULL)
84 return NULL;
85 self->name = name;
86 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000087 self->normalization = normalization;
Antoine Pitrouc83ea132010-05-09 14:46:46 +000088 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000089}
90
Walter Dörwalda2a89a82008-06-02 20:36:03 +000091
92static Py_UCS4 getuchar(PyUnicodeObject *obj)
93{
94 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
95
96 if (PyUnicode_GET_SIZE(obj) == 1)
Antoine Pitrouc83ea132010-05-09 14:46:46 +000097 return *v;
Walter Dörwalda2a89a82008-06-02 20:36:03 +000098#ifndef Py_UNICODE_WIDE
99 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
100 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
101 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000102 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000103#endif
104 PyErr_SetString(PyExc_TypeError,
105 "need a single Unicode character as parameter");
106 return (Py_UCS4)-1;
107}
108
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000109/* --- Module API --------------------------------------------------------- */
110
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000111PyDoc_STRVAR(unicodedata_decimal__doc__,
112"decimal(unichr[, default])\n\
113\n\
114Returns the decimal value assigned to the Unicode character unichr\n\
115as integer. If no such value is defined, default is returned, or, if\n\
116not given, ValueError is raised.");
117
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000119unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000120{
121 PyUnicodeObject *v;
122 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000123 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000124 long rc;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000125 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000126
Fredrik Lundh06d12682001-01-24 07:59:11 +0000127 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000128 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000129 c = getuchar(v);
130 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000131 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000132
133 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000134 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000135 if (old->category_changed == 0) {
136 /* unassigned */
137 have_old = 1;
138 rc = -1;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000139 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000140 else if (old->decimal_changed != 0xFF) {
141 have_old = 1;
142 rc = old->decimal_changed;
143 }
144 }
145
146 if (!have_old)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000147 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000148 if (rc < 0) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000149 if (defobj == NULL) {
150 PyErr_SetString(PyExc_ValueError,
151 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000152 return NULL;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000153 }
154 else {
155 Py_INCREF(defobj);
156 return defobj;
157 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000158 }
159 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000160}
161
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000162PyDoc_STRVAR(unicodedata_digit__doc__,
163"digit(unichr[, default])\n\
164\n\
165Returns the digit value assigned to the Unicode character unichr as\n\
166integer. If no such value is defined, default is returned, or, if\n\
167not given, ValueError is raised.");
168
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000170unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000171{
172 PyUnicodeObject *v;
173 PyObject *defobj = NULL;
174 long rc;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000175 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000176
Fredrik Lundh06d12682001-01-24 07:59:11 +0000177 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000178 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000179 c = getuchar(v);
180 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000181 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000182 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000183 if (rc < 0) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000184 if (defobj == NULL) {
185 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000186 return NULL;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000187 }
188 else {
189 Py_INCREF(defobj);
190 return defobj;
191 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000192 }
193 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000194}
195
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000196PyDoc_STRVAR(unicodedata_numeric__doc__,
197"numeric(unichr[, default])\n\
198\n\
199Returns the numeric value assigned to the Unicode character unichr\n\
200as float. If no such value is defined, default is returned, or, if\n\
201not given, ValueError is raised.");
202
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000204unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000205{
206 PyUnicodeObject *v;
207 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000208 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000209 double rc;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000210 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000211
Fredrik Lundh06d12682001-01-24 07:59:11 +0000212 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000213 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000214 c = getuchar(v);
215 if (c == (Py_UCS4)-1)
216 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000217
218 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000219 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000220 if (old->category_changed == 0) {
221 /* unassigned */
222 have_old = 1;
Martin v. Löwisd004fc82006-05-27 08:36:52 +0000223 rc = -1.0;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000224 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000225 else if (old->decimal_changed != 0xFF) {
226 have_old = 1;
227 rc = old->decimal_changed;
228 }
229 }
230
231 if (!have_old)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000232 rc = Py_UNICODE_TONUMERIC(c);
Martin v. Löwisd004fc82006-05-27 08:36:52 +0000233 if (rc == -1.0) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000234 if (defobj == NULL) {
235 PyErr_SetString(PyExc_ValueError, "not a numeric character");
236 return NULL;
237 }
238 else {
239 Py_INCREF(defobj);
240 return defobj;
241 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000242 }
243 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000244}
245
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000246PyDoc_STRVAR(unicodedata_category__doc__,
247"category(unichr)\n\
248\n\
249Returns the general category assigned to the Unicode character\n\
250unichr as string.");
251
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000252static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000253unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000254{
255 PyUnicodeObject *v;
256 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000257 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000258
259 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000260 &PyUnicode_Type, &v))
261 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000262 c = getuchar(v);
263 if (c == (Py_UCS4)-1)
264 return NULL;
265 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000266 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000267 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000268 if (old->category_changed != 0xFF)
269 index = old->category_changed;
270 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000271 return PyString_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000272}
273
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000274PyDoc_STRVAR(unicodedata_bidirectional__doc__,
275"bidirectional(unichr)\n\
276\n\
277Returns the bidirectional category assigned to the Unicode character\n\
278unichr as string. If no such value is defined, an empty string is\n\
279returned.");
280
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000281static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000282unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000283{
284 PyUnicodeObject *v;
285 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000286 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000287
288 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000289 &PyUnicode_Type, &v))
290 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000291 c = getuchar(v);
292 if (c == (Py_UCS4)-1)
293 return NULL;
294 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000295 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000296 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000297 if (old->category_changed == 0)
298 index = 0; /* unassigned */
299 else if (old->bidir_changed != 0xFF)
300 index = old->bidir_changed;
301 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000302 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000303}
304
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000305PyDoc_STRVAR(unicodedata_combining__doc__,
306"combining(unichr)\n\
307\n\
308Returns the canonical combining class assigned to the Unicode\n\
309character unichr as integer. Returns 0 if no combining class is\n\
310defined.");
311
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000312static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000313unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000314{
315 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000316 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000317 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000318
319 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000320 &PyUnicode_Type, &v))
321 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000322 c = getuchar(v);
323 if (c == (Py_UCS4)-1)
324 return NULL;
325 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000326 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000327 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000328 if (old->category_changed == 0)
329 index = 0; /* unassigned */
330 }
331 return PyInt_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000332}
333
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000334PyDoc_STRVAR(unicodedata_mirrored__doc__,
335"mirrored(unichr)\n\
336\n\
337Returns the mirrored property assigned to the Unicode character\n\
338unichr as integer. Returns 1 if the character has been identified as\n\
339a \"mirrored\" character in bidirectional text, 0 otherwise.");
340
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000341static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000342unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000343{
344 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000345 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000346 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000347
348 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000349 &PyUnicode_Type, &v))
350 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000351 c = getuchar(v);
352 if (c == (Py_UCS4)-1)
353 return NULL;
354 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000355 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000356 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000357 if (old->category_changed == 0)
358 index = 0; /* unassigned */
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000359 else if (old->mirrored_changed != 0xFF)
360 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000361 }
362 return PyInt_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000363}
364
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000365PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
366"east_asian_width(unichr)\n\
367\n\
368Returns the east asian width assigned to the Unicode character\n\
369unichr as string.");
370
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000371static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000372unicodedata_east_asian_width(PyObject *self, PyObject *args)
373{
374 PyUnicodeObject *v;
375 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000376 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000377
378 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000379 &PyUnicode_Type, &v))
380 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000381 c = getuchar(v);
382 if (c == (Py_UCS4)-1)
383 return NULL;
384 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000385 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000386 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000387 if (old->category_changed == 0)
388 index = 0; /* unassigned */
389 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000390 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000391}
392
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000393PyDoc_STRVAR(unicodedata_decomposition__doc__,
394"decomposition(unichr)\n\
395\n\
396Returns the character decomposition mapping assigned to the Unicode\n\
397character unichr as string. An empty string is returned in case no\n\
398such mapping is defined.");
399
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000400static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000401unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000402{
403 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000404 char decomp[256];
405 int code, index, count, i;
Neal Norwitz37f694f2006-07-27 04:04:50 +0000406 unsigned int prefix_index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000407 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000408
409 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000410 &PyUnicode_Type, &v))
411 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000412 c = getuchar(v);
413 if (c == (Py_UCS4)-1)
414 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000415
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000416 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000417
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000418 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000419 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000420 if (old->category_changed == 0)
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000421 return PyString_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000422 }
423
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000424 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000425 index = 0;
426 else {
427 index = decomp_index1[(code>>DECOMP_SHIFT)];
428 index = decomp_index2[(index<<DECOMP_SHIFT)+
429 (code&((1<<DECOMP_SHIFT)-1))];
430 }
431
Tim Peters69b83b12001-11-30 07:23:05 +0000432 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000433 is prefix code (from*/
434 count = decomp_data[index] >> 8;
435
436 /* XXX: could allocate the PyString up front instead
437 (strlen(prefix) + 5 * count + 1 bytes) */
438
Neal Norwitz37f694f2006-07-27 04:04:50 +0000439 /* Based on how index is calculated above and decomp_data is generated
440 from Tools/unicode/makeunicodedata.py, it should not be possible
441 to overflow decomp_prefix. */
442 prefix_index = decomp_data[index] & 255;
443 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
444
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000445 /* copy prefix */
Neal Norwitz37f694f2006-07-27 04:04:50 +0000446 i = strlen(decomp_prefix[prefix_index]);
447 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000448
449 while (count-- > 0) {
450 if (i)
451 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000452 assert((size_t)i < sizeof(decomp));
453 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
454 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000455 i += strlen(decomp + i);
456 }
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000457
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000458 decomp[i] = '\0';
459
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000460 return PyString_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000461}
462
Neal Norwitz88c97842006-04-17 00:36:29 +0000463static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000464get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000465{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000466 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000467 *index = 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000468 } else if (self && get_old_record(self, code)->category_changed==0) {
469 /* unassigned in old version */
470 *index = 0;
471 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000472 else {
473 *index = decomp_index1[(code>>DECOMP_SHIFT)];
474 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
475 (code&((1<<DECOMP_SHIFT)-1))];
476 }
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000477
Martin v. Löwis677bde22002-11-23 22:08:15 +0000478 /* high byte is number of hex bytes (usually one or two), low byte
479 is prefix code (from*/
480 *count = decomp_data[*index] >> 8;
481 *prefix = decomp_data[*index] & 255;
482
483 (*index)++;
484}
485
486#define SBase 0xAC00
487#define LBase 0x1100
488#define VBase 0x1161
489#define TBase 0x11A7
490#define LCount 19
491#define VCount 21
492#define TCount 28
493#define NCount (VCount*TCount)
494#define SCount (LCount*NCount)
495
496static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000497nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000498{
499 PyObject *result;
500 Py_UNICODE *i, *end, *o;
501 /* Longest decomposition in Unicode 3.2: U+FDFA */
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000502 Py_UNICODE stack[20];
Martin v. Löwis3c6e4182006-04-13 06:36:31 +0000503 Py_ssize_t space, isize;
504 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000505 unsigned char prev, cur;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000506
Martin v. Löwis677bde22002-11-23 22:08:15 +0000507 stackptr = 0;
508 isize = PyUnicode_GET_SIZE(input);
509 /* Overallocate atmost 10 characters. */
510 space = (isize > 10 ? 10 : isize) + isize;
511 result = PyUnicode_FromUnicode(NULL, space);
512 if (!result)
513 return NULL;
514 i = PyUnicode_AS_UNICODE(input);
515 end = i + isize;
516 o = PyUnicode_AS_UNICODE(result);
517
518 while (i < end) {
519 stack[stackptr++] = *i++;
520 while(stackptr) {
521 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000522 /* Hangul Decomposition adds three characters in
523 a single step, so we need atleast that much room. */
524 if (space < 3) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000525 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000526 space += 10;
527 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000528 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000529 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000530 }
531 /* Hangul Decomposition. */
532 if (SBase <= code && code < (SBase+SCount)) {
533 int SIndex = code - SBase;
534 int L = LBase + SIndex / NCount;
535 int V = VBase + (SIndex % NCount) / TCount;
536 int T = TBase + SIndex % TCount;
537 *o++ = L;
538 *o++ = V;
539 space -= 2;
540 if (T != TBase) {
541 *o++ = T;
542 space --;
543 }
544 continue;
545 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000546 /* normalization changes */
547 if (self) {
548 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
549 if (value != 0) {
550 stack[stackptr++] = value;
551 continue;
552 }
553 }
554
555 /* Other decompositions. */
556 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000557
558 /* Copy character if it is not decomposable, or has a
559 compatibility decomposition, but we do NFD. */
560 if (!count || (prefix && !k)) {
561 *o++ = code;
562 space--;
563 continue;
564 }
565 /* Copy decomposition onto the stack, in reverse
566 order. */
567 while(count) {
568 code = decomp_data[index + (--count)];
569 stack[stackptr++] = code;
570 }
571 }
572 }
573
574 /* Drop overallocation. Cannot fail. */
575 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
576
577 /* Sort canonically. */
578 i = PyUnicode_AS_UNICODE(result);
579 prev = _getrecord_ex(*i)->combining;
580 end = i + PyUnicode_GET_SIZE(result);
581 for (i++; i < end; i++) {
582 cur = _getrecord_ex(*i)->combining;
583 if (prev == 0 || cur == 0 || prev <= cur) {
584 prev = cur;
585 continue;
586 }
587 /* Non-canonical order. Need to switch *i with previous. */
588 o = i - 1;
589 while (1) {
590 Py_UNICODE tmp = o[1];
591 o[1] = o[0];
592 o[0] = tmp;
593 o--;
594 if (o < PyUnicode_AS_UNICODE(result))
595 break;
596 prev = _getrecord_ex(*o)->combining;
597 if (prev == 0 || prev <= cur)
598 break;
599 }
600 prev = _getrecord_ex(*i)->combining;
601 }
602 return result;
603}
604
605static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000606find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000607{
608 int index;
609 for (index = 0; nfc[index].start; index++) {
610 int start = nfc[index].start;
611 if (code < start)
612 return -1;
613 if (code <= start + nfc[index].count) {
614 int delta = code - start;
615 return nfc[index].index + delta;
616 }
617 }
618 return -1;
619}
620
621static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000622nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000623{
624 PyObject *result;
625 Py_UNICODE *i, *i1, *o, *end;
626 int f,l,index,index1,comb;
627 Py_UNICODE code;
628 Py_UNICODE *skipped[20];
629 int cskipped = 0;
630
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000631 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000632 if (!result)
633 return NULL;
634
635 /* We are going to modify result in-place.
636 If nfd_nfkd is changed to sometimes return the input,
637 this code needs to be reviewed. */
638 assert(result != input);
639
640 i = PyUnicode_AS_UNICODE(result);
641 end = i + PyUnicode_GET_SIZE(result);
642 o = PyUnicode_AS_UNICODE(result);
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000643
Martin v. Löwis677bde22002-11-23 22:08:15 +0000644 again:
645 while (i < end) {
646 for (index = 0; index < cskipped; index++) {
647 if (skipped[index] == i) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000648 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000649 Remove from list. */
650 skipped[index] = skipped[cskipped-1];
651 cskipped--;
652 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000653 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000654 }
655 }
656 /* Hangul Composition. We don't need to check for <LV,T>
657 pairs, since we always have decomposed data. */
658 if (LBase <= *i && *i < (LBase+LCount) &&
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000659 i + 1 < end &&
Martin v. Löwis677bde22002-11-23 22:08:15 +0000660 VBase <= i[1] && i[1] <= (VBase+VCount)) {
661 int LIndex, VIndex;
662 LIndex = i[0] - LBase;
663 VIndex = i[1] - VBase;
664 code = SBase + (LIndex*VCount+VIndex)*TCount;
665 i+=2;
666 if (i < end &&
667 TBase <= *i && *i <= (TBase+TCount)) {
668 code += *i-TBase;
669 i++;
670 }
671 *o++ = code;
672 continue;
673 }
674
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000675 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000676 if (f == -1) {
677 *o++ = *i++;
678 continue;
679 }
680 /* Find next unblocked character. */
681 i1 = i+1;
682 comb = 0;
683 while (i1 < end) {
684 int comb1 = _getrecord_ex(*i1)->combining;
Victor Stinner7c924ec2010-03-04 12:09:33 +0000685 if (comb && (comb1 == 0 || comb == comb1)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000686 /* Character is blocked. */
687 i1++;
688 continue;
689 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000690 l = find_nfc_index(self, nfc_last, *i1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000691 /* *i1 cannot be combined with *i. If *i1
692 is a starter, we don't need to look further.
693 Otherwise, record the combining class. */
694 if (l == -1) {
695 not_combinable:
696 if (comb1 == 0)
697 break;
698 comb = comb1;
699 i1++;
700 continue;
701 }
702 index = f*TOTAL_LAST + l;
703 index1 = comp_index[index >> COMP_SHIFT];
704 code = comp_data[(index1<<COMP_SHIFT)+
705 (index&((1<<COMP_SHIFT)-1))];
706 if (code == 0)
707 goto not_combinable;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000708
Martin v. Löwis677bde22002-11-23 22:08:15 +0000709 /* Replace the original character. */
710 *i = code;
711 /* Mark the second character unused. */
712 skipped[cskipped++] = i1;
713 i1++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000714 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000715 if (f == -1)
716 break;
717 }
718 *o++ = *i++;
719 }
720 if (o != end)
721 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
722 return result;
723}
Antoine Pitroue988e282009-04-27 21:53:26 +0000724
725/* Return 1 if the input is certainly normalized, 0 if it might not be. */
726static int
727is_normalized(PyObject *self, PyObject *input, int nfc, int k)
728{
729 Py_UNICODE *i, *end;
730 unsigned char prev_combining = 0, quickcheck_mask;
731
732 /* An older version of the database is requested, quickchecks must be
733 disabled. */
734 if (self != NULL)
735 return 0;
736
737 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
738 as described in http://unicode.org/reports/tr15/#Annex8. */
739 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
740
741 i = PyUnicode_AS_UNICODE(input);
742 end = i + PyUnicode_GET_SIZE(input);
743 while (i < end) {
744 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
745 unsigned char combining = record->combining;
746 unsigned char quickcheck = record->normalization_quick_check;
747
748 if (quickcheck & quickcheck_mask)
749 return 0; /* this string might need normalization */
750 if (combining && prev_combining > combining)
751 return 0; /* non-canonical sort order, not normalized */
752 prev_combining = combining;
753 }
754 return 1; /* certainly normalized */
755}
756
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000757PyDoc_STRVAR(unicodedata_normalize__doc__,
758"normalize(form, unistr)\n\
759\n\
760Return the normal form 'form' for the Unicode string unistr. Valid\n\
761values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
762
Martin v. Löwis677bde22002-11-23 22:08:15 +0000763static PyObject*
764unicodedata_normalize(PyObject *self, PyObject *args)
765{
766 char *form;
767 PyObject *input;
768
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000769 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000770 &form, &PyUnicode_Type, &input))
771 return NULL;
772
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000773 if (PyUnicode_GetSize(input) == 0) {
774 /* Special case empty input strings, since resizing
775 them later would cause internal errors. */
776 Py_INCREF(input);
777 return input;
778 }
779
Antoine Pitroue988e282009-04-27 21:53:26 +0000780 if (strcmp(form, "NFC") == 0) {
781 if (is_normalized(self, input, 1, 0)) {
782 Py_INCREF(input);
783 return input;
784 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000785 return nfc_nfkc(self, input, 0);
Antoine Pitroue988e282009-04-27 21:53:26 +0000786 }
787 if (strcmp(form, "NFKC") == 0) {
788 if (is_normalized(self, input, 1, 1)) {
789 Py_INCREF(input);
790 return input;
791 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000792 return nfc_nfkc(self, input, 1);
Antoine Pitroue988e282009-04-27 21:53:26 +0000793 }
794 if (strcmp(form, "NFD") == 0) {
795 if (is_normalized(self, input, 0, 0)) {
796 Py_INCREF(input);
797 return input;
798 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000799 return nfd_nfkd(self, input, 0);
Antoine Pitroue988e282009-04-27 21:53:26 +0000800 }
801 if (strcmp(form, "NFKD") == 0) {
802 if (is_normalized(self, input, 0, 1)) {
803 Py_INCREF(input);
804 return input;
805 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000806 return nfd_nfkd(self, input, 1);
Antoine Pitroue988e282009-04-27 21:53:26 +0000807 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000808 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
809 return NULL;
810}
811
Fredrik Lundh06d12682001-01-24 07:59:11 +0000812/* -------------------------------------------------------------------- */
813/* unicode character name tables */
814
815/* data file generated by Tools/unicode/makeunicodedata.py */
816#include "unicodename_db.h"
817
818/* -------------------------------------------------------------------- */
819/* database code (cut and pasted from the unidb package) */
820
821static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000822_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000823{
824 int i;
825 unsigned long h = 0;
826 unsigned long ix;
827 for (i = 0; i < len; i++) {
Neal Norwitz65c05b22006-04-10 02:17:47 +0000828 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000829 ix = h & 0xff000000;
830 if (ix)
831 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
832 }
833 return h;
834}
835
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000836static char *hangul_syllables[][3] = {
837 { "G", "A", "" },
838 { "GG", "AE", "G" },
839 { "N", "YA", "GG" },
840 { "D", "YAE", "GS" },
841 { "DD", "EO", "N", },
842 { "R", "E", "NJ" },
843 { "M", "YEO", "NH" },
844 { "B", "YE", "D" },
845 { "BB", "O", "L" },
846 { "S", "WA", "LG" },
847 { "SS", "WAE", "LM" },
848 { "", "OE", "LB" },
849 { "J", "YO", "LS" },
850 { "JJ", "U", "LT" },
851 { "C", "WEO", "LP" },
852 { "K", "WE", "LH" },
853 { "T", "WI", "M" },
854 { "P", "YU", "B" },
855 { "H", "EU", "BS" },
856 { 0, "YI", "S" },
857 { 0, "I", "SS" },
858 { 0, 0, "NG" },
859 { 0, 0, "J" },
860 { 0, 0, "C" },
861 { 0, 0, "K" },
862 { 0, 0, "T" },
863 { 0, 0, "P" },
864 { 0, 0, "H" }
865};
866
Fredrik Lundh06d12682001-01-24 07:59:11 +0000867static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000868is_unified_ideograph(Py_UCS4 code)
869{
870 return (
871 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Martin v. Löwise03c7782010-11-22 10:53:46 +0000872 (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */
873 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
874 (0x2A700 <= code && code <= 0x2B734)); /* CJK Ideograph Extension C */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000875}
876
877static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000878_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000879{
880 int offset;
881 int i;
882 int word;
883 unsigned char* w;
884
Martin v. Löwisc3509122006-03-11 12:16:23 +0000885 if (code >= 0x110000)
886 return 0;
887
888 if (self) {
889 const change_record *old = get_old_record(self, code);
890 if (old->category_changed == 0) {
891 /* unassigned */
892 return 0;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000893 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000894 }
895
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000896 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000897 /* Hangul syllable. */
898 int SIndex = code - SBase;
899 int L = SIndex / NCount;
900 int V = (SIndex % NCount) / TCount;
901 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000902
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000903 if (buflen < 27)
904 /* Worst case: HANGUL SYLLABLE <10chars>. */
905 return 0;
906 strcpy(buffer, "HANGUL SYLLABLE ");
907 buffer += 16;
908 strcpy(buffer, hangul_syllables[L][0]);
909 buffer += strlen(hangul_syllables[L][0]);
910 strcpy(buffer, hangul_syllables[V][1]);
911 buffer += strlen(hangul_syllables[V][1]);
912 strcpy(buffer, hangul_syllables[T][2]);
913 buffer += strlen(hangul_syllables[T][2]);
914 *buffer = '\0';
915 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000916 }
917
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000918 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000919 if (buflen < 28)
920 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
921 return 0;
922 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
923 return 1;
924 }
925
Fredrik Lundh06d12682001-01-24 07:59:11 +0000926 /* get offset into phrasebook */
927 offset = phrasebook_offset1[(code>>phrasebook_shift)];
928 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
929 (code&((1<<phrasebook_shift)-1))];
930 if (!offset)
931 return 0;
932
933 i = 0;
934
935 for (;;) {
936 /* get word index */
937 word = phrasebook[offset] - phrasebook_short;
938 if (word >= 0) {
939 word = (word << 8) + phrasebook[offset+1];
940 offset += 2;
941 } else
942 word = phrasebook[offset++];
943 if (i) {
944 if (i > buflen)
945 return 0; /* buffer overflow */
946 buffer[i++] = ' ';
947 }
948 /* copy word string from lexicon. the last character in the
949 word has bit 7 set. the last word in a string ends with
950 0x80 */
951 w = lexicon + lexicon_offset[word];
952 while (*w < 128) {
953 if (i >= buflen)
954 return 0; /* buffer overflow */
955 buffer[i++] = *w++;
956 }
957 if (i >= buflen)
958 return 0; /* buffer overflow */
959 buffer[i++] = *w & 127;
960 if (*w == 128)
961 break; /* end of word */
962 }
963
964 return 1;
965}
966
967static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000968_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000969{
970 /* check if code corresponds to the given name */
971 int i;
972 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000973 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000974 return 0;
975 for (i = 0; i < namelen; i++) {
Neal Norwitz65c05b22006-04-10 02:17:47 +0000976 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +0000977 return 0;
978 }
979 return buffer[namelen] == '\0';
980}
981
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000982static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000983find_syllable(const char *str, int *len, int *pos, int count, int column)
984{
985 int i, len1;
986 *len = -1;
987 for (i = 0; i < count; i++) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000988 char *s = hangul_syllables[i][column];
989 len1 = strlen(s);
990 if (len1 <= *len)
991 continue;
992 if (strncmp(str, s, len1) == 0) {
993 *len = len1;
994 *pos = i;
995 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000996 }
997 if (*len == -1) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000998 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000999 }
1000}
1001
Fredrik Lundh06d12682001-01-24 07:59:11 +00001002static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001003_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001004{
1005 unsigned int h, v;
1006 unsigned int mask = code_size-1;
1007 unsigned int i, incr;
1008
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001009 /* Check for hangul syllables. */
1010 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001011 int len, L = -1, V = -1, T = -1;
1012 const char *pos = name + 16;
1013 find_syllable(pos, &len, &L, LCount, 0);
1014 pos += len;
1015 find_syllable(pos, &len, &V, VCount, 1);
1016 pos += len;
1017 find_syllable(pos, &len, &T, TCount, 2);
1018 pos += len;
1019 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1020 *code = SBase + (L*VCount+V)*TCount + T;
1021 return 1;
1022 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001023 /* Otherwise, it's an illegal syllable name. */
1024 return 0;
1025 }
1026
1027 /* Check for unified ideographs. */
1028 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1029 /* Four or five hexdigits must follow. */
1030 v = 0;
1031 name += 22;
1032 namelen -= 22;
1033 if (namelen != 4 && namelen != 5)
1034 return 0;
1035 while (namelen--) {
1036 v *= 16;
1037 if (*name >= '0' && *name <= '9')
1038 v += *name - '0';
1039 else if (*name >= 'A' && *name <= 'F')
1040 v += *name - 'A' + 10;
1041 else
1042 return 0;
1043 name++;
1044 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001045 if (!is_unified_ideograph(v))
1046 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001047 *code = v;
1048 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001049 }
1050
Fredrik Lundh06d12682001-01-24 07:59:11 +00001051 /* the following is the same as python's dictionary lookup, with
1052 only minor changes. see the makeunicodedata script for more
1053 details */
1054
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001055 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001056 i = (~h) & mask;
1057 v = code_hash[i];
1058 if (!v)
1059 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001060 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001061 *code = v;
1062 return 1;
1063 }
1064 incr = (h ^ (h >> 3)) & mask;
1065 if (!incr)
1066 incr = mask;
1067 for (;;) {
1068 i = (i + incr) & mask;
1069 v = code_hash[i];
1070 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001071 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001072 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001073 *code = v;
1074 return 1;
1075 }
1076 incr = incr << 1;
1077 if (incr > mask)
1078 incr = incr ^ code_poly;
1079 }
1080}
1081
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001082static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001083{
1084 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001085 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001086 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001087};
1088
1089/* -------------------------------------------------------------------- */
1090/* Python bindings */
1091
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001092PyDoc_STRVAR(unicodedata_name__doc__,
1093"name(unichr[, default])\n\
1094Returns the name assigned to the Unicode character unichr as a\n\
1095string. If no name is defined, default is returned, or, if not\n\
1096given, ValueError is raised.");
1097
Fredrik Lundh06d12682001-01-24 07:59:11 +00001098static PyObject *
1099unicodedata_name(PyObject* self, PyObject* args)
1100{
1101 char name[NAME_MAXLEN];
Walter Dörwalda2a89a82008-06-02 20:36:03 +00001102 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001103
1104 PyUnicodeObject* v;
1105 PyObject* defobj = NULL;
1106 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1107 return NULL;
1108
Walter Dörwalda2a89a82008-06-02 20:36:03 +00001109 c = getuchar(v);
1110 if (c == (Py_UCS4)-1)
1111 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001112
Walter Dörwalda2a89a82008-06-02 20:36:03 +00001113 if (!_getucname(self, c, name, sizeof(name))) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001114 if (defobj == NULL) {
1115 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001116 return NULL;
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001117 }
1118 else {
1119 Py_INCREF(defobj);
1120 return defobj;
1121 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001122 }
1123
1124 return Py_BuildValue("s", name);
1125}
1126
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001127PyDoc_STRVAR(unicodedata_lookup__doc__,
1128"lookup(name)\n\
1129\n\
1130Look up character by name. If a character with the\n\
1131given name is found, return the corresponding Unicode\n\
1132character. If not found, KeyError is raised.");
1133
Fredrik Lundh06d12682001-01-24 07:59:11 +00001134static PyObject *
1135unicodedata_lookup(PyObject* self, PyObject* args)
1136{
1137 Py_UCS4 code;
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +00001138 Py_UNICODE str[2];
Fredrik Lundh06d12682001-01-24 07:59:11 +00001139
1140 char* name;
1141 int namelen;
1142 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1143 return NULL;
1144
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001145 if (!_getcode(self, name, namelen, &code)) {
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +00001146 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1147 name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001148 return NULL;
1149 }
1150
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +00001151#ifndef Py_UNICODE_WIDE
1152 if (code >= 0x10000) {
1153 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1154 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1155 return PyUnicode_FromUnicode(str, 2);
1156 }
1157#endif
Fredrik Lundh06d12682001-01-24 07:59:11 +00001158 str[0] = (Py_UNICODE) code;
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001159 return PyUnicode_FromUnicode(str, 1);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001160}
1161
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001162/* XXX Add doc strings. */
1163
1164static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001165 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1166 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1167 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1168 {"category", unicodedata_category, METH_VARARGS,
1169 unicodedata_category__doc__},
1170 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1171 unicodedata_bidirectional__doc__},
1172 {"combining", unicodedata_combining, METH_VARARGS,
1173 unicodedata_combining__doc__},
1174 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1175 unicodedata_mirrored__doc__},
1176 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1177 unicodedata_east_asian_width__doc__},
1178 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1179 unicodedata_decomposition__doc__},
1180 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1181 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1182 {"normalize", unicodedata_normalize, METH_VARARGS,
1183 unicodedata_normalize__doc__},
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001184 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001185};
1186
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001187static PyTypeObject UCD_Type = {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001188 /* The ob_type field must be initialized in the module init function
1189 * to be portable to Windows without using C++. */
1190 PyVarObject_HEAD_INIT(NULL, 0)
1191 "unicodedata.UCD", /*tp_name*/
1192 sizeof(PreviousDBVersion), /*tp_basicsize*/
1193 0, /*tp_itemsize*/
1194 /* methods */
1195 (destructor)PyObject_Del, /*tp_dealloc*/
1196 0, /*tp_print*/
1197 0, /*tp_getattr*/
1198 0, /*tp_setattr*/
1199 0, /*tp_compare*/
1200 0, /*tp_repr*/
1201 0, /*tp_as_number*/
1202 0, /*tp_as_sequence*/
1203 0, /*tp_as_mapping*/
1204 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001205 0, /*tp_call*/
1206 0, /*tp_str*/
1207 PyObject_GenericGetAttr,/*tp_getattro*/
1208 0, /*tp_setattro*/
1209 0, /*tp_as_buffer*/
1210 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1211 0, /*tp_doc*/
1212 0, /*tp_traverse*/
1213 0, /*tp_clear*/
1214 0, /*tp_richcompare*/
1215 0, /*tp_weaklistoffset*/
1216 0, /*tp_iter*/
1217 0, /*tp_iternext*/
1218 unicodedata_functions, /*tp_methods*/
1219 DB_members, /*tp_members*/
1220 0, /*tp_getset*/
1221 0, /*tp_base*/
1222 0, /*tp_dict*/
1223 0, /*tp_descr_get*/
1224 0, /*tp_descr_set*/
1225 0, /*tp_dictoffset*/
1226 0, /*tp_init*/
1227 0, /*tp_alloc*/
1228 0, /*tp_new*/
1229 0, /*tp_free*/
1230 0, /*tp_is_gc*/
1231};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001232
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001233PyDoc_STRVAR(unicodedata_docstring,
1234"This module provides access to the Unicode Character Database which\n\
1235defines character properties for all Unicode characters. The data in\n\
1236this database is based on the UnicodeData.txt file version\n\
Ezio Melottiae735a72010-03-22 23:07:32 +000012375.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001238\n\
1239The module uses the same names and symbols as defined by the\n\
Ezio Melotti0d0b80b2010-03-23 00:38:12 +00001240UnicodeData File Format 5.2.0 (see\n\
1241http://www.unicode.org/reports/tr44/tr44-4.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001242
Mark Hammond62b1ab12002-07-23 06:31:15 +00001243PyMODINIT_FUNC
Thomas Woutersf3f33dc2000-07-21 06:00:07 +00001244initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001245{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001246 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001247
Christian Heimese93237d2007-12-19 02:37:44 +00001248 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001249
Fred Drakef585bef2001-03-03 19:41:55 +00001250 m = Py_InitModule3(
1251 "unicodedata", unicodedata_functions, unicodedata_docstring);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001252 if (!m)
1253 return;
1254
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001255 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001256 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001257 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001258
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001259 /* Previous versions */
1260 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1261 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001262 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001263
Fredrik Lundh06d12682001-01-24 07:59:11 +00001264 /* Export C API */
Larry Hastings402b73f2010-03-25 00:54:54 +00001265 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001266 if (v != NULL)
1267 PyModule_AddObject(m, "ucnhash_CAPI", v);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001268}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001269
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001270/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001271Local variables:
1272c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001273indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001274End:
1275*/