blob: df6ffe343c3f1c54b8f317c86115b2405e90dca6 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melottiae735a72010-03-22 23:07:32 +00003 unicodedata -- Provides access to the Unicode 5.2 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melottiae735a72010-03-22 23:07:32 +00005 Data was extracted from the Unicode 5.2 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Martin v. Löwis7d41e292002-11-23 12:22:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
Antoine Pitrouc83ea132010-05-09 14:46:46 +000022 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Antoine Pitroue988e282009-04-27 21:53:26 +000030 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000031} _PyUnicode_DatabaseRecord;
32
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000033typedef struct change_record {
34 /* sequence of fields should be the same as in merge_old_version */
35 const unsigned char bidir_changed;
36 const unsigned char category_changed;
37 const unsigned char decimal_changed;
Martin v. Löwis24329ba2008-09-10 13:38:12 +000038 const unsigned char mirrored_changed;
Amaury Forgeot d'Arcd0052d12009-10-06 19:56:32 +000039 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000040} change_record;
41
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000042/* data file generated by Tools/unicode/makeunicodedata.py */
43#include "unicodedata_db.h"
44
45static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000046_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000049 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000050 index = 0;
51 else {
52 index = index1[(code>>SHIFT)];
53 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54 }
55
56 return &_PyUnicode_Database_Records[index];
57}
58
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000059/* ------------- Previous-version API ------------------------------------- */
60typedef struct previous_version {
61 PyObject_HEAD
62 const char *name;
63 const change_record* (*getrecord)(Py_UCS4);
64 Py_UCS4 (*normalization)(Py_UCS4);
65} PreviousDBVersion;
66
67#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
68
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000069static PyMemberDef DB_members[] = {
Antoine Pitrouc83ea132010-05-09 14:46:46 +000070 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000071 {NULL}
72};
73
Walter Dörwald6fc23822006-11-09 16:23:26 +000074/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000075static PyTypeObject UCD_Type;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000076
77static PyObject*
78new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
79 Py_UCS4 (*normalization)(Py_UCS4))
80{
Antoine Pitrouc83ea132010-05-09 14:46:46 +000081 PreviousDBVersion *self;
82 self = PyObject_New(PreviousDBVersion, &UCD_Type);
83 if (self == NULL)
84 return NULL;
85 self->name = name;
86 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000087 self->normalization = normalization;
Antoine Pitrouc83ea132010-05-09 14:46:46 +000088 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000089}
90
Walter Dörwalda2a89a82008-06-02 20:36:03 +000091
92static Py_UCS4 getuchar(PyUnicodeObject *obj)
93{
94 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
95
96 if (PyUnicode_GET_SIZE(obj) == 1)
Antoine Pitrouc83ea132010-05-09 14:46:46 +000097 return *v;
Walter Dörwalda2a89a82008-06-02 20:36:03 +000098#ifndef Py_UNICODE_WIDE
99 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
100 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
101 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000102 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000103#endif
104 PyErr_SetString(PyExc_TypeError,
105 "need a single Unicode character as parameter");
106 return (Py_UCS4)-1;
107}
108
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000109/* --- Module API --------------------------------------------------------- */
110
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000111PyDoc_STRVAR(unicodedata_decimal__doc__,
112"decimal(unichr[, default])\n\
113\n\
114Returns the decimal value assigned to the Unicode character unichr\n\
115as integer. If no such value is defined, default is returned, or, if\n\
116not given, ValueError is raised.");
117
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000119unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000120{
121 PyUnicodeObject *v;
122 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000123 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000124 long rc;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000125 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000126
Fredrik Lundh06d12682001-01-24 07:59:11 +0000127 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000128 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000129 c = getuchar(v);
130 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000131 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000132
133 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000134 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000135 if (old->category_changed == 0) {
136 /* unassigned */
137 have_old = 1;
138 rc = -1;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000139 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000140 else if (old->decimal_changed != 0xFF) {
141 have_old = 1;
142 rc = old->decimal_changed;
143 }
144 }
145
146 if (!have_old)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000147 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000148 if (rc < 0) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000149 if (defobj == NULL) {
150 PyErr_SetString(PyExc_ValueError,
151 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000152 return NULL;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000153 }
154 else {
155 Py_INCREF(defobj);
156 return defobj;
157 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000158 }
159 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000160}
161
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000162PyDoc_STRVAR(unicodedata_digit__doc__,
163"digit(unichr[, default])\n\
164\n\
165Returns the digit value assigned to the Unicode character unichr as\n\
166integer. If no such value is defined, default is returned, or, if\n\
167not given, ValueError is raised.");
168
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000170unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000171{
172 PyUnicodeObject *v;
173 PyObject *defobj = NULL;
174 long rc;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000175 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000176
Fredrik Lundh06d12682001-01-24 07:59:11 +0000177 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000178 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000179 c = getuchar(v);
180 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000181 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000182 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000183 if (rc < 0) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000184 if (defobj == NULL) {
185 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000186 return NULL;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000187 }
188 else {
189 Py_INCREF(defobj);
190 return defobj;
191 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000192 }
193 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000194}
195
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000196PyDoc_STRVAR(unicodedata_numeric__doc__,
197"numeric(unichr[, default])\n\
198\n\
199Returns the numeric value assigned to the Unicode character unichr\n\
200as float. If no such value is defined, default is returned, or, if\n\
201not given, ValueError is raised.");
202
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000204unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000205{
206 PyUnicodeObject *v;
207 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000208 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000209 double rc;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000210 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000211
Fredrik Lundh06d12682001-01-24 07:59:11 +0000212 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000213 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000214 c = getuchar(v);
215 if (c == (Py_UCS4)-1)
216 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000217
218 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000219 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000220 if (old->category_changed == 0) {
221 /* unassigned */
222 have_old = 1;
Martin v. Löwisd004fc82006-05-27 08:36:52 +0000223 rc = -1.0;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000224 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000225 else if (old->decimal_changed != 0xFF) {
226 have_old = 1;
227 rc = old->decimal_changed;
228 }
229 }
230
231 if (!have_old)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000232 rc = Py_UNICODE_TONUMERIC(c);
Martin v. Löwisd004fc82006-05-27 08:36:52 +0000233 if (rc == -1.0) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000234 if (defobj == NULL) {
235 PyErr_SetString(PyExc_ValueError, "not a numeric character");
236 return NULL;
237 }
238 else {
239 Py_INCREF(defobj);
240 return defobj;
241 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000242 }
243 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000244}
245
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000246PyDoc_STRVAR(unicodedata_category__doc__,
247"category(unichr)\n\
248\n\
249Returns the general category assigned to the Unicode character\n\
250unichr as string.");
251
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000252static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000253unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000254{
255 PyUnicodeObject *v;
256 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000257 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000258
259 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000260 &PyUnicode_Type, &v))
261 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000262 c = getuchar(v);
263 if (c == (Py_UCS4)-1)
264 return NULL;
265 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000266 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000267 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000268 if (old->category_changed != 0xFF)
269 index = old->category_changed;
270 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000271 return PyString_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000272}
273
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000274PyDoc_STRVAR(unicodedata_bidirectional__doc__,
275"bidirectional(unichr)\n\
276\n\
Ezio Melotti67c563e2012-12-14 20:12:25 +0200277Returns the bidirectional class assigned to the Unicode character\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000278unichr as string. If no such value is defined, an empty string is\n\
279returned.");
280
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000281static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000282unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000283{
284 PyUnicodeObject *v;
285 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000286 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000287
288 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000289 &PyUnicode_Type, &v))
290 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000291 c = getuchar(v);
292 if (c == (Py_UCS4)-1)
293 return NULL;
294 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000295 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000296 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000297 if (old->category_changed == 0)
298 index = 0; /* unassigned */
299 else if (old->bidir_changed != 0xFF)
300 index = old->bidir_changed;
301 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000302 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000303}
304
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000305PyDoc_STRVAR(unicodedata_combining__doc__,
306"combining(unichr)\n\
307\n\
308Returns the canonical combining class assigned to the Unicode\n\
309character unichr as integer. Returns 0 if no combining class is\n\
310defined.");
311
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000312static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000313unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000314{
315 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000316 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000317 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000318
319 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000320 &PyUnicode_Type, &v))
321 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000322 c = getuchar(v);
323 if (c == (Py_UCS4)-1)
324 return NULL;
325 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000326 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000327 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000328 if (old->category_changed == 0)
329 index = 0; /* unassigned */
330 }
331 return PyInt_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000332}
333
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000334PyDoc_STRVAR(unicodedata_mirrored__doc__,
335"mirrored(unichr)\n\
336\n\
337Returns the mirrored property assigned to the Unicode character\n\
338unichr as integer. Returns 1 if the character has been identified as\n\
339a \"mirrored\" character in bidirectional text, 0 otherwise.");
340
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000341static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000342unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000343{
344 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000345 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000346 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000347
348 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000349 &PyUnicode_Type, &v))
350 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000351 c = getuchar(v);
352 if (c == (Py_UCS4)-1)
353 return NULL;
354 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000355 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000356 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000357 if (old->category_changed == 0)
358 index = 0; /* unassigned */
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000359 else if (old->mirrored_changed != 0xFF)
360 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000361 }
362 return PyInt_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000363}
364
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000365PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
366"east_asian_width(unichr)\n\
367\n\
368Returns the east asian width assigned to the Unicode character\n\
369unichr as string.");
370
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000371static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000372unicodedata_east_asian_width(PyObject *self, PyObject *args)
373{
374 PyUnicodeObject *v;
375 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000376 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000377
378 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000379 &PyUnicode_Type, &v))
380 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000381 c = getuchar(v);
382 if (c == (Py_UCS4)-1)
383 return NULL;
384 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000385 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000386 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000387 if (old->category_changed == 0)
388 index = 0; /* unassigned */
389 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000390 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000391}
392
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000393PyDoc_STRVAR(unicodedata_decomposition__doc__,
394"decomposition(unichr)\n\
395\n\
396Returns the character decomposition mapping assigned to the Unicode\n\
397character unichr as string. An empty string is returned in case no\n\
398such mapping is defined.");
399
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000400static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000401unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000402{
403 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000404 char decomp[256];
405 int code, index, count, i;
Neal Norwitz37f694f2006-07-27 04:04:50 +0000406 unsigned int prefix_index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000407 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000408
409 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000410 &PyUnicode_Type, &v))
411 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000412 c = getuchar(v);
413 if (c == (Py_UCS4)-1)
414 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000415
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000416 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000417
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000418 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000419 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000420 if (old->category_changed == 0)
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000421 return PyString_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000422 }
423
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000424 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000425 index = 0;
426 else {
427 index = decomp_index1[(code>>DECOMP_SHIFT)];
428 index = decomp_index2[(index<<DECOMP_SHIFT)+
429 (code&((1<<DECOMP_SHIFT)-1))];
430 }
431
Tim Peters69b83b12001-11-30 07:23:05 +0000432 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000433 is prefix code (from*/
434 count = decomp_data[index] >> 8;
435
436 /* XXX: could allocate the PyString up front instead
437 (strlen(prefix) + 5 * count + 1 bytes) */
438
Neal Norwitz37f694f2006-07-27 04:04:50 +0000439 /* Based on how index is calculated above and decomp_data is generated
440 from Tools/unicode/makeunicodedata.py, it should not be possible
441 to overflow decomp_prefix. */
442 prefix_index = decomp_data[index] & 255;
443 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
444
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000445 /* copy prefix */
Neal Norwitz37f694f2006-07-27 04:04:50 +0000446 i = strlen(decomp_prefix[prefix_index]);
447 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000448
449 while (count-- > 0) {
450 if (i)
451 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000452 assert((size_t)i < sizeof(decomp));
453 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
454 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000455 i += strlen(decomp + i);
456 }
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000457
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000458 decomp[i] = '\0';
459
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000460 return PyString_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000461}
462
Neal Norwitz88c97842006-04-17 00:36:29 +0000463static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000464get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000465{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000466 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000467 *index = 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000468 } else if (self && get_old_record(self, code)->category_changed==0) {
469 /* unassigned in old version */
470 *index = 0;
471 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000472 else {
473 *index = decomp_index1[(code>>DECOMP_SHIFT)];
474 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
475 (code&((1<<DECOMP_SHIFT)-1))];
476 }
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000477
Martin v. Löwis677bde22002-11-23 22:08:15 +0000478 /* high byte is number of hex bytes (usually one or two), low byte
479 is prefix code (from*/
480 *count = decomp_data[*index] >> 8;
481 *prefix = decomp_data[*index] & 255;
482
483 (*index)++;
484}
485
486#define SBase 0xAC00
487#define LBase 0x1100
488#define VBase 0x1161
489#define TBase 0x11A7
490#define LCount 19
491#define VCount 21
492#define TCount 28
493#define NCount (VCount*TCount)
494#define SCount (LCount*NCount)
495
496static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000497nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000498{
499 PyObject *result;
500 Py_UNICODE *i, *end, *o;
501 /* Longest decomposition in Unicode 3.2: U+FDFA */
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000502 Py_UNICODE stack[20];
Martin v. Löwis3c6e4182006-04-13 06:36:31 +0000503 Py_ssize_t space, isize;
504 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000505 unsigned char prev, cur;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000506
Martin v. Löwis677bde22002-11-23 22:08:15 +0000507 stackptr = 0;
508 isize = PyUnicode_GET_SIZE(input);
Benjamin Petersonb027c6c2015-03-02 11:17:05 -0500509 space = isize;
Ezio Melotti6d0f0f22013-08-26 01:31:30 +0300510 /* Overallocate at most 10 characters. */
Benjamin Petersonb027c6c2015-03-02 11:17:05 -0500511 if (space > 10) {
512 if (space <= PY_SSIZE_T_MAX - 10)
513 space += 10;
514 }
515 else {
516 space *= 2;
517 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000518 result = PyUnicode_FromUnicode(NULL, space);
519 if (!result)
520 return NULL;
521 i = PyUnicode_AS_UNICODE(input);
522 end = i + isize;
523 o = PyUnicode_AS_UNICODE(result);
524
525 while (i < end) {
526 stack[stackptr++] = *i++;
527 while(stackptr) {
528 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000529 /* Hangul Decomposition adds three characters in
Ezio Melotti419e23c2013-08-17 16:56:09 +0300530 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000531 if (space < 3) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000532 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000533 space += 10;
534 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000535 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000536 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000537 }
538 /* Hangul Decomposition. */
539 if (SBase <= code && code < (SBase+SCount)) {
540 int SIndex = code - SBase;
541 int L = LBase + SIndex / NCount;
542 int V = VBase + (SIndex % NCount) / TCount;
543 int T = TBase + SIndex % TCount;
544 *o++ = L;
545 *o++ = V;
546 space -= 2;
547 if (T != TBase) {
548 *o++ = T;
549 space --;
550 }
551 continue;
552 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000553 /* normalization changes */
554 if (self) {
555 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
556 if (value != 0) {
557 stack[stackptr++] = value;
558 continue;
559 }
560 }
561
562 /* Other decompositions. */
563 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000564
565 /* Copy character if it is not decomposable, or has a
566 compatibility decomposition, but we do NFD. */
567 if (!count || (prefix && !k)) {
568 *o++ = code;
569 space--;
570 continue;
571 }
572 /* Copy decomposition onto the stack, in reverse
573 order. */
574 while(count) {
575 code = decomp_data[index + (--count)];
576 stack[stackptr++] = code;
577 }
578 }
579 }
580
581 /* Drop overallocation. Cannot fail. */
582 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
583
584 /* Sort canonically. */
585 i = PyUnicode_AS_UNICODE(result);
586 prev = _getrecord_ex(*i)->combining;
587 end = i + PyUnicode_GET_SIZE(result);
588 for (i++; i < end; i++) {
589 cur = _getrecord_ex(*i)->combining;
590 if (prev == 0 || cur == 0 || prev <= cur) {
591 prev = cur;
592 continue;
593 }
594 /* Non-canonical order. Need to switch *i with previous. */
595 o = i - 1;
596 while (1) {
597 Py_UNICODE tmp = o[1];
598 o[1] = o[0];
599 o[0] = tmp;
600 o--;
601 if (o < PyUnicode_AS_UNICODE(result))
602 break;
603 prev = _getrecord_ex(*o)->combining;
604 if (prev == 0 || prev <= cur)
605 break;
606 }
607 prev = _getrecord_ex(*i)->combining;
608 }
609 return result;
610}
611
612static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000613find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000614{
615 int index;
616 for (index = 0; nfc[index].start; index++) {
617 int start = nfc[index].start;
618 if (code < start)
619 return -1;
620 if (code <= start + nfc[index].count) {
621 int delta = code - start;
622 return nfc[index].index + delta;
623 }
624 }
625 return -1;
626}
627
628static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000629nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000630{
631 PyObject *result;
632 Py_UNICODE *i, *i1, *o, *end;
633 int f,l,index,index1,comb;
634 Py_UNICODE code;
635 Py_UNICODE *skipped[20];
636 int cskipped = 0;
637
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000638 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000639 if (!result)
640 return NULL;
641
642 /* We are going to modify result in-place.
643 If nfd_nfkd is changed to sometimes return the input,
644 this code needs to be reviewed. */
645 assert(result != input);
646
647 i = PyUnicode_AS_UNICODE(result);
648 end = i + PyUnicode_GET_SIZE(result);
649 o = PyUnicode_AS_UNICODE(result);
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000650
Martin v. Löwis677bde22002-11-23 22:08:15 +0000651 again:
652 while (i < end) {
653 for (index = 0; index < cskipped; index++) {
654 if (skipped[index] == i) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000655 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000656 Remove from list. */
657 skipped[index] = skipped[cskipped-1];
658 cskipped--;
659 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000660 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000661 }
662 }
663 /* Hangul Composition. We don't need to check for <LV,T>
664 pairs, since we always have decomposed data. */
665 if (LBase <= *i && *i < (LBase+LCount) &&
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000666 i + 1 < end &&
Xiang Zhang1889c4c2018-06-15 21:26:55 +0800667 VBase <= i[1] && i[1] < (VBase+VCount)) {
668 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
669 and V character is a modern vowel (0x1161 ~ 0x1175). */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000670 int LIndex, VIndex;
671 LIndex = i[0] - LBase;
672 VIndex = i[1] - VBase;
673 code = SBase + (LIndex*VCount+VIndex)*TCount;
674 i+=2;
675 if (i < end &&
Xiang Zhang1889c4c2018-06-15 21:26:55 +0800676 TBase < *i && *i < (TBase+TCount)) {
677 /* check T character is a modern trailing consonant
678 (0x11A8 ~ 0x11C2). */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000679 code += *i-TBase;
680 i++;
681 }
682 *o++ = code;
683 continue;
684 }
685
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000686 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000687 if (f == -1) {
688 *o++ = *i++;
689 continue;
690 }
691 /* Find next unblocked character. */
692 i1 = i+1;
693 comb = 0;
694 while (i1 < end) {
695 int comb1 = _getrecord_ex(*i1)->combining;
Alexander Belopolskydce6cf32010-12-28 15:47:56 +0000696 if (comb) {
697 if (comb1 == 0)
698 break;
699 if (comb >= comb1) {
700 /* Character is blocked. */
701 i1++;
702 continue;
703 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000704 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000705 l = find_nfc_index(self, nfc_last, *i1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000706 /* *i1 cannot be combined with *i. If *i1
707 is a starter, we don't need to look further.
708 Otherwise, record the combining class. */
709 if (l == -1) {
710 not_combinable:
711 if (comb1 == 0)
712 break;
713 comb = comb1;
714 i1++;
715 continue;
716 }
717 index = f*TOTAL_LAST + l;
718 index1 = comp_index[index >> COMP_SHIFT];
719 code = comp_data[(index1<<COMP_SHIFT)+
720 (index&((1<<COMP_SHIFT)-1))];
721 if (code == 0)
722 goto not_combinable;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000723
Martin v. Löwis677bde22002-11-23 22:08:15 +0000724 /* Replace the original character. */
725 *i = code;
726 /* Mark the second character unused. */
Alexander Belopolskydce6cf32010-12-28 15:47:56 +0000727 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000728 skipped[cskipped++] = i1;
729 i1++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000730 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000731 if (f == -1)
732 break;
733 }
734 *o++ = *i++;
735 }
736 if (o != end)
737 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
738 return result;
739}
Antoine Pitroue988e282009-04-27 21:53:26 +0000740
741/* Return 1 if the input is certainly normalized, 0 if it might not be. */
742static int
743is_normalized(PyObject *self, PyObject *input, int nfc, int k)
744{
745 Py_UNICODE *i, *end;
746 unsigned char prev_combining = 0, quickcheck_mask;
747
748 /* An older version of the database is requested, quickchecks must be
749 disabled. */
750 if (self != NULL)
751 return 0;
752
753 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
754 as described in http://unicode.org/reports/tr15/#Annex8. */
755 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
756
757 i = PyUnicode_AS_UNICODE(input);
758 end = i + PyUnicode_GET_SIZE(input);
759 while (i < end) {
760 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
761 unsigned char combining = record->combining;
762 unsigned char quickcheck = record->normalization_quick_check;
763
764 if (quickcheck & quickcheck_mask)
765 return 0; /* this string might need normalization */
766 if (combining && prev_combining > combining)
767 return 0; /* non-canonical sort order, not normalized */
768 prev_combining = combining;
769 }
770 return 1; /* certainly normalized */
771}
772
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000773PyDoc_STRVAR(unicodedata_normalize__doc__,
774"normalize(form, unistr)\n\
775\n\
776Return the normal form 'form' for the Unicode string unistr. Valid\n\
777values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
778
Martin v. Löwis677bde22002-11-23 22:08:15 +0000779static PyObject*
780unicodedata_normalize(PyObject *self, PyObject *args)
781{
782 char *form;
783 PyObject *input;
784
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000785 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000786 &form, &PyUnicode_Type, &input))
787 return NULL;
788
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000789 if (PyUnicode_GetSize(input) == 0) {
790 /* Special case empty input strings, since resizing
791 them later would cause internal errors. */
792 Py_INCREF(input);
793 return input;
794 }
795
Antoine Pitroue988e282009-04-27 21:53:26 +0000796 if (strcmp(form, "NFC") == 0) {
797 if (is_normalized(self, input, 1, 0)) {
798 Py_INCREF(input);
799 return input;
800 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000801 return nfc_nfkc(self, input, 0);
Antoine Pitroue988e282009-04-27 21:53:26 +0000802 }
803 if (strcmp(form, "NFKC") == 0) {
804 if (is_normalized(self, input, 1, 1)) {
805 Py_INCREF(input);
806 return input;
807 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000808 return nfc_nfkc(self, input, 1);
Antoine Pitroue988e282009-04-27 21:53:26 +0000809 }
810 if (strcmp(form, "NFD") == 0) {
811 if (is_normalized(self, input, 0, 0)) {
812 Py_INCREF(input);
813 return input;
814 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000815 return nfd_nfkd(self, input, 0);
Antoine Pitroue988e282009-04-27 21:53:26 +0000816 }
817 if (strcmp(form, "NFKD") == 0) {
818 if (is_normalized(self, input, 0, 1)) {
819 Py_INCREF(input);
820 return input;
821 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000822 return nfd_nfkd(self, input, 1);
Antoine Pitroue988e282009-04-27 21:53:26 +0000823 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000824 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
825 return NULL;
826}
827
Fredrik Lundh06d12682001-01-24 07:59:11 +0000828/* -------------------------------------------------------------------- */
829/* unicode character name tables */
830
831/* data file generated by Tools/unicode/makeunicodedata.py */
832#include "unicodename_db.h"
833
834/* -------------------------------------------------------------------- */
835/* database code (cut and pasted from the unidb package) */
836
837static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000838_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000839{
840 int i;
841 unsigned long h = 0;
842 unsigned long ix;
843 for (i = 0; i < len; i++) {
Antoine Pitrou44b3b542011-10-04 13:55:37 +0200844 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000845 ix = h & 0xff000000;
846 if (ix)
847 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
848 }
849 return h;
850}
851
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000852static char *hangul_syllables[][3] = {
853 { "G", "A", "" },
854 { "GG", "AE", "G" },
855 { "N", "YA", "GG" },
856 { "D", "YAE", "GS" },
857 { "DD", "EO", "N", },
858 { "R", "E", "NJ" },
859 { "M", "YEO", "NH" },
860 { "B", "YE", "D" },
861 { "BB", "O", "L" },
862 { "S", "WA", "LG" },
863 { "SS", "WAE", "LM" },
864 { "", "OE", "LB" },
865 { "J", "YO", "LS" },
866 { "JJ", "U", "LT" },
867 { "C", "WEO", "LP" },
868 { "K", "WE", "LH" },
869 { "T", "WI", "M" },
870 { "P", "YU", "B" },
871 { "H", "EU", "BS" },
872 { 0, "YI", "S" },
873 { 0, "I", "SS" },
874 { 0, 0, "NG" },
875 { 0, 0, "J" },
876 { 0, 0, "C" },
877 { 0, 0, "K" },
878 { 0, 0, "T" },
879 { 0, 0, "P" },
880 { 0, 0, "H" }
881};
882
Fredrik Lundh06d12682001-01-24 07:59:11 +0000883static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000884is_unified_ideograph(Py_UCS4 code)
885{
886 return (
887 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Martin v. Löwise03c7782010-11-22 10:53:46 +0000888 (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */
889 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
890 (0x2A700 <= code && code <= 0x2B734)); /* CJK Ideograph Extension C */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000891}
892
893static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000894_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000895{
896 int offset;
897 int i;
898 int word;
899 unsigned char* w;
900
Martin v. Löwisc3509122006-03-11 12:16:23 +0000901 if (code >= 0x110000)
902 return 0;
903
904 if (self) {
905 const change_record *old = get_old_record(self, code);
906 if (old->category_changed == 0) {
907 /* unassigned */
908 return 0;
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000909 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000910 }
911
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000912 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000913 /* Hangul syllable. */
914 int SIndex = code - SBase;
915 int L = SIndex / NCount;
916 int V = (SIndex % NCount) / TCount;
917 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000918
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000919 if (buflen < 27)
920 /* Worst case: HANGUL SYLLABLE <10chars>. */
921 return 0;
922 strcpy(buffer, "HANGUL SYLLABLE ");
923 buffer += 16;
924 strcpy(buffer, hangul_syllables[L][0]);
925 buffer += strlen(hangul_syllables[L][0]);
926 strcpy(buffer, hangul_syllables[V][1]);
927 buffer += strlen(hangul_syllables[V][1]);
928 strcpy(buffer, hangul_syllables[T][2]);
929 buffer += strlen(hangul_syllables[T][2]);
930 *buffer = '\0';
931 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000932 }
933
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000934 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000935 if (buflen < 28)
936 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
937 return 0;
938 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
939 return 1;
940 }
941
Fredrik Lundh06d12682001-01-24 07:59:11 +0000942 /* get offset into phrasebook */
943 offset = phrasebook_offset1[(code>>phrasebook_shift)];
944 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
945 (code&((1<<phrasebook_shift)-1))];
946 if (!offset)
947 return 0;
948
949 i = 0;
950
951 for (;;) {
952 /* get word index */
953 word = phrasebook[offset] - phrasebook_short;
954 if (word >= 0) {
955 word = (word << 8) + phrasebook[offset+1];
956 offset += 2;
957 } else
958 word = phrasebook[offset++];
959 if (i) {
960 if (i > buflen)
961 return 0; /* buffer overflow */
962 buffer[i++] = ' ';
963 }
964 /* copy word string from lexicon. the last character in the
965 word has bit 7 set. the last word in a string ends with
966 0x80 */
967 w = lexicon + lexicon_offset[word];
968 while (*w < 128) {
969 if (i >= buflen)
970 return 0; /* buffer overflow */
971 buffer[i++] = *w++;
972 }
973 if (i >= buflen)
974 return 0; /* buffer overflow */
975 buffer[i++] = *w & 127;
976 if (*w == 128)
977 break; /* end of word */
978 }
979
980 return 1;
981}
982
983static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000984_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000985{
986 /* check if code corresponds to the given name */
987 int i;
988 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000989 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000990 return 0;
991 for (i = 0; i < namelen; i++) {
Antoine Pitrou44b3b542011-10-04 13:55:37 +0200992 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +0000993 return 0;
994 }
995 return buffer[namelen] == '\0';
996}
997
Antoine Pitrouc83ea132010-05-09 14:46:46 +0000998static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000999find_syllable(const char *str, int *len, int *pos, int count, int column)
1000{
1001 int i, len1;
1002 *len = -1;
1003 for (i = 0; i < count; i++) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001004 char *s = hangul_syllables[i][column];
1005 len1 = strlen(s);
1006 if (len1 <= *len)
1007 continue;
1008 if (strncmp(str, s, len1) == 0) {
1009 *len = len1;
1010 *pos = i;
1011 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001012 }
1013 if (*len == -1) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001014 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001015 }
1016}
1017
Fredrik Lundh06d12682001-01-24 07:59:11 +00001018static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001019_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001020{
1021 unsigned int h, v;
1022 unsigned int mask = code_size-1;
1023 unsigned int i, incr;
1024
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001025 /* Check for hangul syllables. */
1026 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001027 int len, L = -1, V = -1, T = -1;
1028 const char *pos = name + 16;
1029 find_syllable(pos, &len, &L, LCount, 0);
1030 pos += len;
1031 find_syllable(pos, &len, &V, VCount, 1);
1032 pos += len;
1033 find_syllable(pos, &len, &T, TCount, 2);
1034 pos += len;
1035 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1036 *code = SBase + (L*VCount+V)*TCount + T;
1037 return 1;
1038 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001039 /* Otherwise, it's an illegal syllable name. */
1040 return 0;
1041 }
1042
1043 /* Check for unified ideographs. */
1044 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1045 /* Four or five hexdigits must follow. */
1046 v = 0;
1047 name += 22;
1048 namelen -= 22;
1049 if (namelen != 4 && namelen != 5)
1050 return 0;
1051 while (namelen--) {
1052 v *= 16;
1053 if (*name >= '0' && *name <= '9')
1054 v += *name - '0';
1055 else if (*name >= 'A' && *name <= 'F')
1056 v += *name - 'A' + 10;
1057 else
1058 return 0;
1059 name++;
1060 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001061 if (!is_unified_ideograph(v))
1062 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001063 *code = v;
1064 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001065 }
1066
Fredrik Lundh06d12682001-01-24 07:59:11 +00001067 /* the following is the same as python's dictionary lookup, with
1068 only minor changes. see the makeunicodedata script for more
1069 details */
1070
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001071 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001072 i = (~h) & mask;
1073 v = code_hash[i];
1074 if (!v)
1075 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001076 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001077 *code = v;
1078 return 1;
1079 }
1080 incr = (h ^ (h >> 3)) & mask;
1081 if (!incr)
1082 incr = mask;
1083 for (;;) {
1084 i = (i + incr) & mask;
1085 v = code_hash[i];
1086 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001087 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001088 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001089 *code = v;
1090 return 1;
1091 }
1092 incr = incr << 1;
1093 if (incr > mask)
1094 incr = incr ^ code_poly;
1095 }
1096}
1097
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001098static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001099{
1100 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001101 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001102 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001103};
1104
1105/* -------------------------------------------------------------------- */
1106/* Python bindings */
1107
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001108PyDoc_STRVAR(unicodedata_name__doc__,
1109"name(unichr[, default])\n\
1110Returns the name assigned to the Unicode character unichr as a\n\
1111string. If no name is defined, default is returned, or, if not\n\
1112given, ValueError is raised.");
1113
Fredrik Lundh06d12682001-01-24 07:59:11 +00001114static PyObject *
1115unicodedata_name(PyObject* self, PyObject* args)
1116{
1117 char name[NAME_MAXLEN];
Walter Dörwalda2a89a82008-06-02 20:36:03 +00001118 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001119
1120 PyUnicodeObject* v;
1121 PyObject* defobj = NULL;
1122 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1123 return NULL;
1124
Walter Dörwalda2a89a82008-06-02 20:36:03 +00001125 c = getuchar(v);
1126 if (c == (Py_UCS4)-1)
1127 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001128
Walter Dörwalda2a89a82008-06-02 20:36:03 +00001129 if (!_getucname(self, c, name, sizeof(name))) {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001130 if (defobj == NULL) {
1131 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001132 return NULL;
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001133 }
1134 else {
1135 Py_INCREF(defobj);
1136 return defobj;
1137 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001138 }
1139
1140 return Py_BuildValue("s", name);
1141}
1142
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001143PyDoc_STRVAR(unicodedata_lookup__doc__,
1144"lookup(name)\n\
1145\n\
1146Look up character by name. If a character with the\n\
1147given name is found, return the corresponding Unicode\n\
1148character. If not found, KeyError is raised.");
1149
Fredrik Lundh06d12682001-01-24 07:59:11 +00001150static PyObject *
1151unicodedata_lookup(PyObject* self, PyObject* args)
1152{
1153 Py_UCS4 code;
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +00001154 Py_UNICODE str[2];
Fredrik Lundh06d12682001-01-24 07:59:11 +00001155
1156 char* name;
1157 int namelen;
1158 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1159 return NULL;
1160
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001161 if (!_getcode(self, name, namelen, &code)) {
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +00001162 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1163 name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001164 return NULL;
1165 }
1166
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +00001167#ifndef Py_UNICODE_WIDE
1168 if (code >= 0x10000) {
1169 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1170 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1171 return PyUnicode_FromUnicode(str, 2);
1172 }
1173#endif
Fredrik Lundh06d12682001-01-24 07:59:11 +00001174 str[0] = (Py_UNICODE) code;
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001175 return PyUnicode_FromUnicode(str, 1);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001176}
1177
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001178/* XXX Add doc strings. */
1179
1180static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001181 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1182 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1183 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1184 {"category", unicodedata_category, METH_VARARGS,
1185 unicodedata_category__doc__},
1186 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1187 unicodedata_bidirectional__doc__},
1188 {"combining", unicodedata_combining, METH_VARARGS,
1189 unicodedata_combining__doc__},
1190 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1191 unicodedata_mirrored__doc__},
1192 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1193 unicodedata_east_asian_width__doc__},
1194 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1195 unicodedata_decomposition__doc__},
1196 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1197 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1198 {"normalize", unicodedata_normalize, METH_VARARGS,
1199 unicodedata_normalize__doc__},
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001200 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001201};
1202
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001203static PyTypeObject UCD_Type = {
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001204 /* The ob_type field must be initialized in the module init function
1205 * to be portable to Windows without using C++. */
1206 PyVarObject_HEAD_INIT(NULL, 0)
1207 "unicodedata.UCD", /*tp_name*/
1208 sizeof(PreviousDBVersion), /*tp_basicsize*/
1209 0, /*tp_itemsize*/
1210 /* methods */
1211 (destructor)PyObject_Del, /*tp_dealloc*/
1212 0, /*tp_print*/
1213 0, /*tp_getattr*/
1214 0, /*tp_setattr*/
1215 0, /*tp_compare*/
1216 0, /*tp_repr*/
1217 0, /*tp_as_number*/
1218 0, /*tp_as_sequence*/
1219 0, /*tp_as_mapping*/
1220 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001221 0, /*tp_call*/
1222 0, /*tp_str*/
1223 PyObject_GenericGetAttr,/*tp_getattro*/
1224 0, /*tp_setattro*/
1225 0, /*tp_as_buffer*/
1226 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1227 0, /*tp_doc*/
1228 0, /*tp_traverse*/
1229 0, /*tp_clear*/
1230 0, /*tp_richcompare*/
1231 0, /*tp_weaklistoffset*/
1232 0, /*tp_iter*/
1233 0, /*tp_iternext*/
1234 unicodedata_functions, /*tp_methods*/
1235 DB_members, /*tp_members*/
1236 0, /*tp_getset*/
1237 0, /*tp_base*/
1238 0, /*tp_dict*/
1239 0, /*tp_descr_get*/
1240 0, /*tp_descr_set*/
1241 0, /*tp_dictoffset*/
1242 0, /*tp_init*/
1243 0, /*tp_alloc*/
1244 0, /*tp_new*/
1245 0, /*tp_free*/
1246 0, /*tp_is_gc*/
1247};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001248
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001249PyDoc_STRVAR(unicodedata_docstring,
1250"This module provides access to the Unicode Character Database which\n\
1251defines character properties for all Unicode characters. The data in\n\
1252this database is based on the UnicodeData.txt file version\n\
Ezio Melottiae735a72010-03-22 23:07:32 +000012535.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001254\n\
1255The module uses the same names and symbols as defined by the\n\
Ezio Melotti0d0b80b2010-03-23 00:38:12 +00001256UnicodeData File Format 5.2.0 (see\n\
1257http://www.unicode.org/reports/tr44/tr44-4.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001258
Mark Hammond62b1ab12002-07-23 06:31:15 +00001259PyMODINIT_FUNC
Thomas Woutersf3f33dc2000-07-21 06:00:07 +00001260initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001261{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001262 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001263
Christian Heimese93237d2007-12-19 02:37:44 +00001264 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001265
Fred Drakef585bef2001-03-03 19:41:55 +00001266 m = Py_InitModule3(
1267 "unicodedata", unicodedata_functions, unicodedata_docstring);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001268 if (!m)
1269 return;
1270
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001271 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001272 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001273 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001274
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001275 /* Previous versions */
1276 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1277 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001278 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001279
Fredrik Lundh06d12682001-01-24 07:59:11 +00001280 /* Export C API */
Larry Hastings402b73f2010-03-25 00:54:54 +00001281 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001282 if (v != NULL)
1283 PyModule_AddObject(m, "ucnhash_CAPI", v);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001284}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001285
Antoine Pitrouc83ea132010-05-09 14:46:46 +00001286/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001287Local variables:
1288c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001289indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001290End:
1291*/