blob: 03b869093a0555854fc53f97b2edfda1ba5aaf43 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Martin v. Löwis24329ba2008-09-10 13:38:12 +00003 unicodedata -- Provides access to the Unicode 5.1 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Martin v. Löwis24329ba2008-09-10 13:38:12 +00005 Data was extracted from the Unicode 5.1 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Martin v. Löwis7d41e292002-11-23 12:22:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +000022 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030} _PyUnicode_DatabaseRecord;
31
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000032typedef struct change_record {
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed;
35 const unsigned char category_changed;
36 const unsigned char decimal_changed;
Martin v. Löwis24329ba2008-09-10 13:38:12 +000037 const unsigned char mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000038 const int numeric_changed;
39} change_record;
40
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000041/* data file generated by Tools/unicode/makeunicodedata.py */
42#include "unicodedata_db.h"
43
44static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000045_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000046{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000048 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000049 index = 0;
50 else {
51 index = index1[(code>>SHIFT)];
52 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
53 }
54
55 return &_PyUnicode_Database_Records[index];
56}
57
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000058/* ------------- Previous-version API ------------------------------------- */
59typedef struct previous_version {
60 PyObject_HEAD
61 const char *name;
62 const change_record* (*getrecord)(Py_UCS4);
63 Py_UCS4 (*normalization)(Py_UCS4);
64} PreviousDBVersion;
65
66#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
67
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000068static PyMemberDef DB_members[] = {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +000069 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000070 {NULL}
71};
72
Walter Dörwald6fc23822006-11-09 16:23:26 +000073/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000074static PyTypeObject UCD_Type;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000075
76static PyObject*
77new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
78 Py_UCS4 (*normalization)(Py_UCS4))
79{
Antoine Pitrouc7c96a92010-05-09 15:15:40 +000080 PreviousDBVersion *self;
81 self = PyObject_New(PreviousDBVersion, &UCD_Type);
82 if (self == NULL)
83 return NULL;
84 self->name = name;
85 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086 self->normalization = normalization;
Antoine Pitrouc7c96a92010-05-09 15:15:40 +000087 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088}
89
Walter Dörwalda2a89a82008-06-02 20:36:03 +000090
91static Py_UCS4 getuchar(PyUnicodeObject *obj)
92{
93 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
94
95 if (PyUnicode_GET_SIZE(obj) == 1)
Antoine Pitrouc7c96a92010-05-09 15:15:40 +000096 return *v;
Walter Dörwalda2a89a82008-06-02 20:36:03 +000097#ifndef Py_UNICODE_WIDE
98 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
99 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
100 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000101 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000102#endif
103 PyErr_SetString(PyExc_TypeError,
104 "need a single Unicode character as parameter");
105 return (Py_UCS4)-1;
106}
107
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000108/* --- Module API --------------------------------------------------------- */
109
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000110PyDoc_STRVAR(unicodedata_decimal__doc__,
111"decimal(unichr[, default])\n\
112\n\
113Returns the decimal value assigned to the Unicode character unichr\n\
114as integer. If no such value is defined, default is returned, or, if\n\
115not given, ValueError is raised.");
116
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000117static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000118unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000119{
120 PyUnicodeObject *v;
121 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000122 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000123 long rc;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000124 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000125
Fredrik Lundh06d12682001-01-24 07:59:11 +0000126 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000127 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000128 c = getuchar(v);
129 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000130 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000131
132 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000133 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000134 if (old->category_changed == 0) {
135 /* unassigned */
136 have_old = 1;
137 rc = -1;
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000138 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000139 else if (old->decimal_changed != 0xFF) {
140 have_old = 1;
141 rc = old->decimal_changed;
142 }
143 }
144
145 if (!have_old)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000146 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000147 if (rc < 0) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000148 if (defobj == NULL) {
149 PyErr_SetString(PyExc_ValueError,
150 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000151 return NULL;
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000152 }
153 else {
154 Py_INCREF(defobj);
155 return defobj;
156 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000157 }
158 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000159}
160
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000161PyDoc_STRVAR(unicodedata_digit__doc__,
162"digit(unichr[, default])\n\
163\n\
164Returns the digit value assigned to the Unicode character unichr as\n\
165integer. If no such value is defined, default is returned, or, if\n\
166not given, ValueError is raised.");
167
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000168static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000170{
171 PyUnicodeObject *v;
172 PyObject *defobj = NULL;
173 long rc;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000174 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000175
Fredrik Lundh06d12682001-01-24 07:59:11 +0000176 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000177 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000178 c = getuchar(v);
179 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000180 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000181 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000182 if (rc < 0) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000183 if (defobj == NULL) {
184 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000185 return NULL;
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000186 }
187 else {
188 Py_INCREF(defobj);
189 return defobj;
190 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000191 }
192 return PyInt_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193}
194
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000195PyDoc_STRVAR(unicodedata_numeric__doc__,
196"numeric(unichr[, default])\n\
197\n\
198Returns the numeric value assigned to the Unicode character unichr\n\
199as float. If no such value is defined, default is returned, or, if\n\
200not given, ValueError is raised.");
201
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000202static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000203unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000204{
205 PyUnicodeObject *v;
206 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000207 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000208 double rc;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000209 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000210
Fredrik Lundh06d12682001-01-24 07:59:11 +0000211 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000212 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000213 c = getuchar(v);
214 if (c == (Py_UCS4)-1)
215 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000216
217 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000218 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000219 if (old->category_changed == 0) {
220 /* unassigned */
221 have_old = 1;
Martin v. Löwisd004fc82006-05-27 08:36:52 +0000222 rc = -1.0;
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000223 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000224 else if (old->decimal_changed != 0xFF) {
225 have_old = 1;
226 rc = old->decimal_changed;
227 }
228 }
229
230 if (!have_old)
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000231 rc = Py_UNICODE_TONUMERIC(c);
Martin v. Löwisd004fc82006-05-27 08:36:52 +0000232 if (rc == -1.0) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000233 if (defobj == NULL) {
234 PyErr_SetString(PyExc_ValueError, "not a numeric character");
235 return NULL;
236 }
237 else {
238 Py_INCREF(defobj);
239 return defobj;
240 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000241 }
242 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000243}
244
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000245PyDoc_STRVAR(unicodedata_category__doc__,
246"category(unichr)\n\
247\n\
248Returns the general category assigned to the Unicode character\n\
249unichr as string.");
250
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000251static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000252unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000253{
254 PyUnicodeObject *v;
255 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000256 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000257
258 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000259 &PyUnicode_Type, &v))
260 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000261 c = getuchar(v);
262 if (c == (Py_UCS4)-1)
263 return NULL;
264 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000265 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000266 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000267 if (old->category_changed != 0xFF)
268 index = old->category_changed;
269 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000270 return PyString_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000271}
272
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000273PyDoc_STRVAR(unicodedata_bidirectional__doc__,
274"bidirectional(unichr)\n\
275\n\
276Returns the bidirectional category assigned to the Unicode character\n\
277unichr as string. If no such value is defined, an empty string is\n\
278returned.");
279
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000280static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000281unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000282{
283 PyUnicodeObject *v;
284 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000285 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000286
287 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000288 &PyUnicode_Type, &v))
289 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000290 c = getuchar(v);
291 if (c == (Py_UCS4)-1)
292 return NULL;
293 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000294 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000295 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000296 if (old->category_changed == 0)
297 index = 0; /* unassigned */
298 else if (old->bidir_changed != 0xFF)
299 index = old->bidir_changed;
300 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000301 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000302}
303
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000304PyDoc_STRVAR(unicodedata_combining__doc__,
305"combining(unichr)\n\
306\n\
307Returns the canonical combining class assigned to the Unicode\n\
308character unichr as integer. Returns 0 if no combining class is\n\
309defined.");
310
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000311static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000312unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000313{
314 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000315 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000316 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000317
318 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000319 &PyUnicode_Type, &v))
320 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000321 c = getuchar(v);
322 if (c == (Py_UCS4)-1)
323 return NULL;
324 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000325 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000326 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000327 if (old->category_changed == 0)
328 index = 0; /* unassigned */
329 }
330 return PyInt_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000331}
332
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000333PyDoc_STRVAR(unicodedata_mirrored__doc__,
334"mirrored(unichr)\n\
335\n\
336Returns the mirrored property assigned to the Unicode character\n\
337unichr as integer. Returns 1 if the character has been identified as\n\
338a \"mirrored\" character in bidirectional text, 0 otherwise.");
339
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000340static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000341unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000342{
343 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000344 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000345 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000346
347 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000348 &PyUnicode_Type, &v))
349 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000350 c = getuchar(v);
351 if (c == (Py_UCS4)-1)
352 return NULL;
353 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000354 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000355 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000356 if (old->category_changed == 0)
357 index = 0; /* unassigned */
Martin v. Löwis24329ba2008-09-10 13:38:12 +0000358 else if (old->mirrored_changed != 0xFF)
359 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000360 }
361 return PyInt_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000362}
363
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000364PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
365"east_asian_width(unichr)\n\
366\n\
367Returns the east asian width assigned to the Unicode character\n\
368unichr as string.");
369
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000370static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000371unicodedata_east_asian_width(PyObject *self, PyObject *args)
372{
373 PyUnicodeObject *v;
374 int index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000375 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000376
377 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000378 &PyUnicode_Type, &v))
379 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000380 c = getuchar(v);
381 if (c == (Py_UCS4)-1)
382 return NULL;
383 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000384 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000385 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000386 if (old->category_changed == 0)
387 index = 0; /* unassigned */
388 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000389 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000390}
391
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000392PyDoc_STRVAR(unicodedata_decomposition__doc__,
393"decomposition(unichr)\n\
394\n\
395Returns the character decomposition mapping assigned to the Unicode\n\
396character unichr as string. An empty string is returned in case no\n\
397such mapping is defined.");
398
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000399static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000400unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000401{
402 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000403 char decomp[256];
404 int code, index, count, i;
Neal Norwitz37f694f2006-07-27 04:04:50 +0000405 unsigned int prefix_index;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000406 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000407
408 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000409 &PyUnicode_Type, &v))
410 return NULL;
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000411 c = getuchar(v);
412 if (c == (Py_UCS4)-1)
413 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000414
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000415 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000416
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000417 if (self) {
Walter Dörwalda2a89a82008-06-02 20:36:03 +0000418 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000419 if (old->category_changed == 0)
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000420 return PyString_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000421 }
422
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000423 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000424 index = 0;
425 else {
426 index = decomp_index1[(code>>DECOMP_SHIFT)];
427 index = decomp_index2[(index<<DECOMP_SHIFT)+
428 (code&((1<<DECOMP_SHIFT)-1))];
429 }
430
Tim Peters69b83b12001-11-30 07:23:05 +0000431 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000432 is prefix code (from*/
433 count = decomp_data[index] >> 8;
434
435 /* XXX: could allocate the PyString up front instead
436 (strlen(prefix) + 5 * count + 1 bytes) */
437
Neal Norwitz37f694f2006-07-27 04:04:50 +0000438 /* Based on how index is calculated above and decomp_data is generated
439 from Tools/unicode/makeunicodedata.py, it should not be possible
440 to overflow decomp_prefix. */
441 prefix_index = decomp_data[index] & 255;
442 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
443
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000444 /* copy prefix */
Neal Norwitz37f694f2006-07-27 04:04:50 +0000445 i = strlen(decomp_prefix[prefix_index]);
446 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000447
448 while (count-- > 0) {
449 if (i)
450 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000451 assert((size_t)i < sizeof(decomp));
452 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
453 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000454 i += strlen(decomp + i);
455 }
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000456
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000457 decomp[i] = '\0';
458
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000459 return PyString_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000460}
461
Neal Norwitz88c97842006-04-17 00:36:29 +0000462static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000463get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000464{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000465 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000466 *index = 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000467 } else if (self && get_old_record(self, code)->category_changed==0) {
468 /* unassigned in old version */
469 *index = 0;
470 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000471 else {
472 *index = decomp_index1[(code>>DECOMP_SHIFT)];
473 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
474 (code&((1<<DECOMP_SHIFT)-1))];
475 }
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000476
Martin v. Löwis677bde22002-11-23 22:08:15 +0000477 /* high byte is number of hex bytes (usually one or two), low byte
478 is prefix code (from*/
479 *count = decomp_data[*index] >> 8;
480 *prefix = decomp_data[*index] & 255;
481
482 (*index)++;
483}
484
485#define SBase 0xAC00
486#define LBase 0x1100
487#define VBase 0x1161
488#define TBase 0x11A7
489#define LCount 19
490#define VCount 21
491#define TCount 28
492#define NCount (VCount*TCount)
493#define SCount (LCount*NCount)
494
495static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000496nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000497{
498 PyObject *result;
499 Py_UNICODE *i, *end, *o;
500 /* Longest decomposition in Unicode 3.2: U+FDFA */
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000501 Py_UNICODE stack[20];
Martin v. Löwis3c6e4182006-04-13 06:36:31 +0000502 Py_ssize_t space, isize;
503 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000504 unsigned char prev, cur;
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000505
Martin v. Löwis677bde22002-11-23 22:08:15 +0000506 stackptr = 0;
507 isize = PyUnicode_GET_SIZE(input);
508 /* Overallocate atmost 10 characters. */
509 space = (isize > 10 ? 10 : isize) + isize;
510 result = PyUnicode_FromUnicode(NULL, space);
511 if (!result)
512 return NULL;
513 i = PyUnicode_AS_UNICODE(input);
514 end = i + isize;
515 o = PyUnicode_AS_UNICODE(result);
516
517 while (i < end) {
518 stack[stackptr++] = *i++;
519 while(stackptr) {
520 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000521 /* Hangul Decomposition adds three characters in
522 a single step, so we need atleast that much room. */
523 if (space < 3) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000524 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000525 space += 10;
526 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000527 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000528 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000529 }
530 /* Hangul Decomposition. */
531 if (SBase <= code && code < (SBase+SCount)) {
532 int SIndex = code - SBase;
533 int L = LBase + SIndex / NCount;
534 int V = VBase + (SIndex % NCount) / TCount;
535 int T = TBase + SIndex % TCount;
536 *o++ = L;
537 *o++ = V;
538 space -= 2;
539 if (T != TBase) {
540 *o++ = T;
541 space --;
542 }
543 continue;
544 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000545 /* normalization changes */
546 if (self) {
547 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
548 if (value != 0) {
549 stack[stackptr++] = value;
550 continue;
551 }
552 }
553
554 /* Other decompositions. */
555 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000556
557 /* Copy character if it is not decomposable, or has a
558 compatibility decomposition, but we do NFD. */
559 if (!count || (prefix && !k)) {
560 *o++ = code;
561 space--;
562 continue;
563 }
564 /* Copy decomposition onto the stack, in reverse
565 order. */
566 while(count) {
567 code = decomp_data[index + (--count)];
568 stack[stackptr++] = code;
569 }
570 }
571 }
572
573 /* Drop overallocation. Cannot fail. */
574 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
575
576 /* Sort canonically. */
577 i = PyUnicode_AS_UNICODE(result);
578 prev = _getrecord_ex(*i)->combining;
579 end = i + PyUnicode_GET_SIZE(result);
580 for (i++; i < end; i++) {
581 cur = _getrecord_ex(*i)->combining;
582 if (prev == 0 || cur == 0 || prev <= cur) {
583 prev = cur;
584 continue;
585 }
586 /* Non-canonical order. Need to switch *i with previous. */
587 o = i - 1;
588 while (1) {
589 Py_UNICODE tmp = o[1];
590 o[1] = o[0];
591 o[0] = tmp;
592 o--;
593 if (o < PyUnicode_AS_UNICODE(result))
594 break;
595 prev = _getrecord_ex(*o)->combining;
596 if (prev == 0 || prev <= cur)
597 break;
598 }
599 prev = _getrecord_ex(*i)->combining;
600 }
601 return result;
602}
603
604static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000605find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000606{
607 int index;
608 for (index = 0; nfc[index].start; index++) {
609 int start = nfc[index].start;
610 if (code < start)
611 return -1;
612 if (code <= start + nfc[index].count) {
613 int delta = code - start;
614 return nfc[index].index + delta;
615 }
616 }
617 return -1;
618}
619
620static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000621nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000622{
623 PyObject *result;
624 Py_UNICODE *i, *i1, *o, *end;
625 int f,l,index,index1,comb;
626 Py_UNICODE code;
627 Py_UNICODE *skipped[20];
628 int cskipped = 0;
629
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000630 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000631 if (!result)
632 return NULL;
633
634 /* We are going to modify result in-place.
635 If nfd_nfkd is changed to sometimes return the input,
636 this code needs to be reviewed. */
637 assert(result != input);
638
639 i = PyUnicode_AS_UNICODE(result);
640 end = i + PyUnicode_GET_SIZE(result);
641 o = PyUnicode_AS_UNICODE(result);
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000642
Martin v. Löwis677bde22002-11-23 22:08:15 +0000643 again:
644 while (i < end) {
645 for (index = 0; index < cskipped; index++) {
646 if (skipped[index] == i) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000647 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000648 Remove from list. */
649 skipped[index] = skipped[cskipped-1];
650 cskipped--;
651 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000652 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000653 }
654 }
655 /* Hangul Composition. We don't need to check for <LV,T>
656 pairs, since we always have decomposed data. */
657 if (LBase <= *i && *i < (LBase+LCount) &&
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000658 i + 1 < end &&
Martin v. Löwis677bde22002-11-23 22:08:15 +0000659 VBase <= i[1] && i[1] <= (VBase+VCount)) {
660 int LIndex, VIndex;
661 LIndex = i[0] - LBase;
662 VIndex = i[1] - VBase;
663 code = SBase + (LIndex*VCount+VIndex)*TCount;
664 i+=2;
665 if (i < end &&
666 TBase <= *i && *i <= (TBase+TCount)) {
667 code += *i-TBase;
668 i++;
669 }
670 *o++ = code;
671 continue;
672 }
673
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000674 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000675 if (f == -1) {
676 *o++ = *i++;
677 continue;
678 }
679 /* Find next unblocked character. */
680 i1 = i+1;
681 comb = 0;
682 while (i1 < end) {
683 int comb1 = _getrecord_ex(*i1)->combining;
Alexander Belopolsky893c3542010-12-28 16:15:08 +0000684 if (comb) {
685 if (comb1 == 0)
686 break;
687 if (comb >= comb1) {
688 /* Character is blocked. */
689 i1++;
690 continue;
691 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000692 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000693 l = find_nfc_index(self, nfc_last, *i1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000694 /* *i1 cannot be combined with *i. If *i1
695 is a starter, we don't need to look further.
696 Otherwise, record the combining class. */
697 if (l == -1) {
698 not_combinable:
699 if (comb1 == 0)
700 break;
701 comb = comb1;
702 i1++;
703 continue;
704 }
705 index = f*TOTAL_LAST + l;
706 index1 = comp_index[index >> COMP_SHIFT];
707 code = comp_data[(index1<<COMP_SHIFT)+
708 (index&((1<<COMP_SHIFT)-1))];
709 if (code == 0)
710 goto not_combinable;
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000711
Martin v. Löwis677bde22002-11-23 22:08:15 +0000712 /* Replace the original character. */
713 *i = code;
714 /* Mark the second character unused. */
Alexander Belopolsky893c3542010-12-28 16:15:08 +0000715 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000716 skipped[cskipped++] = i1;
717 i1++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000718 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000719 if (f == -1)
720 break;
721 }
722 *o++ = *i++;
723 }
724 if (o != end)
725 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
726 return result;
727}
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000728
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000729PyDoc_STRVAR(unicodedata_normalize__doc__,
730"normalize(form, unistr)\n\
731\n\
732Return the normal form 'form' for the Unicode string unistr. Valid\n\
733values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
734
Martin v. Löwis677bde22002-11-23 22:08:15 +0000735static PyObject*
736unicodedata_normalize(PyObject *self, PyObject *args)
737{
738 char *form;
739 PyObject *input;
740
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000741 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000742 &form, &PyUnicode_Type, &input))
743 return NULL;
744
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000745 if (PyUnicode_GetSize(input) == 0) {
746 /* Special case empty input strings, since resizing
747 them later would cause internal errors. */
748 Py_INCREF(input);
749 return input;
750 }
751
Martin v. Löwis677bde22002-11-23 22:08:15 +0000752 if (strcmp(form, "NFC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000753 return nfc_nfkc(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000754 if (strcmp(form, "NFKC") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000755 return nfc_nfkc(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000756 if (strcmp(form, "NFD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000757 return nfd_nfkd(self, input, 0);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000758 if (strcmp(form, "NFKD") == 0)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000759 return nfd_nfkd(self, input, 1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000760 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
761 return NULL;
762}
763
Fredrik Lundh06d12682001-01-24 07:59:11 +0000764/* -------------------------------------------------------------------- */
765/* unicode character name tables */
766
767/* data file generated by Tools/unicode/makeunicodedata.py */
768#include "unicodename_db.h"
769
770/* -------------------------------------------------------------------- */
771/* database code (cut and pasted from the unidb package) */
772
773static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000774_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000775{
776 int i;
777 unsigned long h = 0;
778 unsigned long ix;
779 for (i = 0; i < len; i++) {
Neal Norwitz65c05b22006-04-10 02:17:47 +0000780 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000781 ix = h & 0xff000000;
782 if (ix)
783 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
784 }
785 return h;
786}
787
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000788static char *hangul_syllables[][3] = {
789 { "G", "A", "" },
790 { "GG", "AE", "G" },
791 { "N", "YA", "GG" },
792 { "D", "YAE", "GS" },
793 { "DD", "EO", "N", },
794 { "R", "E", "NJ" },
795 { "M", "YEO", "NH" },
796 { "B", "YE", "D" },
797 { "BB", "O", "L" },
798 { "S", "WA", "LG" },
799 { "SS", "WAE", "LM" },
800 { "", "OE", "LB" },
801 { "J", "YO", "LS" },
802 { "JJ", "U", "LT" },
803 { "C", "WEO", "LP" },
804 { "K", "WE", "LH" },
805 { "T", "WI", "M" },
806 { "P", "YU", "B" },
807 { "H", "EU", "BS" },
808 { 0, "YI", "S" },
809 { 0, "I", "SS" },
810 { 0, 0, "NG" },
811 { 0, 0, "J" },
812 { 0, 0, "C" },
813 { 0, 0, "K" },
814 { 0, 0, "T" },
815 { 0, 0, "P" },
816 { 0, 0, "H" }
817};
818
Fredrik Lundh06d12682001-01-24 07:59:11 +0000819static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000820is_unified_ideograph(Py_UCS4 code)
821{
822 return (
823 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Martin v. Löwisc3509122006-03-11 12:16:23 +0000824 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000825 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
826}
827
828static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000829_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000830{
831 int offset;
832 int i;
833 int word;
834 unsigned char* w;
835
Martin v. Löwisc3509122006-03-11 12:16:23 +0000836 if (code >= 0x110000)
837 return 0;
838
839 if (self) {
840 const change_record *old = get_old_record(self, code);
841 if (old->category_changed == 0) {
842 /* unassigned */
843 return 0;
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000844 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000845 }
846
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000847 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000848 /* Hangul syllable. */
849 int SIndex = code - SBase;
850 int L = SIndex / NCount;
851 int V = (SIndex % NCount) / TCount;
852 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000853
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000854 if (buflen < 27)
855 /* Worst case: HANGUL SYLLABLE <10chars>. */
856 return 0;
857 strcpy(buffer, "HANGUL SYLLABLE ");
858 buffer += 16;
859 strcpy(buffer, hangul_syllables[L][0]);
860 buffer += strlen(hangul_syllables[L][0]);
861 strcpy(buffer, hangul_syllables[V][1]);
862 buffer += strlen(hangul_syllables[V][1]);
863 strcpy(buffer, hangul_syllables[T][2]);
864 buffer += strlen(hangul_syllables[T][2]);
865 *buffer = '\0';
866 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000867 }
868
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000869 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000870 if (buflen < 28)
871 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
872 return 0;
873 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
874 return 1;
875 }
876
Fredrik Lundh06d12682001-01-24 07:59:11 +0000877 /* get offset into phrasebook */
878 offset = phrasebook_offset1[(code>>phrasebook_shift)];
879 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
880 (code&((1<<phrasebook_shift)-1))];
881 if (!offset)
882 return 0;
883
884 i = 0;
885
886 for (;;) {
887 /* get word index */
888 word = phrasebook[offset] - phrasebook_short;
889 if (word >= 0) {
890 word = (word << 8) + phrasebook[offset+1];
891 offset += 2;
892 } else
893 word = phrasebook[offset++];
894 if (i) {
895 if (i > buflen)
896 return 0; /* buffer overflow */
897 buffer[i++] = ' ';
898 }
899 /* copy word string from lexicon. the last character in the
900 word has bit 7 set. the last word in a string ends with
901 0x80 */
902 w = lexicon + lexicon_offset[word];
903 while (*w < 128) {
904 if (i >= buflen)
905 return 0; /* buffer overflow */
906 buffer[i++] = *w++;
907 }
908 if (i >= buflen)
909 return 0; /* buffer overflow */
910 buffer[i++] = *w & 127;
911 if (*w == 128)
912 break; /* end of word */
913 }
914
915 return 1;
916}
917
918static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000919_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000920{
921 /* check if code corresponds to the given name */
922 int i;
923 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000924 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000925 return 0;
926 for (i = 0; i < namelen; i++) {
Neal Norwitz65c05b22006-04-10 02:17:47 +0000927 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +0000928 return 0;
929 }
930 return buffer[namelen] == '\0';
931}
932
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000933static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000934find_syllable(const char *str, int *len, int *pos, int count, int column)
935{
936 int i, len1;
937 *len = -1;
938 for (i = 0; i < count; i++) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000939 char *s = hangul_syllables[i][column];
940 len1 = strlen(s);
941 if (len1 <= *len)
942 continue;
943 if (strncmp(str, s, len1) == 0) {
944 *len = len1;
945 *pos = i;
946 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000947 }
948 if (*len == -1) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000949 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000950 }
951}
952
Fredrik Lundh06d12682001-01-24 07:59:11 +0000953static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000954_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000955{
956 unsigned int h, v;
957 unsigned int mask = code_size-1;
958 unsigned int i, incr;
959
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000960 /* Check for hangul syllables. */
961 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +0000962 int len, L = -1, V = -1, T = -1;
963 const char *pos = name + 16;
964 find_syllable(pos, &len, &L, LCount, 0);
965 pos += len;
966 find_syllable(pos, &len, &V, VCount, 1);
967 pos += len;
968 find_syllable(pos, &len, &T, TCount, 2);
969 pos += len;
970 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
971 *code = SBase + (L*VCount+V)*TCount + T;
972 return 1;
973 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000974 /* Otherwise, it's an illegal syllable name. */
975 return 0;
976 }
977
978 /* Check for unified ideographs. */
979 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
980 /* Four or five hexdigits must follow. */
981 v = 0;
982 name += 22;
983 namelen -= 22;
984 if (namelen != 4 && namelen != 5)
985 return 0;
986 while (namelen--) {
987 v *= 16;
988 if (*name >= '0' && *name <= '9')
989 v += *name - '0';
990 else if (*name >= 'A' && *name <= 'F')
991 v += *name - 'A' + 10;
992 else
993 return 0;
994 name++;
995 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000996 if (!is_unified_ideograph(v))
997 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000998 *code = v;
999 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001000 }
1001
Fredrik Lundh06d12682001-01-24 07:59:11 +00001002 /* the following is the same as python's dictionary lookup, with
1003 only minor changes. see the makeunicodedata script for more
1004 details */
1005
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001006 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001007 i = (~h) & mask;
1008 v = code_hash[i];
1009 if (!v)
1010 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001011 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001012 *code = v;
1013 return 1;
1014 }
1015 incr = (h ^ (h >> 3)) & mask;
1016 if (!incr)
1017 incr = mask;
1018 for (;;) {
1019 i = (i + incr) & mask;
1020 v = code_hash[i];
1021 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001022 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001023 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001024 *code = v;
1025 return 1;
1026 }
1027 incr = incr << 1;
1028 if (incr > mask)
1029 incr = incr ^ code_poly;
1030 }
1031}
1032
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001033static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001034{
1035 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001036 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001037 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001038};
1039
1040/* -------------------------------------------------------------------- */
1041/* Python bindings */
1042
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001043PyDoc_STRVAR(unicodedata_name__doc__,
1044"name(unichr[, default])\n\
1045Returns the name assigned to the Unicode character unichr as a\n\
1046string. If no name is defined, default is returned, or, if not\n\
1047given, ValueError is raised.");
1048
Fredrik Lundh06d12682001-01-24 07:59:11 +00001049static PyObject *
1050unicodedata_name(PyObject* self, PyObject* args)
1051{
1052 char name[NAME_MAXLEN];
Walter Dörwalda2a89a82008-06-02 20:36:03 +00001053 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001054
1055 PyUnicodeObject* v;
1056 PyObject* defobj = NULL;
1057 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1058 return NULL;
1059
Walter Dörwalda2a89a82008-06-02 20:36:03 +00001060 c = getuchar(v);
1061 if (c == (Py_UCS4)-1)
1062 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001063
Walter Dörwalda2a89a82008-06-02 20:36:03 +00001064 if (!_getucname(self, c, name, sizeof(name))) {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001065 if (defobj == NULL) {
1066 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001067 return NULL;
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001068 }
1069 else {
1070 Py_INCREF(defobj);
1071 return defobj;
1072 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001073 }
1074
1075 return Py_BuildValue("s", name);
1076}
1077
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001078PyDoc_STRVAR(unicodedata_lookup__doc__,
1079"lookup(name)\n\
1080\n\
1081Look up character by name. If a character with the\n\
1082given name is found, return the corresponding Unicode\n\
1083character. If not found, KeyError is raised.");
1084
Fredrik Lundh06d12682001-01-24 07:59:11 +00001085static PyObject *
1086unicodedata_lookup(PyObject* self, PyObject* args)
1087{
1088 Py_UCS4 code;
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +00001089 Py_UNICODE str[2];
Fredrik Lundh06d12682001-01-24 07:59:11 +00001090
1091 char* name;
1092 int namelen;
1093 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1094 return NULL;
1095
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001096 if (!_getcode(self, name, namelen, &code)) {
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +00001097 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1098 name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001099 return NULL;
1100 }
1101
Martin v. Löwisf1e0b3f2007-07-28 07:03:05 +00001102#ifndef Py_UNICODE_WIDE
1103 if (code >= 0x10000) {
1104 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1105 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1106 return PyUnicode_FromUnicode(str, 2);
1107 }
1108#endif
Fredrik Lundh06d12682001-01-24 07:59:11 +00001109 str[0] = (Py_UNICODE) code;
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001110 return PyUnicode_FromUnicode(str, 1);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001111}
1112
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001113/* XXX Add doc strings. */
1114
1115static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001116 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1117 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1118 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1119 {"category", unicodedata_category, METH_VARARGS,
1120 unicodedata_category__doc__},
1121 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1122 unicodedata_bidirectional__doc__},
1123 {"combining", unicodedata_combining, METH_VARARGS,
1124 unicodedata_combining__doc__},
1125 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1126 unicodedata_mirrored__doc__},
1127 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1128 unicodedata_east_asian_width__doc__},
1129 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1130 unicodedata_decomposition__doc__},
1131 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1132 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1133 {"normalize", unicodedata_normalize, METH_VARARGS,
1134 unicodedata_normalize__doc__},
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001135 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001136};
1137
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001138static PyTypeObject UCD_Type = {
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001139 /* The ob_type field must be initialized in the module init function
1140 * to be portable to Windows without using C++. */
1141 PyVarObject_HEAD_INIT(NULL, 0)
1142 "unicodedata.UCD", /*tp_name*/
1143 sizeof(PreviousDBVersion), /*tp_basicsize*/
1144 0, /*tp_itemsize*/
1145 /* methods */
1146 (destructor)PyObject_Del, /*tp_dealloc*/
1147 0, /*tp_print*/
1148 0, /*tp_getattr*/
1149 0, /*tp_setattr*/
1150 0, /*tp_compare*/
1151 0, /*tp_repr*/
1152 0, /*tp_as_number*/
1153 0, /*tp_as_sequence*/
1154 0, /*tp_as_mapping*/
1155 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001156 0, /*tp_call*/
1157 0, /*tp_str*/
1158 PyObject_GenericGetAttr,/*tp_getattro*/
1159 0, /*tp_setattro*/
1160 0, /*tp_as_buffer*/
1161 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1162 0, /*tp_doc*/
1163 0, /*tp_traverse*/
1164 0, /*tp_clear*/
1165 0, /*tp_richcompare*/
1166 0, /*tp_weaklistoffset*/
1167 0, /*tp_iter*/
1168 0, /*tp_iternext*/
1169 unicodedata_functions, /*tp_methods*/
1170 DB_members, /*tp_members*/
1171 0, /*tp_getset*/
1172 0, /*tp_base*/
1173 0, /*tp_dict*/
1174 0, /*tp_descr_get*/
1175 0, /*tp_descr_set*/
1176 0, /*tp_dictoffset*/
1177 0, /*tp_init*/
1178 0, /*tp_alloc*/
1179 0, /*tp_new*/
1180 0, /*tp_free*/
1181 0, /*tp_is_gc*/
1182};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001183
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001184PyDoc_STRVAR(unicodedata_docstring,
1185"This module provides access to the Unicode Character Database which\n\
1186defines character properties for all Unicode characters. The data in\n\
1187this database is based on the UnicodeData.txt file version\n\
Martin v. Löwis24329ba2008-09-10 13:38:12 +000011885.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001189\n\
1190The module uses the same names and symbols as defined by the\n\
Martin v. Löwis24329ba2008-09-10 13:38:12 +00001191UnicodeData File Format 5.1.0 (see\n\
1192http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001193
Mark Hammond62b1ab12002-07-23 06:31:15 +00001194PyMODINIT_FUNC
Thomas Woutersf3f33dc2000-07-21 06:00:07 +00001195initunicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001196{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001197 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001198
Christian Heimese93237d2007-12-19 02:37:44 +00001199 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001200
Fred Drakef585bef2001-03-03 19:41:55 +00001201 m = Py_InitModule3(
1202 "unicodedata", unicodedata_functions, unicodedata_docstring);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001203 if (!m)
1204 return;
1205
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001206 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001207 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001208 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001209
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001210 /* Previous versions */
1211 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1212 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001213 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001214
Fredrik Lundh06d12682001-01-24 07:59:11 +00001215 /* Export C API */
1216 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001217 if (v != NULL)
1218 PyModule_AddObject(m, "ucnhash_CAPI", v);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001219}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001220
Antoine Pitrouc7c96a92010-05-09 15:15:40 +00001221/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001222Local variables:
1223c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001224indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001225End:
1226*/