blob: 1dacf881a0a8eba07d620a4538ea7d913579fb42 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti4c5475d2010-03-22 23:16:42 +00003 unicodedata -- Provides access to the Unicode 5.2 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti4c5475d2010-03-22 23:16:42 +00005 Data was extracted from the Unicode 5.2 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000030 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000031} _PyUnicode_DatabaseRecord;
32
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000033typedef struct change_record {
34 /* sequence of fields should be the same as in merge_old_version */
35 const unsigned char bidir_changed;
36 const unsigned char category_changed;
37 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000038 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000039 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000040} change_record;
41
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000042/* data file generated by Tools/unicode/makeunicodedata.py */
43#include "unicodedata_db.h"
44
45static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000046_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000049 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000050 index = 0;
51 else {
52 index = index1[(code>>SHIFT)];
53 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54 }
55
56 return &_PyUnicode_Database_Records[index];
57}
58
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000059/* ------------- Previous-version API ------------------------------------- */
60typedef struct previous_version {
61 PyObject_HEAD
62 const char *name;
63 const change_record* (*getrecord)(Py_UCS4);
64 Py_UCS4 (*normalization)(Py_UCS4);
65} PreviousDBVersion;
66
67#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
68
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000069static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000070 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000071 {NULL}
72};
73
Thomas Wouters89f507f2006-12-13 04:49:30 +000074/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000075static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000076#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000077
78static PyObject*
79new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
80 Py_UCS4 (*normalization)(Py_UCS4))
81{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000082 PreviousDBVersion *self;
83 self = PyObject_New(PreviousDBVersion, &UCD_Type);
84 if (self == NULL)
85 return NULL;
86 self->name = name;
87 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090}
91
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000092
93static Py_UCS4 getuchar(PyUnicodeObject *obj)
94{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020095 if (PyUnicode_READY(obj))
96 return (Py_UCS4)-1;
97 if (PyUnicode_GET_LENGTH(obj) == 1) {
98 if (PyUnicode_READY(obj))
99 return (Py_UCS4)-1;
100 return PyUnicode_READ_CHAR(obj, 0);
101 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000102 PyErr_SetString(PyExc_TypeError,
103 "need a single Unicode character as parameter");
104 return (Py_UCS4)-1;
105}
106
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000107/* --- Module API --------------------------------------------------------- */
108
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000109PyDoc_STRVAR(unicodedata_decimal__doc__,
110"decimal(unichr[, default])\n\
111\n\
112Returns the decimal value assigned to the Unicode character unichr\n\
113as integer. If no such value is defined, default is returned, or, if\n\
114not given, ValueError is raised.");
115
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000116static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000117unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118{
119 PyUnicodeObject *v;
120 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000121 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000122 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000123 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000124
Fredrik Lundh06d12682001-01-24 07:59:11 +0000125 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000126 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000127 c = getuchar(v);
128 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000129 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000130
Martin v. Löwis1a214512008-06-11 05:26:20 +0000131 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000132 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000133 if (old->category_changed == 0) {
134 /* unassigned */
135 have_old = 1;
136 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000137 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000138 else if (old->decimal_changed != 0xFF) {
139 have_old = 1;
140 rc = old->decimal_changed;
141 }
142 }
143
144 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000145 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000146 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 if (defobj == NULL) {
148 PyErr_SetString(PyExc_ValueError,
149 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000150 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000151 }
152 else {
153 Py_INCREF(defobj);
154 return defobj;
155 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000156 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000157 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000158}
159
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000160PyDoc_STRVAR(unicodedata_digit__doc__,
161"digit(unichr[, default])\n\
162\n\
163Returns the digit value assigned to the Unicode character unichr as\n\
164integer. If no such value is defined, default is returned, or, if\n\
165not given, ValueError is raised.");
166
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000167static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000168unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169{
170 PyUnicodeObject *v;
171 PyObject *defobj = NULL;
172 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000173 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000174
Fredrik Lundh06d12682001-01-24 07:59:11 +0000175 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000176 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000177 c = getuchar(v);
178 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000179 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000180 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000181 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000182 if (defobj == NULL) {
183 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000184 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 }
186 else {
187 Py_INCREF(defobj);
188 return defobj;
189 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000190 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000191 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000192}
193
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000194PyDoc_STRVAR(unicodedata_numeric__doc__,
195"numeric(unichr[, default])\n\
196\n\
197Returns the numeric value assigned to the Unicode character unichr\n\
198as float. If no such value is defined, default is returned, or, if\n\
199not given, ValueError is raised.");
200
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000202unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203{
204 PyUnicodeObject *v;
205 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000206 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000207 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000208 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000209
Fredrik Lundh06d12682001-01-24 07:59:11 +0000210 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000211 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000212 c = getuchar(v);
213 if (c == (Py_UCS4)-1)
214 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000215
Martin v. Löwis1a214512008-06-11 05:26:20 +0000216 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000217 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000218 if (old->category_changed == 0) {
219 /* unassigned */
220 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000222 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000223 else if (old->decimal_changed != 0xFF) {
224 have_old = 1;
225 rc = old->decimal_changed;
226 }
227 }
228
229 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000230 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000232 if (defobj == NULL) {
233 PyErr_SetString(PyExc_ValueError, "not a numeric character");
234 return NULL;
235 }
236 else {
237 Py_INCREF(defobj);
238 return defobj;
239 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000240 }
241 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000242}
243
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000244PyDoc_STRVAR(unicodedata_category__doc__,
245"category(unichr)\n\
246\n\
247Returns the general category assigned to the Unicode character\n\
248unichr as string.");
249
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000251unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000252{
253 PyUnicodeObject *v;
254 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000255 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000256
257 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000258 &PyUnicode_Type, &v))
259 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000260 c = getuchar(v);
261 if (c == (Py_UCS4)-1)
262 return NULL;
263 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000264 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000265 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000266 if (old->category_changed != 0xFF)
267 index = old->category_changed;
268 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000269 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000270}
271
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000272PyDoc_STRVAR(unicodedata_bidirectional__doc__,
273"bidirectional(unichr)\n\
274\n\
275Returns the bidirectional category assigned to the Unicode character\n\
276unichr as string. If no such value is defined, an empty string is\n\
277returned.");
278
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000279static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000280unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000281{
282 PyUnicodeObject *v;
283 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000284 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000285
286 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000287 &PyUnicode_Type, &v))
288 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000289 c = getuchar(v);
290 if (c == (Py_UCS4)-1)
291 return NULL;
292 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000293 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000294 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000295 if (old->category_changed == 0)
296 index = 0; /* unassigned */
297 else if (old->bidir_changed != 0xFF)
298 index = old->bidir_changed;
299 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000300 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000301}
302
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000303PyDoc_STRVAR(unicodedata_combining__doc__,
304"combining(unichr)\n\
305\n\
306Returns the canonical combining class assigned to the Unicode\n\
307character unichr as integer. Returns 0 if no combining class is\n\
308defined.");
309
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000310static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000311unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000312{
313 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000314 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000315 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000316
317 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000318 &PyUnicode_Type, &v))
319 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000320 c = getuchar(v);
321 if (c == (Py_UCS4)-1)
322 return NULL;
323 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000324 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000325 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000326 if (old->category_changed == 0)
327 index = 0; /* unassigned */
328 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000329 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000330}
331
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000332PyDoc_STRVAR(unicodedata_mirrored__doc__,
333"mirrored(unichr)\n\
334\n\
335Returns the mirrored property assigned to the Unicode character\n\
336unichr as integer. Returns 1 if the character has been identified as\n\
337a \"mirrored\" character in bidirectional text, 0 otherwise.");
338
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000339static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000340unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000341{
342 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000343 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000344 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000345
346 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000347 &PyUnicode_Type, &v))
348 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000349 c = getuchar(v);
350 if (c == (Py_UCS4)-1)
351 return NULL;
352 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000353 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000354 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000355 if (old->category_changed == 0)
356 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000357 else if (old->mirrored_changed != 0xFF)
358 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000359 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000360 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000361}
362
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000363PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
364"east_asian_width(unichr)\n\
365\n\
366Returns the east asian width assigned to the Unicode character\n\
367unichr as string.");
368
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000369static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000370unicodedata_east_asian_width(PyObject *self, PyObject *args)
371{
372 PyUnicodeObject *v;
373 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000374 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000375
376 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000377 &PyUnicode_Type, &v))
378 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000379 c = getuchar(v);
380 if (c == (Py_UCS4)-1)
381 return NULL;
382 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000383 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000384 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000385 if (old->category_changed == 0)
386 index = 0; /* unassigned */
387 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000388 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000389}
390
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000391PyDoc_STRVAR(unicodedata_decomposition__doc__,
392"decomposition(unichr)\n\
393\n\
394Returns the character decomposition mapping assigned to the Unicode\n\
395character unichr as string. An empty string is returned in case no\n\
396such mapping is defined.");
397
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000398static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000399unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000400{
401 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000402 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000403 int code, index, count;
404 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000405 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000406 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000407
408 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000409 &PyUnicode_Type, &v))
410 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000411 c = getuchar(v);
412 if (c == (Py_UCS4)-1)
413 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000414
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000415 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000416
Martin v. Löwis1a214512008-06-11 05:26:20 +0000417 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000418 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000419 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000420 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000421 }
422
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000423 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000424 index = 0;
425 else {
426 index = decomp_index1[(code>>DECOMP_SHIFT)];
427 index = decomp_index2[(index<<DECOMP_SHIFT)+
428 (code&((1<<DECOMP_SHIFT)-1))];
429 }
430
Tim Peters69b83b12001-11-30 07:23:05 +0000431 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000432 is prefix code (from*/
433 count = decomp_data[index] >> 8;
434
435 /* XXX: could allocate the PyString up front instead
436 (strlen(prefix) + 5 * count + 1 bytes) */
437
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000438 /* Based on how index is calculated above and decomp_data is generated
439 from Tools/unicode/makeunicodedata.py, it should not be possible
440 to overflow decomp_prefix. */
441 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200442 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000443
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000444 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000445 i = strlen(decomp_prefix[prefix_index]);
446 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000447
448 while (count-- > 0) {
449 if (i)
450 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000451 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000452 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
453 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000454 i += strlen(decomp + i);
455 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000456 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000457}
458
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000459static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000460get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000461{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000462 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000463 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000465 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000466 /* unassigned in old version */
467 *index = 0;
468 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000469 else {
470 *index = decomp_index1[(code>>DECOMP_SHIFT)];
471 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
472 (code&((1<<DECOMP_SHIFT)-1))];
473 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000474
Martin v. Löwis677bde22002-11-23 22:08:15 +0000475 /* high byte is number of hex bytes (usually one or two), low byte
476 is prefix code (from*/
477 *count = decomp_data[*index] >> 8;
478 *prefix = decomp_data[*index] & 255;
479
480 (*index)++;
481}
482
483#define SBase 0xAC00
484#define LBase 0x1100
485#define VBase 0x1161
486#define TBase 0x11A7
487#define LCount 19
488#define VCount 21
489#define TCount 28
490#define NCount (VCount*TCount)
491#define SCount (LCount*NCount)
492
493static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000494nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000495{
496 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200497 Py_UCS4 *output;
498 Py_ssize_t i, o, osize;
499 int kind;
500 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000501 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200502 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000503 Py_ssize_t space, isize;
504 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000505 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000506
Martin v. Löwis677bde22002-11-23 22:08:15 +0000507 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200508 isize = PyUnicode_GET_LENGTH(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000509 /* Overallocate atmost 10 characters. */
510 space = (isize > 10 ? 10 : isize) + isize;
Martin v. Löwis22970662011-09-29 13:39:38 +0200511 osize = space;
512 output = PyMem_Malloc(space * sizeof(Py_UCS4));
513 if (!output) {
514 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000515 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200516 }
517 i = o = 0;
518 kind = PyUnicode_KIND(input);
519 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000520
Martin v. Löwis22970662011-09-29 13:39:38 +0200521 while (i < isize) {
522 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000523 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200524 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000525 /* Hangul Decomposition adds three characters in
526 a single step, so we need atleast that much room. */
527 if (space < 3) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200528 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000529 space += 10;
Martin v. Löwis22970662011-09-29 13:39:38 +0200530 output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
531 if (output == NULL) {
532 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000533 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200534 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000535 }
536 /* Hangul Decomposition. */
537 if (SBase <= code && code < (SBase+SCount)) {
538 int SIndex = code - SBase;
539 int L = LBase + SIndex / NCount;
540 int V = VBase + (SIndex % NCount) / TCount;
541 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200542 output[o++] = L;
543 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000544 space -= 2;
545 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200546 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000547 space --;
548 }
549 continue;
550 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000551 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000552 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000553 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
554 if (value != 0) {
555 stack[stackptr++] = value;
556 continue;
557 }
558 }
559
560 /* Other decompositions. */
561 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000562
563 /* Copy character if it is not decomposable, or has a
564 compatibility decomposition, but we do NFD. */
565 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200566 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000567 space--;
568 continue;
569 }
570 /* Copy decomposition onto the stack, in reverse
571 order. */
572 while(count) {
573 code = decomp_data[index + (--count)];
574 stack[stackptr++] = code;
575 }
576 }
577 }
578
Martin v. Löwis22970662011-09-29 13:39:38 +0200579 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
580 output, o);
581 PyMem_Free(output);
582 if (!result)
583 return NULL;
584 /* result is guaranteed to be ready, as it is compact. */
585 kind = PyUnicode_KIND(result);
586 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000587
588 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200589 i = 0;
590 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
591 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
592 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000593 if (prev == 0 || cur == 0 || prev <= cur) {
594 prev = cur;
595 continue;
596 }
597 /* Non-canonical order. Need to switch *i with previous. */
598 o = i - 1;
599 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200600 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
601 PyUnicode_WRITE(kind, data, o+1,
602 PyUnicode_READ(kind, data, o));
603 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000604 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200605 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000606 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200607 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000608 if (prev == 0 || prev <= cur)
609 break;
610 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200611 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000612 }
613 return result;
614}
615
616static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200617find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000618{
619 int index;
620 for (index = 0; nfc[index].start; index++) {
621 int start = nfc[index].start;
622 if (code < start)
623 return -1;
624 if (code <= start + nfc[index].count) {
625 int delta = code - start;
626 return nfc[index].index + delta;
627 }
628 }
629 return -1;
630}
631
632static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000633nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000634{
635 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200636 int kind;
637 void *data;
638 Py_UCS4 *output;
639 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000640 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200641 Py_UCS4 code;
642 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000643 int cskipped = 0;
644
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000645 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000646 if (!result)
647 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200648 /* result will be "ready". */
649 kind = PyUnicode_KIND(result);
650 data = PyUnicode_DATA(result);
651 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000652
Martin v. Löwis22970662011-09-29 13:39:38 +0200653 /* We allocate a buffer for the output.
654 If we find that we made no changes, we still return
655 the NFD result. */
656 output = PyMem_Malloc(len * sizeof(Py_UCS4));
657 if (!output) {
658 PyErr_NoMemory();
659 Py_DECREF(result);
660 return 0;
661 }
662 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000663
Martin v. Löwis677bde22002-11-23 22:08:15 +0000664 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200665 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000666 for (index = 0; index < cskipped; index++) {
667 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000668 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000669 Remove from list. */
670 skipped[index] = skipped[cskipped-1];
671 cskipped--;
672 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000673 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000674 }
675 }
676 /* Hangul Composition. We don't need to check for <LV,T>
677 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200678 code = PyUnicode_READ(kind, data, i);
679 if (LBase <= code && code < (LBase+LCount) &&
680 i + 1 < len &&
681 VBase <= PyUnicode_READ(kind, data, i+1) &&
682 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000683 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200684 LIndex = code - LBase;
685 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000686 code = SBase + (LIndex*VCount+VIndex)*TCount;
687 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200688 if (i < len &&
689 TBase <= PyUnicode_READ(kind, data, i) &&
690 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
691 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000692 i++;
693 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200694 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000695 continue;
696 }
697
Martin v. Löwis22970662011-09-29 13:39:38 +0200698 /* code is still input[i] here */
699 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000700 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200701 output[o++] = code;
702 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000703 continue;
704 }
705 /* Find next unblocked character. */
706 i1 = i+1;
707 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200708 /* output base character for now; might be updated later. */
709 output[o] = PyUnicode_READ(kind, data, i);
710 while (i1 < len) {
711 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
712 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000713 if (comb) {
714 if (comb1 == 0)
715 break;
716 if (comb >= comb1) {
717 /* Character is blocked. */
718 i1++;
719 continue;
720 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000721 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200722 l = find_nfc_index(self, nfc_last, code1);
723 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000724 is a starter, we don't need to look further.
725 Otherwise, record the combining class. */
726 if (l == -1) {
727 not_combinable:
728 if (comb1 == 0)
729 break;
730 comb = comb1;
731 i1++;
732 continue;
733 }
734 index = f*TOTAL_LAST + l;
735 index1 = comp_index[index >> COMP_SHIFT];
736 code = comp_data[(index1<<COMP_SHIFT)+
737 (index&((1<<COMP_SHIFT)-1))];
738 if (code == 0)
739 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000740
Martin v. Löwis677bde22002-11-23 22:08:15 +0000741 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200742 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000743 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000744 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000745 skipped[cskipped++] = i1;
746 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200747 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000748 if (f == -1)
749 break;
750 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200751 /* Output character was already written.
752 Just advance the indices. */
753 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000754 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200755 if (o == len) {
756 /* No changes. Return original string. */
757 PyMem_Free(output);
758 return result;
759 }
760 Py_DECREF(result);
761 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
762 output, o);
763 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000764 return result;
765}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000766
767/* Return 1 if the input is certainly normalized, 0 if it might not be. */
768static int
769is_normalized(PyObject *self, PyObject *input, int nfc, int k)
770{
Martin v. Löwis22970662011-09-29 13:39:38 +0200771 Py_ssize_t i, len;
772 int kind;
773 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000774 unsigned char prev_combining = 0, quickcheck_mask;
775
776 /* An older version of the database is requested, quickchecks must be
777 disabled. */
778 if (self && UCD_Check(self))
779 return 0;
780
781 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
782 as described in http://unicode.org/reports/tr15/#Annex8. */
783 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
784
Martin v. Löwis22970662011-09-29 13:39:38 +0200785 i = 0;
786 kind = PyUnicode_KIND(input);
787 data = PyUnicode_DATA(input);
788 len = PyUnicode_GET_LENGTH(input);
789 while (i < len) {
790 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
791 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000792 unsigned char combining = record->combining;
793 unsigned char quickcheck = record->normalization_quick_check;
794
795 if (quickcheck & quickcheck_mask)
796 return 0; /* this string might need normalization */
797 if (combining && prev_combining > combining)
798 return 0; /* non-canonical sort order, not normalized */
799 prev_combining = combining;
800 }
801 return 1; /* certainly normalized */
802}
803
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000804PyDoc_STRVAR(unicodedata_normalize__doc__,
805"normalize(form, unistr)\n\
806\n\
807Return the normal form 'form' for the Unicode string unistr. Valid\n\
808values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
809
Martin v. Löwis677bde22002-11-23 22:08:15 +0000810static PyObject*
811unicodedata_normalize(PyObject *self, PyObject *args)
812{
813 char *form;
814 PyObject *input;
815
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000816 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000817 &form, &PyUnicode_Type, &input))
818 return NULL;
819
Martin v. Löwis22970662011-09-29 13:39:38 +0200820 if (PyUnicode_READY(input) == -1)
821 return NULL;
822
823 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000824 /* Special case empty input strings, since resizing
825 them later would cause internal errors. */
826 Py_INCREF(input);
827 return input;
828 }
829
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000830 if (strcmp(form, "NFC") == 0) {
831 if (is_normalized(self, input, 1, 0)) {
832 Py_INCREF(input);
833 return input;
834 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000835 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000836 }
837 if (strcmp(form, "NFKC") == 0) {
838 if (is_normalized(self, input, 1, 1)) {
839 Py_INCREF(input);
840 return input;
841 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000842 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000843 }
844 if (strcmp(form, "NFD") == 0) {
845 if (is_normalized(self, input, 0, 0)) {
846 Py_INCREF(input);
847 return input;
848 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000849 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000850 }
851 if (strcmp(form, "NFKD") == 0) {
852 if (is_normalized(self, input, 0, 1)) {
853 Py_INCREF(input);
854 return input;
855 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000856 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000857 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000858 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
859 return NULL;
860}
861
Fredrik Lundh06d12682001-01-24 07:59:11 +0000862/* -------------------------------------------------------------------- */
863/* unicode character name tables */
864
865/* data file generated by Tools/unicode/makeunicodedata.py */
866#include "unicodename_db.h"
867
868/* -------------------------------------------------------------------- */
869/* database code (cut and pasted from the unidb package) */
870
871static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000872_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000873{
874 int i;
875 unsigned long h = 0;
876 unsigned long ix;
877 for (i = 0; i < len; i++) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000878 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000879 ix = h & 0xff000000;
880 if (ix)
881 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
882 }
883 return h;
884}
885
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000886static char *hangul_syllables[][3] = {
887 { "G", "A", "" },
888 { "GG", "AE", "G" },
889 { "N", "YA", "GG" },
890 { "D", "YAE", "GS" },
891 { "DD", "EO", "N", },
892 { "R", "E", "NJ" },
893 { "M", "YEO", "NH" },
894 { "B", "YE", "D" },
895 { "BB", "O", "L" },
896 { "S", "WA", "LG" },
897 { "SS", "WAE", "LM" },
898 { "", "OE", "LB" },
899 { "J", "YO", "LS" },
900 { "JJ", "U", "LT" },
901 { "C", "WEO", "LP" },
902 { "K", "WE", "LH" },
903 { "T", "WI", "M" },
904 { "P", "YU", "B" },
905 { "H", "EU", "BS" },
906 { 0, "YI", "S" },
907 { 0, "I", "SS" },
908 { 0, 0, "NG" },
909 { 0, 0, "J" },
910 { 0, 0, "C" },
911 { 0, 0, "K" },
912 { 0, 0, "T" },
913 { 0, 0, "P" },
914 { 0, 0, "H" }
915};
916
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000917/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000918static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000919is_unified_ideograph(Py_UCS4 code)
920{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000921 return
922 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
923 (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */
924 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
925 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
926 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000927}
928
929static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000930_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000931{
932 int offset;
933 int i;
934 int word;
935 unsigned char* w;
936
Martin v. Löwisc3509122006-03-11 12:16:23 +0000937 if (code >= 0x110000)
938 return 0;
939
Martin v. Löwis1a214512008-06-11 05:26:20 +0000940 if (self && UCD_Check(self)) {
Martin v. Löwisc3509122006-03-11 12:16:23 +0000941 const change_record *old = get_old_record(self, code);
942 if (old->category_changed == 0) {
943 /* unassigned */
944 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000945 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000946 }
947
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000948 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000949 /* Hangul syllable. */
950 int SIndex = code - SBase;
951 int L = SIndex / NCount;
952 int V = (SIndex % NCount) / TCount;
953 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000954
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000955 if (buflen < 27)
956 /* Worst case: HANGUL SYLLABLE <10chars>. */
957 return 0;
958 strcpy(buffer, "HANGUL SYLLABLE ");
959 buffer += 16;
960 strcpy(buffer, hangul_syllables[L][0]);
961 buffer += strlen(hangul_syllables[L][0]);
962 strcpy(buffer, hangul_syllables[V][1]);
963 buffer += strlen(hangul_syllables[V][1]);
964 strcpy(buffer, hangul_syllables[T][2]);
965 buffer += strlen(hangul_syllables[T][2]);
966 *buffer = '\0';
967 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000968 }
969
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000970 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000971 if (buflen < 28)
972 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
973 return 0;
974 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
975 return 1;
976 }
977
Fredrik Lundh06d12682001-01-24 07:59:11 +0000978 /* get offset into phrasebook */
979 offset = phrasebook_offset1[(code>>phrasebook_shift)];
980 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
981 (code&((1<<phrasebook_shift)-1))];
982 if (!offset)
983 return 0;
984
985 i = 0;
986
987 for (;;) {
988 /* get word index */
989 word = phrasebook[offset] - phrasebook_short;
990 if (word >= 0) {
991 word = (word << 8) + phrasebook[offset+1];
992 offset += 2;
993 } else
994 word = phrasebook[offset++];
995 if (i) {
996 if (i > buflen)
997 return 0; /* buffer overflow */
998 buffer[i++] = ' ';
999 }
1000 /* copy word string from lexicon. the last character in the
1001 word has bit 7 set. the last word in a string ends with
1002 0x80 */
1003 w = lexicon + lexicon_offset[word];
1004 while (*w < 128) {
1005 if (i >= buflen)
1006 return 0; /* buffer overflow */
1007 buffer[i++] = *w++;
1008 }
1009 if (i >= buflen)
1010 return 0; /* buffer overflow */
1011 buffer[i++] = *w & 127;
1012 if (*w == 128)
1013 break; /* end of word */
1014 }
1015
1016 return 1;
1017}
1018
1019static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001020_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001021{
1022 /* check if code corresponds to the given name */
1023 int i;
1024 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001025 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001026 return 0;
1027 for (i = 0; i < namelen; i++) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001028 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001029 return 0;
1030 }
1031 return buffer[namelen] == '\0';
1032}
1033
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001034static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001035find_syllable(const char *str, int *len, int *pos, int count, int column)
1036{
1037 int i, len1;
1038 *len = -1;
1039 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001040 char *s = hangul_syllables[i][column];
1041 len1 = strlen(s);
1042 if (len1 <= *len)
1043 continue;
1044 if (strncmp(str, s, len1) == 0) {
1045 *len = len1;
1046 *pos = i;
1047 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001048 }
1049 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001050 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001051 }
1052}
1053
Fredrik Lundh06d12682001-01-24 07:59:11 +00001054static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001055_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001056{
1057 unsigned int h, v;
1058 unsigned int mask = code_size-1;
1059 unsigned int i, incr;
1060
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001061 /* Check for hangul syllables. */
1062 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001063 int len, L = -1, V = -1, T = -1;
1064 const char *pos = name + 16;
1065 find_syllable(pos, &len, &L, LCount, 0);
1066 pos += len;
1067 find_syllable(pos, &len, &V, VCount, 1);
1068 pos += len;
1069 find_syllable(pos, &len, &T, TCount, 2);
1070 pos += len;
1071 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1072 *code = SBase + (L*VCount+V)*TCount + T;
1073 return 1;
1074 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001075 /* Otherwise, it's an illegal syllable name. */
1076 return 0;
1077 }
1078
1079 /* Check for unified ideographs. */
1080 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1081 /* Four or five hexdigits must follow. */
1082 v = 0;
1083 name += 22;
1084 namelen -= 22;
1085 if (namelen != 4 && namelen != 5)
1086 return 0;
1087 while (namelen--) {
1088 v *= 16;
1089 if (*name >= '0' && *name <= '9')
1090 v += *name - '0';
1091 else if (*name >= 'A' && *name <= 'F')
1092 v += *name - 'A' + 10;
1093 else
1094 return 0;
1095 name++;
1096 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001097 if (!is_unified_ideograph(v))
1098 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001099 *code = v;
1100 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001101 }
1102
Fredrik Lundh06d12682001-01-24 07:59:11 +00001103 /* the following is the same as python's dictionary lookup, with
1104 only minor changes. see the makeunicodedata script for more
1105 details */
1106
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001107 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001108 i = (~h) & mask;
1109 v = code_hash[i];
1110 if (!v)
1111 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001112 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001113 *code = v;
1114 return 1;
1115 }
1116 incr = (h ^ (h >> 3)) & mask;
1117 if (!incr)
1118 incr = mask;
1119 for (;;) {
1120 i = (i + incr) & mask;
1121 v = code_hash[i];
1122 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001123 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001124 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001125 *code = v;
1126 return 1;
1127 }
1128 incr = incr << 1;
1129 if (incr > mask)
1130 incr = incr ^ code_poly;
1131 }
1132}
1133
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001134static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001135{
1136 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001137 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001138 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001139};
1140
1141/* -------------------------------------------------------------------- */
1142/* Python bindings */
1143
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001144PyDoc_STRVAR(unicodedata_name__doc__,
1145"name(unichr[, default])\n\
1146Returns the name assigned to the Unicode character unichr as a\n\
1147string. If no name is defined, default is returned, or, if not\n\
1148given, ValueError is raised.");
1149
Fredrik Lundh06d12682001-01-24 07:59:11 +00001150static PyObject *
1151unicodedata_name(PyObject* self, PyObject* args)
1152{
1153 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001154 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001155
1156 PyUnicodeObject* v;
1157 PyObject* defobj = NULL;
1158 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1159 return NULL;
1160
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001161 c = getuchar(v);
1162 if (c == (Py_UCS4)-1)
1163 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001164
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001165 if (!_getucname(self, c, name, sizeof(name))) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001166 if (defobj == NULL) {
1167 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001168 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001169 }
1170 else {
1171 Py_INCREF(defobj);
1172 return defobj;
1173 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001174 }
1175
Walter Dörwald4254e762007-06-05 16:04:09 +00001176 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001177}
1178
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001179PyDoc_STRVAR(unicodedata_lookup__doc__,
1180"lookup(name)\n\
1181\n\
1182Look up character by name. If a character with the\n\
1183given name is found, return the corresponding Unicode\n\
1184character. If not found, KeyError is raised.");
1185
Fredrik Lundh06d12682001-01-24 07:59:11 +00001186static PyObject *
1187unicodedata_lookup(PyObject* self, PyObject* args)
1188{
1189 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001190
1191 char* name;
1192 int namelen;
1193 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1194 return NULL;
1195
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001196 if (!_getcode(self, name, namelen, &code)) {
Guido van Rossum806c2462007-08-06 23:33:07 +00001197 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1198 name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001199 return NULL;
1200 }
1201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001203}
1204
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001205/* XXX Add doc strings. */
1206
1207static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001208 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1209 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1210 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1211 {"category", unicodedata_category, METH_VARARGS,
1212 unicodedata_category__doc__},
1213 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1214 unicodedata_bidirectional__doc__},
1215 {"combining", unicodedata_combining, METH_VARARGS,
1216 unicodedata_combining__doc__},
1217 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1218 unicodedata_mirrored__doc__},
1219 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1220 unicodedata_east_asian_width__doc__},
1221 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1222 unicodedata_decomposition__doc__},
1223 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1224 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1225 {"normalize", unicodedata_normalize, METH_VARARGS,
1226 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001227 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001228};
1229
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001230static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001231 /* The ob_type field must be initialized in the module init function
1232 * to be portable to Windows without using C++. */
1233 PyVarObject_HEAD_INIT(NULL, 0)
1234 "unicodedata.UCD", /*tp_name*/
1235 sizeof(PreviousDBVersion), /*tp_basicsize*/
1236 0, /*tp_itemsize*/
1237 /* methods */
1238 (destructor)PyObject_Del, /*tp_dealloc*/
1239 0, /*tp_print*/
1240 0, /*tp_getattr*/
1241 0, /*tp_setattr*/
1242 0, /*tp_reserved*/
1243 0, /*tp_repr*/
1244 0, /*tp_as_number*/
1245 0, /*tp_as_sequence*/
1246 0, /*tp_as_mapping*/
1247 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001248 0, /*tp_call*/
1249 0, /*tp_str*/
1250 PyObject_GenericGetAttr,/*tp_getattro*/
1251 0, /*tp_setattro*/
1252 0, /*tp_as_buffer*/
1253 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1254 0, /*tp_doc*/
1255 0, /*tp_traverse*/
1256 0, /*tp_clear*/
1257 0, /*tp_richcompare*/
1258 0, /*tp_weaklistoffset*/
1259 0, /*tp_iter*/
1260 0, /*tp_iternext*/
1261 unicodedata_functions, /*tp_methods*/
1262 DB_members, /*tp_members*/
1263 0, /*tp_getset*/
1264 0, /*tp_base*/
1265 0, /*tp_dict*/
1266 0, /*tp_descr_get*/
1267 0, /*tp_descr_set*/
1268 0, /*tp_dictoffset*/
1269 0, /*tp_init*/
1270 0, /*tp_alloc*/
1271 0, /*tp_new*/
1272 0, /*tp_free*/
1273 0, /*tp_is_gc*/
1274};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001275
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001276PyDoc_STRVAR(unicodedata_docstring,
1277"This module provides access to the Unicode Character Database which\n\
1278defines character properties for all Unicode characters. The data in\n\
1279this database is based on the UnicodeData.txt file version\n\
Ezio Melotti4c5475d2010-03-22 23:16:42 +000012805.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001281\n\
1282The module uses the same names and symbols as defined by the\n\
Ezio Melottid96b2f22010-03-23 00:39:22 +00001283UnicodeData File Format 5.2.0 (see\n\
1284http://www.unicode.org/reports/tr44/tr44-4.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001285
Martin v. Löwis1a214512008-06-11 05:26:20 +00001286
1287static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 PyModuleDef_HEAD_INIT,
1289 "unicodedata",
1290 unicodedata_docstring,
1291 -1,
1292 unicodedata_functions,
1293 NULL,
1294 NULL,
1295 NULL,
1296 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001297};
1298
Mark Hammond62b1ab12002-07-23 06:31:15 +00001299PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001300PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001301{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001302 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001303
Christian Heimes90aa7642007-12-19 02:45:37 +00001304 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001305
Martin v. Löwis1a214512008-06-11 05:26:20 +00001306 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001307 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001308 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001309
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001310 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001311 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001312 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001313
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001314 /* Previous versions */
1315 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1316 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001317 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001318
Fredrik Lundh06d12682001-01-24 07:59:11 +00001319 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001320 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001321 if (v != NULL)
1322 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001323 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001324}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001325
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001326/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001327Local variables:
1328c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001329indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001330End:
1331*/