blob: 82a711fd34a15fbaf3660d1c9ed278e1046679cc [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti4c5475d2010-03-22 23:16:42 +00003 unicodedata -- Provides access to the Unicode 5.2 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti4c5475d2010-03-22 23:16:42 +00005 Data was extracted from the Unicode 5.2 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000030 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000031} _PyUnicode_DatabaseRecord;
32
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000033typedef struct change_record {
34 /* sequence of fields should be the same as in merge_old_version */
35 const unsigned char bidir_changed;
36 const unsigned char category_changed;
37 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000038 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000039 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000040} change_record;
41
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000042/* data file generated by Tools/unicode/makeunicodedata.py */
43#include "unicodedata_db.h"
44
45static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000046_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000049 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000050 index = 0;
51 else {
52 index = index1[(code>>SHIFT)];
53 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54 }
55
56 return &_PyUnicode_Database_Records[index];
57}
58
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000059/* ------------- Previous-version API ------------------------------------- */
60typedef struct previous_version {
61 PyObject_HEAD
62 const char *name;
63 const change_record* (*getrecord)(Py_UCS4);
64 Py_UCS4 (*normalization)(Py_UCS4);
65} PreviousDBVersion;
66
67#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
68
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000069static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000070 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000071 {NULL}
72};
73
Thomas Wouters89f507f2006-12-13 04:49:30 +000074/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000075static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000076#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000077
78static PyObject*
79new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
80 Py_UCS4 (*normalization)(Py_UCS4))
81{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000082 PreviousDBVersion *self;
83 self = PyObject_New(PreviousDBVersion, &UCD_Type);
84 if (self == NULL)
85 return NULL;
86 self->name = name;
87 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090}
91
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000092
93static Py_UCS4 getuchar(PyUnicodeObject *obj)
94{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020095 if (PyUnicode_READY(obj))
96 return (Py_UCS4)-1;
97 if (PyUnicode_GET_LENGTH(obj) == 1) {
98 if (PyUnicode_READY(obj))
99 return (Py_UCS4)-1;
100 return PyUnicode_READ_CHAR(obj, 0);
101 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000102 PyErr_SetString(PyExc_TypeError,
103 "need a single Unicode character as parameter");
104 return (Py_UCS4)-1;
105}
106
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000107/* --- Module API --------------------------------------------------------- */
108
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000109PyDoc_STRVAR(unicodedata_decimal__doc__,
110"decimal(unichr[, default])\n\
111\n\
112Returns the decimal value assigned to the Unicode character unichr\n\
113as integer. If no such value is defined, default is returned, or, if\n\
114not given, ValueError is raised.");
115
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000116static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000117unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118{
119 PyUnicodeObject *v;
120 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000121 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000122 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000123 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000124
Fredrik Lundh06d12682001-01-24 07:59:11 +0000125 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000126 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000127 c = getuchar(v);
128 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000129 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000130
Martin v. Löwis1a214512008-06-11 05:26:20 +0000131 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000132 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000133 if (old->category_changed == 0) {
134 /* unassigned */
135 have_old = 1;
136 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000137 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000138 else if (old->decimal_changed != 0xFF) {
139 have_old = 1;
140 rc = old->decimal_changed;
141 }
142 }
143
144 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000145 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000146 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 if (defobj == NULL) {
148 PyErr_SetString(PyExc_ValueError,
149 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000150 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000151 }
152 else {
153 Py_INCREF(defobj);
154 return defobj;
155 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000156 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000157 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000158}
159
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000160PyDoc_STRVAR(unicodedata_digit__doc__,
161"digit(unichr[, default])\n\
162\n\
163Returns the digit value assigned to the Unicode character unichr as\n\
164integer. If no such value is defined, default is returned, or, if\n\
165not given, ValueError is raised.");
166
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000167static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000168unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169{
170 PyUnicodeObject *v;
171 PyObject *defobj = NULL;
172 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000173 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000174
Fredrik Lundh06d12682001-01-24 07:59:11 +0000175 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000176 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000177 c = getuchar(v);
178 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000179 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000180 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000181 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000182 if (defobj == NULL) {
183 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000184 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 }
186 else {
187 Py_INCREF(defobj);
188 return defobj;
189 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000190 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000191 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000192}
193
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000194PyDoc_STRVAR(unicodedata_numeric__doc__,
195"numeric(unichr[, default])\n\
196\n\
197Returns the numeric value assigned to the Unicode character unichr\n\
198as float. If no such value is defined, default is returned, or, if\n\
199not given, ValueError is raised.");
200
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000202unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203{
204 PyUnicodeObject *v;
205 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000206 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000207 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000208 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000209
Fredrik Lundh06d12682001-01-24 07:59:11 +0000210 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000211 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000212 c = getuchar(v);
213 if (c == (Py_UCS4)-1)
214 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000215
Martin v. Löwis1a214512008-06-11 05:26:20 +0000216 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000217 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000218 if (old->category_changed == 0) {
219 /* unassigned */
220 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000222 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000223 else if (old->decimal_changed != 0xFF) {
224 have_old = 1;
225 rc = old->decimal_changed;
226 }
227 }
228
229 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000230 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000232 if (defobj == NULL) {
233 PyErr_SetString(PyExc_ValueError, "not a numeric character");
234 return NULL;
235 }
236 else {
237 Py_INCREF(defobj);
238 return defobj;
239 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000240 }
241 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000242}
243
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000244PyDoc_STRVAR(unicodedata_category__doc__,
245"category(unichr)\n\
246\n\
247Returns the general category assigned to the Unicode character\n\
248unichr as string.");
249
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000251unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000252{
253 PyUnicodeObject *v;
254 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000255 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000256
257 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000258 &PyUnicode_Type, &v))
259 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000260 c = getuchar(v);
261 if (c == (Py_UCS4)-1)
262 return NULL;
263 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000264 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000265 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000266 if (old->category_changed != 0xFF)
267 index = old->category_changed;
268 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000269 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000270}
271
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000272PyDoc_STRVAR(unicodedata_bidirectional__doc__,
273"bidirectional(unichr)\n\
274\n\
275Returns the bidirectional category assigned to the Unicode character\n\
276unichr as string. If no such value is defined, an empty string is\n\
277returned.");
278
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000279static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000280unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000281{
282 PyUnicodeObject *v;
283 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000284 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000285
286 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000287 &PyUnicode_Type, &v))
288 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000289 c = getuchar(v);
290 if (c == (Py_UCS4)-1)
291 return NULL;
292 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000293 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000294 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000295 if (old->category_changed == 0)
296 index = 0; /* unassigned */
297 else if (old->bidir_changed != 0xFF)
298 index = old->bidir_changed;
299 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000300 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000301}
302
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000303PyDoc_STRVAR(unicodedata_combining__doc__,
304"combining(unichr)\n\
305\n\
306Returns the canonical combining class assigned to the Unicode\n\
307character unichr as integer. Returns 0 if no combining class is\n\
308defined.");
309
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000310static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000311unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000312{
313 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000314 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000315 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000316
317 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000318 &PyUnicode_Type, &v))
319 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000320 c = getuchar(v);
321 if (c == (Py_UCS4)-1)
322 return NULL;
323 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000324 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000325 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000326 if (old->category_changed == 0)
327 index = 0; /* unassigned */
328 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000329 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000330}
331
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000332PyDoc_STRVAR(unicodedata_mirrored__doc__,
333"mirrored(unichr)\n\
334\n\
335Returns the mirrored property assigned to the Unicode character\n\
336unichr as integer. Returns 1 if the character has been identified as\n\
337a \"mirrored\" character in bidirectional text, 0 otherwise.");
338
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000339static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000340unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000341{
342 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000343 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000344 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000345
346 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000347 &PyUnicode_Type, &v))
348 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000349 c = getuchar(v);
350 if (c == (Py_UCS4)-1)
351 return NULL;
352 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000353 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000354 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000355 if (old->category_changed == 0)
356 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000357 else if (old->mirrored_changed != 0xFF)
358 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000359 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000360 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000361}
362
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000363PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
364"east_asian_width(unichr)\n\
365\n\
366Returns the east asian width assigned to the Unicode character\n\
367unichr as string.");
368
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000369static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000370unicodedata_east_asian_width(PyObject *self, PyObject *args)
371{
372 PyUnicodeObject *v;
373 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000374 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000375
376 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000377 &PyUnicode_Type, &v))
378 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000379 c = getuchar(v);
380 if (c == (Py_UCS4)-1)
381 return NULL;
382 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000383 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000384 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000385 if (old->category_changed == 0)
386 index = 0; /* unassigned */
387 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000388 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000389}
390
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000391PyDoc_STRVAR(unicodedata_decomposition__doc__,
392"decomposition(unichr)\n\
393\n\
394Returns the character decomposition mapping assigned to the Unicode\n\
395character unichr as string. An empty string is returned in case no\n\
396such mapping is defined.");
397
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000398static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000399unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000400{
401 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000402 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000403 int code, index, count;
404 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000405 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000406 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000407
408 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000409 &PyUnicode_Type, &v))
410 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000411 c = getuchar(v);
412 if (c == (Py_UCS4)-1)
413 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000414
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000415 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000416
Martin v. Löwis1a214512008-06-11 05:26:20 +0000417 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000418 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000419 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000420 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000421 }
422
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000423 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000424 index = 0;
425 else {
426 index = decomp_index1[(code>>DECOMP_SHIFT)];
427 index = decomp_index2[(index<<DECOMP_SHIFT)+
428 (code&((1<<DECOMP_SHIFT)-1))];
429 }
430
Tim Peters69b83b12001-11-30 07:23:05 +0000431 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000432 is prefix code (from*/
433 count = decomp_data[index] >> 8;
434
435 /* XXX: could allocate the PyString up front instead
436 (strlen(prefix) + 5 * count + 1 bytes) */
437
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000438 /* Based on how index is calculated above and decomp_data is generated
439 from Tools/unicode/makeunicodedata.py, it should not be possible
440 to overflow decomp_prefix. */
441 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200442 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000443
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000444 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000445 i = strlen(decomp_prefix[prefix_index]);
446 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000447
448 while (count-- > 0) {
449 if (i)
450 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000451 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000452 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
453 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000454 i += strlen(decomp + i);
455 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000456 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000457}
458
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000459static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000460get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000461{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000462 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000463 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000465 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000466 /* unassigned in old version */
467 *index = 0;
468 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000469 else {
470 *index = decomp_index1[(code>>DECOMP_SHIFT)];
471 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
472 (code&((1<<DECOMP_SHIFT)-1))];
473 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000474
Martin v. Löwis677bde22002-11-23 22:08:15 +0000475 /* high byte is number of hex bytes (usually one or two), low byte
476 is prefix code (from*/
477 *count = decomp_data[*index] >> 8;
478 *prefix = decomp_data[*index] & 255;
479
480 (*index)++;
481}
482
483#define SBase 0xAC00
484#define LBase 0x1100
485#define VBase 0x1161
486#define TBase 0x11A7
487#define LCount 19
488#define VCount 21
489#define TCount 28
490#define NCount (VCount*TCount)
491#define SCount (LCount*NCount)
492
493static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000494nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000495{
496 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200497 Py_UCS4 *output;
498 Py_ssize_t i, o, osize;
499 int kind;
500 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000501 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200502 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000503 Py_ssize_t space, isize;
504 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000505 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000506
Martin v. Löwis677bde22002-11-23 22:08:15 +0000507 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200508 isize = PyUnicode_GET_LENGTH(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000509 /* Overallocate atmost 10 characters. */
510 space = (isize > 10 ? 10 : isize) + isize;
Martin v. Löwis22970662011-09-29 13:39:38 +0200511 osize = space;
512 output = PyMem_Malloc(space * sizeof(Py_UCS4));
513 if (!output) {
514 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000515 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200516 }
517 i = o = 0;
518 kind = PyUnicode_KIND(input);
519 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000520
Martin v. Löwis22970662011-09-29 13:39:38 +0200521 while (i < isize) {
522 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000523 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200524 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000525 /* Hangul Decomposition adds three characters in
526 a single step, so we need atleast that much room. */
527 if (space < 3) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200528 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000529 space += 10;
Martin v. Löwis22970662011-09-29 13:39:38 +0200530 output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
531 if (output == NULL) {
532 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000533 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200534 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000535 }
536 /* Hangul Decomposition. */
537 if (SBase <= code && code < (SBase+SCount)) {
538 int SIndex = code - SBase;
539 int L = LBase + SIndex / NCount;
540 int V = VBase + (SIndex % NCount) / TCount;
541 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200542 output[o++] = L;
543 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000544 space -= 2;
545 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200546 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000547 space --;
548 }
549 continue;
550 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000551 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000552 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000553 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
554 if (value != 0) {
555 stack[stackptr++] = value;
556 continue;
557 }
558 }
559
560 /* Other decompositions. */
561 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000562
563 /* Copy character if it is not decomposable, or has a
564 compatibility decomposition, but we do NFD. */
565 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200566 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000567 space--;
568 continue;
569 }
570 /* Copy decomposition onto the stack, in reverse
571 order. */
572 while(count) {
573 code = decomp_data[index + (--count)];
574 stack[stackptr++] = code;
575 }
576 }
577 }
578
Martin v. Löwis22970662011-09-29 13:39:38 +0200579 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
580 output, o);
581 PyMem_Free(output);
582 if (!result)
583 return NULL;
584 /* result is guaranteed to be ready, as it is compact. */
585 kind = PyUnicode_KIND(result);
586 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000587
588 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200589 i = 0;
590 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
591 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
592 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000593 if (prev == 0 || cur == 0 || prev <= cur) {
594 prev = cur;
595 continue;
596 }
597 /* Non-canonical order. Need to switch *i with previous. */
598 o = i - 1;
599 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200600 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
601 PyUnicode_WRITE(kind, data, o+1,
602 PyUnicode_READ(kind, data, o));
603 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000604 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200605 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000606 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200607 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000608 if (prev == 0 || prev <= cur)
609 break;
610 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200611 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000612 }
613 return result;
614}
615
616static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200617find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000618{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200619 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000620 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200621 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000622 if (code < start)
623 return -1;
624 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200625 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000626 return nfc[index].index + delta;
627 }
628 }
629 return -1;
630}
631
632static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000633nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000634{
635 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200636 int kind;
637 void *data;
638 Py_UCS4 *output;
639 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000640 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200641 Py_UCS4 code;
642 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000643 int cskipped = 0;
644
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000645 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000646 if (!result)
647 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200648 /* result will be "ready". */
649 kind = PyUnicode_KIND(result);
650 data = PyUnicode_DATA(result);
651 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000652
Martin v. Löwis22970662011-09-29 13:39:38 +0200653 /* We allocate a buffer for the output.
654 If we find that we made no changes, we still return
655 the NFD result. */
656 output = PyMem_Malloc(len * sizeof(Py_UCS4));
657 if (!output) {
658 PyErr_NoMemory();
659 Py_DECREF(result);
660 return 0;
661 }
662 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000663
Martin v. Löwis677bde22002-11-23 22:08:15 +0000664 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200665 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000666 for (index = 0; index < cskipped; index++) {
667 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000668 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000669 Remove from list. */
670 skipped[index] = skipped[cskipped-1];
671 cskipped--;
672 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000673 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000674 }
675 }
676 /* Hangul Composition. We don't need to check for <LV,T>
677 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200678 code = PyUnicode_READ(kind, data, i);
679 if (LBase <= code && code < (LBase+LCount) &&
680 i + 1 < len &&
681 VBase <= PyUnicode_READ(kind, data, i+1) &&
682 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000683 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200684 LIndex = code - LBase;
685 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000686 code = SBase + (LIndex*VCount+VIndex)*TCount;
687 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200688 if (i < len &&
689 TBase <= PyUnicode_READ(kind, data, i) &&
690 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
691 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000692 i++;
693 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200694 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000695 continue;
696 }
697
Martin v. Löwis22970662011-09-29 13:39:38 +0200698 /* code is still input[i] here */
699 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000700 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200701 output[o++] = code;
702 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000703 continue;
704 }
705 /* Find next unblocked character. */
706 i1 = i+1;
707 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200708 /* output base character for now; might be updated later. */
709 output[o] = PyUnicode_READ(kind, data, i);
710 while (i1 < len) {
711 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
712 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000713 if (comb) {
714 if (comb1 == 0)
715 break;
716 if (comb >= comb1) {
717 /* Character is blocked. */
718 i1++;
719 continue;
720 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000721 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200722 l = find_nfc_index(self, nfc_last, code1);
723 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000724 is a starter, we don't need to look further.
725 Otherwise, record the combining class. */
726 if (l == -1) {
727 not_combinable:
728 if (comb1 == 0)
729 break;
730 comb = comb1;
731 i1++;
732 continue;
733 }
734 index = f*TOTAL_LAST + l;
735 index1 = comp_index[index >> COMP_SHIFT];
736 code = comp_data[(index1<<COMP_SHIFT)+
737 (index&((1<<COMP_SHIFT)-1))];
738 if (code == 0)
739 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000740
Martin v. Löwis677bde22002-11-23 22:08:15 +0000741 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200742 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000743 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000744 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000745 skipped[cskipped++] = i1;
746 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200747 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000748 if (f == -1)
749 break;
750 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200751 /* Output character was already written.
752 Just advance the indices. */
753 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000754 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200755 if (o == len) {
756 /* No changes. Return original string. */
757 PyMem_Free(output);
758 return result;
759 }
760 Py_DECREF(result);
761 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
762 output, o);
763 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000764 return result;
765}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000766
767/* Return 1 if the input is certainly normalized, 0 if it might not be. */
768static int
769is_normalized(PyObject *self, PyObject *input, int nfc, int k)
770{
Martin v. Löwis22970662011-09-29 13:39:38 +0200771 Py_ssize_t i, len;
772 int kind;
773 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000774 unsigned char prev_combining = 0, quickcheck_mask;
775
776 /* An older version of the database is requested, quickchecks must be
777 disabled. */
778 if (self && UCD_Check(self))
779 return 0;
780
781 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
782 as described in http://unicode.org/reports/tr15/#Annex8. */
783 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
784
Martin v. Löwis22970662011-09-29 13:39:38 +0200785 i = 0;
786 kind = PyUnicode_KIND(input);
787 data = PyUnicode_DATA(input);
788 len = PyUnicode_GET_LENGTH(input);
789 while (i < len) {
790 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
791 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000792 unsigned char combining = record->combining;
793 unsigned char quickcheck = record->normalization_quick_check;
794
795 if (quickcheck & quickcheck_mask)
796 return 0; /* this string might need normalization */
797 if (combining && prev_combining > combining)
798 return 0; /* non-canonical sort order, not normalized */
799 prev_combining = combining;
800 }
801 return 1; /* certainly normalized */
802}
803
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000804PyDoc_STRVAR(unicodedata_normalize__doc__,
805"normalize(form, unistr)\n\
806\n\
807Return the normal form 'form' for the Unicode string unistr. Valid\n\
808values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
809
Martin v. Löwis677bde22002-11-23 22:08:15 +0000810static PyObject*
811unicodedata_normalize(PyObject *self, PyObject *args)
812{
813 char *form;
814 PyObject *input;
815
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000816 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000817 &form, &PyUnicode_Type, &input))
818 return NULL;
819
Martin v. Löwis22970662011-09-29 13:39:38 +0200820 if (PyUnicode_READY(input) == -1)
821 return NULL;
822
823 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000824 /* Special case empty input strings, since resizing
825 them later would cause internal errors. */
826 Py_INCREF(input);
827 return input;
828 }
829
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000830 if (strcmp(form, "NFC") == 0) {
831 if (is_normalized(self, input, 1, 0)) {
832 Py_INCREF(input);
833 return input;
834 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000835 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000836 }
837 if (strcmp(form, "NFKC") == 0) {
838 if (is_normalized(self, input, 1, 1)) {
839 Py_INCREF(input);
840 return input;
841 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000842 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000843 }
844 if (strcmp(form, "NFD") == 0) {
845 if (is_normalized(self, input, 0, 0)) {
846 Py_INCREF(input);
847 return input;
848 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000849 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000850 }
851 if (strcmp(form, "NFKD") == 0) {
852 if (is_normalized(self, input, 0, 1)) {
853 Py_INCREF(input);
854 return input;
855 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000856 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000857 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000858 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
859 return NULL;
860}
861
Fredrik Lundh06d12682001-01-24 07:59:11 +0000862/* -------------------------------------------------------------------- */
863/* unicode character name tables */
864
865/* data file generated by Tools/unicode/makeunicodedata.py */
866#include "unicodename_db.h"
867
868/* -------------------------------------------------------------------- */
869/* database code (cut and pasted from the unidb package) */
870
871static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000872_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000873{
874 int i;
875 unsigned long h = 0;
876 unsigned long ix;
877 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200878 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000879 ix = h & 0xff000000;
880 if (ix)
881 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
882 }
883 return h;
884}
885
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000886static char *hangul_syllables[][3] = {
887 { "G", "A", "" },
888 { "GG", "AE", "G" },
889 { "N", "YA", "GG" },
890 { "D", "YAE", "GS" },
891 { "DD", "EO", "N", },
892 { "R", "E", "NJ" },
893 { "M", "YEO", "NH" },
894 { "B", "YE", "D" },
895 { "BB", "O", "L" },
896 { "S", "WA", "LG" },
897 { "SS", "WAE", "LM" },
898 { "", "OE", "LB" },
899 { "J", "YO", "LS" },
900 { "JJ", "U", "LT" },
901 { "C", "WEO", "LP" },
902 { "K", "WE", "LH" },
903 { "T", "WI", "M" },
904 { "P", "YU", "B" },
905 { "H", "EU", "BS" },
906 { 0, "YI", "S" },
907 { 0, "I", "SS" },
908 { 0, 0, "NG" },
909 { 0, 0, "J" },
910 { 0, 0, "C" },
911 { 0, 0, "K" },
912 { 0, 0, "T" },
913 { 0, 0, "P" },
914 { 0, 0, "H" }
915};
916
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000917/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000918static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000919is_unified_ideograph(Py_UCS4 code)
920{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000921 return
922 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
923 (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */
924 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
925 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
926 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000927}
928
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300929/* macros used to determine if the given codepoint is in the PUA range that
930 * we are using to store aliases and named sequences */
931#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
932#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
933 (cp < named_sequences_end))
934
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000935static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300936_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
937 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000938{
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300939 /* Find the name associated with the given codepoint.
940 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
941 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000942 int offset;
943 int i;
944 int word;
945 unsigned char* w;
946
Martin v. Löwisc3509122006-03-11 12:16:23 +0000947 if (code >= 0x110000)
948 return 0;
949
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300950 /* XXX should we just skip all the codepoints in the PUAs here? */
951 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
952 return 0;
953
Martin v. Löwis1a214512008-06-11 05:26:20 +0000954 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300955 /* in 3.2.0 there are no aliases and named sequences */
956 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
957 return 0;
Martin v. Löwisc3509122006-03-11 12:16:23 +0000958 const change_record *old = get_old_record(self, code);
959 if (old->category_changed == 0) {
960 /* unassigned */
961 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000962 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000963 }
964
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000965 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000966 /* Hangul syllable. */
967 int SIndex = code - SBase;
968 int L = SIndex / NCount;
969 int V = (SIndex % NCount) / TCount;
970 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000971
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000972 if (buflen < 27)
973 /* Worst case: HANGUL SYLLABLE <10chars>. */
974 return 0;
975 strcpy(buffer, "HANGUL SYLLABLE ");
976 buffer += 16;
977 strcpy(buffer, hangul_syllables[L][0]);
978 buffer += strlen(hangul_syllables[L][0]);
979 strcpy(buffer, hangul_syllables[V][1]);
980 buffer += strlen(hangul_syllables[V][1]);
981 strcpy(buffer, hangul_syllables[T][2]);
982 buffer += strlen(hangul_syllables[T][2]);
983 *buffer = '\0';
984 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000985 }
986
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000987 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000988 if (buflen < 28)
989 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
990 return 0;
991 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
992 return 1;
993 }
994
Fredrik Lundh06d12682001-01-24 07:59:11 +0000995 /* get offset into phrasebook */
996 offset = phrasebook_offset1[(code>>phrasebook_shift)];
997 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
998 (code&((1<<phrasebook_shift)-1))];
999 if (!offset)
1000 return 0;
1001
1002 i = 0;
1003
1004 for (;;) {
1005 /* get word index */
1006 word = phrasebook[offset] - phrasebook_short;
1007 if (word >= 0) {
1008 word = (word << 8) + phrasebook[offset+1];
1009 offset += 2;
1010 } else
1011 word = phrasebook[offset++];
1012 if (i) {
1013 if (i > buflen)
1014 return 0; /* buffer overflow */
1015 buffer[i++] = ' ';
1016 }
1017 /* copy word string from lexicon. the last character in the
1018 word has bit 7 set. the last word in a string ends with
1019 0x80 */
1020 w = lexicon + lexicon_offset[word];
1021 while (*w < 128) {
1022 if (i >= buflen)
1023 return 0; /* buffer overflow */
1024 buffer[i++] = *w++;
1025 }
1026 if (i >= buflen)
1027 return 0; /* buffer overflow */
1028 buffer[i++] = *w & 127;
1029 if (*w == 128)
1030 break; /* end of word */
1031 }
1032
1033 return 1;
1034}
1035
1036static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001037_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001038{
1039 /* check if code corresponds to the given name */
1040 int i;
1041 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001042 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001043 return 0;
1044 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001045 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001046 return 0;
1047 }
1048 return buffer[namelen] == '\0';
1049}
1050
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001051static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001052find_syllable(const char *str, int *len, int *pos, int count, int column)
1053{
1054 int i, len1;
1055 *len = -1;
1056 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001057 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001058 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001059 if (len1 <= *len)
1060 continue;
1061 if (strncmp(str, s, len1) == 0) {
1062 *len = len1;
1063 *pos = i;
1064 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001065 }
1066 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001067 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001068 }
1069}
1070
Fredrik Lundh06d12682001-01-24 07:59:11 +00001071static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001072_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001073{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001074 /* check if named sequences are allowed */
1075 if (!with_named_seq && IS_NAMED_SEQ(cp))
1076 return 0;
1077 /* if the codepoint is in the PUA range that we use for aliases,
1078 * convert it to obtain the right codepoint */
1079 if (IS_ALIAS(cp))
1080 *code = name_aliases[cp-aliases_start];
1081 else
1082 *code = cp;
1083 return 1;
1084}
1085
1086static int
1087_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1088 int with_named_seq)
1089{
1090 /* Return the codepoint associated with the given name.
1091 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1092 * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
1093 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001094 unsigned int h, v;
1095 unsigned int mask = code_size-1;
1096 unsigned int i, incr;
1097
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001098 /* Check for hangul syllables. */
1099 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001100 int len, L = -1, V = -1, T = -1;
1101 const char *pos = name + 16;
1102 find_syllable(pos, &len, &L, LCount, 0);
1103 pos += len;
1104 find_syllable(pos, &len, &V, VCount, 1);
1105 pos += len;
1106 find_syllable(pos, &len, &T, TCount, 2);
1107 pos += len;
1108 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1109 *code = SBase + (L*VCount+V)*TCount + T;
1110 return 1;
1111 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001112 /* Otherwise, it's an illegal syllable name. */
1113 return 0;
1114 }
1115
1116 /* Check for unified ideographs. */
1117 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1118 /* Four or five hexdigits must follow. */
1119 v = 0;
1120 name += 22;
1121 namelen -= 22;
1122 if (namelen != 4 && namelen != 5)
1123 return 0;
1124 while (namelen--) {
1125 v *= 16;
1126 if (*name >= '0' && *name <= '9')
1127 v += *name - '0';
1128 else if (*name >= 'A' && *name <= 'F')
1129 v += *name - 'A' + 10;
1130 else
1131 return 0;
1132 name++;
1133 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001134 if (!is_unified_ideograph(v))
1135 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001136 *code = v;
1137 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001138 }
1139
Fredrik Lundh06d12682001-01-24 07:59:11 +00001140 /* the following is the same as python's dictionary lookup, with
1141 only minor changes. see the makeunicodedata script for more
1142 details */
1143
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001144 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001145 i = (~h) & mask;
1146 v = code_hash[i];
1147 if (!v)
1148 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001149 if (_cmpname(self, v, name, namelen))
1150 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001151 incr = (h ^ (h >> 3)) & mask;
1152 if (!incr)
1153 incr = mask;
1154 for (;;) {
1155 i = (i + incr) & mask;
1156 v = code_hash[i];
1157 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001158 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001159 if (_cmpname(self, v, name, namelen))
1160 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001161 incr = incr << 1;
1162 if (incr > mask)
1163 incr = incr ^ code_poly;
1164 }
1165}
1166
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001167static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001168{
1169 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001170 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001171 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001172};
1173
1174/* -------------------------------------------------------------------- */
1175/* Python bindings */
1176
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001177PyDoc_STRVAR(unicodedata_name__doc__,
1178"name(unichr[, default])\n\
1179Returns the name assigned to the Unicode character unichr as a\n\
1180string. If no name is defined, default is returned, or, if not\n\
1181given, ValueError is raised.");
1182
Fredrik Lundh06d12682001-01-24 07:59:11 +00001183static PyObject *
1184unicodedata_name(PyObject* self, PyObject* args)
1185{
1186 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001187 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001188
1189 PyUnicodeObject* v;
1190 PyObject* defobj = NULL;
1191 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1192 return NULL;
1193
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001194 c = getuchar(v);
1195 if (c == (Py_UCS4)-1)
1196 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001197
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001198 if (!_getucname(self, c, name, sizeof(name), 0)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001199 if (defobj == NULL) {
1200 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001201 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001202 }
1203 else {
1204 Py_INCREF(defobj);
1205 return defobj;
1206 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001207 }
1208
Walter Dörwald4254e762007-06-05 16:04:09 +00001209 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001210}
1211
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001212PyDoc_STRVAR(unicodedata_lookup__doc__,
1213"lookup(name)\n\
1214\n\
1215Look up character by name. If a character with the\n\
1216given name is found, return the corresponding Unicode\n\
1217character. If not found, KeyError is raised.");
1218
Fredrik Lundh06d12682001-01-24 07:59:11 +00001219static PyObject *
1220unicodedata_lookup(PyObject* self, PyObject* args)
1221{
1222 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001223
1224 char* name;
1225 int namelen;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001226 unsigned int index;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001227 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1228 return NULL;
1229
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001230 if (!_getcode(self, name, namelen, &code, 1)) {
1231 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001232 return NULL;
1233 }
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001234 // check if code is in the PUA range that we use for named sequences
1235 // and convert it
1236 if (IS_NAMED_SEQ(code)) {
1237 index = code-named_sequences_start;
1238 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1239 named_sequences[index].seq,
1240 named_sequences[index].seqlen);
1241 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001243}
1244
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001245/* XXX Add doc strings. */
1246
1247static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001248 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1249 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1250 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1251 {"category", unicodedata_category, METH_VARARGS,
1252 unicodedata_category__doc__},
1253 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1254 unicodedata_bidirectional__doc__},
1255 {"combining", unicodedata_combining, METH_VARARGS,
1256 unicodedata_combining__doc__},
1257 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1258 unicodedata_mirrored__doc__},
1259 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1260 unicodedata_east_asian_width__doc__},
1261 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1262 unicodedata_decomposition__doc__},
1263 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1264 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1265 {"normalize", unicodedata_normalize, METH_VARARGS,
1266 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001267 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001268};
1269
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001270static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001271 /* The ob_type field must be initialized in the module init function
1272 * to be portable to Windows without using C++. */
1273 PyVarObject_HEAD_INIT(NULL, 0)
1274 "unicodedata.UCD", /*tp_name*/
1275 sizeof(PreviousDBVersion), /*tp_basicsize*/
1276 0, /*tp_itemsize*/
1277 /* methods */
1278 (destructor)PyObject_Del, /*tp_dealloc*/
1279 0, /*tp_print*/
1280 0, /*tp_getattr*/
1281 0, /*tp_setattr*/
1282 0, /*tp_reserved*/
1283 0, /*tp_repr*/
1284 0, /*tp_as_number*/
1285 0, /*tp_as_sequence*/
1286 0, /*tp_as_mapping*/
1287 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001288 0, /*tp_call*/
1289 0, /*tp_str*/
1290 PyObject_GenericGetAttr,/*tp_getattro*/
1291 0, /*tp_setattro*/
1292 0, /*tp_as_buffer*/
1293 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1294 0, /*tp_doc*/
1295 0, /*tp_traverse*/
1296 0, /*tp_clear*/
1297 0, /*tp_richcompare*/
1298 0, /*tp_weaklistoffset*/
1299 0, /*tp_iter*/
1300 0, /*tp_iternext*/
1301 unicodedata_functions, /*tp_methods*/
1302 DB_members, /*tp_members*/
1303 0, /*tp_getset*/
1304 0, /*tp_base*/
1305 0, /*tp_dict*/
1306 0, /*tp_descr_get*/
1307 0, /*tp_descr_set*/
1308 0, /*tp_dictoffset*/
1309 0, /*tp_init*/
1310 0, /*tp_alloc*/
1311 0, /*tp_new*/
1312 0, /*tp_free*/
1313 0, /*tp_is_gc*/
1314};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001315
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001316PyDoc_STRVAR(unicodedata_docstring,
1317"This module provides access to the Unicode Character Database which\n\
1318defines character properties for all Unicode characters. The data in\n\
1319this database is based on the UnicodeData.txt file version\n\
Ezio Melotti4c5475d2010-03-22 23:16:42 +000013205.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001321\n\
1322The module uses the same names and symbols as defined by the\n\
Ezio Melottid96b2f22010-03-23 00:39:22 +00001323UnicodeData File Format 5.2.0 (see\n\
1324http://www.unicode.org/reports/tr44/tr44-4.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001325
Martin v. Löwis1a214512008-06-11 05:26:20 +00001326
1327static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001328 PyModuleDef_HEAD_INIT,
1329 "unicodedata",
1330 unicodedata_docstring,
1331 -1,
1332 unicodedata_functions,
1333 NULL,
1334 NULL,
1335 NULL,
1336 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001337};
1338
Mark Hammond62b1ab12002-07-23 06:31:15 +00001339PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001340PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001341{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001342 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001343
Christian Heimes90aa7642007-12-19 02:45:37 +00001344 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001345
Martin v. Löwis1a214512008-06-11 05:26:20 +00001346 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001347 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001348 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001349
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001350 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001351 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001352 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001353
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001354 /* Previous versions */
1355 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1356 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001357 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001358
Fredrik Lundh06d12682001-01-24 07:59:11 +00001359 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001360 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001361 if (v != NULL)
1362 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001363 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001364}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001365
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001367Local variables:
1368c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001369indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001370End:
1371*/