blob: 5032d42b6a906cfb4b965de8ab7962d96d01e708 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti4c5475d2010-03-22 23:16:42 +00003 unicodedata -- Provides access to the Unicode 5.2 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti4c5475d2010-03-22 23:16:42 +00005 Data was extracted from the Unicode 5.2 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000030 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000031} _PyUnicode_DatabaseRecord;
32
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000033typedef struct change_record {
34 /* sequence of fields should be the same as in merge_old_version */
35 const unsigned char bidir_changed;
36 const unsigned char category_changed;
37 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000038 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000039 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000040} change_record;
41
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000042/* data file generated by Tools/unicode/makeunicodedata.py */
43#include "unicodedata_db.h"
44
45static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000046_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000049 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000050 index = 0;
51 else {
52 index = index1[(code>>SHIFT)];
53 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54 }
55
56 return &_PyUnicode_Database_Records[index];
57}
58
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000059/* ------------- Previous-version API ------------------------------------- */
60typedef struct previous_version {
61 PyObject_HEAD
62 const char *name;
63 const change_record* (*getrecord)(Py_UCS4);
64 Py_UCS4 (*normalization)(Py_UCS4);
65} PreviousDBVersion;
66
67#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
68
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000069static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000070 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000071 {NULL}
72};
73
Thomas Wouters89f507f2006-12-13 04:49:30 +000074/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000075static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000076#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000077
78static PyObject*
79new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
80 Py_UCS4 (*normalization)(Py_UCS4))
81{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000082 PreviousDBVersion *self;
83 self = PyObject_New(PreviousDBVersion, &UCD_Type);
84 if (self == NULL)
85 return NULL;
86 self->name = name;
87 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090}
91
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000092
93static Py_UCS4 getuchar(PyUnicodeObject *obj)
94{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020095 if (PyUnicode_READY(obj))
96 return (Py_UCS4)-1;
97 if (PyUnicode_GET_LENGTH(obj) == 1) {
98 if (PyUnicode_READY(obj))
99 return (Py_UCS4)-1;
100 return PyUnicode_READ_CHAR(obj, 0);
101 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000102 PyErr_SetString(PyExc_TypeError,
103 "need a single Unicode character as parameter");
104 return (Py_UCS4)-1;
105}
106
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000107/* --- Module API --------------------------------------------------------- */
108
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000109PyDoc_STRVAR(unicodedata_decimal__doc__,
110"decimal(unichr[, default])\n\
111\n\
112Returns the decimal value assigned to the Unicode character unichr\n\
113as integer. If no such value is defined, default is returned, or, if\n\
114not given, ValueError is raised.");
115
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000116static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000117unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118{
119 PyUnicodeObject *v;
120 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000121 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000122 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000123 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000124
Fredrik Lundh06d12682001-01-24 07:59:11 +0000125 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000126 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000127 c = getuchar(v);
128 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000129 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000130
Martin v. Löwis1a214512008-06-11 05:26:20 +0000131 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000132 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000133 if (old->category_changed == 0) {
134 /* unassigned */
135 have_old = 1;
136 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000137 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000138 else if (old->decimal_changed != 0xFF) {
139 have_old = 1;
140 rc = old->decimal_changed;
141 }
142 }
143
144 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000145 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000146 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 if (defobj == NULL) {
148 PyErr_SetString(PyExc_ValueError,
149 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000150 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000151 }
152 else {
153 Py_INCREF(defobj);
154 return defobj;
155 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000156 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000157 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000158}
159
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000160PyDoc_STRVAR(unicodedata_digit__doc__,
161"digit(unichr[, default])\n\
162\n\
163Returns the digit value assigned to the Unicode character unichr as\n\
164integer. If no such value is defined, default is returned, or, if\n\
165not given, ValueError is raised.");
166
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000167static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000168unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169{
170 PyUnicodeObject *v;
171 PyObject *defobj = NULL;
172 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000173 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000174
Fredrik Lundh06d12682001-01-24 07:59:11 +0000175 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000176 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000177 c = getuchar(v);
178 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000179 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000180 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000181 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000182 if (defobj == NULL) {
183 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000184 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 }
186 else {
187 Py_INCREF(defobj);
188 return defobj;
189 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000190 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000191 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000192}
193
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000194PyDoc_STRVAR(unicodedata_numeric__doc__,
195"numeric(unichr[, default])\n\
196\n\
197Returns the numeric value assigned to the Unicode character unichr\n\
198as float. If no such value is defined, default is returned, or, if\n\
199not given, ValueError is raised.");
200
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000202unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203{
204 PyUnicodeObject *v;
205 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000206 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000207 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000208 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000209
Fredrik Lundh06d12682001-01-24 07:59:11 +0000210 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000211 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000212 c = getuchar(v);
213 if (c == (Py_UCS4)-1)
214 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000215
Martin v. Löwis1a214512008-06-11 05:26:20 +0000216 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000217 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000218 if (old->category_changed == 0) {
219 /* unassigned */
220 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000222 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000223 else if (old->decimal_changed != 0xFF) {
224 have_old = 1;
225 rc = old->decimal_changed;
226 }
227 }
228
229 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000230 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000232 if (defobj == NULL) {
233 PyErr_SetString(PyExc_ValueError, "not a numeric character");
234 return NULL;
235 }
236 else {
237 Py_INCREF(defobj);
238 return defobj;
239 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000240 }
241 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000242}
243
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000244PyDoc_STRVAR(unicodedata_category__doc__,
245"category(unichr)\n\
246\n\
247Returns the general category assigned to the Unicode character\n\
248unichr as string.");
249
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000251unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000252{
253 PyUnicodeObject *v;
254 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000255 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000256
257 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000258 &PyUnicode_Type, &v))
259 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000260 c = getuchar(v);
261 if (c == (Py_UCS4)-1)
262 return NULL;
263 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000264 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000265 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000266 if (old->category_changed != 0xFF)
267 index = old->category_changed;
268 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000269 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000270}
271
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000272PyDoc_STRVAR(unicodedata_bidirectional__doc__,
273"bidirectional(unichr)\n\
274\n\
275Returns the bidirectional category assigned to the Unicode character\n\
276unichr as string. If no such value is defined, an empty string is\n\
277returned.");
278
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000279static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000280unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000281{
282 PyUnicodeObject *v;
283 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000284 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000285
286 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000287 &PyUnicode_Type, &v))
288 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000289 c = getuchar(v);
290 if (c == (Py_UCS4)-1)
291 return NULL;
292 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000293 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000294 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000295 if (old->category_changed == 0)
296 index = 0; /* unassigned */
297 else if (old->bidir_changed != 0xFF)
298 index = old->bidir_changed;
299 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000300 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000301}
302
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000303PyDoc_STRVAR(unicodedata_combining__doc__,
304"combining(unichr)\n\
305\n\
306Returns the canonical combining class assigned to the Unicode\n\
307character unichr as integer. Returns 0 if no combining class is\n\
308defined.");
309
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000310static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000311unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000312{
313 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000314 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000315 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000316
317 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000318 &PyUnicode_Type, &v))
319 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000320 c = getuchar(v);
321 if (c == (Py_UCS4)-1)
322 return NULL;
323 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000324 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000325 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000326 if (old->category_changed == 0)
327 index = 0; /* unassigned */
328 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000329 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000330}
331
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000332PyDoc_STRVAR(unicodedata_mirrored__doc__,
333"mirrored(unichr)\n\
334\n\
335Returns the mirrored property assigned to the Unicode character\n\
336unichr as integer. Returns 1 if the character has been identified as\n\
337a \"mirrored\" character in bidirectional text, 0 otherwise.");
338
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000339static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000340unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000341{
342 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000343 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000344 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000345
346 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000347 &PyUnicode_Type, &v))
348 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000349 c = getuchar(v);
350 if (c == (Py_UCS4)-1)
351 return NULL;
352 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000353 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000354 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000355 if (old->category_changed == 0)
356 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000357 else if (old->mirrored_changed != 0xFF)
358 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000359 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000360 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000361}
362
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000363PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
364"east_asian_width(unichr)\n\
365\n\
366Returns the east asian width assigned to the Unicode character\n\
367unichr as string.");
368
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000369static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000370unicodedata_east_asian_width(PyObject *self, PyObject *args)
371{
372 PyUnicodeObject *v;
373 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000374 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000375
376 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000377 &PyUnicode_Type, &v))
378 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000379 c = getuchar(v);
380 if (c == (Py_UCS4)-1)
381 return NULL;
382 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000383 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000384 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000385 if (old->category_changed == 0)
386 index = 0; /* unassigned */
387 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000388 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000389}
390
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000391PyDoc_STRVAR(unicodedata_decomposition__doc__,
392"decomposition(unichr)\n\
393\n\
394Returns the character decomposition mapping assigned to the Unicode\n\
395character unichr as string. An empty string is returned in case no\n\
396such mapping is defined.");
397
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000398static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000399unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000400{
401 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000402 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000403 int code, index, count;
404 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000405 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000406 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000407
408 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000409 &PyUnicode_Type, &v))
410 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000411 c = getuchar(v);
412 if (c == (Py_UCS4)-1)
413 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000414
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000415 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000416
Martin v. Löwis1a214512008-06-11 05:26:20 +0000417 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000418 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000419 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000420 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000421 }
422
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000423 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000424 index = 0;
425 else {
426 index = decomp_index1[(code>>DECOMP_SHIFT)];
427 index = decomp_index2[(index<<DECOMP_SHIFT)+
428 (code&((1<<DECOMP_SHIFT)-1))];
429 }
430
Tim Peters69b83b12001-11-30 07:23:05 +0000431 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000432 is prefix code (from*/
433 count = decomp_data[index] >> 8;
434
435 /* XXX: could allocate the PyString up front instead
436 (strlen(prefix) + 5 * count + 1 bytes) */
437
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000438 /* Based on how index is calculated above and decomp_data is generated
439 from Tools/unicode/makeunicodedata.py, it should not be possible
440 to overflow decomp_prefix. */
441 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200442 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000443
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000444 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000445 i = strlen(decomp_prefix[prefix_index]);
446 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000447
448 while (count-- > 0) {
449 if (i)
450 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000451 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000452 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
453 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000454 i += strlen(decomp + i);
455 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000456 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000457}
458
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000459static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000460get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000461{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000462 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000463 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000465 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000466 /* unassigned in old version */
467 *index = 0;
468 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000469 else {
470 *index = decomp_index1[(code>>DECOMP_SHIFT)];
471 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
472 (code&((1<<DECOMP_SHIFT)-1))];
473 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000474
Martin v. Löwis677bde22002-11-23 22:08:15 +0000475 /* high byte is number of hex bytes (usually one or two), low byte
476 is prefix code (from*/
477 *count = decomp_data[*index] >> 8;
478 *prefix = decomp_data[*index] & 255;
479
480 (*index)++;
481}
482
483#define SBase 0xAC00
484#define LBase 0x1100
485#define VBase 0x1161
486#define TBase 0x11A7
487#define LCount 19
488#define VCount 21
489#define TCount 28
490#define NCount (VCount*TCount)
491#define SCount (LCount*NCount)
492
493static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000494nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000495{
496 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200497 Py_UCS4 *output;
498 Py_ssize_t i, o, osize;
499 int kind;
500 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000501 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200502 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000503 Py_ssize_t space, isize;
504 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000505 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000506
Martin v. Löwis677bde22002-11-23 22:08:15 +0000507 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200508 isize = PyUnicode_GET_LENGTH(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000509 /* Overallocate atmost 10 characters. */
510 space = (isize > 10 ? 10 : isize) + isize;
Martin v. Löwis22970662011-09-29 13:39:38 +0200511 osize = space;
512 output = PyMem_Malloc(space * sizeof(Py_UCS4));
513 if (!output) {
514 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000515 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200516 }
517 i = o = 0;
518 kind = PyUnicode_KIND(input);
519 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000520
Martin v. Löwis22970662011-09-29 13:39:38 +0200521 while (i < isize) {
522 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000523 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200524 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000525 /* Hangul Decomposition adds three characters in
526 a single step, so we need atleast that much room. */
527 if (space < 3) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200528 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000529 space += 10;
Martin v. Löwis22970662011-09-29 13:39:38 +0200530 output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
531 if (output == NULL) {
532 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000533 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200534 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000535 }
536 /* Hangul Decomposition. */
537 if (SBase <= code && code < (SBase+SCount)) {
538 int SIndex = code - SBase;
539 int L = LBase + SIndex / NCount;
540 int V = VBase + (SIndex % NCount) / TCount;
541 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200542 output[o++] = L;
543 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000544 space -= 2;
545 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200546 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000547 space --;
548 }
549 continue;
550 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000551 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000552 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000553 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
554 if (value != 0) {
555 stack[stackptr++] = value;
556 continue;
557 }
558 }
559
560 /* Other decompositions. */
561 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000562
563 /* Copy character if it is not decomposable, or has a
564 compatibility decomposition, but we do NFD. */
565 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200566 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000567 space--;
568 continue;
569 }
570 /* Copy decomposition onto the stack, in reverse
571 order. */
572 while(count) {
573 code = decomp_data[index + (--count)];
574 stack[stackptr++] = code;
575 }
576 }
577 }
578
Martin v. Löwis22970662011-09-29 13:39:38 +0200579 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
580 output, o);
581 PyMem_Free(output);
582 if (!result)
583 return NULL;
584 /* result is guaranteed to be ready, as it is compact. */
585 kind = PyUnicode_KIND(result);
586 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000587
588 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200589 i = 0;
590 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
591 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
592 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000593 if (prev == 0 || cur == 0 || prev <= cur) {
594 prev = cur;
595 continue;
596 }
597 /* Non-canonical order. Need to switch *i with previous. */
598 o = i - 1;
599 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200600 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
601 PyUnicode_WRITE(kind, data, o+1,
602 PyUnicode_READ(kind, data, o));
603 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000604 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200605 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000606 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200607 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000608 if (prev == 0 || prev <= cur)
609 break;
610 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200611 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000612 }
613 return result;
614}
615
616static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200617find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000618{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200619 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000620 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200621 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000622 if (code < start)
623 return -1;
624 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200625 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000626 return nfc[index].index + delta;
627 }
628 }
629 return -1;
630}
631
632static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000633nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000634{
635 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200636 int kind;
637 void *data;
638 Py_UCS4 *output;
639 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000640 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200641 Py_UCS4 code;
642 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000643 int cskipped = 0;
644
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000645 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000646 if (!result)
647 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200648 /* result will be "ready". */
649 kind = PyUnicode_KIND(result);
650 data = PyUnicode_DATA(result);
651 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000652
Martin v. Löwis22970662011-09-29 13:39:38 +0200653 /* We allocate a buffer for the output.
654 If we find that we made no changes, we still return
655 the NFD result. */
656 output = PyMem_Malloc(len * sizeof(Py_UCS4));
657 if (!output) {
658 PyErr_NoMemory();
659 Py_DECREF(result);
660 return 0;
661 }
662 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000663
Martin v. Löwis677bde22002-11-23 22:08:15 +0000664 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200665 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000666 for (index = 0; index < cskipped; index++) {
667 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000668 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000669 Remove from list. */
670 skipped[index] = skipped[cskipped-1];
671 cskipped--;
672 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000673 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000674 }
675 }
676 /* Hangul Composition. We don't need to check for <LV,T>
677 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200678 code = PyUnicode_READ(kind, data, i);
679 if (LBase <= code && code < (LBase+LCount) &&
680 i + 1 < len &&
681 VBase <= PyUnicode_READ(kind, data, i+1) &&
682 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000683 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200684 LIndex = code - LBase;
685 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000686 code = SBase + (LIndex*VCount+VIndex)*TCount;
687 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200688 if (i < len &&
689 TBase <= PyUnicode_READ(kind, data, i) &&
690 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
691 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000692 i++;
693 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200694 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000695 continue;
696 }
697
Martin v. Löwis22970662011-09-29 13:39:38 +0200698 /* code is still input[i] here */
699 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000700 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200701 output[o++] = code;
702 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000703 continue;
704 }
705 /* Find next unblocked character. */
706 i1 = i+1;
707 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200708 /* output base character for now; might be updated later. */
709 output[o] = PyUnicode_READ(kind, data, i);
710 while (i1 < len) {
711 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
712 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000713 if (comb) {
714 if (comb1 == 0)
715 break;
716 if (comb >= comb1) {
717 /* Character is blocked. */
718 i1++;
719 continue;
720 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000721 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200722 l = find_nfc_index(self, nfc_last, code1);
723 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000724 is a starter, we don't need to look further.
725 Otherwise, record the combining class. */
726 if (l == -1) {
727 not_combinable:
728 if (comb1 == 0)
729 break;
730 comb = comb1;
731 i1++;
732 continue;
733 }
734 index = f*TOTAL_LAST + l;
735 index1 = comp_index[index >> COMP_SHIFT];
736 code = comp_data[(index1<<COMP_SHIFT)+
737 (index&((1<<COMP_SHIFT)-1))];
738 if (code == 0)
739 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000740
Martin v. Löwis677bde22002-11-23 22:08:15 +0000741 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200742 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000743 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000744 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000745 skipped[cskipped++] = i1;
746 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200747 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000748 if (f == -1)
749 break;
750 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200751 /* Output character was already written.
752 Just advance the indices. */
753 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000754 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200755 if (o == len) {
756 /* No changes. Return original string. */
757 PyMem_Free(output);
758 return result;
759 }
760 Py_DECREF(result);
761 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
762 output, o);
763 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000764 return result;
765}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000766
767/* Return 1 if the input is certainly normalized, 0 if it might not be. */
768static int
769is_normalized(PyObject *self, PyObject *input, int nfc, int k)
770{
Martin v. Löwis22970662011-09-29 13:39:38 +0200771 Py_ssize_t i, len;
772 int kind;
773 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000774 unsigned char prev_combining = 0, quickcheck_mask;
775
776 /* An older version of the database is requested, quickchecks must be
777 disabled. */
778 if (self && UCD_Check(self))
779 return 0;
780
781 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
782 as described in http://unicode.org/reports/tr15/#Annex8. */
783 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
784
Martin v. Löwis22970662011-09-29 13:39:38 +0200785 i = 0;
786 kind = PyUnicode_KIND(input);
787 data = PyUnicode_DATA(input);
788 len = PyUnicode_GET_LENGTH(input);
789 while (i < len) {
790 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
791 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000792 unsigned char combining = record->combining;
793 unsigned char quickcheck = record->normalization_quick_check;
794
795 if (quickcheck & quickcheck_mask)
796 return 0; /* this string might need normalization */
797 if (combining && prev_combining > combining)
798 return 0; /* non-canonical sort order, not normalized */
799 prev_combining = combining;
800 }
801 return 1; /* certainly normalized */
802}
803
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000804PyDoc_STRVAR(unicodedata_normalize__doc__,
805"normalize(form, unistr)\n\
806\n\
807Return the normal form 'form' for the Unicode string unistr. Valid\n\
808values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
809
Martin v. Löwis677bde22002-11-23 22:08:15 +0000810static PyObject*
811unicodedata_normalize(PyObject *self, PyObject *args)
812{
813 char *form;
814 PyObject *input;
815
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000816 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000817 &form, &PyUnicode_Type, &input))
818 return NULL;
819
Martin v. Löwis22970662011-09-29 13:39:38 +0200820 if (PyUnicode_READY(input) == -1)
821 return NULL;
822
823 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000824 /* Special case empty input strings, since resizing
825 them later would cause internal errors. */
826 Py_INCREF(input);
827 return input;
828 }
829
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000830 if (strcmp(form, "NFC") == 0) {
831 if (is_normalized(self, input, 1, 0)) {
832 Py_INCREF(input);
833 return input;
834 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000835 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000836 }
837 if (strcmp(form, "NFKC") == 0) {
838 if (is_normalized(self, input, 1, 1)) {
839 Py_INCREF(input);
840 return input;
841 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000842 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000843 }
844 if (strcmp(form, "NFD") == 0) {
845 if (is_normalized(self, input, 0, 0)) {
846 Py_INCREF(input);
847 return input;
848 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000849 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000850 }
851 if (strcmp(form, "NFKD") == 0) {
852 if (is_normalized(self, input, 0, 1)) {
853 Py_INCREF(input);
854 return input;
855 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000856 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000857 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000858 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
859 return NULL;
860}
861
Fredrik Lundh06d12682001-01-24 07:59:11 +0000862/* -------------------------------------------------------------------- */
863/* unicode character name tables */
864
865/* data file generated by Tools/unicode/makeunicodedata.py */
866#include "unicodename_db.h"
867
868/* -------------------------------------------------------------------- */
869/* database code (cut and pasted from the unidb package) */
870
871static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000872_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000873{
874 int i;
875 unsigned long h = 0;
876 unsigned long ix;
877 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200878 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000879 ix = h & 0xff000000;
880 if (ix)
881 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
882 }
883 return h;
884}
885
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000886static char *hangul_syllables[][3] = {
887 { "G", "A", "" },
888 { "GG", "AE", "G" },
889 { "N", "YA", "GG" },
890 { "D", "YAE", "GS" },
891 { "DD", "EO", "N", },
892 { "R", "E", "NJ" },
893 { "M", "YEO", "NH" },
894 { "B", "YE", "D" },
895 { "BB", "O", "L" },
896 { "S", "WA", "LG" },
897 { "SS", "WAE", "LM" },
898 { "", "OE", "LB" },
899 { "J", "YO", "LS" },
900 { "JJ", "U", "LT" },
901 { "C", "WEO", "LP" },
902 { "K", "WE", "LH" },
903 { "T", "WI", "M" },
904 { "P", "YU", "B" },
905 { "H", "EU", "BS" },
906 { 0, "YI", "S" },
907 { 0, "I", "SS" },
908 { 0, 0, "NG" },
909 { 0, 0, "J" },
910 { 0, 0, "C" },
911 { 0, 0, "K" },
912 { 0, 0, "T" },
913 { 0, 0, "P" },
914 { 0, 0, "H" }
915};
916
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000917/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000918static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000919is_unified_ideograph(Py_UCS4 code)
920{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000921 return
922 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
923 (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */
924 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
925 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
926 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000927}
928
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300929/* macros used to determine if the given codepoint is in the PUA range that
930 * we are using to store aliases and named sequences */
931#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
932#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
933 (cp < named_sequences_end))
934
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000935static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300936_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
937 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000938{
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300939 /* Find the name associated with the given codepoint.
940 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
941 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000942 int offset;
943 int i;
944 int word;
945 unsigned char* w;
946
Martin v. Löwisc3509122006-03-11 12:16:23 +0000947 if (code >= 0x110000)
948 return 0;
949
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300950 /* XXX should we just skip all the codepoints in the PUAs here? */
951 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
952 return 0;
953
Martin v. Löwis1a214512008-06-11 05:26:20 +0000954 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300955 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +0300956 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300957 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
958 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +0300959 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +0000960 if (old->category_changed == 0) {
961 /* unassigned */
962 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000963 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000964 }
965
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000966 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000967 /* Hangul syllable. */
968 int SIndex = code - SBase;
969 int L = SIndex / NCount;
970 int V = (SIndex % NCount) / TCount;
971 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000972
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000973 if (buflen < 27)
974 /* Worst case: HANGUL SYLLABLE <10chars>. */
975 return 0;
976 strcpy(buffer, "HANGUL SYLLABLE ");
977 buffer += 16;
978 strcpy(buffer, hangul_syllables[L][0]);
979 buffer += strlen(hangul_syllables[L][0]);
980 strcpy(buffer, hangul_syllables[V][1]);
981 buffer += strlen(hangul_syllables[V][1]);
982 strcpy(buffer, hangul_syllables[T][2]);
983 buffer += strlen(hangul_syllables[T][2]);
984 *buffer = '\0';
985 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000986 }
987
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000988 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000989 if (buflen < 28)
990 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
991 return 0;
992 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
993 return 1;
994 }
995
Fredrik Lundh06d12682001-01-24 07:59:11 +0000996 /* get offset into phrasebook */
997 offset = phrasebook_offset1[(code>>phrasebook_shift)];
998 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
999 (code&((1<<phrasebook_shift)-1))];
1000 if (!offset)
1001 return 0;
1002
1003 i = 0;
1004
1005 for (;;) {
1006 /* get word index */
1007 word = phrasebook[offset] - phrasebook_short;
1008 if (word >= 0) {
1009 word = (word << 8) + phrasebook[offset+1];
1010 offset += 2;
1011 } else
1012 word = phrasebook[offset++];
1013 if (i) {
1014 if (i > buflen)
1015 return 0; /* buffer overflow */
1016 buffer[i++] = ' ';
1017 }
1018 /* copy word string from lexicon. the last character in the
1019 word has bit 7 set. the last word in a string ends with
1020 0x80 */
1021 w = lexicon + lexicon_offset[word];
1022 while (*w < 128) {
1023 if (i >= buflen)
1024 return 0; /* buffer overflow */
1025 buffer[i++] = *w++;
1026 }
1027 if (i >= buflen)
1028 return 0; /* buffer overflow */
1029 buffer[i++] = *w & 127;
1030 if (*w == 128)
1031 break; /* end of word */
1032 }
1033
1034 return 1;
1035}
1036
1037static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001038_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001039{
1040 /* check if code corresponds to the given name */
1041 int i;
1042 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001043 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001044 return 0;
1045 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001046 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001047 return 0;
1048 }
1049 return buffer[namelen] == '\0';
1050}
1051
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001052static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001053find_syllable(const char *str, int *len, int *pos, int count, int column)
1054{
1055 int i, len1;
1056 *len = -1;
1057 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001058 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001059 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001060 if (len1 <= *len)
1061 continue;
1062 if (strncmp(str, s, len1) == 0) {
1063 *len = len1;
1064 *pos = i;
1065 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001066 }
1067 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001068 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001069 }
1070}
1071
Fredrik Lundh06d12682001-01-24 07:59:11 +00001072static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001073_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001074{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001075 /* check if named sequences are allowed */
1076 if (!with_named_seq && IS_NAMED_SEQ(cp))
1077 return 0;
1078 /* if the codepoint is in the PUA range that we use for aliases,
1079 * convert it to obtain the right codepoint */
1080 if (IS_ALIAS(cp))
1081 *code = name_aliases[cp-aliases_start];
1082 else
1083 *code = cp;
1084 return 1;
1085}
1086
1087static int
1088_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1089 int with_named_seq)
1090{
1091 /* Return the codepoint associated with the given name.
1092 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1093 * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
1094 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001095 unsigned int h, v;
1096 unsigned int mask = code_size-1;
1097 unsigned int i, incr;
1098
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001099 /* Check for hangul syllables. */
1100 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001101 int len, L = -1, V = -1, T = -1;
1102 const char *pos = name + 16;
1103 find_syllable(pos, &len, &L, LCount, 0);
1104 pos += len;
1105 find_syllable(pos, &len, &V, VCount, 1);
1106 pos += len;
1107 find_syllable(pos, &len, &T, TCount, 2);
1108 pos += len;
1109 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1110 *code = SBase + (L*VCount+V)*TCount + T;
1111 return 1;
1112 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001113 /* Otherwise, it's an illegal syllable name. */
1114 return 0;
1115 }
1116
1117 /* Check for unified ideographs. */
1118 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1119 /* Four or five hexdigits must follow. */
1120 v = 0;
1121 name += 22;
1122 namelen -= 22;
1123 if (namelen != 4 && namelen != 5)
1124 return 0;
1125 while (namelen--) {
1126 v *= 16;
1127 if (*name >= '0' && *name <= '9')
1128 v += *name - '0';
1129 else if (*name >= 'A' && *name <= 'F')
1130 v += *name - 'A' + 10;
1131 else
1132 return 0;
1133 name++;
1134 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001135 if (!is_unified_ideograph(v))
1136 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001137 *code = v;
1138 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001139 }
1140
Fredrik Lundh06d12682001-01-24 07:59:11 +00001141 /* the following is the same as python's dictionary lookup, with
1142 only minor changes. see the makeunicodedata script for more
1143 details */
1144
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001145 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001146 i = (~h) & mask;
1147 v = code_hash[i];
1148 if (!v)
1149 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001150 if (_cmpname(self, v, name, namelen))
1151 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001152 incr = (h ^ (h >> 3)) & mask;
1153 if (!incr)
1154 incr = mask;
1155 for (;;) {
1156 i = (i + incr) & mask;
1157 v = code_hash[i];
1158 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001159 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001160 if (_cmpname(self, v, name, namelen))
1161 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001162 incr = incr << 1;
1163 if (incr > mask)
1164 incr = incr ^ code_poly;
1165 }
1166}
1167
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001168static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001169{
1170 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001171 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001172 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001173};
1174
1175/* -------------------------------------------------------------------- */
1176/* Python bindings */
1177
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001178PyDoc_STRVAR(unicodedata_name__doc__,
1179"name(unichr[, default])\n\
1180Returns the name assigned to the Unicode character unichr as a\n\
1181string. If no name is defined, default is returned, or, if not\n\
1182given, ValueError is raised.");
1183
Fredrik Lundh06d12682001-01-24 07:59:11 +00001184static PyObject *
1185unicodedata_name(PyObject* self, PyObject* args)
1186{
1187 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001188 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001189
1190 PyUnicodeObject* v;
1191 PyObject* defobj = NULL;
1192 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1193 return NULL;
1194
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001195 c = getuchar(v);
1196 if (c == (Py_UCS4)-1)
1197 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001198
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001199 if (!_getucname(self, c, name, sizeof(name), 0)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001200 if (defobj == NULL) {
1201 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001202 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001203 }
1204 else {
1205 Py_INCREF(defobj);
1206 return defobj;
1207 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001208 }
1209
Walter Dörwald4254e762007-06-05 16:04:09 +00001210 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001211}
1212
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001213PyDoc_STRVAR(unicodedata_lookup__doc__,
1214"lookup(name)\n\
1215\n\
1216Look up character by name. If a character with the\n\
1217given name is found, return the corresponding Unicode\n\
1218character. If not found, KeyError is raised.");
1219
Fredrik Lundh06d12682001-01-24 07:59:11 +00001220static PyObject *
1221unicodedata_lookup(PyObject* self, PyObject* args)
1222{
1223 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001224
1225 char* name;
1226 int namelen;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001227 unsigned int index;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001228 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1229 return NULL;
1230
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001231 if (!_getcode(self, name, namelen, &code, 1)) {
1232 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001233 return NULL;
1234 }
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001235 // check if code is in the PUA range that we use for named sequences
1236 // and convert it
1237 if (IS_NAMED_SEQ(code)) {
1238 index = code-named_sequences_start;
1239 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1240 named_sequences[index].seq,
1241 named_sequences[index].seqlen);
1242 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001244}
1245
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001246/* XXX Add doc strings. */
1247
1248static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001249 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1250 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1251 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1252 {"category", unicodedata_category, METH_VARARGS,
1253 unicodedata_category__doc__},
1254 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1255 unicodedata_bidirectional__doc__},
1256 {"combining", unicodedata_combining, METH_VARARGS,
1257 unicodedata_combining__doc__},
1258 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1259 unicodedata_mirrored__doc__},
1260 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1261 unicodedata_east_asian_width__doc__},
1262 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1263 unicodedata_decomposition__doc__},
1264 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1265 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1266 {"normalize", unicodedata_normalize, METH_VARARGS,
1267 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001268 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001269};
1270
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001271static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001272 /* The ob_type field must be initialized in the module init function
1273 * to be portable to Windows without using C++. */
1274 PyVarObject_HEAD_INIT(NULL, 0)
1275 "unicodedata.UCD", /*tp_name*/
1276 sizeof(PreviousDBVersion), /*tp_basicsize*/
1277 0, /*tp_itemsize*/
1278 /* methods */
1279 (destructor)PyObject_Del, /*tp_dealloc*/
1280 0, /*tp_print*/
1281 0, /*tp_getattr*/
1282 0, /*tp_setattr*/
1283 0, /*tp_reserved*/
1284 0, /*tp_repr*/
1285 0, /*tp_as_number*/
1286 0, /*tp_as_sequence*/
1287 0, /*tp_as_mapping*/
1288 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001289 0, /*tp_call*/
1290 0, /*tp_str*/
1291 PyObject_GenericGetAttr,/*tp_getattro*/
1292 0, /*tp_setattro*/
1293 0, /*tp_as_buffer*/
1294 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1295 0, /*tp_doc*/
1296 0, /*tp_traverse*/
1297 0, /*tp_clear*/
1298 0, /*tp_richcompare*/
1299 0, /*tp_weaklistoffset*/
1300 0, /*tp_iter*/
1301 0, /*tp_iternext*/
1302 unicodedata_functions, /*tp_methods*/
1303 DB_members, /*tp_members*/
1304 0, /*tp_getset*/
1305 0, /*tp_base*/
1306 0, /*tp_dict*/
1307 0, /*tp_descr_get*/
1308 0, /*tp_descr_set*/
1309 0, /*tp_dictoffset*/
1310 0, /*tp_init*/
1311 0, /*tp_alloc*/
1312 0, /*tp_new*/
1313 0, /*tp_free*/
1314 0, /*tp_is_gc*/
1315};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001316
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001317PyDoc_STRVAR(unicodedata_docstring,
1318"This module provides access to the Unicode Character Database which\n\
1319defines character properties for all Unicode characters. The data in\n\
1320this database is based on the UnicodeData.txt file version\n\
Ezio Melotti4c5475d2010-03-22 23:16:42 +000013215.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001322\n\
1323The module uses the same names and symbols as defined by the\n\
Ezio Melottid96b2f22010-03-23 00:39:22 +00001324UnicodeData File Format 5.2.0 (see\n\
1325http://www.unicode.org/reports/tr44/tr44-4.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001326
Martin v. Löwis1a214512008-06-11 05:26:20 +00001327
1328static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001329 PyModuleDef_HEAD_INIT,
1330 "unicodedata",
1331 unicodedata_docstring,
1332 -1,
1333 unicodedata_functions,
1334 NULL,
1335 NULL,
1336 NULL,
1337 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001338};
1339
Mark Hammond62b1ab12002-07-23 06:31:15 +00001340PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001341PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001342{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001343 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001344
Christian Heimes90aa7642007-12-19 02:45:37 +00001345 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001346
Martin v. Löwis1a214512008-06-11 05:26:20 +00001347 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001348 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001349 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001350
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001351 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001352 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001353 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001354
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001355 /* Previous versions */
1356 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1357 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001358 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001359
Fredrik Lundh06d12682001-01-24 07:59:11 +00001360 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001361 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001362 if (v != NULL)
1363 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001364 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001365}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001366
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001367/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001368Local variables:
1369c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001370indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001371End:
1372*/