blob: ed79165770b4a7930d466667972b46504f96291d [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
16#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000017#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000018#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019
20/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000021
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000022typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000023 const unsigned char category; /* index into
24 _PyUnicode_CategoryNames */
25 const unsigned char combining; /* combining class value 0 - 255 */
26 const unsigned char bidirectional; /* index into
27 _PyUnicode_BidirectionalNames */
28 const unsigned char mirrored; /* true if mirrored in bidir mode */
29 const unsigned char east_asian_width; /* index into
30 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000031 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000032} _PyUnicode_DatabaseRecord;
33
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000034typedef struct change_record {
35 /* sequence of fields should be the same as in merge_old_version */
36 const unsigned char bidir_changed;
37 const unsigned char category_changed;
38 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000039 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000040 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000041} change_record;
42
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000043/* data file generated by Tools/unicode/makeunicodedata.py */
44#include "unicodedata_db.h"
45
46static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000047_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000049 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000050 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000051 index = 0;
52 else {
53 index = index1[(code>>SHIFT)];
54 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
55 }
56
57 return &_PyUnicode_Database_Records[index];
58}
59
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000060/* ------------- Previous-version API ------------------------------------- */
61typedef struct previous_version {
62 PyObject_HEAD
63 const char *name;
64 const change_record* (*getrecord)(Py_UCS4);
65 Py_UCS4 (*normalization)(Py_UCS4);
66} PreviousDBVersion;
67
68#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
69
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000070static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000071 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000072 {NULL}
73};
74
Thomas Wouters89f507f2006-12-13 04:49:30 +000075/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000076static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000077#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078
79static PyObject*
80new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
81 Py_UCS4 (*normalization)(Py_UCS4))
82{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000083 PreviousDBVersion *self;
84 self = PyObject_New(PreviousDBVersion, &UCD_Type);
85 if (self == NULL)
86 return NULL;
87 self->name = name;
88 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000089 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000090 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000091}
92
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000093
94static Py_UCS4 getuchar(PyUnicodeObject *obj)
95{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020096 if (PyUnicode_READY(obj))
97 return (Py_UCS4)-1;
98 if (PyUnicode_GET_LENGTH(obj) == 1) {
99 if (PyUnicode_READY(obj))
100 return (Py_UCS4)-1;
101 return PyUnicode_READ_CHAR(obj, 0);
102 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000103 PyErr_SetString(PyExc_TypeError,
104 "need a single Unicode character as parameter");
105 return (Py_UCS4)-1;
106}
107
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000108/* --- Module API --------------------------------------------------------- */
109
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000110PyDoc_STRVAR(unicodedata_decimal__doc__,
111"decimal(unichr[, default])\n\
112\n\
113Returns the decimal value assigned to the Unicode character unichr\n\
114as integer. If no such value is defined, default is returned, or, if\n\
115not given, ValueError is raised.");
116
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000117static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000118unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000119{
120 PyUnicodeObject *v;
121 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000122 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000123 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000124 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000125
Fredrik Lundh06d12682001-01-24 07:59:11 +0000126 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000127 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000128 c = getuchar(v);
129 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000130 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000131
Martin v. Löwis1a214512008-06-11 05:26:20 +0000132 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000133 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000134 if (old->category_changed == 0) {
135 /* unassigned */
136 have_old = 1;
137 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000138 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000139 else if (old->decimal_changed != 0xFF) {
140 have_old = 1;
141 rc = old->decimal_changed;
142 }
143 }
144
145 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000146 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000147 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 if (defobj == NULL) {
149 PyErr_SetString(PyExc_ValueError,
150 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000151 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 }
153 else {
154 Py_INCREF(defobj);
155 return defobj;
156 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000157 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000158 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000159}
160
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000161PyDoc_STRVAR(unicodedata_digit__doc__,
162"digit(unichr[, default])\n\
163\n\
164Returns the digit value assigned to the Unicode character unichr as\n\
165integer. If no such value is defined, default is returned, or, if\n\
166not given, ValueError is raised.");
167
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000168static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000170{
171 PyUnicodeObject *v;
172 PyObject *defobj = NULL;
173 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000174 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000175
Fredrik Lundh06d12682001-01-24 07:59:11 +0000176 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000177 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000178 c = getuchar(v);
179 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000180 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000181 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000182 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000183 if (defobj == NULL) {
184 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000185 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000186 }
187 else {
188 Py_INCREF(defobj);
189 return defobj;
190 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000191 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000192 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193}
194
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000195PyDoc_STRVAR(unicodedata_numeric__doc__,
196"numeric(unichr[, default])\n\
197\n\
198Returns the numeric value assigned to the Unicode character unichr\n\
199as float. If no such value is defined, default is returned, or, if\n\
200not given, ValueError is raised.");
201
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000202static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000203unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000204{
205 PyUnicodeObject *v;
206 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000207 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000208 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000209 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000210
Fredrik Lundh06d12682001-01-24 07:59:11 +0000211 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000212 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000213 c = getuchar(v);
214 if (c == (Py_UCS4)-1)
215 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000216
Martin v. Löwis1a214512008-06-11 05:26:20 +0000217 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000218 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000219 if (old->category_changed == 0) {
220 /* unassigned */
221 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000222 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000223 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000224 else if (old->decimal_changed != 0xFF) {
225 have_old = 1;
226 rc = old->decimal_changed;
227 }
228 }
229
230 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000231 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000233 if (defobj == NULL) {
234 PyErr_SetString(PyExc_ValueError, "not a numeric character");
235 return NULL;
236 }
237 else {
238 Py_INCREF(defobj);
239 return defobj;
240 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000241 }
242 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000243}
244
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000245PyDoc_STRVAR(unicodedata_category__doc__,
246"category(unichr)\n\
247\n\
248Returns the general category assigned to the Unicode character\n\
249unichr as string.");
250
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000251static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000252unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000253{
254 PyUnicodeObject *v;
255 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000256 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000257
258 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000259 &PyUnicode_Type, &v))
260 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000261 c = getuchar(v);
262 if (c == (Py_UCS4)-1)
263 return NULL;
264 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000265 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000266 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000267 if (old->category_changed != 0xFF)
268 index = old->category_changed;
269 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000270 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000271}
272
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000273PyDoc_STRVAR(unicodedata_bidirectional__doc__,
274"bidirectional(unichr)\n\
275\n\
276Returns the bidirectional category assigned to the Unicode character\n\
277unichr as string. If no such value is defined, an empty string is\n\
278returned.");
279
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000280static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000281unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000282{
283 PyUnicodeObject *v;
284 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000285 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000286
287 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000288 &PyUnicode_Type, &v))
289 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000290 c = getuchar(v);
291 if (c == (Py_UCS4)-1)
292 return NULL;
293 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000294 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000295 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000296 if (old->category_changed == 0)
297 index = 0; /* unassigned */
298 else if (old->bidir_changed != 0xFF)
299 index = old->bidir_changed;
300 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000301 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000302}
303
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000304PyDoc_STRVAR(unicodedata_combining__doc__,
305"combining(unichr)\n\
306\n\
307Returns the canonical combining class assigned to the Unicode\n\
308character unichr as integer. Returns 0 if no combining class is\n\
309defined.");
310
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000311static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000312unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000313{
314 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000315 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000316 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000317
318 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000319 &PyUnicode_Type, &v))
320 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000321 c = getuchar(v);
322 if (c == (Py_UCS4)-1)
323 return NULL;
324 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000325 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000326 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000327 if (old->category_changed == 0)
328 index = 0; /* unassigned */
329 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000330 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000331}
332
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000333PyDoc_STRVAR(unicodedata_mirrored__doc__,
334"mirrored(unichr)\n\
335\n\
336Returns the mirrored property assigned to the Unicode character\n\
337unichr as integer. Returns 1 if the character has been identified as\n\
338a \"mirrored\" character in bidirectional text, 0 otherwise.");
339
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000340static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000341unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000342{
343 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000344 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000345 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000346
347 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 &PyUnicode_Type, &v))
349 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000350 c = getuchar(v);
351 if (c == (Py_UCS4)-1)
352 return NULL;
353 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000354 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000355 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000356 if (old->category_changed == 0)
357 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000358 else if (old->mirrored_changed != 0xFF)
359 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000360 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000361 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000362}
363
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000364PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
365"east_asian_width(unichr)\n\
366\n\
367Returns the east asian width assigned to the Unicode character\n\
368unichr as string.");
369
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000370static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000371unicodedata_east_asian_width(PyObject *self, PyObject *args)
372{
373 PyUnicodeObject *v;
374 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000375 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000376
377 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000378 &PyUnicode_Type, &v))
379 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000380 c = getuchar(v);
381 if (c == (Py_UCS4)-1)
382 return NULL;
383 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000384 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000385 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000386 if (old->category_changed == 0)
387 index = 0; /* unassigned */
388 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000389 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000390}
391
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000392PyDoc_STRVAR(unicodedata_decomposition__doc__,
393"decomposition(unichr)\n\
394\n\
395Returns the character decomposition mapping assigned to the Unicode\n\
396character unichr as string. An empty string is returned in case no\n\
397such mapping is defined.");
398
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000399static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000400unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000401{
402 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000403 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000404 int code, index, count;
405 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000406 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000407 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000408
409 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000410 &PyUnicode_Type, &v))
411 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000412 c = getuchar(v);
413 if (c == (Py_UCS4)-1)
414 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000415
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000416 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000417
Martin v. Löwis1a214512008-06-11 05:26:20 +0000418 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000419 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000420 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000421 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000422 }
423
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000424 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000425 index = 0;
426 else {
427 index = decomp_index1[(code>>DECOMP_SHIFT)];
428 index = decomp_index2[(index<<DECOMP_SHIFT)+
429 (code&((1<<DECOMP_SHIFT)-1))];
430 }
431
Tim Peters69b83b12001-11-30 07:23:05 +0000432 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000433 is prefix code (from*/
434 count = decomp_data[index] >> 8;
435
436 /* XXX: could allocate the PyString up front instead
437 (strlen(prefix) + 5 * count + 1 bytes) */
438
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000439 /* Based on how index is calculated above and decomp_data is generated
440 from Tools/unicode/makeunicodedata.py, it should not be possible
441 to overflow decomp_prefix. */
442 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200443 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000444
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000445 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000446 i = strlen(decomp_prefix[prefix_index]);
447 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000448
449 while (count-- > 0) {
450 if (i)
451 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000452 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000453 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
454 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000455 i += strlen(decomp + i);
456 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000457 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000458}
459
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000460static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000461get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000462{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000463 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000464 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000465 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000466 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000467 /* unassigned in old version */
468 *index = 0;
469 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000470 else {
471 *index = decomp_index1[(code>>DECOMP_SHIFT)];
472 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
473 (code&((1<<DECOMP_SHIFT)-1))];
474 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000475
Martin v. Löwis677bde22002-11-23 22:08:15 +0000476 /* high byte is number of hex bytes (usually one or two), low byte
477 is prefix code (from*/
478 *count = decomp_data[*index] >> 8;
479 *prefix = decomp_data[*index] & 255;
480
481 (*index)++;
482}
483
484#define SBase 0xAC00
485#define LBase 0x1100
486#define VBase 0x1161
487#define TBase 0x11A7
488#define LCount 19
489#define VCount 21
490#define TCount 28
491#define NCount (VCount*TCount)
492#define SCount (LCount*NCount)
493
494static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000495nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000496{
497 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200498 Py_UCS4 *output;
499 Py_ssize_t i, o, osize;
500 int kind;
501 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000502 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200503 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000504 Py_ssize_t space, isize;
505 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000506 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000507
Martin v. Löwis677bde22002-11-23 22:08:15 +0000508 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200509 isize = PyUnicode_GET_LENGTH(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000510 /* Overallocate atmost 10 characters. */
511 space = (isize > 10 ? 10 : isize) + isize;
Martin v. Löwis22970662011-09-29 13:39:38 +0200512 osize = space;
513 output = PyMem_Malloc(space * sizeof(Py_UCS4));
514 if (!output) {
515 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000516 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200517 }
518 i = o = 0;
519 kind = PyUnicode_KIND(input);
520 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000521
Martin v. Löwis22970662011-09-29 13:39:38 +0200522 while (i < isize) {
523 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000524 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200525 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000526 /* Hangul Decomposition adds three characters in
527 a single step, so we need atleast that much room. */
528 if (space < 3) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200529 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000530 space += 10;
Martin v. Löwis22970662011-09-29 13:39:38 +0200531 output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
532 if (output == NULL) {
533 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000534 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200535 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000536 }
537 /* Hangul Decomposition. */
538 if (SBase <= code && code < (SBase+SCount)) {
539 int SIndex = code - SBase;
540 int L = LBase + SIndex / NCount;
541 int V = VBase + (SIndex % NCount) / TCount;
542 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200543 output[o++] = L;
544 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000545 space -= 2;
546 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200547 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000548 space --;
549 }
550 continue;
551 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000552 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000553 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000554 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
555 if (value != 0) {
556 stack[stackptr++] = value;
557 continue;
558 }
559 }
560
561 /* Other decompositions. */
562 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000563
564 /* Copy character if it is not decomposable, or has a
565 compatibility decomposition, but we do NFD. */
566 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200567 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000568 space--;
569 continue;
570 }
571 /* Copy decomposition onto the stack, in reverse
572 order. */
573 while(count) {
574 code = decomp_data[index + (--count)];
575 stack[stackptr++] = code;
576 }
577 }
578 }
579
Martin v. Löwis22970662011-09-29 13:39:38 +0200580 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
581 output, o);
582 PyMem_Free(output);
583 if (!result)
584 return NULL;
585 /* result is guaranteed to be ready, as it is compact. */
586 kind = PyUnicode_KIND(result);
587 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000588
589 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200590 i = 0;
591 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
592 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
593 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000594 if (prev == 0 || cur == 0 || prev <= cur) {
595 prev = cur;
596 continue;
597 }
598 /* Non-canonical order. Need to switch *i with previous. */
599 o = i - 1;
600 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200601 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
602 PyUnicode_WRITE(kind, data, o+1,
603 PyUnicode_READ(kind, data, o));
604 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000605 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200606 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000607 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200608 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000609 if (prev == 0 || prev <= cur)
610 break;
611 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200612 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000613 }
614 return result;
615}
616
617static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200618find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000619{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200620 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000621 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200622 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000623 if (code < start)
624 return -1;
625 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200626 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000627 return nfc[index].index + delta;
628 }
629 }
630 return -1;
631}
632
633static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000634nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000635{
636 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200637 int kind;
638 void *data;
639 Py_UCS4 *output;
640 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000641 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200642 Py_UCS4 code;
643 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000644 int cskipped = 0;
645
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000646 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000647 if (!result)
648 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200649 /* result will be "ready". */
650 kind = PyUnicode_KIND(result);
651 data = PyUnicode_DATA(result);
652 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000653
Martin v. Löwis22970662011-09-29 13:39:38 +0200654 /* We allocate a buffer for the output.
655 If we find that we made no changes, we still return
656 the NFD result. */
657 output = PyMem_Malloc(len * sizeof(Py_UCS4));
658 if (!output) {
659 PyErr_NoMemory();
660 Py_DECREF(result);
661 return 0;
662 }
663 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000664
Martin v. Löwis677bde22002-11-23 22:08:15 +0000665 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200666 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000667 for (index = 0; index < cskipped; index++) {
668 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000669 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000670 Remove from list. */
671 skipped[index] = skipped[cskipped-1];
672 cskipped--;
673 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000674 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000675 }
676 }
677 /* Hangul Composition. We don't need to check for <LV,T>
678 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200679 code = PyUnicode_READ(kind, data, i);
680 if (LBase <= code && code < (LBase+LCount) &&
681 i + 1 < len &&
682 VBase <= PyUnicode_READ(kind, data, i+1) &&
683 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000684 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200685 LIndex = code - LBase;
686 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000687 code = SBase + (LIndex*VCount+VIndex)*TCount;
688 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200689 if (i < len &&
690 TBase <= PyUnicode_READ(kind, data, i) &&
691 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
692 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000693 i++;
694 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200695 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000696 continue;
697 }
698
Martin v. Löwis22970662011-09-29 13:39:38 +0200699 /* code is still input[i] here */
700 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000701 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200702 output[o++] = code;
703 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000704 continue;
705 }
706 /* Find next unblocked character. */
707 i1 = i+1;
708 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200709 /* output base character for now; might be updated later. */
710 output[o] = PyUnicode_READ(kind, data, i);
711 while (i1 < len) {
712 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
713 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000714 if (comb) {
715 if (comb1 == 0)
716 break;
717 if (comb >= comb1) {
718 /* Character is blocked. */
719 i1++;
720 continue;
721 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000722 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200723 l = find_nfc_index(self, nfc_last, code1);
724 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000725 is a starter, we don't need to look further.
726 Otherwise, record the combining class. */
727 if (l == -1) {
728 not_combinable:
729 if (comb1 == 0)
730 break;
731 comb = comb1;
732 i1++;
733 continue;
734 }
735 index = f*TOTAL_LAST + l;
736 index1 = comp_index[index >> COMP_SHIFT];
737 code = comp_data[(index1<<COMP_SHIFT)+
738 (index&((1<<COMP_SHIFT)-1))];
739 if (code == 0)
740 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000741
Martin v. Löwis677bde22002-11-23 22:08:15 +0000742 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200743 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000744 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000745 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000746 skipped[cskipped++] = i1;
747 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200748 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000749 if (f == -1)
750 break;
751 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200752 /* Output character was already written.
753 Just advance the indices. */
754 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000755 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200756 if (o == len) {
757 /* No changes. Return original string. */
758 PyMem_Free(output);
759 return result;
760 }
761 Py_DECREF(result);
762 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
763 output, o);
764 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000765 return result;
766}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000767
768/* Return 1 if the input is certainly normalized, 0 if it might not be. */
769static int
770is_normalized(PyObject *self, PyObject *input, int nfc, int k)
771{
Martin v. Löwis22970662011-09-29 13:39:38 +0200772 Py_ssize_t i, len;
773 int kind;
774 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000775 unsigned char prev_combining = 0, quickcheck_mask;
776
777 /* An older version of the database is requested, quickchecks must be
778 disabled. */
779 if (self && UCD_Check(self))
780 return 0;
781
782 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
783 as described in http://unicode.org/reports/tr15/#Annex8. */
784 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
785
Martin v. Löwis22970662011-09-29 13:39:38 +0200786 i = 0;
787 kind = PyUnicode_KIND(input);
788 data = PyUnicode_DATA(input);
789 len = PyUnicode_GET_LENGTH(input);
790 while (i < len) {
791 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
792 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000793 unsigned char combining = record->combining;
794 unsigned char quickcheck = record->normalization_quick_check;
795
796 if (quickcheck & quickcheck_mask)
797 return 0; /* this string might need normalization */
798 if (combining && prev_combining > combining)
799 return 0; /* non-canonical sort order, not normalized */
800 prev_combining = combining;
801 }
802 return 1; /* certainly normalized */
803}
804
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000805PyDoc_STRVAR(unicodedata_normalize__doc__,
806"normalize(form, unistr)\n\
807\n\
808Return the normal form 'form' for the Unicode string unistr. Valid\n\
809values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
810
Martin v. Löwis677bde22002-11-23 22:08:15 +0000811static PyObject*
812unicodedata_normalize(PyObject *self, PyObject *args)
813{
814 char *form;
815 PyObject *input;
816
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000817 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000818 &form, &PyUnicode_Type, &input))
819 return NULL;
820
Martin v. Löwis22970662011-09-29 13:39:38 +0200821 if (PyUnicode_READY(input) == -1)
822 return NULL;
823
824 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000825 /* Special case empty input strings, since resizing
826 them later would cause internal errors. */
827 Py_INCREF(input);
828 return input;
829 }
830
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000831 if (strcmp(form, "NFC") == 0) {
832 if (is_normalized(self, input, 1, 0)) {
833 Py_INCREF(input);
834 return input;
835 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000836 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000837 }
838 if (strcmp(form, "NFKC") == 0) {
839 if (is_normalized(self, input, 1, 1)) {
840 Py_INCREF(input);
841 return input;
842 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000843 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000844 }
845 if (strcmp(form, "NFD") == 0) {
846 if (is_normalized(self, input, 0, 0)) {
847 Py_INCREF(input);
848 return input;
849 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000850 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000851 }
852 if (strcmp(form, "NFKD") == 0) {
853 if (is_normalized(self, input, 0, 1)) {
854 Py_INCREF(input);
855 return input;
856 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000857 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000858 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000859 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
860 return NULL;
861}
862
Fredrik Lundh06d12682001-01-24 07:59:11 +0000863/* -------------------------------------------------------------------- */
864/* unicode character name tables */
865
866/* data file generated by Tools/unicode/makeunicodedata.py */
867#include "unicodename_db.h"
868
869/* -------------------------------------------------------------------- */
870/* database code (cut and pasted from the unidb package) */
871
872static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000873_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000874{
875 int i;
876 unsigned long h = 0;
877 unsigned long ix;
878 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200879 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000880 ix = h & 0xff000000;
881 if (ix)
882 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
883 }
884 return h;
885}
886
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000887static char *hangul_syllables[][3] = {
888 { "G", "A", "" },
889 { "GG", "AE", "G" },
890 { "N", "YA", "GG" },
891 { "D", "YAE", "GS" },
892 { "DD", "EO", "N", },
893 { "R", "E", "NJ" },
894 { "M", "YEO", "NH" },
895 { "B", "YE", "D" },
896 { "BB", "O", "L" },
897 { "S", "WA", "LG" },
898 { "SS", "WAE", "LM" },
899 { "", "OE", "LB" },
900 { "J", "YO", "LS" },
901 { "JJ", "U", "LT" },
902 { "C", "WEO", "LP" },
903 { "K", "WE", "LH" },
904 { "T", "WI", "M" },
905 { "P", "YU", "B" },
906 { "H", "EU", "BS" },
907 { 0, "YI", "S" },
908 { 0, "I", "SS" },
909 { 0, 0, "NG" },
910 { 0, 0, "J" },
911 { 0, 0, "C" },
912 { 0, 0, "K" },
913 { 0, 0, "T" },
914 { 0, 0, "P" },
915 { 0, 0, "H" }
916};
917
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000918/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000919static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000920is_unified_ideograph(Py_UCS4 code)
921{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000922 return
923 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500924 (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000925 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
926 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
927 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000928}
929
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300930/* macros used to determine if the given codepoint is in the PUA range that
931 * we are using to store aliases and named sequences */
932#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
933#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
934 (cp < named_sequences_end))
935
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000936static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300937_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
938 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000939{
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300940 /* Find the name associated with the given codepoint.
941 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
942 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000943 int offset;
944 int i;
945 int word;
946 unsigned char* w;
947
Martin v. Löwisc3509122006-03-11 12:16:23 +0000948 if (code >= 0x110000)
949 return 0;
950
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300951 /* XXX should we just skip all the codepoints in the PUAs here? */
952 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
953 return 0;
954
Martin v. Löwis1a214512008-06-11 05:26:20 +0000955 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300956 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +0300957 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300958 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
959 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +0300960 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +0000961 if (old->category_changed == 0) {
962 /* unassigned */
963 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000964 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000965 }
966
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000967 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000968 /* Hangul syllable. */
969 int SIndex = code - SBase;
970 int L = SIndex / NCount;
971 int V = (SIndex % NCount) / TCount;
972 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000973
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000974 if (buflen < 27)
975 /* Worst case: HANGUL SYLLABLE <10chars>. */
976 return 0;
977 strcpy(buffer, "HANGUL SYLLABLE ");
978 buffer += 16;
979 strcpy(buffer, hangul_syllables[L][0]);
980 buffer += strlen(hangul_syllables[L][0]);
981 strcpy(buffer, hangul_syllables[V][1]);
982 buffer += strlen(hangul_syllables[V][1]);
983 strcpy(buffer, hangul_syllables[T][2]);
984 buffer += strlen(hangul_syllables[T][2]);
985 *buffer = '\0';
986 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000987 }
988
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000989 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000990 if (buflen < 28)
991 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
992 return 0;
993 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
994 return 1;
995 }
996
Fredrik Lundh06d12682001-01-24 07:59:11 +0000997 /* get offset into phrasebook */
998 offset = phrasebook_offset1[(code>>phrasebook_shift)];
999 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1000 (code&((1<<phrasebook_shift)-1))];
1001 if (!offset)
1002 return 0;
1003
1004 i = 0;
1005
1006 for (;;) {
1007 /* get word index */
1008 word = phrasebook[offset] - phrasebook_short;
1009 if (word >= 0) {
1010 word = (word << 8) + phrasebook[offset+1];
1011 offset += 2;
1012 } else
1013 word = phrasebook[offset++];
1014 if (i) {
1015 if (i > buflen)
1016 return 0; /* buffer overflow */
1017 buffer[i++] = ' ';
1018 }
1019 /* copy word string from lexicon. the last character in the
1020 word has bit 7 set. the last word in a string ends with
1021 0x80 */
1022 w = lexicon + lexicon_offset[word];
1023 while (*w < 128) {
1024 if (i >= buflen)
1025 return 0; /* buffer overflow */
1026 buffer[i++] = *w++;
1027 }
1028 if (i >= buflen)
1029 return 0; /* buffer overflow */
1030 buffer[i++] = *w & 127;
1031 if (*w == 128)
1032 break; /* end of word */
1033 }
1034
1035 return 1;
1036}
1037
1038static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001039_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001040{
1041 /* check if code corresponds to the given name */
1042 int i;
1043 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001044 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001045 return 0;
1046 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001047 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001048 return 0;
1049 }
1050 return buffer[namelen] == '\0';
1051}
1052
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001053static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001054find_syllable(const char *str, int *len, int *pos, int count, int column)
1055{
1056 int i, len1;
1057 *len = -1;
1058 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001059 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001060 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001061 if (len1 <= *len)
1062 continue;
1063 if (strncmp(str, s, len1) == 0) {
1064 *len = len1;
1065 *pos = i;
1066 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001067 }
1068 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001069 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001070 }
1071}
1072
Fredrik Lundh06d12682001-01-24 07:59:11 +00001073static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001074_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001075{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001076 /* check if named sequences are allowed */
1077 if (!with_named_seq && IS_NAMED_SEQ(cp))
1078 return 0;
1079 /* if the codepoint is in the PUA range that we use for aliases,
1080 * convert it to obtain the right codepoint */
1081 if (IS_ALIAS(cp))
1082 *code = name_aliases[cp-aliases_start];
1083 else
1084 *code = cp;
1085 return 1;
1086}
1087
1088static int
1089_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1090 int with_named_seq)
1091{
1092 /* Return the codepoint associated with the given name.
1093 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1094 * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
1095 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001096 unsigned int h, v;
1097 unsigned int mask = code_size-1;
1098 unsigned int i, incr;
1099
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001100 /* Check for hangul syllables. */
1101 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001102 int len, L = -1, V = -1, T = -1;
1103 const char *pos = name + 16;
1104 find_syllable(pos, &len, &L, LCount, 0);
1105 pos += len;
1106 find_syllable(pos, &len, &V, VCount, 1);
1107 pos += len;
1108 find_syllable(pos, &len, &T, TCount, 2);
1109 pos += len;
1110 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1111 *code = SBase + (L*VCount+V)*TCount + T;
1112 return 1;
1113 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001114 /* Otherwise, it's an illegal syllable name. */
1115 return 0;
1116 }
1117
1118 /* Check for unified ideographs. */
1119 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1120 /* Four or five hexdigits must follow. */
1121 v = 0;
1122 name += 22;
1123 namelen -= 22;
1124 if (namelen != 4 && namelen != 5)
1125 return 0;
1126 while (namelen--) {
1127 v *= 16;
1128 if (*name >= '0' && *name <= '9')
1129 v += *name - '0';
1130 else if (*name >= 'A' && *name <= 'F')
1131 v += *name - 'A' + 10;
1132 else
1133 return 0;
1134 name++;
1135 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001136 if (!is_unified_ideograph(v))
1137 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001138 *code = v;
1139 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001140 }
1141
Fredrik Lundh06d12682001-01-24 07:59:11 +00001142 /* the following is the same as python's dictionary lookup, with
1143 only minor changes. see the makeunicodedata script for more
1144 details */
1145
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001146 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001147 i = (~h) & mask;
1148 v = code_hash[i];
1149 if (!v)
1150 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001151 if (_cmpname(self, v, name, namelen))
1152 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001153 incr = (h ^ (h >> 3)) & mask;
1154 if (!incr)
1155 incr = mask;
1156 for (;;) {
1157 i = (i + incr) & mask;
1158 v = code_hash[i];
1159 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001160 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001161 if (_cmpname(self, v, name, namelen))
1162 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001163 incr = incr << 1;
1164 if (incr > mask)
1165 incr = incr ^ code_poly;
1166 }
1167}
1168
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001169static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001170{
1171 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001172 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001173 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001174};
1175
1176/* -------------------------------------------------------------------- */
1177/* Python bindings */
1178
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001179PyDoc_STRVAR(unicodedata_name__doc__,
1180"name(unichr[, default])\n\
1181Returns the name assigned to the Unicode character unichr as a\n\
1182string. If no name is defined, default is returned, or, if not\n\
1183given, ValueError is raised.");
1184
Fredrik Lundh06d12682001-01-24 07:59:11 +00001185static PyObject *
1186unicodedata_name(PyObject* self, PyObject* args)
1187{
1188 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001189 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001190
1191 PyUnicodeObject* v;
1192 PyObject* defobj = NULL;
1193 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1194 return NULL;
1195
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001196 c = getuchar(v);
1197 if (c == (Py_UCS4)-1)
1198 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001199
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001200 if (!_getucname(self, c, name, sizeof(name), 0)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001201 if (defobj == NULL) {
1202 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001203 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001204 }
1205 else {
1206 Py_INCREF(defobj);
1207 return defobj;
1208 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001209 }
1210
Walter Dörwald4254e762007-06-05 16:04:09 +00001211 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001212}
1213
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001214PyDoc_STRVAR(unicodedata_lookup__doc__,
1215"lookup(name)\n\
1216\n\
1217Look up character by name. If a character with the\n\
1218given name is found, return the corresponding Unicode\n\
1219character. If not found, KeyError is raised.");
1220
Fredrik Lundh06d12682001-01-24 07:59:11 +00001221static PyObject *
1222unicodedata_lookup(PyObject* self, PyObject* args)
1223{
1224 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001225
1226 char* name;
1227 int namelen;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001228 unsigned int index;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001229 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1230 return NULL;
1231
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001232 if (!_getcode(self, name, namelen, &code, 1)) {
1233 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001234 return NULL;
1235 }
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001236 // check if code is in the PUA range that we use for named sequences
1237 // and convert it
1238 if (IS_NAMED_SEQ(code)) {
1239 index = code-named_sequences_start;
1240 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1241 named_sequences[index].seq,
1242 named_sequences[index].seqlen);
1243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001245}
1246
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001247/* XXX Add doc strings. */
1248
1249static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001250 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1251 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1252 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1253 {"category", unicodedata_category, METH_VARARGS,
1254 unicodedata_category__doc__},
1255 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1256 unicodedata_bidirectional__doc__},
1257 {"combining", unicodedata_combining, METH_VARARGS,
1258 unicodedata_combining__doc__},
1259 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1260 unicodedata_mirrored__doc__},
1261 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1262 unicodedata_east_asian_width__doc__},
1263 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1264 unicodedata_decomposition__doc__},
1265 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1266 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1267 {"normalize", unicodedata_normalize, METH_VARARGS,
1268 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001269 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001270};
1271
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001272static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001273 /* The ob_type field must be initialized in the module init function
1274 * to be portable to Windows without using C++. */
1275 PyVarObject_HEAD_INIT(NULL, 0)
1276 "unicodedata.UCD", /*tp_name*/
1277 sizeof(PreviousDBVersion), /*tp_basicsize*/
1278 0, /*tp_itemsize*/
1279 /* methods */
1280 (destructor)PyObject_Del, /*tp_dealloc*/
1281 0, /*tp_print*/
1282 0, /*tp_getattr*/
1283 0, /*tp_setattr*/
1284 0, /*tp_reserved*/
1285 0, /*tp_repr*/
1286 0, /*tp_as_number*/
1287 0, /*tp_as_sequence*/
1288 0, /*tp_as_mapping*/
1289 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001290 0, /*tp_call*/
1291 0, /*tp_str*/
1292 PyObject_GenericGetAttr,/*tp_getattro*/
1293 0, /*tp_setattro*/
1294 0, /*tp_as_buffer*/
1295 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1296 0, /*tp_doc*/
1297 0, /*tp_traverse*/
1298 0, /*tp_clear*/
1299 0, /*tp_richcompare*/
1300 0, /*tp_weaklistoffset*/
1301 0, /*tp_iter*/
1302 0, /*tp_iternext*/
1303 unicodedata_functions, /*tp_methods*/
1304 DB_members, /*tp_members*/
1305 0, /*tp_getset*/
1306 0, /*tp_base*/
1307 0, /*tp_dict*/
1308 0, /*tp_descr_get*/
1309 0, /*tp_descr_set*/
1310 0, /*tp_dictoffset*/
1311 0, /*tp_init*/
1312 0, /*tp_alloc*/
1313 0, /*tp_new*/
1314 0, /*tp_free*/
1315 0, /*tp_is_gc*/
1316};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001317
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001318PyDoc_STRVAR(unicodedata_docstring,
1319"This module provides access to the Unicode Character Database which\n\
1320defines character properties for all Unicode characters. The data in\n\
1321this database is based on the UnicodeData.txt file version\n\
Ezio Melotti98d2c0a2011-11-10 09:36:34 +020013226.0.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001323\n\
1324The module uses the same names and symbols as defined by the\n\
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02001325UnicodeData File Format 6.0.0 (see\n\
1326http://www.unicode.org/reports/tr44/tr44-6.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001327
Martin v. Löwis1a214512008-06-11 05:26:20 +00001328
1329static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001330 PyModuleDef_HEAD_INIT,
1331 "unicodedata",
1332 unicodedata_docstring,
1333 -1,
1334 unicodedata_functions,
1335 NULL,
1336 NULL,
1337 NULL,
1338 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001339};
1340
Mark Hammond62b1ab12002-07-23 06:31:15 +00001341PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001342PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001343{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001344 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001345
Christian Heimes90aa7642007-12-19 02:45:37 +00001346 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001347
Martin v. Löwis1a214512008-06-11 05:26:20 +00001348 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001349 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001350 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001351
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001352 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001353 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001354 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001355
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001356 /* Previous versions */
1357 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1358 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001359 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001360
Fredrik Lundh06d12682001-01-24 07:59:11 +00001361 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001362 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001363 if (v != NULL)
1364 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001365 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001366}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001367
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001368/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001369Local variables:
1370c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001371indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001372End:
1373*/