blob: 75c162656ed7dce43c03d5148df8056206739722 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
16#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000017#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000018#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019
20/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000021
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000022typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000023 const unsigned char category; /* index into
24 _PyUnicode_CategoryNames */
25 const unsigned char combining; /* combining class value 0 - 255 */
26 const unsigned char bidirectional; /* index into
27 _PyUnicode_BidirectionalNames */
28 const unsigned char mirrored; /* true if mirrored in bidir mode */
29 const unsigned char east_asian_width; /* index into
30 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000031 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000032} _PyUnicode_DatabaseRecord;
33
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000034typedef struct change_record {
35 /* sequence of fields should be the same as in merge_old_version */
36 const unsigned char bidir_changed;
37 const unsigned char category_changed;
38 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000039 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000040 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000041} change_record;
42
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000043/* data file generated by Tools/unicode/makeunicodedata.py */
44#include "unicodedata_db.h"
45
46static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000047_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000049 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000050 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000051 index = 0;
52 else {
53 index = index1[(code>>SHIFT)];
54 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
55 }
56
57 return &_PyUnicode_Database_Records[index];
58}
59
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000060/* ------------- Previous-version API ------------------------------------- */
61typedef struct previous_version {
62 PyObject_HEAD
63 const char *name;
64 const change_record* (*getrecord)(Py_UCS4);
65 Py_UCS4 (*normalization)(Py_UCS4);
66} PreviousDBVersion;
67
68#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
69
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000070static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000071 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000072 {NULL}
73};
74
Thomas Wouters89f507f2006-12-13 04:49:30 +000075/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000076static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000077#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078
79static PyObject*
80new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
81 Py_UCS4 (*normalization)(Py_UCS4))
82{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000083 PreviousDBVersion *self;
84 self = PyObject_New(PreviousDBVersion, &UCD_Type);
85 if (self == NULL)
86 return NULL;
87 self->name = name;
88 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000089 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000090 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000091}
92
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000093
94static Py_UCS4 getuchar(PyUnicodeObject *obj)
95{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020096 if (PyUnicode_READY(obj))
97 return (Py_UCS4)-1;
98 if (PyUnicode_GET_LENGTH(obj) == 1) {
99 if (PyUnicode_READY(obj))
100 return (Py_UCS4)-1;
101 return PyUnicode_READ_CHAR(obj, 0);
102 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000103 PyErr_SetString(PyExc_TypeError,
104 "need a single Unicode character as parameter");
105 return (Py_UCS4)-1;
106}
107
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000108/* --- Module API --------------------------------------------------------- */
109
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000110PyDoc_STRVAR(unicodedata_decimal__doc__,
111"decimal(unichr[, default])\n\
112\n\
113Returns the decimal value assigned to the Unicode character unichr\n\
114as integer. If no such value is defined, default is returned, or, if\n\
115not given, ValueError is raised.");
116
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000117static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000118unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000119{
120 PyUnicodeObject *v;
121 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000122 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000123 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000124 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000125
Fredrik Lundh06d12682001-01-24 07:59:11 +0000126 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000127 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000128 c = getuchar(v);
129 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000130 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000131
Martin v. Löwis1a214512008-06-11 05:26:20 +0000132 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000133 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000134 if (old->category_changed == 0) {
135 /* unassigned */
136 have_old = 1;
137 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000138 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000139 else if (old->decimal_changed != 0xFF) {
140 have_old = 1;
141 rc = old->decimal_changed;
142 }
143 }
144
145 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000146 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000147 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 if (defobj == NULL) {
149 PyErr_SetString(PyExc_ValueError,
150 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000151 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 }
153 else {
154 Py_INCREF(defobj);
155 return defobj;
156 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000157 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000158 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000159}
160
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000161PyDoc_STRVAR(unicodedata_digit__doc__,
162"digit(unichr[, default])\n\
163\n\
164Returns the digit value assigned to the Unicode character unichr as\n\
165integer. If no such value is defined, default is returned, or, if\n\
166not given, ValueError is raised.");
167
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000168static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000170{
171 PyUnicodeObject *v;
172 PyObject *defobj = NULL;
173 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000174 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000175
Fredrik Lundh06d12682001-01-24 07:59:11 +0000176 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000177 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000178 c = getuchar(v);
179 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000180 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000181 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000182 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000183 if (defobj == NULL) {
184 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000185 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000186 }
187 else {
188 Py_INCREF(defobj);
189 return defobj;
190 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000191 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000192 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193}
194
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000195PyDoc_STRVAR(unicodedata_numeric__doc__,
196"numeric(unichr[, default])\n\
197\n\
198Returns the numeric value assigned to the Unicode character unichr\n\
199as float. If no such value is defined, default is returned, or, if\n\
200not given, ValueError is raised.");
201
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000202static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000203unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000204{
205 PyUnicodeObject *v;
206 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000207 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000208 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000209 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000210
Fredrik Lundh06d12682001-01-24 07:59:11 +0000211 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000212 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000213 c = getuchar(v);
214 if (c == (Py_UCS4)-1)
215 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000216
Martin v. Löwis1a214512008-06-11 05:26:20 +0000217 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000218 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000219 if (old->category_changed == 0) {
220 /* unassigned */
221 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000222 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000223 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000224 else if (old->decimal_changed != 0xFF) {
225 have_old = 1;
226 rc = old->decimal_changed;
227 }
228 }
229
230 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000231 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000233 if (defobj == NULL) {
234 PyErr_SetString(PyExc_ValueError, "not a numeric character");
235 return NULL;
236 }
237 else {
238 Py_INCREF(defobj);
239 return defobj;
240 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000241 }
242 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000243}
244
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000245PyDoc_STRVAR(unicodedata_category__doc__,
246"category(unichr)\n\
247\n\
248Returns the general category assigned to the Unicode character\n\
249unichr as string.");
250
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000251static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000252unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000253{
254 PyUnicodeObject *v;
255 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000256 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000257
258 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000259 &PyUnicode_Type, &v))
260 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000261 c = getuchar(v);
262 if (c == (Py_UCS4)-1)
263 return NULL;
264 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000265 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000266 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000267 if (old->category_changed != 0xFF)
268 index = old->category_changed;
269 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000270 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000271}
272
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000273PyDoc_STRVAR(unicodedata_bidirectional__doc__,
274"bidirectional(unichr)\n\
275\n\
Ezio Melottie3d7e542012-12-14 20:12:25 +0200276Returns the bidirectional class assigned to the Unicode character\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000277unichr as string. If no such value is defined, an empty string is\n\
278returned.");
279
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000280static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000281unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000282{
283 PyUnicodeObject *v;
284 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000285 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000286
287 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000288 &PyUnicode_Type, &v))
289 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000290 c = getuchar(v);
291 if (c == (Py_UCS4)-1)
292 return NULL;
293 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000294 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000295 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000296 if (old->category_changed == 0)
297 index = 0; /* unassigned */
298 else if (old->bidir_changed != 0xFF)
299 index = old->bidir_changed;
300 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000301 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000302}
303
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000304PyDoc_STRVAR(unicodedata_combining__doc__,
305"combining(unichr)\n\
306\n\
307Returns the canonical combining class assigned to the Unicode\n\
308character unichr as integer. Returns 0 if no combining class is\n\
309defined.");
310
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000311static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000312unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000313{
314 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000315 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000316 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000317
318 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000319 &PyUnicode_Type, &v))
320 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000321 c = getuchar(v);
322 if (c == (Py_UCS4)-1)
323 return NULL;
324 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000325 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000326 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000327 if (old->category_changed == 0)
328 index = 0; /* unassigned */
329 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000330 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000331}
332
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000333PyDoc_STRVAR(unicodedata_mirrored__doc__,
334"mirrored(unichr)\n\
335\n\
336Returns the mirrored property assigned to the Unicode character\n\
337unichr as integer. Returns 1 if the character has been identified as\n\
338a \"mirrored\" character in bidirectional text, 0 otherwise.");
339
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000340static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000341unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000342{
343 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000344 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000345 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000346
347 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 &PyUnicode_Type, &v))
349 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000350 c = getuchar(v);
351 if (c == (Py_UCS4)-1)
352 return NULL;
353 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000354 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000355 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000356 if (old->category_changed == 0)
357 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000358 else if (old->mirrored_changed != 0xFF)
359 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000360 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000361 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000362}
363
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000364PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
365"east_asian_width(unichr)\n\
366\n\
367Returns the east asian width assigned to the Unicode character\n\
368unichr as string.");
369
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000370static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000371unicodedata_east_asian_width(PyObject *self, PyObject *args)
372{
373 PyUnicodeObject *v;
374 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000375 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000376
377 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000378 &PyUnicode_Type, &v))
379 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000380 c = getuchar(v);
381 if (c == (Py_UCS4)-1)
382 return NULL;
383 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000384 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000385 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000386 if (old->category_changed == 0)
387 index = 0; /* unassigned */
388 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000389 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000390}
391
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000392PyDoc_STRVAR(unicodedata_decomposition__doc__,
393"decomposition(unichr)\n\
394\n\
395Returns the character decomposition mapping assigned to the Unicode\n\
396character unichr as string. An empty string is returned in case no\n\
397such mapping is defined.");
398
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000399static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000400unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000401{
402 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000403 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000404 int code, index, count;
405 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000406 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000407 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000408
409 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000410 &PyUnicode_Type, &v))
411 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000412 c = getuchar(v);
413 if (c == (Py_UCS4)-1)
414 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000415
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000416 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000417
Martin v. Löwis1a214512008-06-11 05:26:20 +0000418 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000419 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000420 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000421 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000422 }
423
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000424 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000425 index = 0;
426 else {
427 index = decomp_index1[(code>>DECOMP_SHIFT)];
428 index = decomp_index2[(index<<DECOMP_SHIFT)+
429 (code&((1<<DECOMP_SHIFT)-1))];
430 }
431
Tim Peters69b83b12001-11-30 07:23:05 +0000432 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000433 is prefix code (from*/
434 count = decomp_data[index] >> 8;
435
436 /* XXX: could allocate the PyString up front instead
437 (strlen(prefix) + 5 * count + 1 bytes) */
438
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000439 /* Based on how index is calculated above and decomp_data is generated
440 from Tools/unicode/makeunicodedata.py, it should not be possible
441 to overflow decomp_prefix. */
442 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200443 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000444
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000445 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000446 i = strlen(decomp_prefix[prefix_index]);
447 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000448
449 while (count-- > 0) {
450 if (i)
451 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000452 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000453 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
454 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000455 i += strlen(decomp + i);
456 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000457 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000458}
459
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000460static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000461get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000462{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000463 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000464 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000465 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000466 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000467 /* unassigned in old version */
468 *index = 0;
469 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000470 else {
471 *index = decomp_index1[(code>>DECOMP_SHIFT)];
472 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
473 (code&((1<<DECOMP_SHIFT)-1))];
474 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000475
Martin v. Löwis677bde22002-11-23 22:08:15 +0000476 /* high byte is number of hex bytes (usually one or two), low byte
477 is prefix code (from*/
478 *count = decomp_data[*index] >> 8;
479 *prefix = decomp_data[*index] & 255;
480
481 (*index)++;
482}
483
484#define SBase 0xAC00
485#define LBase 0x1100
486#define VBase 0x1161
487#define TBase 0x11A7
488#define LCount 19
489#define VCount 21
490#define TCount 28
491#define NCount (VCount*TCount)
492#define SCount (LCount*NCount)
493
494static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000495nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000496{
497 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200498 Py_UCS4 *output;
499 Py_ssize_t i, o, osize;
500 int kind;
501 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000502 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200503 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000504 Py_ssize_t space, isize;
505 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000506 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000507
Martin v. Löwis677bde22002-11-23 22:08:15 +0000508 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200509 isize = PyUnicode_GET_LENGTH(input);
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300510 /* Overallocate at most 10 characters. */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000511 space = (isize > 10 ? 10 : isize) + isize;
Martin v. Löwis22970662011-09-29 13:39:38 +0200512 osize = space;
513 output = PyMem_Malloc(space * sizeof(Py_UCS4));
514 if (!output) {
515 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000516 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200517 }
518 i = o = 0;
519 kind = PyUnicode_KIND(input);
520 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000521
Martin v. Löwis22970662011-09-29 13:39:38 +0200522 while (i < isize) {
523 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000524 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200525 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000526 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300527 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000528 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000529 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200530 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000531 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000532 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
533 if (new_output == NULL) {
534 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200535 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000536 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200537 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000538 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000539 }
540 /* Hangul Decomposition. */
541 if (SBase <= code && code < (SBase+SCount)) {
542 int SIndex = code - SBase;
543 int L = LBase + SIndex / NCount;
544 int V = VBase + (SIndex % NCount) / TCount;
545 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200546 output[o++] = L;
547 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000548 space -= 2;
549 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200550 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000551 space --;
552 }
553 continue;
554 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000555 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000556 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000557 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
558 if (value != 0) {
559 stack[stackptr++] = value;
560 continue;
561 }
562 }
563
564 /* Other decompositions. */
565 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000566
567 /* Copy character if it is not decomposable, or has a
568 compatibility decomposition, but we do NFD. */
569 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200570 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000571 space--;
572 continue;
573 }
574 /* Copy decomposition onto the stack, in reverse
575 order. */
576 while(count) {
577 code = decomp_data[index + (--count)];
578 stack[stackptr++] = code;
579 }
580 }
581 }
582
Martin v. Löwis22970662011-09-29 13:39:38 +0200583 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
584 output, o);
585 PyMem_Free(output);
586 if (!result)
587 return NULL;
588 /* result is guaranteed to be ready, as it is compact. */
589 kind = PyUnicode_KIND(result);
590 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000591
592 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200593 i = 0;
594 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
595 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
596 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000597 if (prev == 0 || cur == 0 || prev <= cur) {
598 prev = cur;
599 continue;
600 }
601 /* Non-canonical order. Need to switch *i with previous. */
602 o = i - 1;
603 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200604 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
605 PyUnicode_WRITE(kind, data, o+1,
606 PyUnicode_READ(kind, data, o));
607 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000608 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200609 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000610 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200611 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000612 if (prev == 0 || prev <= cur)
613 break;
614 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200615 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000616 }
617 return result;
618}
619
620static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200621find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000622{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200623 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000624 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200625 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000626 if (code < start)
627 return -1;
628 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200629 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000630 return nfc[index].index + delta;
631 }
632 }
633 return -1;
634}
635
636static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000637nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000638{
639 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200640 int kind;
641 void *data;
642 Py_UCS4 *output;
643 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000644 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200645 Py_UCS4 code;
646 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000647 int cskipped = 0;
648
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000649 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000650 if (!result)
651 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200652 /* result will be "ready". */
653 kind = PyUnicode_KIND(result);
654 data = PyUnicode_DATA(result);
655 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000656
Martin v. Löwis22970662011-09-29 13:39:38 +0200657 /* We allocate a buffer for the output.
658 If we find that we made no changes, we still return
659 the NFD result. */
660 output = PyMem_Malloc(len * sizeof(Py_UCS4));
661 if (!output) {
662 PyErr_NoMemory();
663 Py_DECREF(result);
664 return 0;
665 }
666 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000667
Martin v. Löwis677bde22002-11-23 22:08:15 +0000668 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200669 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000670 for (index = 0; index < cskipped; index++) {
671 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000672 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000673 Remove from list. */
674 skipped[index] = skipped[cskipped-1];
675 cskipped--;
676 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000677 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000678 }
679 }
680 /* Hangul Composition. We don't need to check for <LV,T>
681 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200682 code = PyUnicode_READ(kind, data, i);
683 if (LBase <= code && code < (LBase+LCount) &&
684 i + 1 < len &&
685 VBase <= PyUnicode_READ(kind, data, i+1) &&
686 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000687 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200688 LIndex = code - LBase;
689 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000690 code = SBase + (LIndex*VCount+VIndex)*TCount;
691 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200692 if (i < len &&
693 TBase <= PyUnicode_READ(kind, data, i) &&
694 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
695 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000696 i++;
697 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200698 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000699 continue;
700 }
701
Martin v. Löwis22970662011-09-29 13:39:38 +0200702 /* code is still input[i] here */
703 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000704 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200705 output[o++] = code;
706 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000707 continue;
708 }
709 /* Find next unblocked character. */
710 i1 = i+1;
711 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200712 /* output base character for now; might be updated later. */
713 output[o] = PyUnicode_READ(kind, data, i);
714 while (i1 < len) {
715 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
716 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000717 if (comb) {
718 if (comb1 == 0)
719 break;
720 if (comb >= comb1) {
721 /* Character is blocked. */
722 i1++;
723 continue;
724 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000725 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200726 l = find_nfc_index(self, nfc_last, code1);
727 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000728 is a starter, we don't need to look further.
729 Otherwise, record the combining class. */
730 if (l == -1) {
731 not_combinable:
732 if (comb1 == 0)
733 break;
734 comb = comb1;
735 i1++;
736 continue;
737 }
738 index = f*TOTAL_LAST + l;
739 index1 = comp_index[index >> COMP_SHIFT];
740 code = comp_data[(index1<<COMP_SHIFT)+
741 (index&((1<<COMP_SHIFT)-1))];
742 if (code == 0)
743 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000744
Martin v. Löwis677bde22002-11-23 22:08:15 +0000745 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200746 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000747 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000748 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000749 skipped[cskipped++] = i1;
750 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200751 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000752 if (f == -1)
753 break;
754 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200755 /* Output character was already written.
756 Just advance the indices. */
757 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000758 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200759 if (o == len) {
760 /* No changes. Return original string. */
761 PyMem_Free(output);
762 return result;
763 }
764 Py_DECREF(result);
765 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
766 output, o);
767 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000768 return result;
769}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000770
771/* Return 1 if the input is certainly normalized, 0 if it might not be. */
772static int
773is_normalized(PyObject *self, PyObject *input, int nfc, int k)
774{
Martin v. Löwis22970662011-09-29 13:39:38 +0200775 Py_ssize_t i, len;
776 int kind;
777 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000778 unsigned char prev_combining = 0, quickcheck_mask;
779
780 /* An older version of the database is requested, quickchecks must be
781 disabled. */
782 if (self && UCD_Check(self))
783 return 0;
784
785 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
786 as described in http://unicode.org/reports/tr15/#Annex8. */
787 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
788
Martin v. Löwis22970662011-09-29 13:39:38 +0200789 i = 0;
790 kind = PyUnicode_KIND(input);
791 data = PyUnicode_DATA(input);
792 len = PyUnicode_GET_LENGTH(input);
793 while (i < len) {
794 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
795 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000796 unsigned char combining = record->combining;
797 unsigned char quickcheck = record->normalization_quick_check;
798
799 if (quickcheck & quickcheck_mask)
800 return 0; /* this string might need normalization */
801 if (combining && prev_combining > combining)
802 return 0; /* non-canonical sort order, not normalized */
803 prev_combining = combining;
804 }
805 return 1; /* certainly normalized */
806}
807
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000808PyDoc_STRVAR(unicodedata_normalize__doc__,
809"normalize(form, unistr)\n\
810\n\
811Return the normal form 'form' for the Unicode string unistr. Valid\n\
812values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
813
Martin v. Löwis677bde22002-11-23 22:08:15 +0000814static PyObject*
815unicodedata_normalize(PyObject *self, PyObject *args)
816{
817 char *form;
818 PyObject *input;
819
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000820 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000821 &form, &PyUnicode_Type, &input))
822 return NULL;
823
Martin v. Löwis22970662011-09-29 13:39:38 +0200824 if (PyUnicode_READY(input) == -1)
825 return NULL;
826
827 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000828 /* Special case empty input strings, since resizing
829 them later would cause internal errors. */
830 Py_INCREF(input);
831 return input;
832 }
833
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000834 if (strcmp(form, "NFC") == 0) {
835 if (is_normalized(self, input, 1, 0)) {
836 Py_INCREF(input);
837 return input;
838 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000839 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000840 }
841 if (strcmp(form, "NFKC") == 0) {
842 if (is_normalized(self, input, 1, 1)) {
843 Py_INCREF(input);
844 return input;
845 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000846 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000847 }
848 if (strcmp(form, "NFD") == 0) {
849 if (is_normalized(self, input, 0, 0)) {
850 Py_INCREF(input);
851 return input;
852 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000853 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000854 }
855 if (strcmp(form, "NFKD") == 0) {
856 if (is_normalized(self, input, 0, 1)) {
857 Py_INCREF(input);
858 return input;
859 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000860 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000861 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000862 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
863 return NULL;
864}
865
Fredrik Lundh06d12682001-01-24 07:59:11 +0000866/* -------------------------------------------------------------------- */
867/* unicode character name tables */
868
869/* data file generated by Tools/unicode/makeunicodedata.py */
870#include "unicodename_db.h"
871
872/* -------------------------------------------------------------------- */
873/* database code (cut and pasted from the unidb package) */
874
875static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000876_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000877{
878 int i;
879 unsigned long h = 0;
880 unsigned long ix;
881 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200882 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000883 ix = h & 0xff000000;
884 if (ix)
885 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
886 }
887 return h;
888}
889
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000890static char *hangul_syllables[][3] = {
891 { "G", "A", "" },
892 { "GG", "AE", "G" },
893 { "N", "YA", "GG" },
894 { "D", "YAE", "GS" },
895 { "DD", "EO", "N", },
896 { "R", "E", "NJ" },
897 { "M", "YEO", "NH" },
898 { "B", "YE", "D" },
899 { "BB", "O", "L" },
900 { "S", "WA", "LG" },
901 { "SS", "WAE", "LM" },
902 { "", "OE", "LB" },
903 { "J", "YO", "LS" },
904 { "JJ", "U", "LT" },
905 { "C", "WEO", "LP" },
906 { "K", "WE", "LH" },
907 { "T", "WI", "M" },
908 { "P", "YU", "B" },
909 { "H", "EU", "BS" },
910 { 0, "YI", "S" },
911 { 0, "I", "SS" },
912 { 0, 0, "NG" },
913 { 0, 0, "J" },
914 { 0, 0, "C" },
915 { 0, 0, "K" },
916 { 0, 0, "T" },
917 { 0, 0, "P" },
918 { 0, 0, "H" }
919};
920
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000921/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000922static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000923is_unified_ideograph(Py_UCS4 code)
924{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000925 return
926 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500927 (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000928 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
929 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
930 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000931}
932
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300933/* macros used to determine if the given codepoint is in the PUA range that
934 * we are using to store aliases and named sequences */
935#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
936#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
937 (cp < named_sequences_end))
938
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000939static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300940_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
941 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000942{
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300943 /* Find the name associated with the given codepoint.
944 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
945 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000946 int offset;
947 int i;
948 int word;
949 unsigned char* w;
950
Martin v. Löwisc3509122006-03-11 12:16:23 +0000951 if (code >= 0x110000)
952 return 0;
953
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300954 /* XXX should we just skip all the codepoints in the PUAs here? */
955 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
956 return 0;
957
Martin v. Löwis1a214512008-06-11 05:26:20 +0000958 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300959 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +0300960 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300961 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
962 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +0300963 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +0000964 if (old->category_changed == 0) {
965 /* unassigned */
966 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000967 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000968 }
969
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000970 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000971 /* Hangul syllable. */
972 int SIndex = code - SBase;
973 int L = SIndex / NCount;
974 int V = (SIndex % NCount) / TCount;
975 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000976
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000977 if (buflen < 27)
978 /* Worst case: HANGUL SYLLABLE <10chars>. */
979 return 0;
980 strcpy(buffer, "HANGUL SYLLABLE ");
981 buffer += 16;
982 strcpy(buffer, hangul_syllables[L][0]);
983 buffer += strlen(hangul_syllables[L][0]);
984 strcpy(buffer, hangul_syllables[V][1]);
985 buffer += strlen(hangul_syllables[V][1]);
986 strcpy(buffer, hangul_syllables[T][2]);
987 buffer += strlen(hangul_syllables[T][2]);
988 *buffer = '\0';
989 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000990 }
991
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000992 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000993 if (buflen < 28)
994 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
995 return 0;
996 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
997 return 1;
998 }
999
Fredrik Lundh06d12682001-01-24 07:59:11 +00001000 /* get offset into phrasebook */
1001 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1002 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1003 (code&((1<<phrasebook_shift)-1))];
1004 if (!offset)
1005 return 0;
1006
1007 i = 0;
1008
1009 for (;;) {
1010 /* get word index */
1011 word = phrasebook[offset] - phrasebook_short;
1012 if (word >= 0) {
1013 word = (word << 8) + phrasebook[offset+1];
1014 offset += 2;
1015 } else
1016 word = phrasebook[offset++];
1017 if (i) {
1018 if (i > buflen)
1019 return 0; /* buffer overflow */
1020 buffer[i++] = ' ';
1021 }
1022 /* copy word string from lexicon. the last character in the
1023 word has bit 7 set. the last word in a string ends with
1024 0x80 */
1025 w = lexicon + lexicon_offset[word];
1026 while (*w < 128) {
1027 if (i >= buflen)
1028 return 0; /* buffer overflow */
1029 buffer[i++] = *w++;
1030 }
1031 if (i >= buflen)
1032 return 0; /* buffer overflow */
1033 buffer[i++] = *w & 127;
1034 if (*w == 128)
1035 break; /* end of word */
1036 }
1037
1038 return 1;
1039}
1040
1041static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001042_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001043{
1044 /* check if code corresponds to the given name */
1045 int i;
1046 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001047 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001048 return 0;
1049 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001050 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001051 return 0;
1052 }
1053 return buffer[namelen] == '\0';
1054}
1055
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001056static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001057find_syllable(const char *str, int *len, int *pos, int count, int column)
1058{
1059 int i, len1;
1060 *len = -1;
1061 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001062 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001063 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001064 if (len1 <= *len)
1065 continue;
1066 if (strncmp(str, s, len1) == 0) {
1067 *len = len1;
1068 *pos = i;
1069 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001070 }
1071 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001072 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001073 }
1074}
1075
Fredrik Lundh06d12682001-01-24 07:59:11 +00001076static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001077_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001078{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001079 /* check if named sequences are allowed */
1080 if (!with_named_seq && IS_NAMED_SEQ(cp))
1081 return 0;
1082 /* if the codepoint is in the PUA range that we use for aliases,
1083 * convert it to obtain the right codepoint */
1084 if (IS_ALIAS(cp))
1085 *code = name_aliases[cp-aliases_start];
1086 else
1087 *code = cp;
1088 return 1;
1089}
1090
1091static int
1092_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1093 int with_named_seq)
1094{
1095 /* Return the codepoint associated with the given name.
1096 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1097 * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
1098 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001099 unsigned int h, v;
1100 unsigned int mask = code_size-1;
1101 unsigned int i, incr;
1102
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001103 /* Check for hangul syllables. */
1104 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001105 int len, L = -1, V = -1, T = -1;
1106 const char *pos = name + 16;
1107 find_syllable(pos, &len, &L, LCount, 0);
1108 pos += len;
1109 find_syllable(pos, &len, &V, VCount, 1);
1110 pos += len;
1111 find_syllable(pos, &len, &T, TCount, 2);
1112 pos += len;
1113 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1114 *code = SBase + (L*VCount+V)*TCount + T;
1115 return 1;
1116 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001117 /* Otherwise, it's an illegal syllable name. */
1118 return 0;
1119 }
1120
1121 /* Check for unified ideographs. */
1122 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1123 /* Four or five hexdigits must follow. */
1124 v = 0;
1125 name += 22;
1126 namelen -= 22;
1127 if (namelen != 4 && namelen != 5)
1128 return 0;
1129 while (namelen--) {
1130 v *= 16;
1131 if (*name >= '0' && *name <= '9')
1132 v += *name - '0';
1133 else if (*name >= 'A' && *name <= 'F')
1134 v += *name - 'A' + 10;
1135 else
1136 return 0;
1137 name++;
1138 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001139 if (!is_unified_ideograph(v))
1140 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001141 *code = v;
1142 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001143 }
1144
Fredrik Lundh06d12682001-01-24 07:59:11 +00001145 /* the following is the same as python's dictionary lookup, with
1146 only minor changes. see the makeunicodedata script for more
1147 details */
1148
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001149 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001150 i = (~h) & mask;
1151 v = code_hash[i];
1152 if (!v)
1153 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001154 if (_cmpname(self, v, name, namelen))
1155 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001156 incr = (h ^ (h >> 3)) & mask;
1157 if (!incr)
1158 incr = mask;
1159 for (;;) {
1160 i = (i + incr) & mask;
1161 v = code_hash[i];
1162 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001163 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001164 if (_cmpname(self, v, name, namelen))
1165 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001166 incr = incr << 1;
1167 if (incr > mask)
1168 incr = incr ^ code_poly;
1169 }
1170}
1171
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001172static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001173{
1174 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001175 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001176 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001177};
1178
1179/* -------------------------------------------------------------------- */
1180/* Python bindings */
1181
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001182PyDoc_STRVAR(unicodedata_name__doc__,
1183"name(unichr[, default])\n\
1184Returns the name assigned to the Unicode character unichr as a\n\
1185string. If no name is defined, default is returned, or, if not\n\
1186given, ValueError is raised.");
1187
Fredrik Lundh06d12682001-01-24 07:59:11 +00001188static PyObject *
1189unicodedata_name(PyObject* self, PyObject* args)
1190{
1191 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001192 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001193
1194 PyUnicodeObject* v;
1195 PyObject* defobj = NULL;
1196 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1197 return NULL;
1198
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001199 c = getuchar(v);
1200 if (c == (Py_UCS4)-1)
1201 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001202
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001203 if (!_getucname(self, c, name, sizeof(name), 0)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001204 if (defobj == NULL) {
1205 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001206 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001207 }
1208 else {
1209 Py_INCREF(defobj);
1210 return defobj;
1211 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001212 }
1213
Walter Dörwald4254e762007-06-05 16:04:09 +00001214 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001215}
1216
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001217PyDoc_STRVAR(unicodedata_lookup__doc__,
1218"lookup(name)\n\
1219\n\
1220Look up character by name. If a character with the\n\
1221given name is found, return the corresponding Unicode\n\
1222character. If not found, KeyError is raised.");
1223
Fredrik Lundh06d12682001-01-24 07:59:11 +00001224static PyObject *
1225unicodedata_lookup(PyObject* self, PyObject* args)
1226{
1227 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001228
1229 char* name;
1230 int namelen;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001231 unsigned int index;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001232 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1233 return NULL;
1234
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001235 if (!_getcode(self, name, namelen, &code, 1)) {
1236 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001237 return NULL;
1238 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001239 /* check if code is in the PUA range that we use for named sequences
1240 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001241 if (IS_NAMED_SEQ(code)) {
1242 index = code-named_sequences_start;
1243 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1244 named_sequences[index].seq,
1245 named_sequences[index].seqlen);
1246 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001248}
1249
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001250/* XXX Add doc strings. */
1251
1252static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001253 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1254 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1255 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1256 {"category", unicodedata_category, METH_VARARGS,
1257 unicodedata_category__doc__},
1258 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1259 unicodedata_bidirectional__doc__},
1260 {"combining", unicodedata_combining, METH_VARARGS,
1261 unicodedata_combining__doc__},
1262 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1263 unicodedata_mirrored__doc__},
1264 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1265 unicodedata_east_asian_width__doc__},
1266 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1267 unicodedata_decomposition__doc__},
1268 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1269 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1270 {"normalize", unicodedata_normalize, METH_VARARGS,
1271 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001272 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001273};
1274
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001275static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001276 /* The ob_type field must be initialized in the module init function
1277 * to be portable to Windows without using C++. */
1278 PyVarObject_HEAD_INIT(NULL, 0)
1279 "unicodedata.UCD", /*tp_name*/
1280 sizeof(PreviousDBVersion), /*tp_basicsize*/
1281 0, /*tp_itemsize*/
1282 /* methods */
1283 (destructor)PyObject_Del, /*tp_dealloc*/
1284 0, /*tp_print*/
1285 0, /*tp_getattr*/
1286 0, /*tp_setattr*/
1287 0, /*tp_reserved*/
1288 0, /*tp_repr*/
1289 0, /*tp_as_number*/
1290 0, /*tp_as_sequence*/
1291 0, /*tp_as_mapping*/
1292 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001293 0, /*tp_call*/
1294 0, /*tp_str*/
1295 PyObject_GenericGetAttr,/*tp_getattro*/
1296 0, /*tp_setattro*/
1297 0, /*tp_as_buffer*/
1298 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1299 0, /*tp_doc*/
1300 0, /*tp_traverse*/
1301 0, /*tp_clear*/
1302 0, /*tp_richcompare*/
1303 0, /*tp_weaklistoffset*/
1304 0, /*tp_iter*/
1305 0, /*tp_iternext*/
1306 unicodedata_functions, /*tp_methods*/
1307 DB_members, /*tp_members*/
1308 0, /*tp_getset*/
1309 0, /*tp_base*/
1310 0, /*tp_dict*/
1311 0, /*tp_descr_get*/
1312 0, /*tp_descr_set*/
1313 0, /*tp_dictoffset*/
1314 0, /*tp_init*/
1315 0, /*tp_alloc*/
1316 0, /*tp_new*/
1317 0, /*tp_free*/
1318 0, /*tp_is_gc*/
1319};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001320
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001321PyDoc_STRVAR(unicodedata_docstring,
1322"This module provides access to the Unicode Character Database which\n\
1323defines character properties for all Unicode characters. The data in\n\
1324this database is based on the UnicodeData.txt file version\n\
Ezio Melotti98d2c0a2011-11-10 09:36:34 +020013256.0.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001326\n\
1327The module uses the same names and symbols as defined by the\n\
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02001328UnicodeData File Format 6.0.0 (see\n\
1329http://www.unicode.org/reports/tr44/tr44-6.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001330
Martin v. Löwis1a214512008-06-11 05:26:20 +00001331
1332static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001333 PyModuleDef_HEAD_INIT,
1334 "unicodedata",
1335 unicodedata_docstring,
1336 -1,
1337 unicodedata_functions,
1338 NULL,
1339 NULL,
1340 NULL,
1341 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001342};
1343
Mark Hammond62b1ab12002-07-23 06:31:15 +00001344PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001345PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001346{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001347 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001348
Christian Heimes90aa7642007-12-19 02:45:37 +00001349 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001350
Martin v. Löwis1a214512008-06-11 05:26:20 +00001351 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001352 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001353 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001354
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001355 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001356 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001357 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001358
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001359 /* Previous versions */
1360 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1361 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001362 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001363
Fredrik Lundh06d12682001-01-24 07:59:11 +00001364 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001365 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001366 if (v != NULL)
1367 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001368 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001369}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001370
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001371/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001372Local variables:
1373c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001374indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001375End:
1376*/