blob: 9fb1191fc593489e00df82488c2bdab2f833ab6a [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
16#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000017#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000018#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019
20/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000021
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000022typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000023 const unsigned char category; /* index into
24 _PyUnicode_CategoryNames */
25 const unsigned char combining; /* combining class value 0 - 255 */
26 const unsigned char bidirectional; /* index into
27 _PyUnicode_BidirectionalNames */
28 const unsigned char mirrored; /* true if mirrored in bidir mode */
29 const unsigned char east_asian_width; /* index into
30 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000031 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000032} _PyUnicode_DatabaseRecord;
33
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000034typedef struct change_record {
35 /* sequence of fields should be the same as in merge_old_version */
36 const unsigned char bidir_changed;
37 const unsigned char category_changed;
38 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000039 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000040 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000041} change_record;
42
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000043/* data file generated by Tools/unicode/makeunicodedata.py */
44#include "unicodedata_db.h"
45
46static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000047_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000049 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000050 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000051 index = 0;
52 else {
53 index = index1[(code>>SHIFT)];
54 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
55 }
56
57 return &_PyUnicode_Database_Records[index];
58}
59
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000060/* ------------- Previous-version API ------------------------------------- */
61typedef struct previous_version {
62 PyObject_HEAD
63 const char *name;
64 const change_record* (*getrecord)(Py_UCS4);
65 Py_UCS4 (*normalization)(Py_UCS4);
66} PreviousDBVersion;
67
68#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
69
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000070static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000071 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000072 {NULL}
73};
74
Thomas Wouters89f507f2006-12-13 04:49:30 +000075/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000076static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000077#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078
79static PyObject*
80new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
81 Py_UCS4 (*normalization)(Py_UCS4))
82{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000083 PreviousDBVersion *self;
84 self = PyObject_New(PreviousDBVersion, &UCD_Type);
85 if (self == NULL)
86 return NULL;
87 self->name = name;
88 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000089 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000090 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000091}
92
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000093
94static Py_UCS4 getuchar(PyUnicodeObject *obj)
95{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020096 if (PyUnicode_READY(obj))
97 return (Py_UCS4)-1;
98 if (PyUnicode_GET_LENGTH(obj) == 1) {
99 if (PyUnicode_READY(obj))
100 return (Py_UCS4)-1;
101 return PyUnicode_READ_CHAR(obj, 0);
102 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000103 PyErr_SetString(PyExc_TypeError,
104 "need a single Unicode character as parameter");
105 return (Py_UCS4)-1;
106}
107
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000108/* --- Module API --------------------------------------------------------- */
109
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000110PyDoc_STRVAR(unicodedata_decimal__doc__,
111"decimal(unichr[, default])\n\
112\n\
113Returns the decimal value assigned to the Unicode character unichr\n\
114as integer. If no such value is defined, default is returned, or, if\n\
115not given, ValueError is raised.");
116
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000117static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000118unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000119{
120 PyUnicodeObject *v;
121 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000122 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000123 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000124 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000125
Fredrik Lundh06d12682001-01-24 07:59:11 +0000126 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000127 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000128 c = getuchar(v);
129 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000130 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000131
Martin v. Löwis1a214512008-06-11 05:26:20 +0000132 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000133 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000134 if (old->category_changed == 0) {
135 /* unassigned */
136 have_old = 1;
137 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000138 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000139 else if (old->decimal_changed != 0xFF) {
140 have_old = 1;
141 rc = old->decimal_changed;
142 }
143 }
144
145 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000146 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000147 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 if (defobj == NULL) {
149 PyErr_SetString(PyExc_ValueError,
150 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000151 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 }
153 else {
154 Py_INCREF(defobj);
155 return defobj;
156 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000157 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000158 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000159}
160
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000161PyDoc_STRVAR(unicodedata_digit__doc__,
162"digit(unichr[, default])\n\
163\n\
164Returns the digit value assigned to the Unicode character unichr as\n\
165integer. If no such value is defined, default is returned, or, if\n\
166not given, ValueError is raised.");
167
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000168static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000169unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000170{
171 PyUnicodeObject *v;
172 PyObject *defobj = NULL;
173 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000174 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000175
Fredrik Lundh06d12682001-01-24 07:59:11 +0000176 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000177 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000178 c = getuchar(v);
179 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000180 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000181 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000182 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000183 if (defobj == NULL) {
184 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000185 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000186 }
187 else {
188 Py_INCREF(defobj);
189 return defobj;
190 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000191 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000192 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193}
194
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000195PyDoc_STRVAR(unicodedata_numeric__doc__,
196"numeric(unichr[, default])\n\
197\n\
198Returns the numeric value assigned to the Unicode character unichr\n\
199as float. If no such value is defined, default is returned, or, if\n\
200not given, ValueError is raised.");
201
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000202static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000203unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000204{
205 PyUnicodeObject *v;
206 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000207 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000208 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000209 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000210
Fredrik Lundh06d12682001-01-24 07:59:11 +0000211 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000212 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000213 c = getuchar(v);
214 if (c == (Py_UCS4)-1)
215 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000216
Martin v. Löwis1a214512008-06-11 05:26:20 +0000217 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000218 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000219 if (old->category_changed == 0) {
220 /* unassigned */
221 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000222 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000223 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000224 else if (old->decimal_changed != 0xFF) {
225 have_old = 1;
226 rc = old->decimal_changed;
227 }
228 }
229
230 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000231 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000233 if (defobj == NULL) {
234 PyErr_SetString(PyExc_ValueError, "not a numeric character");
235 return NULL;
236 }
237 else {
238 Py_INCREF(defobj);
239 return defobj;
240 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000241 }
242 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000243}
244
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000245PyDoc_STRVAR(unicodedata_category__doc__,
246"category(unichr)\n\
247\n\
248Returns the general category assigned to the Unicode character\n\
249unichr as string.");
250
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000251static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000252unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000253{
254 PyUnicodeObject *v;
255 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000256 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000257
258 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000259 &PyUnicode_Type, &v))
260 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000261 c = getuchar(v);
262 if (c == (Py_UCS4)-1)
263 return NULL;
264 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000265 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000266 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000267 if (old->category_changed != 0xFF)
268 index = old->category_changed;
269 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000270 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000271}
272
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000273PyDoc_STRVAR(unicodedata_bidirectional__doc__,
274"bidirectional(unichr)\n\
275\n\
Ezio Melottie3d7e542012-12-14 20:12:25 +0200276Returns the bidirectional class assigned to the Unicode character\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000277unichr as string. If no such value is defined, an empty string is\n\
278returned.");
279
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000280static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000281unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000282{
283 PyUnicodeObject *v;
284 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000285 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000286
287 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000288 &PyUnicode_Type, &v))
289 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000290 c = getuchar(v);
291 if (c == (Py_UCS4)-1)
292 return NULL;
293 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000294 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000295 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000296 if (old->category_changed == 0)
297 index = 0; /* unassigned */
298 else if (old->bidir_changed != 0xFF)
299 index = old->bidir_changed;
300 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000301 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000302}
303
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000304PyDoc_STRVAR(unicodedata_combining__doc__,
305"combining(unichr)\n\
306\n\
307Returns the canonical combining class assigned to the Unicode\n\
308character unichr as integer. Returns 0 if no combining class is\n\
309defined.");
310
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000311static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000312unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000313{
314 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000315 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000316 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000317
318 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000319 &PyUnicode_Type, &v))
320 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000321 c = getuchar(v);
322 if (c == (Py_UCS4)-1)
323 return NULL;
324 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000325 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000326 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000327 if (old->category_changed == 0)
328 index = 0; /* unassigned */
329 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000330 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000331}
332
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000333PyDoc_STRVAR(unicodedata_mirrored__doc__,
334"mirrored(unichr)\n\
335\n\
336Returns the mirrored property assigned to the Unicode character\n\
337unichr as integer. Returns 1 if the character has been identified as\n\
338a \"mirrored\" character in bidirectional text, 0 otherwise.");
339
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000340static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000341unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000342{
343 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000344 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000345 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000346
347 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 &PyUnicode_Type, &v))
349 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000350 c = getuchar(v);
351 if (c == (Py_UCS4)-1)
352 return NULL;
353 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000354 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000355 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000356 if (old->category_changed == 0)
357 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000358 else if (old->mirrored_changed != 0xFF)
359 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000360 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000361 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000362}
363
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000364PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
365"east_asian_width(unichr)\n\
366\n\
367Returns the east asian width assigned to the Unicode character\n\
368unichr as string.");
369
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000370static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000371unicodedata_east_asian_width(PyObject *self, PyObject *args)
372{
373 PyUnicodeObject *v;
374 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000375 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000376
377 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000378 &PyUnicode_Type, &v))
379 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000380 c = getuchar(v);
381 if (c == (Py_UCS4)-1)
382 return NULL;
383 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000384 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000385 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000386 if (old->category_changed == 0)
387 index = 0; /* unassigned */
388 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000389 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000390}
391
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000392PyDoc_STRVAR(unicodedata_decomposition__doc__,
393"decomposition(unichr)\n\
394\n\
395Returns the character decomposition mapping assigned to the Unicode\n\
396character unichr as string. An empty string is returned in case no\n\
397such mapping is defined.");
398
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000399static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000400unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000401{
402 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000403 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000404 int code, index, count;
405 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000406 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000407 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000408
409 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000410 &PyUnicode_Type, &v))
411 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000412 c = getuchar(v);
413 if (c == (Py_UCS4)-1)
414 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000415
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000416 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000417
Martin v. Löwis1a214512008-06-11 05:26:20 +0000418 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000419 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000420 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000421 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000422 }
423
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000424 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000425 index = 0;
426 else {
427 index = decomp_index1[(code>>DECOMP_SHIFT)];
428 index = decomp_index2[(index<<DECOMP_SHIFT)+
429 (code&((1<<DECOMP_SHIFT)-1))];
430 }
431
Tim Peters69b83b12001-11-30 07:23:05 +0000432 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000433 is prefix code (from*/
434 count = decomp_data[index] >> 8;
435
436 /* XXX: could allocate the PyString up front instead
437 (strlen(prefix) + 5 * count + 1 bytes) */
438
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000439 /* Based on how index is calculated above and decomp_data is generated
440 from Tools/unicode/makeunicodedata.py, it should not be possible
441 to overflow decomp_prefix. */
442 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200443 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000444
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000445 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000446 i = strlen(decomp_prefix[prefix_index]);
447 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000448
449 while (count-- > 0) {
450 if (i)
451 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000452 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000453 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
454 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000455 i += strlen(decomp + i);
456 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000457 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000458}
459
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000460static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000461get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000462{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000463 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000464 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000465 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000466 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000467 /* unassigned in old version */
468 *index = 0;
469 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000470 else {
471 *index = decomp_index1[(code>>DECOMP_SHIFT)];
472 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
473 (code&((1<<DECOMP_SHIFT)-1))];
474 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000475
Martin v. Löwis677bde22002-11-23 22:08:15 +0000476 /* high byte is number of hex bytes (usually one or two), low byte
477 is prefix code (from*/
478 *count = decomp_data[*index] >> 8;
479 *prefix = decomp_data[*index] & 255;
480
481 (*index)++;
482}
483
484#define SBase 0xAC00
485#define LBase 0x1100
486#define VBase 0x1161
487#define TBase 0x11A7
488#define LCount 19
489#define VCount 21
490#define TCount 28
491#define NCount (VCount*TCount)
492#define SCount (LCount*NCount)
493
494static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000495nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000496{
497 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200498 Py_UCS4 *output;
499 Py_ssize_t i, o, osize;
500 int kind;
501 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000502 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200503 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000504 Py_ssize_t space, isize;
505 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000506 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000507
Martin v. Löwis677bde22002-11-23 22:08:15 +0000508 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200509 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500510 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300511 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500512 if (space > 10) {
513 if (space <= PY_SSIZE_T_MAX - 10)
514 space += 10;
515 }
516 else {
517 space *= 2;
518 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200519 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500520 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200521 if (!output) {
522 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000523 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200524 }
525 i = o = 0;
526 kind = PyUnicode_KIND(input);
527 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000528
Martin v. Löwis22970662011-09-29 13:39:38 +0200529 while (i < isize) {
530 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000531 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200532 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000533 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300534 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000535 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000536 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200537 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000538 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000539 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
540 if (new_output == NULL) {
541 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200542 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000543 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200544 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000545 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000546 }
547 /* Hangul Decomposition. */
548 if (SBase <= code && code < (SBase+SCount)) {
549 int SIndex = code - SBase;
550 int L = LBase + SIndex / NCount;
551 int V = VBase + (SIndex % NCount) / TCount;
552 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200553 output[o++] = L;
554 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000555 space -= 2;
556 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200557 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000558 space --;
559 }
560 continue;
561 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000562 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000563 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000564 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
565 if (value != 0) {
566 stack[stackptr++] = value;
567 continue;
568 }
569 }
570
571 /* Other decompositions. */
572 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000573
574 /* Copy character if it is not decomposable, or has a
575 compatibility decomposition, but we do NFD. */
576 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200577 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000578 space--;
579 continue;
580 }
581 /* Copy decomposition onto the stack, in reverse
582 order. */
583 while(count) {
584 code = decomp_data[index + (--count)];
585 stack[stackptr++] = code;
586 }
587 }
588 }
589
Martin v. Löwis22970662011-09-29 13:39:38 +0200590 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
591 output, o);
592 PyMem_Free(output);
593 if (!result)
594 return NULL;
595 /* result is guaranteed to be ready, as it is compact. */
596 kind = PyUnicode_KIND(result);
597 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000598
599 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200600 i = 0;
601 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
602 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
603 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000604 if (prev == 0 || cur == 0 || prev <= cur) {
605 prev = cur;
606 continue;
607 }
608 /* Non-canonical order. Need to switch *i with previous. */
609 o = i - 1;
610 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200611 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
612 PyUnicode_WRITE(kind, data, o+1,
613 PyUnicode_READ(kind, data, o));
614 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000615 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200616 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000617 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200618 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000619 if (prev == 0 || prev <= cur)
620 break;
621 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200622 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000623 }
624 return result;
625}
626
627static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200628find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000629{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200630 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000631 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200632 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000633 if (code < start)
634 return -1;
635 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200636 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000637 return nfc[index].index + delta;
638 }
639 }
640 return -1;
641}
642
643static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000644nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000645{
646 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200647 int kind;
648 void *data;
649 Py_UCS4 *output;
650 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000651 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200652 Py_UCS4 code;
653 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000654 int cskipped = 0;
655
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000656 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000657 if (!result)
658 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200659 /* result will be "ready". */
660 kind = PyUnicode_KIND(result);
661 data = PyUnicode_DATA(result);
662 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000663
Martin v. Löwis22970662011-09-29 13:39:38 +0200664 /* We allocate a buffer for the output.
665 If we find that we made no changes, we still return
666 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500667 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200668 if (!output) {
669 PyErr_NoMemory();
670 Py_DECREF(result);
671 return 0;
672 }
673 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000674
Martin v. Löwis677bde22002-11-23 22:08:15 +0000675 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200676 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000677 for (index = 0; index < cskipped; index++) {
678 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000679 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000680 Remove from list. */
681 skipped[index] = skipped[cskipped-1];
682 cskipped--;
683 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000684 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000685 }
686 }
687 /* Hangul Composition. We don't need to check for <LV,T>
688 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200689 code = PyUnicode_READ(kind, data, i);
690 if (LBase <= code && code < (LBase+LCount) &&
691 i + 1 < len &&
692 VBase <= PyUnicode_READ(kind, data, i+1) &&
693 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000694 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200695 LIndex = code - LBase;
696 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000697 code = SBase + (LIndex*VCount+VIndex)*TCount;
698 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200699 if (i < len &&
700 TBase <= PyUnicode_READ(kind, data, i) &&
701 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
702 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000703 i++;
704 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200705 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000706 continue;
707 }
708
Martin v. Löwis22970662011-09-29 13:39:38 +0200709 /* code is still input[i] here */
710 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000711 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200712 output[o++] = code;
713 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000714 continue;
715 }
716 /* Find next unblocked character. */
717 i1 = i+1;
718 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200719 /* output base character for now; might be updated later. */
720 output[o] = PyUnicode_READ(kind, data, i);
721 while (i1 < len) {
722 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
723 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000724 if (comb) {
725 if (comb1 == 0)
726 break;
727 if (comb >= comb1) {
728 /* Character is blocked. */
729 i1++;
730 continue;
731 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000732 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200733 l = find_nfc_index(self, nfc_last, code1);
734 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000735 is a starter, we don't need to look further.
736 Otherwise, record the combining class. */
737 if (l == -1) {
738 not_combinable:
739 if (comb1 == 0)
740 break;
741 comb = comb1;
742 i1++;
743 continue;
744 }
745 index = f*TOTAL_LAST + l;
746 index1 = comp_index[index >> COMP_SHIFT];
747 code = comp_data[(index1<<COMP_SHIFT)+
748 (index&((1<<COMP_SHIFT)-1))];
749 if (code == 0)
750 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000751
Martin v. Löwis677bde22002-11-23 22:08:15 +0000752 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200753 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000754 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000755 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000756 skipped[cskipped++] = i1;
757 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200758 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000759 if (f == -1)
760 break;
761 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200762 /* Output character was already written.
763 Just advance the indices. */
764 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000765 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200766 if (o == len) {
767 /* No changes. Return original string. */
768 PyMem_Free(output);
769 return result;
770 }
771 Py_DECREF(result);
772 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
773 output, o);
774 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000775 return result;
776}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000777
778/* Return 1 if the input is certainly normalized, 0 if it might not be. */
779static int
780is_normalized(PyObject *self, PyObject *input, int nfc, int k)
781{
Martin v. Löwis22970662011-09-29 13:39:38 +0200782 Py_ssize_t i, len;
783 int kind;
784 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000785 unsigned char prev_combining = 0, quickcheck_mask;
786
787 /* An older version of the database is requested, quickchecks must be
788 disabled. */
789 if (self && UCD_Check(self))
790 return 0;
791
792 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
793 as described in http://unicode.org/reports/tr15/#Annex8. */
794 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
795
Martin v. Löwis22970662011-09-29 13:39:38 +0200796 i = 0;
797 kind = PyUnicode_KIND(input);
798 data = PyUnicode_DATA(input);
799 len = PyUnicode_GET_LENGTH(input);
800 while (i < len) {
801 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
802 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000803 unsigned char combining = record->combining;
804 unsigned char quickcheck = record->normalization_quick_check;
805
806 if (quickcheck & quickcheck_mask)
807 return 0; /* this string might need normalization */
808 if (combining && prev_combining > combining)
809 return 0; /* non-canonical sort order, not normalized */
810 prev_combining = combining;
811 }
812 return 1; /* certainly normalized */
813}
814
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000815PyDoc_STRVAR(unicodedata_normalize__doc__,
816"normalize(form, unistr)\n\
817\n\
818Return the normal form 'form' for the Unicode string unistr. Valid\n\
819values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
820
Martin v. Löwis677bde22002-11-23 22:08:15 +0000821static PyObject*
822unicodedata_normalize(PyObject *self, PyObject *args)
823{
824 char *form;
825 PyObject *input;
826
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000827 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000828 &form, &PyUnicode_Type, &input))
829 return NULL;
830
Martin v. Löwis22970662011-09-29 13:39:38 +0200831 if (PyUnicode_READY(input) == -1)
832 return NULL;
833
834 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000835 /* Special case empty input strings, since resizing
836 them later would cause internal errors. */
837 Py_INCREF(input);
838 return input;
839 }
840
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000841 if (strcmp(form, "NFC") == 0) {
842 if (is_normalized(self, input, 1, 0)) {
843 Py_INCREF(input);
844 return input;
845 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000846 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000847 }
848 if (strcmp(form, "NFKC") == 0) {
849 if (is_normalized(self, input, 1, 1)) {
850 Py_INCREF(input);
851 return input;
852 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000853 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000854 }
855 if (strcmp(form, "NFD") == 0) {
856 if (is_normalized(self, input, 0, 0)) {
857 Py_INCREF(input);
858 return input;
859 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000860 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000861 }
862 if (strcmp(form, "NFKD") == 0) {
863 if (is_normalized(self, input, 0, 1)) {
864 Py_INCREF(input);
865 return input;
866 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000867 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000868 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000869 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
870 return NULL;
871}
872
Fredrik Lundh06d12682001-01-24 07:59:11 +0000873/* -------------------------------------------------------------------- */
874/* unicode character name tables */
875
876/* data file generated by Tools/unicode/makeunicodedata.py */
877#include "unicodename_db.h"
878
879/* -------------------------------------------------------------------- */
880/* database code (cut and pasted from the unidb package) */
881
882static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000883_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000884{
885 int i;
886 unsigned long h = 0;
887 unsigned long ix;
888 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200889 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000890 ix = h & 0xff000000;
891 if (ix)
892 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
893 }
894 return h;
895}
896
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000897static char *hangul_syllables[][3] = {
898 { "G", "A", "" },
899 { "GG", "AE", "G" },
900 { "N", "YA", "GG" },
901 { "D", "YAE", "GS" },
902 { "DD", "EO", "N", },
903 { "R", "E", "NJ" },
904 { "M", "YEO", "NH" },
905 { "B", "YE", "D" },
906 { "BB", "O", "L" },
907 { "S", "WA", "LG" },
908 { "SS", "WAE", "LM" },
909 { "", "OE", "LB" },
910 { "J", "YO", "LS" },
911 { "JJ", "U", "LT" },
912 { "C", "WEO", "LP" },
913 { "K", "WE", "LH" },
914 { "T", "WI", "M" },
915 { "P", "YU", "B" },
916 { "H", "EU", "BS" },
917 { 0, "YI", "S" },
918 { 0, "I", "SS" },
919 { 0, 0, "NG" },
920 { 0, 0, "J" },
921 { 0, 0, "C" },
922 { 0, 0, "K" },
923 { 0, 0, "T" },
924 { 0, 0, "P" },
925 { 0, 0, "H" }
926};
927
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000928/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000929static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000930is_unified_ideograph(Py_UCS4 code)
931{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000932 return
933 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500934 (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000935 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
936 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
937 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000938}
939
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300940/* macros used to determine if the given codepoint is in the PUA range that
941 * we are using to store aliases and named sequences */
942#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
943#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
944 (cp < named_sequences_end))
945
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000946static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300947_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
948 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000949{
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300950 /* Find the name associated with the given codepoint.
951 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
952 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000953 int offset;
954 int i;
955 int word;
956 unsigned char* w;
957
Martin v. Löwisc3509122006-03-11 12:16:23 +0000958 if (code >= 0x110000)
959 return 0;
960
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300961 /* XXX should we just skip all the codepoints in the PUAs here? */
962 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
963 return 0;
964
Martin v. Löwis1a214512008-06-11 05:26:20 +0000965 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300966 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +0300967 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300968 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
969 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +0300970 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +0000971 if (old->category_changed == 0) {
972 /* unassigned */
973 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000974 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000975 }
976
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000977 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000978 /* Hangul syllable. */
979 int SIndex = code - SBase;
980 int L = SIndex / NCount;
981 int V = (SIndex % NCount) / TCount;
982 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000983
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000984 if (buflen < 27)
985 /* Worst case: HANGUL SYLLABLE <10chars>. */
986 return 0;
987 strcpy(buffer, "HANGUL SYLLABLE ");
988 buffer += 16;
989 strcpy(buffer, hangul_syllables[L][0]);
990 buffer += strlen(hangul_syllables[L][0]);
991 strcpy(buffer, hangul_syllables[V][1]);
992 buffer += strlen(hangul_syllables[V][1]);
993 strcpy(buffer, hangul_syllables[T][2]);
994 buffer += strlen(hangul_syllables[T][2]);
995 *buffer = '\0';
996 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000997 }
998
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000999 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001000 if (buflen < 28)
1001 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1002 return 0;
1003 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1004 return 1;
1005 }
1006
Fredrik Lundh06d12682001-01-24 07:59:11 +00001007 /* get offset into phrasebook */
1008 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1009 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1010 (code&((1<<phrasebook_shift)-1))];
1011 if (!offset)
1012 return 0;
1013
1014 i = 0;
1015
1016 for (;;) {
1017 /* get word index */
1018 word = phrasebook[offset] - phrasebook_short;
1019 if (word >= 0) {
1020 word = (word << 8) + phrasebook[offset+1];
1021 offset += 2;
1022 } else
1023 word = phrasebook[offset++];
1024 if (i) {
1025 if (i > buflen)
1026 return 0; /* buffer overflow */
1027 buffer[i++] = ' ';
1028 }
1029 /* copy word string from lexicon. the last character in the
1030 word has bit 7 set. the last word in a string ends with
1031 0x80 */
1032 w = lexicon + lexicon_offset[word];
1033 while (*w < 128) {
1034 if (i >= buflen)
1035 return 0; /* buffer overflow */
1036 buffer[i++] = *w++;
1037 }
1038 if (i >= buflen)
1039 return 0; /* buffer overflow */
1040 buffer[i++] = *w & 127;
1041 if (*w == 128)
1042 break; /* end of word */
1043 }
1044
1045 return 1;
1046}
1047
1048static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001049_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001050{
1051 /* check if code corresponds to the given name */
1052 int i;
1053 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001054 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001055 return 0;
1056 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001057 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001058 return 0;
1059 }
1060 return buffer[namelen] == '\0';
1061}
1062
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001063static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001064find_syllable(const char *str, int *len, int *pos, int count, int column)
1065{
1066 int i, len1;
1067 *len = -1;
1068 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001069 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001070 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001071 if (len1 <= *len)
1072 continue;
1073 if (strncmp(str, s, len1) == 0) {
1074 *len = len1;
1075 *pos = i;
1076 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001077 }
1078 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001079 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001080 }
1081}
1082
Fredrik Lundh06d12682001-01-24 07:59:11 +00001083static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001084_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001085{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001086 /* check if named sequences are allowed */
1087 if (!with_named_seq && IS_NAMED_SEQ(cp))
1088 return 0;
1089 /* if the codepoint is in the PUA range that we use for aliases,
1090 * convert it to obtain the right codepoint */
1091 if (IS_ALIAS(cp))
1092 *code = name_aliases[cp-aliases_start];
1093 else
1094 *code = cp;
1095 return 1;
1096}
1097
1098static int
1099_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1100 int with_named_seq)
1101{
1102 /* Return the codepoint associated with the given name.
1103 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1104 * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
1105 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001106 unsigned int h, v;
1107 unsigned int mask = code_size-1;
1108 unsigned int i, incr;
1109
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001110 /* Check for hangul syllables. */
1111 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001112 int len, L = -1, V = -1, T = -1;
1113 const char *pos = name + 16;
1114 find_syllable(pos, &len, &L, LCount, 0);
1115 pos += len;
1116 find_syllable(pos, &len, &V, VCount, 1);
1117 pos += len;
1118 find_syllable(pos, &len, &T, TCount, 2);
1119 pos += len;
1120 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1121 *code = SBase + (L*VCount+V)*TCount + T;
1122 return 1;
1123 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001124 /* Otherwise, it's an illegal syllable name. */
1125 return 0;
1126 }
1127
1128 /* Check for unified ideographs. */
1129 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1130 /* Four or five hexdigits must follow. */
1131 v = 0;
1132 name += 22;
1133 namelen -= 22;
1134 if (namelen != 4 && namelen != 5)
1135 return 0;
1136 while (namelen--) {
1137 v *= 16;
1138 if (*name >= '0' && *name <= '9')
1139 v += *name - '0';
1140 else if (*name >= 'A' && *name <= 'F')
1141 v += *name - 'A' + 10;
1142 else
1143 return 0;
1144 name++;
1145 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001146 if (!is_unified_ideograph(v))
1147 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001148 *code = v;
1149 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001150 }
1151
Fredrik Lundh06d12682001-01-24 07:59:11 +00001152 /* the following is the same as python's dictionary lookup, with
1153 only minor changes. see the makeunicodedata script for more
1154 details */
1155
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001156 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001157 i = (~h) & mask;
1158 v = code_hash[i];
1159 if (!v)
1160 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001161 if (_cmpname(self, v, name, namelen))
1162 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001163 incr = (h ^ (h >> 3)) & mask;
1164 if (!incr)
1165 incr = mask;
1166 for (;;) {
1167 i = (i + incr) & mask;
1168 v = code_hash[i];
1169 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001170 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001171 if (_cmpname(self, v, name, namelen))
1172 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001173 incr = incr << 1;
1174 if (incr > mask)
1175 incr = incr ^ code_poly;
1176 }
1177}
1178
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001179static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001180{
1181 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001182 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001183 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001184};
1185
1186/* -------------------------------------------------------------------- */
1187/* Python bindings */
1188
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001189PyDoc_STRVAR(unicodedata_name__doc__,
1190"name(unichr[, default])\n\
1191Returns the name assigned to the Unicode character unichr as a\n\
1192string. If no name is defined, default is returned, or, if not\n\
1193given, ValueError is raised.");
1194
Fredrik Lundh06d12682001-01-24 07:59:11 +00001195static PyObject *
1196unicodedata_name(PyObject* self, PyObject* args)
1197{
1198 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001199 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001200
1201 PyUnicodeObject* v;
1202 PyObject* defobj = NULL;
1203 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1204 return NULL;
1205
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001206 c = getuchar(v);
1207 if (c == (Py_UCS4)-1)
1208 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001209
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001210 if (!_getucname(self, c, name, sizeof(name), 0)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001211 if (defobj == NULL) {
1212 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001213 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001214 }
1215 else {
1216 Py_INCREF(defobj);
1217 return defobj;
1218 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001219 }
1220
Walter Dörwald4254e762007-06-05 16:04:09 +00001221 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001222}
1223
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001224PyDoc_STRVAR(unicodedata_lookup__doc__,
1225"lookup(name)\n\
1226\n\
1227Look up character by name. If a character with the\n\
1228given name is found, return the corresponding Unicode\n\
1229character. If not found, KeyError is raised.");
1230
Fredrik Lundh06d12682001-01-24 07:59:11 +00001231static PyObject *
1232unicodedata_lookup(PyObject* self, PyObject* args)
1233{
1234 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001235
1236 char* name;
1237 int namelen;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001238 unsigned int index;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001239 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1240 return NULL;
1241
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001242 if (!_getcode(self, name, namelen, &code, 1)) {
1243 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001244 return NULL;
1245 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001246 /* check if code is in the PUA range that we use for named sequences
1247 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001248 if (IS_NAMED_SEQ(code)) {
1249 index = code-named_sequences_start;
1250 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1251 named_sequences[index].seq,
1252 named_sequences[index].seqlen);
1253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001255}
1256
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001257/* XXX Add doc strings. */
1258
1259static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001260 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1261 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1262 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1263 {"category", unicodedata_category, METH_VARARGS,
1264 unicodedata_category__doc__},
1265 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1266 unicodedata_bidirectional__doc__},
1267 {"combining", unicodedata_combining, METH_VARARGS,
1268 unicodedata_combining__doc__},
1269 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1270 unicodedata_mirrored__doc__},
1271 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1272 unicodedata_east_asian_width__doc__},
1273 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1274 unicodedata_decomposition__doc__},
1275 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1276 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1277 {"normalize", unicodedata_normalize, METH_VARARGS,
1278 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001279 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001280};
1281
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001282static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001283 /* The ob_type field must be initialized in the module init function
1284 * to be portable to Windows without using C++. */
1285 PyVarObject_HEAD_INIT(NULL, 0)
1286 "unicodedata.UCD", /*tp_name*/
1287 sizeof(PreviousDBVersion), /*tp_basicsize*/
1288 0, /*tp_itemsize*/
1289 /* methods */
1290 (destructor)PyObject_Del, /*tp_dealloc*/
1291 0, /*tp_print*/
1292 0, /*tp_getattr*/
1293 0, /*tp_setattr*/
1294 0, /*tp_reserved*/
1295 0, /*tp_repr*/
1296 0, /*tp_as_number*/
1297 0, /*tp_as_sequence*/
1298 0, /*tp_as_mapping*/
1299 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001300 0, /*tp_call*/
1301 0, /*tp_str*/
1302 PyObject_GenericGetAttr,/*tp_getattro*/
1303 0, /*tp_setattro*/
1304 0, /*tp_as_buffer*/
1305 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1306 0, /*tp_doc*/
1307 0, /*tp_traverse*/
1308 0, /*tp_clear*/
1309 0, /*tp_richcompare*/
1310 0, /*tp_weaklistoffset*/
1311 0, /*tp_iter*/
1312 0, /*tp_iternext*/
1313 unicodedata_functions, /*tp_methods*/
1314 DB_members, /*tp_members*/
1315 0, /*tp_getset*/
1316 0, /*tp_base*/
1317 0, /*tp_dict*/
1318 0, /*tp_descr_get*/
1319 0, /*tp_descr_set*/
1320 0, /*tp_dictoffset*/
1321 0, /*tp_init*/
1322 0, /*tp_alloc*/
1323 0, /*tp_new*/
1324 0, /*tp_free*/
1325 0, /*tp_is_gc*/
1326};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001327
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001328PyDoc_STRVAR(unicodedata_docstring,
1329"This module provides access to the Unicode Character Database which\n\
1330defines character properties for all Unicode characters. The data in\n\
1331this database is based on the UnicodeData.txt file version\n\
Benjamin Peterson8aa7b892013-10-10 20:22:10 -04001332" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001333\n\
1334The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001335UnicodeData File Format " UNIDATA_VERSION ".");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001336
Martin v. Löwis1a214512008-06-11 05:26:20 +00001337
1338static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001339 PyModuleDef_HEAD_INIT,
1340 "unicodedata",
1341 unicodedata_docstring,
1342 -1,
1343 unicodedata_functions,
1344 NULL,
1345 NULL,
1346 NULL,
1347 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001348};
1349
Mark Hammond62b1ab12002-07-23 06:31:15 +00001350PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001351PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001352{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001353 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001354
Christian Heimes90aa7642007-12-19 02:45:37 +00001355 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001356
Martin v. Löwis1a214512008-06-11 05:26:20 +00001357 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001358 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001359 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001360
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001361 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001362 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001363 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001364
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001365 /* Previous versions */
1366 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1367 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001368 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001369
Fredrik Lundh06d12682001-01-24 07:59:11 +00001370 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001371 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001372 if (v != NULL)
1373 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001374 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001375}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001376
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001377/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001378Local variables:
1379c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001380indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001381End:
1382*/