blob: 3979f65738df9f471f259672f3506836cfe6c44f [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000020#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Larry Hastings61272b72014-01-07 12:41:53 -080022/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080023module unicodedata
Larry Hastingsc2047262014-01-25 20:43:29 -080024class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Larry Hastings61272b72014-01-07 12:41:53 -080025[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080026/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080027
Fredrik Lundh06d12682001-01-24 07:59:11 +000028/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000029
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000031 const unsigned char category; /* index into
32 _PyUnicode_CategoryNames */
33 const unsigned char combining; /* combining class value 0 - 255 */
34 const unsigned char bidirectional; /* index into
35 _PyUnicode_BidirectionalNames */
36 const unsigned char mirrored; /* true if mirrored in bidir mode */
37 const unsigned char east_asian_width; /* index into
38 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000039 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000040} _PyUnicode_DatabaseRecord;
41
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042typedef struct change_record {
43 /* sequence of fields should be the same as in merge_old_version */
44 const unsigned char bidir_changed;
45 const unsigned char category_changed;
46 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000047 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000048 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000049} change_record;
50
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000051/* data file generated by Tools/unicode/makeunicodedata.py */
52#include "unicodedata_db.h"
53
54static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000055_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000056{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000057 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000058 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000059 index = 0;
60 else {
61 index = index1[(code>>SHIFT)];
62 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
63 }
64
65 return &_PyUnicode_Database_Records[index];
66}
67
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000068/* ------------- Previous-version API ------------------------------------- */
69typedef struct previous_version {
70 PyObject_HEAD
71 const char *name;
72 const change_record* (*getrecord)(Py_UCS4);
73 Py_UCS4 (*normalization)(Py_UCS4);
74} PreviousDBVersion;
75
76#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
77
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000079 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000080 {NULL}
81};
82
Thomas Wouters89f507f2006-12-13 04:49:30 +000083/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000084static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000085#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086
87static PyObject*
88new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
89 Py_UCS4 (*normalization)(Py_UCS4))
90{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000091 PreviousDBVersion *self;
92 self = PyObject_New(PreviousDBVersion, &UCD_Type);
93 if (self == NULL)
94 return NULL;
95 self->name = name;
96 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000097 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000098 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000099}
100
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000101
102static Py_UCS4 getuchar(PyUnicodeObject *obj)
103{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200104 if (PyUnicode_READY(obj))
105 return (Py_UCS4)-1;
106 if (PyUnicode_GET_LENGTH(obj) == 1) {
107 if (PyUnicode_READY(obj))
108 return (Py_UCS4)-1;
109 return PyUnicode_READ_CHAR(obj, 0);
110 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000111 PyErr_SetString(PyExc_TypeError,
112 "need a single Unicode character as parameter");
113 return (Py_UCS4)-1;
114}
115
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000116/* --- Module API --------------------------------------------------------- */
117
Larry Hastings61272b72014-01-07 12:41:53 -0800118/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -0800119
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800120unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700121
Larry Hastings77561cc2014-01-07 12:13:13 -0800122 unichr: object(type='PyUnicodeObject *', subclass_of='&PyUnicode_Type')
Larry Hastings31826802013-10-19 00:09:25 -0700123 default: object=NULL
124 /
125
126Converts a Unicode character into its equivalent decimal value.
127
128Returns the decimal value assigned to the Unicode character unichr
129as integer. If no such value is defined, default is returned, or, if
130not given, ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800131[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700132
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800133PyDoc_STRVAR(unicodedata_UCD_decimal__doc__,
Larry Hastings2623c8c2014-02-08 22:15:29 -0800134"decimal($self, unichr, default=None, /)\n"
135"--\n"
136"\n"
Larry Hastings31826802013-10-19 00:09:25 -0700137"Converts a Unicode character into its equivalent decimal value.\n"
138"\n"
Larry Hastings31826802013-10-19 00:09:25 -0700139"Returns the decimal value assigned to the Unicode character unichr\n"
140"as integer. If no such value is defined, default is returned, or, if\n"
141"not given, ValueError is raised.");
142
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800143#define UNICODEDATA_UCD_DECIMAL_METHODDEF \
144 {"decimal", (PyCFunction)unicodedata_UCD_decimal, METH_VARARGS, unicodedata_UCD_decimal__doc__},
Larry Hastings31826802013-10-19 00:09:25 -0700145
146static PyObject *
Larry Hastingsc2047262014-01-25 20:43:29 -0800147unicodedata_UCD_decimal_impl(PreviousDBVersion *self, PyUnicodeObject *unichr, PyObject *default_value);
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000148
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000149static PyObject *
Larry Hastingsc2047262014-01-25 20:43:29 -0800150unicodedata_UCD_decimal(PreviousDBVersion *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000151{
Larry Hastings31826802013-10-19 00:09:25 -0700152 PyObject *return_value = NULL;
Larry Hastings77561cc2014-01-07 12:13:13 -0800153 PyUnicodeObject *unichr;
Larry Hastings31826802013-10-19 00:09:25 -0700154 PyObject *default_value = NULL;
155
156 if (!PyArg_ParseTuple(args,
157 "O!|O:decimal",
158 &PyUnicode_Type, &unichr, &default_value))
159 goto exit;
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800160 return_value = unicodedata_UCD_decimal_impl(self, unichr, default_value);
Larry Hastings31826802013-10-19 00:09:25 -0700161
162exit:
163 return return_value;
164}
165
166static PyObject *
Larry Hastingsc2047262014-01-25 20:43:29 -0800167unicodedata_UCD_decimal_impl(PreviousDBVersion *self, PyUnicodeObject *unichr, PyObject *default_value)
Larry Hastings2623c8c2014-02-08 22:15:29 -0800168/*[clinic end generated code: output=8689669896d293df input=c25c9d2b4de076b1]*/
Larry Hastings31826802013-10-19 00:09:25 -0700169{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000170 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000171 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000172 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000173
Larry Hastingsc2047262014-01-25 20:43:29 -0800174 c = getuchar(unichr);
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000175 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000176 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000177
Martin v. Löwis1a214512008-06-11 05:26:20 +0000178 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000179 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000180 if (old->category_changed == 0) {
181 /* unassigned */
182 have_old = 1;
183 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000184 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000185 else if (old->decimal_changed != 0xFF) {
186 have_old = 1;
187 rc = old->decimal_changed;
188 }
189 }
190
191 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000192 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700194 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000195 PyErr_SetString(PyExc_ValueError,
196 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000197 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 }
199 else {
Larry Hastings31826802013-10-19 00:09:25 -0700200 Py_INCREF(default_value);
201 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000202 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000204 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000205}
206
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000207PyDoc_STRVAR(unicodedata_digit__doc__,
208"digit(unichr[, default])\n\
209\n\
210Returns the digit value assigned to the Unicode character unichr as\n\
211integer. If no such value is defined, default is returned, or, if\n\
212not given, ValueError is raised.");
213
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000214static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000215unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000216{
217 PyUnicodeObject *v;
218 PyObject *defobj = NULL;
219 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000220 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000221
Fredrik Lundh06d12682001-01-24 07:59:11 +0000222 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000223 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000224 c = getuchar(v);
225 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000226 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000227 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000228 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 if (defobj == NULL) {
230 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000231 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000232 }
233 else {
234 Py_INCREF(defobj);
235 return defobj;
236 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000237 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000238 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000239}
240
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000241PyDoc_STRVAR(unicodedata_numeric__doc__,
242"numeric(unichr[, default])\n\
243\n\
244Returns the numeric value assigned to the Unicode character unichr\n\
245as float. If no such value is defined, default is returned, or, if\n\
246not given, ValueError is raised.");
247
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000248static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000249unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250{
251 PyUnicodeObject *v;
252 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000253 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000254 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000255 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000256
Fredrik Lundh06d12682001-01-24 07:59:11 +0000257 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000258 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000259 c = getuchar(v);
260 if (c == (Py_UCS4)-1)
261 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000262
Martin v. Löwis1a214512008-06-11 05:26:20 +0000263 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000264 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000265 if (old->category_changed == 0) {
266 /* unassigned */
267 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000268 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000269 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000270 else if (old->decimal_changed != 0xFF) {
271 have_old = 1;
272 rc = old->decimal_changed;
273 }
274 }
275
276 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000277 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000278 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000279 if (defobj == NULL) {
280 PyErr_SetString(PyExc_ValueError, "not a numeric character");
281 return NULL;
282 }
283 else {
284 Py_INCREF(defobj);
285 return defobj;
286 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000287 }
288 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000289}
290
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000291PyDoc_STRVAR(unicodedata_category__doc__,
292"category(unichr)\n\
293\n\
294Returns the general category assigned to the Unicode character\n\
295unichr as string.");
296
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000297static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000298unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000299{
300 PyUnicodeObject *v;
301 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000302 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000303
304 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000305 &PyUnicode_Type, &v))
306 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000307 c = getuchar(v);
308 if (c == (Py_UCS4)-1)
309 return NULL;
310 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000311 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000312 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000313 if (old->category_changed != 0xFF)
314 index = old->category_changed;
315 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000316 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000317}
318
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000319PyDoc_STRVAR(unicodedata_bidirectional__doc__,
320"bidirectional(unichr)\n\
321\n\
Ezio Melottie3d7e542012-12-14 20:12:25 +0200322Returns the bidirectional class assigned to the Unicode character\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000323unichr as string. If no such value is defined, an empty string is\n\
324returned.");
325
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000326static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000327unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000328{
329 PyUnicodeObject *v;
330 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000331 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000332
333 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000334 &PyUnicode_Type, &v))
335 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000336 c = getuchar(v);
337 if (c == (Py_UCS4)-1)
338 return NULL;
339 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000340 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000341 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000342 if (old->category_changed == 0)
343 index = 0; /* unassigned */
344 else if (old->bidir_changed != 0xFF)
345 index = old->bidir_changed;
346 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000347 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000348}
349
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000350PyDoc_STRVAR(unicodedata_combining__doc__,
351"combining(unichr)\n\
352\n\
353Returns the canonical combining class assigned to the Unicode\n\
354character unichr as integer. Returns 0 if no combining class is\n\
355defined.");
356
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000357static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000358unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000359{
360 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000361 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000362 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000363
364 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000365 &PyUnicode_Type, &v))
366 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000367 c = getuchar(v);
368 if (c == (Py_UCS4)-1)
369 return NULL;
370 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000371 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000372 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000373 if (old->category_changed == 0)
374 index = 0; /* unassigned */
375 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000376 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000377}
378
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000379PyDoc_STRVAR(unicodedata_mirrored__doc__,
380"mirrored(unichr)\n\
381\n\
382Returns the mirrored property assigned to the Unicode character\n\
383unichr as integer. Returns 1 if the character has been identified as\n\
384a \"mirrored\" character in bidirectional text, 0 otherwise.");
385
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000386static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000387unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000388{
389 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000390 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000391 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000392
393 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000394 &PyUnicode_Type, &v))
395 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000396 c = getuchar(v);
397 if (c == (Py_UCS4)-1)
398 return NULL;
399 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000400 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000401 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000402 if (old->category_changed == 0)
403 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000404 else if (old->mirrored_changed != 0xFF)
405 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000406 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000407 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000408}
409
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000410PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
411"east_asian_width(unichr)\n\
412\n\
413Returns the east asian width assigned to the Unicode character\n\
414unichr as string.");
415
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000416static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000417unicodedata_east_asian_width(PyObject *self, PyObject *args)
418{
419 PyUnicodeObject *v;
420 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000421 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000422
423 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000424 &PyUnicode_Type, &v))
425 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000426 c = getuchar(v);
427 if (c == (Py_UCS4)-1)
428 return NULL;
429 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000430 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000431 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000432 if (old->category_changed == 0)
433 index = 0; /* unassigned */
434 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000435 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000436}
437
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000438PyDoc_STRVAR(unicodedata_decomposition__doc__,
439"decomposition(unichr)\n\
440\n\
441Returns the character decomposition mapping assigned to the Unicode\n\
442character unichr as string. An empty string is returned in case no\n\
443such mapping is defined.");
444
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000445static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000446unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000447{
448 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000449 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000450 int code, index, count;
451 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000452 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000453 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000454
455 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000456 &PyUnicode_Type, &v))
457 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000458 c = getuchar(v);
459 if (c == (Py_UCS4)-1)
460 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000461
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000462 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000463
Martin v. Löwis1a214512008-06-11 05:26:20 +0000464 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000465 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000466 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000467 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000468 }
469
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000470 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000471 index = 0;
472 else {
473 index = decomp_index1[(code>>DECOMP_SHIFT)];
474 index = decomp_index2[(index<<DECOMP_SHIFT)+
475 (code&((1<<DECOMP_SHIFT)-1))];
476 }
477
Tim Peters69b83b12001-11-30 07:23:05 +0000478 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000479 is prefix code (from*/
480 count = decomp_data[index] >> 8;
481
482 /* XXX: could allocate the PyString up front instead
483 (strlen(prefix) + 5 * count + 1 bytes) */
484
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000485 /* Based on how index is calculated above and decomp_data is generated
486 from Tools/unicode/makeunicodedata.py, it should not be possible
487 to overflow decomp_prefix. */
488 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200489 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000490
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000491 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000492 i = strlen(decomp_prefix[prefix_index]);
493 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000494
495 while (count-- > 0) {
496 if (i)
497 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000498 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000499 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
500 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000501 i += strlen(decomp + i);
502 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000503 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000504}
505
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000506static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000507get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000508{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000509 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000510 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000511 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000512 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000513 /* unassigned in old version */
514 *index = 0;
515 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000516 else {
517 *index = decomp_index1[(code>>DECOMP_SHIFT)];
518 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
519 (code&((1<<DECOMP_SHIFT)-1))];
520 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000521
Martin v. Löwis677bde22002-11-23 22:08:15 +0000522 /* high byte is number of hex bytes (usually one or two), low byte
523 is prefix code (from*/
524 *count = decomp_data[*index] >> 8;
525 *prefix = decomp_data[*index] & 255;
526
527 (*index)++;
528}
529
530#define SBase 0xAC00
531#define LBase 0x1100
532#define VBase 0x1161
533#define TBase 0x11A7
534#define LCount 19
535#define VCount 21
536#define TCount 28
537#define NCount (VCount*TCount)
538#define SCount (LCount*NCount)
539
540static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000541nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000542{
543 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200544 Py_UCS4 *output;
545 Py_ssize_t i, o, osize;
546 int kind;
547 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000548 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200549 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000550 Py_ssize_t space, isize;
551 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000552 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000553
Martin v. Löwis677bde22002-11-23 22:08:15 +0000554 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200555 isize = PyUnicode_GET_LENGTH(input);
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300556 /* Overallocate at most 10 characters. */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000557 space = (isize > 10 ? 10 : isize) + isize;
Martin v. Löwis22970662011-09-29 13:39:38 +0200558 osize = space;
559 output = PyMem_Malloc(space * sizeof(Py_UCS4));
560 if (!output) {
561 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000562 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200563 }
564 i = o = 0;
565 kind = PyUnicode_KIND(input);
566 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000567
Martin v. Löwis22970662011-09-29 13:39:38 +0200568 while (i < isize) {
569 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000570 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200571 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000572 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300573 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000574 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000575 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200576 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000577 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000578 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
579 if (new_output == NULL) {
580 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200581 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000582 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200583 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000584 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000585 }
586 /* Hangul Decomposition. */
587 if (SBase <= code && code < (SBase+SCount)) {
588 int SIndex = code - SBase;
589 int L = LBase + SIndex / NCount;
590 int V = VBase + (SIndex % NCount) / TCount;
591 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200592 output[o++] = L;
593 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000594 space -= 2;
595 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200596 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000597 space --;
598 }
599 continue;
600 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000601 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000602 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000603 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
604 if (value != 0) {
605 stack[stackptr++] = value;
606 continue;
607 }
608 }
609
610 /* Other decompositions. */
611 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000612
613 /* Copy character if it is not decomposable, or has a
614 compatibility decomposition, but we do NFD. */
615 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200616 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000617 space--;
618 continue;
619 }
620 /* Copy decomposition onto the stack, in reverse
621 order. */
622 while(count) {
623 code = decomp_data[index + (--count)];
624 stack[stackptr++] = code;
625 }
626 }
627 }
628
Martin v. Löwis22970662011-09-29 13:39:38 +0200629 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
630 output, o);
631 PyMem_Free(output);
632 if (!result)
633 return NULL;
634 /* result is guaranteed to be ready, as it is compact. */
635 kind = PyUnicode_KIND(result);
636 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000637
638 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200639 i = 0;
640 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
641 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
642 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000643 if (prev == 0 || cur == 0 || prev <= cur) {
644 prev = cur;
645 continue;
646 }
647 /* Non-canonical order. Need to switch *i with previous. */
648 o = i - 1;
649 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200650 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
651 PyUnicode_WRITE(kind, data, o+1,
652 PyUnicode_READ(kind, data, o));
653 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000654 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200655 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000656 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200657 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000658 if (prev == 0 || prev <= cur)
659 break;
660 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200661 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000662 }
663 return result;
664}
665
666static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200667find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000668{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200669 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000670 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200671 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000672 if (code < start)
673 return -1;
674 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200675 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000676 return nfc[index].index + delta;
677 }
678 }
679 return -1;
680}
681
682static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000683nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000684{
685 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200686 int kind;
687 void *data;
688 Py_UCS4 *output;
689 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000690 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200691 Py_UCS4 code;
692 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000693 int cskipped = 0;
694
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000695 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000696 if (!result)
697 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200698 /* result will be "ready". */
699 kind = PyUnicode_KIND(result);
700 data = PyUnicode_DATA(result);
701 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000702
Martin v. Löwis22970662011-09-29 13:39:38 +0200703 /* We allocate a buffer for the output.
704 If we find that we made no changes, we still return
705 the NFD result. */
706 output = PyMem_Malloc(len * sizeof(Py_UCS4));
707 if (!output) {
708 PyErr_NoMemory();
709 Py_DECREF(result);
710 return 0;
711 }
712 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000713
Martin v. Löwis677bde22002-11-23 22:08:15 +0000714 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200715 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000716 for (index = 0; index < cskipped; index++) {
717 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000718 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000719 Remove from list. */
720 skipped[index] = skipped[cskipped-1];
721 cskipped--;
722 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000723 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000724 }
725 }
726 /* Hangul Composition. We don't need to check for <LV,T>
727 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200728 code = PyUnicode_READ(kind, data, i);
729 if (LBase <= code && code < (LBase+LCount) &&
730 i + 1 < len &&
731 VBase <= PyUnicode_READ(kind, data, i+1) &&
732 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000733 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200734 LIndex = code - LBase;
735 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000736 code = SBase + (LIndex*VCount+VIndex)*TCount;
737 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200738 if (i < len &&
739 TBase <= PyUnicode_READ(kind, data, i) &&
740 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
741 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000742 i++;
743 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200744 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000745 continue;
746 }
747
Martin v. Löwis22970662011-09-29 13:39:38 +0200748 /* code is still input[i] here */
749 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000750 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200751 output[o++] = code;
752 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000753 continue;
754 }
755 /* Find next unblocked character. */
756 i1 = i+1;
757 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200758 /* output base character for now; might be updated later. */
759 output[o] = PyUnicode_READ(kind, data, i);
760 while (i1 < len) {
761 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
762 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000763 if (comb) {
764 if (comb1 == 0)
765 break;
766 if (comb >= comb1) {
767 /* Character is blocked. */
768 i1++;
769 continue;
770 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000771 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200772 l = find_nfc_index(self, nfc_last, code1);
773 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000774 is a starter, we don't need to look further.
775 Otherwise, record the combining class. */
776 if (l == -1) {
777 not_combinable:
778 if (comb1 == 0)
779 break;
780 comb = comb1;
781 i1++;
782 continue;
783 }
784 index = f*TOTAL_LAST + l;
785 index1 = comp_index[index >> COMP_SHIFT];
786 code = comp_data[(index1<<COMP_SHIFT)+
787 (index&((1<<COMP_SHIFT)-1))];
788 if (code == 0)
789 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000790
Martin v. Löwis677bde22002-11-23 22:08:15 +0000791 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200792 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000793 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000794 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000795 skipped[cskipped++] = i1;
796 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200797 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000798 if (f == -1)
799 break;
800 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200801 /* Output character was already written.
802 Just advance the indices. */
803 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000804 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200805 if (o == len) {
806 /* No changes. Return original string. */
807 PyMem_Free(output);
808 return result;
809 }
810 Py_DECREF(result);
811 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
812 output, o);
813 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000814 return result;
815}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000816
817/* Return 1 if the input is certainly normalized, 0 if it might not be. */
818static int
819is_normalized(PyObject *self, PyObject *input, int nfc, int k)
820{
Martin v. Löwis22970662011-09-29 13:39:38 +0200821 Py_ssize_t i, len;
822 int kind;
823 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000824 unsigned char prev_combining = 0, quickcheck_mask;
825
826 /* An older version of the database is requested, quickchecks must be
827 disabled. */
828 if (self && UCD_Check(self))
829 return 0;
830
831 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
832 as described in http://unicode.org/reports/tr15/#Annex8. */
833 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
834
Martin v. Löwis22970662011-09-29 13:39:38 +0200835 i = 0;
836 kind = PyUnicode_KIND(input);
837 data = PyUnicode_DATA(input);
838 len = PyUnicode_GET_LENGTH(input);
839 while (i < len) {
840 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
841 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000842 unsigned char combining = record->combining;
843 unsigned char quickcheck = record->normalization_quick_check;
844
845 if (quickcheck & quickcheck_mask)
846 return 0; /* this string might need normalization */
847 if (combining && prev_combining > combining)
848 return 0; /* non-canonical sort order, not normalized */
849 prev_combining = combining;
850 }
851 return 1; /* certainly normalized */
852}
853
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000854PyDoc_STRVAR(unicodedata_normalize__doc__,
855"normalize(form, unistr)\n\
856\n\
857Return the normal form 'form' for the Unicode string unistr. Valid\n\
858values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
859
Martin v. Löwis677bde22002-11-23 22:08:15 +0000860static PyObject*
861unicodedata_normalize(PyObject *self, PyObject *args)
862{
863 char *form;
864 PyObject *input;
865
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000866 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000867 &form, &PyUnicode_Type, &input))
868 return NULL;
869
Martin v. Löwis22970662011-09-29 13:39:38 +0200870 if (PyUnicode_READY(input) == -1)
871 return NULL;
872
873 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000874 /* Special case empty input strings, since resizing
875 them later would cause internal errors. */
876 Py_INCREF(input);
877 return input;
878 }
879
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000880 if (strcmp(form, "NFC") == 0) {
881 if (is_normalized(self, input, 1, 0)) {
882 Py_INCREF(input);
883 return input;
884 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000885 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000886 }
887 if (strcmp(form, "NFKC") == 0) {
888 if (is_normalized(self, input, 1, 1)) {
889 Py_INCREF(input);
890 return input;
891 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000892 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000893 }
894 if (strcmp(form, "NFD") == 0) {
895 if (is_normalized(self, input, 0, 0)) {
896 Py_INCREF(input);
897 return input;
898 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000899 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000900 }
901 if (strcmp(form, "NFKD") == 0) {
902 if (is_normalized(self, input, 0, 1)) {
903 Py_INCREF(input);
904 return input;
905 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000906 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000907 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000908 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
909 return NULL;
910}
911
Fredrik Lundh06d12682001-01-24 07:59:11 +0000912/* -------------------------------------------------------------------- */
913/* unicode character name tables */
914
915/* data file generated by Tools/unicode/makeunicodedata.py */
916#include "unicodename_db.h"
917
918/* -------------------------------------------------------------------- */
919/* database code (cut and pasted from the unidb package) */
920
921static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000922_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000923{
924 int i;
925 unsigned long h = 0;
926 unsigned long ix;
927 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200928 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000929 ix = h & 0xff000000;
930 if (ix)
931 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
932 }
933 return h;
934}
935
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000936static char *hangul_syllables[][3] = {
937 { "G", "A", "" },
938 { "GG", "AE", "G" },
939 { "N", "YA", "GG" },
940 { "D", "YAE", "GS" },
941 { "DD", "EO", "N", },
942 { "R", "E", "NJ" },
943 { "M", "YEO", "NH" },
944 { "B", "YE", "D" },
945 { "BB", "O", "L" },
946 { "S", "WA", "LG" },
947 { "SS", "WAE", "LM" },
948 { "", "OE", "LB" },
949 { "J", "YO", "LS" },
950 { "JJ", "U", "LT" },
951 { "C", "WEO", "LP" },
952 { "K", "WE", "LH" },
953 { "T", "WI", "M" },
954 { "P", "YU", "B" },
955 { "H", "EU", "BS" },
956 { 0, "YI", "S" },
957 { 0, "I", "SS" },
958 { 0, 0, "NG" },
959 { 0, 0, "J" },
960 { 0, 0, "C" },
961 { 0, 0, "K" },
962 { 0, 0, "T" },
963 { 0, 0, "P" },
964 { 0, 0, "H" }
965};
966
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000967/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000968static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000969is_unified_ideograph(Py_UCS4 code)
970{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000971 return
972 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500973 (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000974 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
975 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
976 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000977}
978
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300979/* macros used to determine if the given codepoint is in the PUA range that
980 * we are using to store aliases and named sequences */
981#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
982#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
983 (cp < named_sequences_end))
984
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000985static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300986_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
987 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000988{
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300989 /* Find the name associated with the given codepoint.
990 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
991 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000992 int offset;
993 int i;
994 int word;
995 unsigned char* w;
996
Martin v. Löwisc3509122006-03-11 12:16:23 +0000997 if (code >= 0x110000)
998 return 0;
999
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001000 /* XXX should we just skip all the codepoints in the PUAs here? */
1001 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1002 return 0;
1003
Martin v. Löwis1a214512008-06-11 05:26:20 +00001004 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001005 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +03001006 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001007 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1008 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +03001009 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +00001010 if (old->category_changed == 0) {
1011 /* unassigned */
1012 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001013 }
Martin v. Löwisc3509122006-03-11 12:16:23 +00001014 }
1015
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +00001016 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001017 /* Hangul syllable. */
1018 int SIndex = code - SBase;
1019 int L = SIndex / NCount;
1020 int V = (SIndex % NCount) / TCount;
1021 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001022
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001023 if (buflen < 27)
1024 /* Worst case: HANGUL SYLLABLE <10chars>. */
1025 return 0;
1026 strcpy(buffer, "HANGUL SYLLABLE ");
1027 buffer += 16;
1028 strcpy(buffer, hangul_syllables[L][0]);
1029 buffer += strlen(hangul_syllables[L][0]);
1030 strcpy(buffer, hangul_syllables[V][1]);
1031 buffer += strlen(hangul_syllables[V][1]);
1032 strcpy(buffer, hangul_syllables[T][2]);
1033 buffer += strlen(hangul_syllables[T][2]);
1034 *buffer = '\0';
1035 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001036 }
1037
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001038 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001039 if (buflen < 28)
1040 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1041 return 0;
1042 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1043 return 1;
1044 }
1045
Fredrik Lundh06d12682001-01-24 07:59:11 +00001046 /* get offset into phrasebook */
1047 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1048 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1049 (code&((1<<phrasebook_shift)-1))];
1050 if (!offset)
1051 return 0;
1052
1053 i = 0;
1054
1055 for (;;) {
1056 /* get word index */
1057 word = phrasebook[offset] - phrasebook_short;
1058 if (word >= 0) {
1059 word = (word << 8) + phrasebook[offset+1];
1060 offset += 2;
1061 } else
1062 word = phrasebook[offset++];
1063 if (i) {
1064 if (i > buflen)
1065 return 0; /* buffer overflow */
1066 buffer[i++] = ' ';
1067 }
1068 /* copy word string from lexicon. the last character in the
1069 word has bit 7 set. the last word in a string ends with
1070 0x80 */
1071 w = lexicon + lexicon_offset[word];
1072 while (*w < 128) {
1073 if (i >= buflen)
1074 return 0; /* buffer overflow */
1075 buffer[i++] = *w++;
1076 }
1077 if (i >= buflen)
1078 return 0; /* buffer overflow */
1079 buffer[i++] = *w & 127;
1080 if (*w == 128)
1081 break; /* end of word */
1082 }
1083
1084 return 1;
1085}
1086
1087static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001088_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001089{
1090 /* check if code corresponds to the given name */
1091 int i;
1092 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001093 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001094 return 0;
1095 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001096 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001097 return 0;
1098 }
1099 return buffer[namelen] == '\0';
1100}
1101
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001102static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001103find_syllable(const char *str, int *len, int *pos, int count, int column)
1104{
1105 int i, len1;
1106 *len = -1;
1107 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001108 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001109 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001110 if (len1 <= *len)
1111 continue;
1112 if (strncmp(str, s, len1) == 0) {
1113 *len = len1;
1114 *pos = i;
1115 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001116 }
1117 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001118 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001119 }
1120}
1121
Fredrik Lundh06d12682001-01-24 07:59:11 +00001122static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001123_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001124{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001125 /* check if named sequences are allowed */
1126 if (!with_named_seq && IS_NAMED_SEQ(cp))
1127 return 0;
1128 /* if the codepoint is in the PUA range that we use for aliases,
1129 * convert it to obtain the right codepoint */
1130 if (IS_ALIAS(cp))
1131 *code = name_aliases[cp-aliases_start];
1132 else
1133 *code = cp;
1134 return 1;
1135}
1136
1137static int
1138_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1139 int with_named_seq)
1140{
1141 /* Return the codepoint associated with the given name.
1142 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1143 * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
1144 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001145 unsigned int h, v;
1146 unsigned int mask = code_size-1;
1147 unsigned int i, incr;
1148
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001149 /* Check for hangul syllables. */
1150 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001151 int len, L = -1, V = -1, T = -1;
1152 const char *pos = name + 16;
1153 find_syllable(pos, &len, &L, LCount, 0);
1154 pos += len;
1155 find_syllable(pos, &len, &V, VCount, 1);
1156 pos += len;
1157 find_syllable(pos, &len, &T, TCount, 2);
1158 pos += len;
1159 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1160 *code = SBase + (L*VCount+V)*TCount + T;
1161 return 1;
1162 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001163 /* Otherwise, it's an illegal syllable name. */
1164 return 0;
1165 }
1166
1167 /* Check for unified ideographs. */
1168 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1169 /* Four or five hexdigits must follow. */
1170 v = 0;
1171 name += 22;
1172 namelen -= 22;
1173 if (namelen != 4 && namelen != 5)
1174 return 0;
1175 while (namelen--) {
1176 v *= 16;
1177 if (*name >= '0' && *name <= '9')
1178 v += *name - '0';
1179 else if (*name >= 'A' && *name <= 'F')
1180 v += *name - 'A' + 10;
1181 else
1182 return 0;
1183 name++;
1184 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001185 if (!is_unified_ideograph(v))
1186 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001187 *code = v;
1188 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001189 }
1190
Fredrik Lundh06d12682001-01-24 07:59:11 +00001191 /* the following is the same as python's dictionary lookup, with
1192 only minor changes. see the makeunicodedata script for more
1193 details */
1194
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001195 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001196 i = (~h) & mask;
1197 v = code_hash[i];
1198 if (!v)
1199 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001200 if (_cmpname(self, v, name, namelen))
1201 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001202 incr = (h ^ (h >> 3)) & mask;
1203 if (!incr)
1204 incr = mask;
1205 for (;;) {
1206 i = (i + incr) & mask;
1207 v = code_hash[i];
1208 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001209 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001210 if (_cmpname(self, v, name, namelen))
1211 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001212 incr = incr << 1;
1213 if (incr > mask)
1214 incr = incr ^ code_poly;
1215 }
1216}
1217
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001218static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001219{
1220 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001221 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001222 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001223};
1224
1225/* -------------------------------------------------------------------- */
1226/* Python bindings */
1227
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001228PyDoc_STRVAR(unicodedata_name__doc__,
1229"name(unichr[, default])\n\
1230Returns the name assigned to the Unicode character unichr as a\n\
1231string. If no name is defined, default is returned, or, if not\n\
1232given, ValueError is raised.");
1233
Fredrik Lundh06d12682001-01-24 07:59:11 +00001234static PyObject *
1235unicodedata_name(PyObject* self, PyObject* args)
1236{
1237 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001238 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001239
1240 PyUnicodeObject* v;
1241 PyObject* defobj = NULL;
1242 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1243 return NULL;
1244
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001245 c = getuchar(v);
1246 if (c == (Py_UCS4)-1)
1247 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001248
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001249 if (!_getucname(self, c, name, sizeof(name), 0)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001250 if (defobj == NULL) {
1251 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001252 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001253 }
1254 else {
1255 Py_INCREF(defobj);
1256 return defobj;
1257 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001258 }
1259
Walter Dörwald4254e762007-06-05 16:04:09 +00001260 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001261}
1262
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001263PyDoc_STRVAR(unicodedata_lookup__doc__,
1264"lookup(name)\n\
1265\n\
1266Look up character by name. If a character with the\n\
1267given name is found, return the corresponding Unicode\n\
1268character. If not found, KeyError is raised.");
1269
Fredrik Lundh06d12682001-01-24 07:59:11 +00001270static PyObject *
1271unicodedata_lookup(PyObject* self, PyObject* args)
1272{
1273 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001274
1275 char* name;
Victor Stinner65a31442014-07-01 16:45:52 +02001276 Py_ssize_t namelen;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001277 unsigned int index;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001278 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1279 return NULL;
Victor Stinner65a31442014-07-01 16:45:52 +02001280 if (namelen > INT_MAX) {
1281 PyErr_SetString(PyExc_KeyError, "name too long");
1282 return NULL;
1283 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001284
Victor Stinner65a31442014-07-01 16:45:52 +02001285 if (!_getcode(self, name, (int)namelen, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001286 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001287 return NULL;
1288 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001289 /* check if code is in the PUA range that we use for named sequences
1290 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001291 if (IS_NAMED_SEQ(code)) {
1292 index = code-named_sequences_start;
1293 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1294 named_sequences[index].seq,
1295 named_sequences[index].seqlen);
1296 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001298}
1299
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001300/* XXX Add doc strings. */
1301
1302static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001303 UNICODEDATA_UCD_DECIMAL_METHODDEF
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001304 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1305 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1306 {"category", unicodedata_category, METH_VARARGS,
1307 unicodedata_category__doc__},
1308 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1309 unicodedata_bidirectional__doc__},
1310 {"combining", unicodedata_combining, METH_VARARGS,
1311 unicodedata_combining__doc__},
1312 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1313 unicodedata_mirrored__doc__},
1314 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1315 unicodedata_east_asian_width__doc__},
1316 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1317 unicodedata_decomposition__doc__},
1318 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1319 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1320 {"normalize", unicodedata_normalize, METH_VARARGS,
1321 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001322 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001323};
1324
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001325static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001326 /* The ob_type field must be initialized in the module init function
1327 * to be portable to Windows without using C++. */
1328 PyVarObject_HEAD_INIT(NULL, 0)
1329 "unicodedata.UCD", /*tp_name*/
1330 sizeof(PreviousDBVersion), /*tp_basicsize*/
1331 0, /*tp_itemsize*/
1332 /* methods */
1333 (destructor)PyObject_Del, /*tp_dealloc*/
1334 0, /*tp_print*/
1335 0, /*tp_getattr*/
1336 0, /*tp_setattr*/
1337 0, /*tp_reserved*/
1338 0, /*tp_repr*/
1339 0, /*tp_as_number*/
1340 0, /*tp_as_sequence*/
1341 0, /*tp_as_mapping*/
1342 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001343 0, /*tp_call*/
1344 0, /*tp_str*/
1345 PyObject_GenericGetAttr,/*tp_getattro*/
1346 0, /*tp_setattro*/
1347 0, /*tp_as_buffer*/
1348 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1349 0, /*tp_doc*/
1350 0, /*tp_traverse*/
1351 0, /*tp_clear*/
1352 0, /*tp_richcompare*/
1353 0, /*tp_weaklistoffset*/
1354 0, /*tp_iter*/
1355 0, /*tp_iternext*/
1356 unicodedata_functions, /*tp_methods*/
1357 DB_members, /*tp_members*/
1358 0, /*tp_getset*/
1359 0, /*tp_base*/
1360 0, /*tp_dict*/
1361 0, /*tp_descr_get*/
1362 0, /*tp_descr_set*/
1363 0, /*tp_dictoffset*/
1364 0, /*tp_init*/
1365 0, /*tp_alloc*/
1366 0, /*tp_new*/
1367 0, /*tp_free*/
1368 0, /*tp_is_gc*/
1369};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001370
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001371PyDoc_STRVAR(unicodedata_docstring,
1372"This module provides access to the Unicode Character Database which\n\
1373defines character properties for all Unicode characters. The data in\n\
1374this database is based on the UnicodeData.txt file version\n\
Benjamin Peterson8aa7b892013-10-10 20:22:10 -04001375" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001376\n\
1377The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001378UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001379
1380static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001381 PyModuleDef_HEAD_INIT,
1382 "unicodedata",
1383 unicodedata_docstring,
1384 -1,
1385 unicodedata_functions,
1386 NULL,
1387 NULL,
1388 NULL,
1389 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001390};
1391
Mark Hammond62b1ab12002-07-23 06:31:15 +00001392PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001393PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001394{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001395 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001396
Christian Heimes90aa7642007-12-19 02:45:37 +00001397 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001398
Martin v. Löwis1a214512008-06-11 05:26:20 +00001399 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001400 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001401 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001402
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001403 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001404 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001405 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001406
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001407 /* Previous versions */
1408 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1409 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001410 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001411
Fredrik Lundh06d12682001-01-24 07:59:11 +00001412 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001413 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001414 if (v != NULL)
1415 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001416 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001417}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001418
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001419/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001420Local variables:
1421c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001422indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001423End:
1424*/