blob: 507cef3ba1733068ffb2db27c6478c9a4fd3ce5d [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000020#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Larry Hastings61272b72014-01-07 12:41:53 -080022/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080023module unicodedata
Larry Hastingsc2047262014-01-25 20:43:29 -080024class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Larry Hastings61272b72014-01-07 12:41:53 -080025[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080026/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080027
Fredrik Lundh06d12682001-01-24 07:59:11 +000028/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000029
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000031 const unsigned char category; /* index into
32 _PyUnicode_CategoryNames */
33 const unsigned char combining; /* combining class value 0 - 255 */
34 const unsigned char bidirectional; /* index into
35 _PyUnicode_BidirectionalNames */
36 const unsigned char mirrored; /* true if mirrored in bidir mode */
37 const unsigned char east_asian_width; /* index into
38 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000039 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000040} _PyUnicode_DatabaseRecord;
41
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042typedef struct change_record {
43 /* sequence of fields should be the same as in merge_old_version */
44 const unsigned char bidir_changed;
45 const unsigned char category_changed;
46 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000047 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000048 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000049} change_record;
50
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000051/* data file generated by Tools/unicode/makeunicodedata.py */
52#include "unicodedata_db.h"
53
54static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000055_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000056{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000057 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000058 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000059 index = 0;
60 else {
61 index = index1[(code>>SHIFT)];
62 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
63 }
64
65 return &_PyUnicode_Database_Records[index];
66}
67
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000068/* ------------- Previous-version API ------------------------------------- */
69typedef struct previous_version {
70 PyObject_HEAD
71 const char *name;
72 const change_record* (*getrecord)(Py_UCS4);
73 Py_UCS4 (*normalization)(Py_UCS4);
74} PreviousDBVersion;
75
76#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
77
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000079 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000080 {NULL}
81};
82
Thomas Wouters89f507f2006-12-13 04:49:30 +000083/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000084static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000085#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086
87static PyObject*
88new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
89 Py_UCS4 (*normalization)(Py_UCS4))
90{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000091 PreviousDBVersion *self;
92 self = PyObject_New(PreviousDBVersion, &UCD_Type);
93 if (self == NULL)
94 return NULL;
95 self->name = name;
96 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000097 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000098 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000099}
100
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000101
102static Py_UCS4 getuchar(PyUnicodeObject *obj)
103{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200104 if (PyUnicode_READY(obj))
105 return (Py_UCS4)-1;
106 if (PyUnicode_GET_LENGTH(obj) == 1) {
107 if (PyUnicode_READY(obj))
108 return (Py_UCS4)-1;
109 return PyUnicode_READ_CHAR(obj, 0);
110 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000111 PyErr_SetString(PyExc_TypeError,
112 "need a single Unicode character as parameter");
113 return (Py_UCS4)-1;
114}
115
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000116/* --- Module API --------------------------------------------------------- */
117
Larry Hastings61272b72014-01-07 12:41:53 -0800118/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -0800119
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800120unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700121
Larry Hastings77561cc2014-01-07 12:13:13 -0800122 unichr: object(type='PyUnicodeObject *', subclass_of='&PyUnicode_Type')
Larry Hastings31826802013-10-19 00:09:25 -0700123 default: object=NULL
124 /
125
126Converts a Unicode character into its equivalent decimal value.
127
128Returns the decimal value assigned to the Unicode character unichr
129as integer. If no such value is defined, default is returned, or, if
130not given, ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800131[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700132
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800133PyDoc_STRVAR(unicodedata_UCD_decimal__doc__,
Larry Hastings2623c8c2014-02-08 22:15:29 -0800134"decimal($self, unichr, default=None, /)\n"
135"--\n"
136"\n"
Larry Hastings31826802013-10-19 00:09:25 -0700137"Converts a Unicode character into its equivalent decimal value.\n"
138"\n"
Larry Hastings31826802013-10-19 00:09:25 -0700139"Returns the decimal value assigned to the Unicode character unichr\n"
140"as integer. If no such value is defined, default is returned, or, if\n"
141"not given, ValueError is raised.");
142
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800143#define UNICODEDATA_UCD_DECIMAL_METHODDEF \
144 {"decimal", (PyCFunction)unicodedata_UCD_decimal, METH_VARARGS, unicodedata_UCD_decimal__doc__},
Larry Hastings31826802013-10-19 00:09:25 -0700145
146static PyObject *
Larry Hastingsc2047262014-01-25 20:43:29 -0800147unicodedata_UCD_decimal_impl(PreviousDBVersion *self, PyUnicodeObject *unichr, PyObject *default_value);
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000148
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000149static PyObject *
Larry Hastingsc2047262014-01-25 20:43:29 -0800150unicodedata_UCD_decimal(PreviousDBVersion *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000151{
Larry Hastings31826802013-10-19 00:09:25 -0700152 PyObject *return_value = NULL;
Larry Hastings77561cc2014-01-07 12:13:13 -0800153 PyUnicodeObject *unichr;
Larry Hastings31826802013-10-19 00:09:25 -0700154 PyObject *default_value = NULL;
155
156 if (!PyArg_ParseTuple(args,
157 "O!|O:decimal",
158 &PyUnicode_Type, &unichr, &default_value))
159 goto exit;
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800160 return_value = unicodedata_UCD_decimal_impl(self, unichr, default_value);
Larry Hastings31826802013-10-19 00:09:25 -0700161
162exit:
163 return return_value;
164}
165
166static PyObject *
Larry Hastingsc2047262014-01-25 20:43:29 -0800167unicodedata_UCD_decimal_impl(PreviousDBVersion *self, PyUnicodeObject *unichr, PyObject *default_value)
Larry Hastings2623c8c2014-02-08 22:15:29 -0800168/*[clinic end generated code: output=8689669896d293df input=c25c9d2b4de076b1]*/
Larry Hastings31826802013-10-19 00:09:25 -0700169{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000170 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000171 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000172 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000173
Larry Hastingsc2047262014-01-25 20:43:29 -0800174 c = getuchar(unichr);
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000175 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000176 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000177
Martin v. Löwis1a214512008-06-11 05:26:20 +0000178 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000179 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000180 if (old->category_changed == 0) {
181 /* unassigned */
182 have_old = 1;
183 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000184 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000185 else if (old->decimal_changed != 0xFF) {
186 have_old = 1;
187 rc = old->decimal_changed;
188 }
189 }
190
191 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000192 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700194 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000195 PyErr_SetString(PyExc_ValueError,
196 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000197 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 }
199 else {
Larry Hastings31826802013-10-19 00:09:25 -0700200 Py_INCREF(default_value);
201 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000202 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000204 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000205}
206
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000207PyDoc_STRVAR(unicodedata_digit__doc__,
208"digit(unichr[, default])\n\
209\n\
210Returns the digit value assigned to the Unicode character unichr as\n\
211integer. If no such value is defined, default is returned, or, if\n\
212not given, ValueError is raised.");
213
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000214static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000215unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000216{
217 PyUnicodeObject *v;
218 PyObject *defobj = NULL;
219 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000220 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000221
Fredrik Lundh06d12682001-01-24 07:59:11 +0000222 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000223 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000224 c = getuchar(v);
225 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000226 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000227 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000228 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 if (defobj == NULL) {
230 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000231 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000232 }
233 else {
234 Py_INCREF(defobj);
235 return defobj;
236 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000237 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000238 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000239}
240
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000241PyDoc_STRVAR(unicodedata_numeric__doc__,
242"numeric(unichr[, default])\n\
243\n\
244Returns the numeric value assigned to the Unicode character unichr\n\
245as float. If no such value is defined, default is returned, or, if\n\
246not given, ValueError is raised.");
247
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000248static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000249unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250{
251 PyUnicodeObject *v;
252 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000253 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000254 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000255 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000256
Fredrik Lundh06d12682001-01-24 07:59:11 +0000257 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000258 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000259 c = getuchar(v);
260 if (c == (Py_UCS4)-1)
261 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000262
Martin v. Löwis1a214512008-06-11 05:26:20 +0000263 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000264 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000265 if (old->category_changed == 0) {
266 /* unassigned */
267 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000268 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000269 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000270 else if (old->decimal_changed != 0xFF) {
271 have_old = 1;
272 rc = old->decimal_changed;
273 }
274 }
275
276 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000277 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000278 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000279 if (defobj == NULL) {
280 PyErr_SetString(PyExc_ValueError, "not a numeric character");
281 return NULL;
282 }
283 else {
284 Py_INCREF(defobj);
285 return defobj;
286 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000287 }
288 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000289}
290
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000291PyDoc_STRVAR(unicodedata_category__doc__,
292"category(unichr)\n\
293\n\
294Returns the general category assigned to the Unicode character\n\
295unichr as string.");
296
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000297static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000298unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000299{
300 PyUnicodeObject *v;
301 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000302 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000303
304 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000305 &PyUnicode_Type, &v))
306 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000307 c = getuchar(v);
308 if (c == (Py_UCS4)-1)
309 return NULL;
310 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000311 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000312 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000313 if (old->category_changed != 0xFF)
314 index = old->category_changed;
315 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000316 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000317}
318
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000319PyDoc_STRVAR(unicodedata_bidirectional__doc__,
320"bidirectional(unichr)\n\
321\n\
Ezio Melottie3d7e542012-12-14 20:12:25 +0200322Returns the bidirectional class assigned to the Unicode character\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000323unichr as string. If no such value is defined, an empty string is\n\
324returned.");
325
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000326static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000327unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000328{
329 PyUnicodeObject *v;
330 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000331 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000332
333 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000334 &PyUnicode_Type, &v))
335 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000336 c = getuchar(v);
337 if (c == (Py_UCS4)-1)
338 return NULL;
339 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000340 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000341 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000342 if (old->category_changed == 0)
343 index = 0; /* unassigned */
344 else if (old->bidir_changed != 0xFF)
345 index = old->bidir_changed;
346 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000347 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000348}
349
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000350PyDoc_STRVAR(unicodedata_combining__doc__,
351"combining(unichr)\n\
352\n\
353Returns the canonical combining class assigned to the Unicode\n\
354character unichr as integer. Returns 0 if no combining class is\n\
355defined.");
356
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000357static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000358unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000359{
360 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000361 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000362 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000363
364 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000365 &PyUnicode_Type, &v))
366 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000367 c = getuchar(v);
368 if (c == (Py_UCS4)-1)
369 return NULL;
370 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000371 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000372 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000373 if (old->category_changed == 0)
374 index = 0; /* unassigned */
375 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000376 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000377}
378
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000379PyDoc_STRVAR(unicodedata_mirrored__doc__,
380"mirrored(unichr)\n\
381\n\
382Returns the mirrored property assigned to the Unicode character\n\
383unichr as integer. Returns 1 if the character has been identified as\n\
384a \"mirrored\" character in bidirectional text, 0 otherwise.");
385
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000386static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000387unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000388{
389 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000390 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000391 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000392
393 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000394 &PyUnicode_Type, &v))
395 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000396 c = getuchar(v);
397 if (c == (Py_UCS4)-1)
398 return NULL;
399 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000400 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000401 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000402 if (old->category_changed == 0)
403 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000404 else if (old->mirrored_changed != 0xFF)
405 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000406 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000407 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000408}
409
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000410PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
411"east_asian_width(unichr)\n\
412\n\
413Returns the east asian width assigned to the Unicode character\n\
414unichr as string.");
415
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000416static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000417unicodedata_east_asian_width(PyObject *self, PyObject *args)
418{
419 PyUnicodeObject *v;
420 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000421 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000422
423 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000424 &PyUnicode_Type, &v))
425 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000426 c = getuchar(v);
427 if (c == (Py_UCS4)-1)
428 return NULL;
429 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000430 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000431 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000432 if (old->category_changed == 0)
433 index = 0; /* unassigned */
434 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000435 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000436}
437
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000438PyDoc_STRVAR(unicodedata_decomposition__doc__,
439"decomposition(unichr)\n\
440\n\
441Returns the character decomposition mapping assigned to the Unicode\n\
442character unichr as string. An empty string is returned in case no\n\
443such mapping is defined.");
444
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000445static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000446unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000447{
448 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000449 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000450 int code, index, count;
451 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000452 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000453 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000454
455 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000456 &PyUnicode_Type, &v))
457 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000458 c = getuchar(v);
459 if (c == (Py_UCS4)-1)
460 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000461
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000462 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000463
Martin v. Löwis1a214512008-06-11 05:26:20 +0000464 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000465 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000466 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000467 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000468 }
469
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000470 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000471 index = 0;
472 else {
473 index = decomp_index1[(code>>DECOMP_SHIFT)];
474 index = decomp_index2[(index<<DECOMP_SHIFT)+
475 (code&((1<<DECOMP_SHIFT)-1))];
476 }
477
Tim Peters69b83b12001-11-30 07:23:05 +0000478 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000479 is prefix code (from*/
480 count = decomp_data[index] >> 8;
481
482 /* XXX: could allocate the PyString up front instead
483 (strlen(prefix) + 5 * count + 1 bytes) */
484
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000485 /* Based on how index is calculated above and decomp_data is generated
486 from Tools/unicode/makeunicodedata.py, it should not be possible
487 to overflow decomp_prefix. */
488 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200489 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000490
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000491 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000492 i = strlen(decomp_prefix[prefix_index]);
493 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000494
495 while (count-- > 0) {
496 if (i)
497 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000498 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000499 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
500 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000501 i += strlen(decomp + i);
502 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000503 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000504}
505
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000506static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000507get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000508{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000509 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000510 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000511 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000512 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000513 /* unassigned in old version */
514 *index = 0;
515 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000516 else {
517 *index = decomp_index1[(code>>DECOMP_SHIFT)];
518 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
519 (code&((1<<DECOMP_SHIFT)-1))];
520 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000521
Martin v. Löwis677bde22002-11-23 22:08:15 +0000522 /* high byte is number of hex bytes (usually one or two), low byte
523 is prefix code (from*/
524 *count = decomp_data[*index] >> 8;
525 *prefix = decomp_data[*index] & 255;
526
527 (*index)++;
528}
529
530#define SBase 0xAC00
531#define LBase 0x1100
532#define VBase 0x1161
533#define TBase 0x11A7
534#define LCount 19
535#define VCount 21
536#define TCount 28
537#define NCount (VCount*TCount)
538#define SCount (LCount*NCount)
539
540static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000541nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000542{
543 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200544 Py_UCS4 *output;
545 Py_ssize_t i, o, osize;
546 int kind;
547 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000548 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200549 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000550 Py_ssize_t space, isize;
551 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000552 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000553
Martin v. Löwis677bde22002-11-23 22:08:15 +0000554 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200555 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500556 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300557 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500558 if (space > 10) {
559 if (space <= PY_SSIZE_T_MAX - 10)
560 space += 10;
561 }
562 else {
563 space *= 2;
564 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200565 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500566 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200567 if (!output) {
568 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000569 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200570 }
571 i = o = 0;
572 kind = PyUnicode_KIND(input);
573 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000574
Martin v. Löwis22970662011-09-29 13:39:38 +0200575 while (i < isize) {
576 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000577 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200578 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000579 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300580 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000581 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000582 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200583 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000584 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000585 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
586 if (new_output == NULL) {
587 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200588 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000589 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200590 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000591 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000592 }
593 /* Hangul Decomposition. */
594 if (SBase <= code && code < (SBase+SCount)) {
595 int SIndex = code - SBase;
596 int L = LBase + SIndex / NCount;
597 int V = VBase + (SIndex % NCount) / TCount;
598 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200599 output[o++] = L;
600 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000601 space -= 2;
602 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200603 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000604 space --;
605 }
606 continue;
607 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000608 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000609 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000610 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
611 if (value != 0) {
612 stack[stackptr++] = value;
613 continue;
614 }
615 }
616
617 /* Other decompositions. */
618 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000619
620 /* Copy character if it is not decomposable, or has a
621 compatibility decomposition, but we do NFD. */
622 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200623 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000624 space--;
625 continue;
626 }
627 /* Copy decomposition onto the stack, in reverse
628 order. */
629 while(count) {
630 code = decomp_data[index + (--count)];
631 stack[stackptr++] = code;
632 }
633 }
634 }
635
Martin v. Löwis22970662011-09-29 13:39:38 +0200636 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
637 output, o);
638 PyMem_Free(output);
639 if (!result)
640 return NULL;
641 /* result is guaranteed to be ready, as it is compact. */
642 kind = PyUnicode_KIND(result);
643 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000644
645 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200646 i = 0;
647 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
648 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
649 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000650 if (prev == 0 || cur == 0 || prev <= cur) {
651 prev = cur;
652 continue;
653 }
654 /* Non-canonical order. Need to switch *i with previous. */
655 o = i - 1;
656 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200657 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
658 PyUnicode_WRITE(kind, data, o+1,
659 PyUnicode_READ(kind, data, o));
660 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000661 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200662 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000663 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200664 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000665 if (prev == 0 || prev <= cur)
666 break;
667 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200668 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000669 }
670 return result;
671}
672
673static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200674find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000675{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200676 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000677 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200678 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000679 if (code < start)
680 return -1;
681 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200682 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000683 return nfc[index].index + delta;
684 }
685 }
686 return -1;
687}
688
689static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000690nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000691{
692 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200693 int kind;
694 void *data;
695 Py_UCS4 *output;
696 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000697 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200698 Py_UCS4 code;
699 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000700 int cskipped = 0;
701
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000702 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000703 if (!result)
704 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200705 /* result will be "ready". */
706 kind = PyUnicode_KIND(result);
707 data = PyUnicode_DATA(result);
708 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000709
Martin v. Löwis22970662011-09-29 13:39:38 +0200710 /* We allocate a buffer for the output.
711 If we find that we made no changes, we still return
712 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500713 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200714 if (!output) {
715 PyErr_NoMemory();
716 Py_DECREF(result);
717 return 0;
718 }
719 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000720
Martin v. Löwis677bde22002-11-23 22:08:15 +0000721 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200722 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000723 for (index = 0; index < cskipped; index++) {
724 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000725 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000726 Remove from list. */
727 skipped[index] = skipped[cskipped-1];
728 cskipped--;
729 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000730 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000731 }
732 }
733 /* Hangul Composition. We don't need to check for <LV,T>
734 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200735 code = PyUnicode_READ(kind, data, i);
736 if (LBase <= code && code < (LBase+LCount) &&
737 i + 1 < len &&
738 VBase <= PyUnicode_READ(kind, data, i+1) &&
739 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000740 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200741 LIndex = code - LBase;
742 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000743 code = SBase + (LIndex*VCount+VIndex)*TCount;
744 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200745 if (i < len &&
746 TBase <= PyUnicode_READ(kind, data, i) &&
747 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
748 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000749 i++;
750 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200751 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000752 continue;
753 }
754
Martin v. Löwis22970662011-09-29 13:39:38 +0200755 /* code is still input[i] here */
756 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000757 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200758 output[o++] = code;
759 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000760 continue;
761 }
762 /* Find next unblocked character. */
763 i1 = i+1;
764 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200765 /* output base character for now; might be updated later. */
766 output[o] = PyUnicode_READ(kind, data, i);
767 while (i1 < len) {
768 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
769 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000770 if (comb) {
771 if (comb1 == 0)
772 break;
773 if (comb >= comb1) {
774 /* Character is blocked. */
775 i1++;
776 continue;
777 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000778 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200779 l = find_nfc_index(self, nfc_last, code1);
780 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000781 is a starter, we don't need to look further.
782 Otherwise, record the combining class. */
783 if (l == -1) {
784 not_combinable:
785 if (comb1 == 0)
786 break;
787 comb = comb1;
788 i1++;
789 continue;
790 }
791 index = f*TOTAL_LAST + l;
792 index1 = comp_index[index >> COMP_SHIFT];
793 code = comp_data[(index1<<COMP_SHIFT)+
794 (index&((1<<COMP_SHIFT)-1))];
795 if (code == 0)
796 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000797
Martin v. Löwis677bde22002-11-23 22:08:15 +0000798 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200799 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000800 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000801 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000802 skipped[cskipped++] = i1;
803 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200804 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000805 if (f == -1)
806 break;
807 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200808 /* Output character was already written.
809 Just advance the indices. */
810 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000811 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200812 if (o == len) {
813 /* No changes. Return original string. */
814 PyMem_Free(output);
815 return result;
816 }
817 Py_DECREF(result);
818 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
819 output, o);
820 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000821 return result;
822}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000823
824/* Return 1 if the input is certainly normalized, 0 if it might not be. */
825static int
826is_normalized(PyObject *self, PyObject *input, int nfc, int k)
827{
Martin v. Löwis22970662011-09-29 13:39:38 +0200828 Py_ssize_t i, len;
829 int kind;
830 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000831 unsigned char prev_combining = 0, quickcheck_mask;
832
833 /* An older version of the database is requested, quickchecks must be
834 disabled. */
835 if (self && UCD_Check(self))
836 return 0;
837
838 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
839 as described in http://unicode.org/reports/tr15/#Annex8. */
840 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
841
Martin v. Löwis22970662011-09-29 13:39:38 +0200842 i = 0;
843 kind = PyUnicode_KIND(input);
844 data = PyUnicode_DATA(input);
845 len = PyUnicode_GET_LENGTH(input);
846 while (i < len) {
847 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
848 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000849 unsigned char combining = record->combining;
850 unsigned char quickcheck = record->normalization_quick_check;
851
852 if (quickcheck & quickcheck_mask)
853 return 0; /* this string might need normalization */
854 if (combining && prev_combining > combining)
855 return 0; /* non-canonical sort order, not normalized */
856 prev_combining = combining;
857 }
858 return 1; /* certainly normalized */
859}
860
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000861PyDoc_STRVAR(unicodedata_normalize__doc__,
862"normalize(form, unistr)\n\
863\n\
864Return the normal form 'form' for the Unicode string unistr. Valid\n\
865values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
866
Martin v. Löwis677bde22002-11-23 22:08:15 +0000867static PyObject*
868unicodedata_normalize(PyObject *self, PyObject *args)
869{
870 char *form;
871 PyObject *input;
872
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000873 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000874 &form, &PyUnicode_Type, &input))
875 return NULL;
876
Martin v. Löwis22970662011-09-29 13:39:38 +0200877 if (PyUnicode_READY(input) == -1)
878 return NULL;
879
880 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000881 /* Special case empty input strings, since resizing
882 them later would cause internal errors. */
883 Py_INCREF(input);
884 return input;
885 }
886
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000887 if (strcmp(form, "NFC") == 0) {
888 if (is_normalized(self, input, 1, 0)) {
889 Py_INCREF(input);
890 return input;
891 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000892 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000893 }
894 if (strcmp(form, "NFKC") == 0) {
895 if (is_normalized(self, input, 1, 1)) {
896 Py_INCREF(input);
897 return input;
898 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000899 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000900 }
901 if (strcmp(form, "NFD") == 0) {
902 if (is_normalized(self, input, 0, 0)) {
903 Py_INCREF(input);
904 return input;
905 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000906 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000907 }
908 if (strcmp(form, "NFKD") == 0) {
909 if (is_normalized(self, input, 0, 1)) {
910 Py_INCREF(input);
911 return input;
912 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000913 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000914 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000915 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
916 return NULL;
917}
918
Fredrik Lundh06d12682001-01-24 07:59:11 +0000919/* -------------------------------------------------------------------- */
920/* unicode character name tables */
921
922/* data file generated by Tools/unicode/makeunicodedata.py */
923#include "unicodename_db.h"
924
925/* -------------------------------------------------------------------- */
926/* database code (cut and pasted from the unidb package) */
927
928static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000929_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000930{
931 int i;
932 unsigned long h = 0;
933 unsigned long ix;
934 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200935 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000936 ix = h & 0xff000000;
937 if (ix)
938 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
939 }
940 return h;
941}
942
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000943static char *hangul_syllables[][3] = {
944 { "G", "A", "" },
945 { "GG", "AE", "G" },
946 { "N", "YA", "GG" },
947 { "D", "YAE", "GS" },
948 { "DD", "EO", "N", },
949 { "R", "E", "NJ" },
950 { "M", "YEO", "NH" },
951 { "B", "YE", "D" },
952 { "BB", "O", "L" },
953 { "S", "WA", "LG" },
954 { "SS", "WAE", "LM" },
955 { "", "OE", "LB" },
956 { "J", "YO", "LS" },
957 { "JJ", "U", "LT" },
958 { "C", "WEO", "LP" },
959 { "K", "WE", "LH" },
960 { "T", "WI", "M" },
961 { "P", "YU", "B" },
962 { "H", "EU", "BS" },
963 { 0, "YI", "S" },
964 { 0, "I", "SS" },
965 { 0, 0, "NG" },
966 { 0, 0, "J" },
967 { 0, 0, "C" },
968 { 0, 0, "K" },
969 { 0, 0, "T" },
970 { 0, 0, "P" },
971 { 0, 0, "H" }
972};
973
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000974/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000975static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000976is_unified_ideograph(Py_UCS4 code)
977{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000978 return
979 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500980 (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000981 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
982 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
983 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000984}
985
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200986/* macros used to determine if the given code point is in the PUA range that
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300987 * we are using to store aliases and named sequences */
988#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
989#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
990 (cp < named_sequences_end))
991
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000992static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300993_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
994 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000995{
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200996 /* Find the name associated with the given code point.
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300997 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
998 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000999 int offset;
1000 int i;
1001 int word;
1002 unsigned char* w;
1003
Martin v. Löwisc3509122006-03-11 12:16:23 +00001004 if (code >= 0x110000)
1005 return 0;
1006
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001007 /* XXX should we just skip all the code points in the PUAs here? */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001008 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1009 return 0;
1010
Martin v. Löwis1a214512008-06-11 05:26:20 +00001011 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001012 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +03001013 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001014 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1015 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +03001016 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +00001017 if (old->category_changed == 0) {
1018 /* unassigned */
1019 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001020 }
Martin v. Löwisc3509122006-03-11 12:16:23 +00001021 }
1022
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +00001023 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001024 /* Hangul syllable. */
1025 int SIndex = code - SBase;
1026 int L = SIndex / NCount;
1027 int V = (SIndex % NCount) / TCount;
1028 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001029
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001030 if (buflen < 27)
1031 /* Worst case: HANGUL SYLLABLE <10chars>. */
1032 return 0;
1033 strcpy(buffer, "HANGUL SYLLABLE ");
1034 buffer += 16;
1035 strcpy(buffer, hangul_syllables[L][0]);
1036 buffer += strlen(hangul_syllables[L][0]);
1037 strcpy(buffer, hangul_syllables[V][1]);
1038 buffer += strlen(hangul_syllables[V][1]);
1039 strcpy(buffer, hangul_syllables[T][2]);
1040 buffer += strlen(hangul_syllables[T][2]);
1041 *buffer = '\0';
1042 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001043 }
1044
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001045 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001046 if (buflen < 28)
1047 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1048 return 0;
1049 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1050 return 1;
1051 }
1052
Fredrik Lundh06d12682001-01-24 07:59:11 +00001053 /* get offset into phrasebook */
1054 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1055 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1056 (code&((1<<phrasebook_shift)-1))];
1057 if (!offset)
1058 return 0;
1059
1060 i = 0;
1061
1062 for (;;) {
1063 /* get word index */
1064 word = phrasebook[offset] - phrasebook_short;
1065 if (word >= 0) {
1066 word = (word << 8) + phrasebook[offset+1];
1067 offset += 2;
1068 } else
1069 word = phrasebook[offset++];
1070 if (i) {
1071 if (i > buflen)
1072 return 0; /* buffer overflow */
1073 buffer[i++] = ' ';
1074 }
1075 /* copy word string from lexicon. the last character in the
1076 word has bit 7 set. the last word in a string ends with
1077 0x80 */
1078 w = lexicon + lexicon_offset[word];
1079 while (*w < 128) {
1080 if (i >= buflen)
1081 return 0; /* buffer overflow */
1082 buffer[i++] = *w++;
1083 }
1084 if (i >= buflen)
1085 return 0; /* buffer overflow */
1086 buffer[i++] = *w & 127;
1087 if (*w == 128)
1088 break; /* end of word */
1089 }
1090
1091 return 1;
1092}
1093
1094static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001095_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001096{
1097 /* check if code corresponds to the given name */
1098 int i;
1099 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001100 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001101 return 0;
1102 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001103 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001104 return 0;
1105 }
1106 return buffer[namelen] == '\0';
1107}
1108
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001109static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001110find_syllable(const char *str, int *len, int *pos, int count, int column)
1111{
1112 int i, len1;
1113 *len = -1;
1114 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001115 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001116 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001117 if (len1 <= *len)
1118 continue;
1119 if (strncmp(str, s, len1) == 0) {
1120 *len = len1;
1121 *pos = i;
1122 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001123 }
1124 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001125 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001126 }
1127}
1128
Fredrik Lundh06d12682001-01-24 07:59:11 +00001129static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001130_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001131{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001132 /* check if named sequences are allowed */
1133 if (!with_named_seq && IS_NAMED_SEQ(cp))
1134 return 0;
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001135 /* if the code point is in the PUA range that we use for aliases,
1136 * convert it to obtain the right code point */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001137 if (IS_ALIAS(cp))
1138 *code = name_aliases[cp-aliases_start];
1139 else
1140 *code = cp;
1141 return 1;
1142}
1143
1144static int
1145_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1146 int with_named_seq)
1147{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001148 /* Return the code point associated with the given name.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001149 * Named aliases are resolved too (unless self != NULL (i.e. we are using
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001150 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001151 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001152 unsigned int h, v;
1153 unsigned int mask = code_size-1;
1154 unsigned int i, incr;
1155
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001156 /* Check for hangul syllables. */
1157 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001158 int len, L = -1, V = -1, T = -1;
1159 const char *pos = name + 16;
1160 find_syllable(pos, &len, &L, LCount, 0);
1161 pos += len;
1162 find_syllable(pos, &len, &V, VCount, 1);
1163 pos += len;
1164 find_syllable(pos, &len, &T, TCount, 2);
1165 pos += len;
1166 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1167 *code = SBase + (L*VCount+V)*TCount + T;
1168 return 1;
1169 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001170 /* Otherwise, it's an illegal syllable name. */
1171 return 0;
1172 }
1173
1174 /* Check for unified ideographs. */
1175 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1176 /* Four or five hexdigits must follow. */
1177 v = 0;
1178 name += 22;
1179 namelen -= 22;
1180 if (namelen != 4 && namelen != 5)
1181 return 0;
1182 while (namelen--) {
1183 v *= 16;
1184 if (*name >= '0' && *name <= '9')
1185 v += *name - '0';
1186 else if (*name >= 'A' && *name <= 'F')
1187 v += *name - 'A' + 10;
1188 else
1189 return 0;
1190 name++;
1191 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001192 if (!is_unified_ideograph(v))
1193 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001194 *code = v;
1195 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001196 }
1197
Fredrik Lundh06d12682001-01-24 07:59:11 +00001198 /* the following is the same as python's dictionary lookup, with
1199 only minor changes. see the makeunicodedata script for more
1200 details */
1201
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001202 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001203 i = (~h) & mask;
1204 v = code_hash[i];
1205 if (!v)
1206 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001207 if (_cmpname(self, v, name, namelen))
1208 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001209 incr = (h ^ (h >> 3)) & mask;
1210 if (!incr)
1211 incr = mask;
1212 for (;;) {
1213 i = (i + incr) & mask;
1214 v = code_hash[i];
1215 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001216 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001217 if (_cmpname(self, v, name, namelen))
1218 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001219 incr = incr << 1;
1220 if (incr > mask)
1221 incr = incr ^ code_poly;
1222 }
1223}
1224
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001225static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001226{
1227 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001228 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001229 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001230};
1231
1232/* -------------------------------------------------------------------- */
1233/* Python bindings */
1234
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001235PyDoc_STRVAR(unicodedata_name__doc__,
1236"name(unichr[, default])\n\
1237Returns the name assigned to the Unicode character unichr as a\n\
1238string. If no name is defined, default is returned, or, if not\n\
1239given, ValueError is raised.");
1240
Fredrik Lundh06d12682001-01-24 07:59:11 +00001241static PyObject *
1242unicodedata_name(PyObject* self, PyObject* args)
1243{
1244 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001245 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001246
1247 PyUnicodeObject* v;
1248 PyObject* defobj = NULL;
1249 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1250 return NULL;
1251
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001252 c = getuchar(v);
1253 if (c == (Py_UCS4)-1)
1254 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001255
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001256 if (!_getucname(self, c, name, sizeof(name), 0)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001257 if (defobj == NULL) {
1258 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001259 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001260 }
1261 else {
1262 Py_INCREF(defobj);
1263 return defobj;
1264 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001265 }
1266
Walter Dörwald4254e762007-06-05 16:04:09 +00001267 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001268}
1269
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001270PyDoc_STRVAR(unicodedata_lookup__doc__,
1271"lookup(name)\n\
1272\n\
1273Look up character by name. If a character with the\n\
1274given name is found, return the corresponding Unicode\n\
1275character. If not found, KeyError is raised.");
1276
Fredrik Lundh06d12682001-01-24 07:59:11 +00001277static PyObject *
1278unicodedata_lookup(PyObject* self, PyObject* args)
1279{
1280 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001281
1282 char* name;
Victor Stinner65a31442014-07-01 16:45:52 +02001283 Py_ssize_t namelen;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001284 unsigned int index;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001285 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1286 return NULL;
Victor Stinner65a31442014-07-01 16:45:52 +02001287 if (namelen > INT_MAX) {
1288 PyErr_SetString(PyExc_KeyError, "name too long");
1289 return NULL;
1290 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001291
Victor Stinner65a31442014-07-01 16:45:52 +02001292 if (!_getcode(self, name, (int)namelen, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001293 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001294 return NULL;
1295 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001296 /* check if code is in the PUA range that we use for named sequences
1297 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001298 if (IS_NAMED_SEQ(code)) {
1299 index = code-named_sequences_start;
1300 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1301 named_sequences[index].seq,
1302 named_sequences[index].seqlen);
1303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001305}
1306
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001307/* XXX Add doc strings. */
1308
1309static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001310 UNICODEDATA_UCD_DECIMAL_METHODDEF
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001311 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1312 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1313 {"category", unicodedata_category, METH_VARARGS,
1314 unicodedata_category__doc__},
1315 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1316 unicodedata_bidirectional__doc__},
1317 {"combining", unicodedata_combining, METH_VARARGS,
1318 unicodedata_combining__doc__},
1319 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1320 unicodedata_mirrored__doc__},
1321 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1322 unicodedata_east_asian_width__doc__},
1323 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1324 unicodedata_decomposition__doc__},
1325 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1326 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1327 {"normalize", unicodedata_normalize, METH_VARARGS,
1328 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001329 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001330};
1331
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001332static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001333 /* The ob_type field must be initialized in the module init function
1334 * to be portable to Windows without using C++. */
1335 PyVarObject_HEAD_INIT(NULL, 0)
1336 "unicodedata.UCD", /*tp_name*/
1337 sizeof(PreviousDBVersion), /*tp_basicsize*/
1338 0, /*tp_itemsize*/
1339 /* methods */
1340 (destructor)PyObject_Del, /*tp_dealloc*/
1341 0, /*tp_print*/
1342 0, /*tp_getattr*/
1343 0, /*tp_setattr*/
1344 0, /*tp_reserved*/
1345 0, /*tp_repr*/
1346 0, /*tp_as_number*/
1347 0, /*tp_as_sequence*/
1348 0, /*tp_as_mapping*/
1349 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001350 0, /*tp_call*/
1351 0, /*tp_str*/
1352 PyObject_GenericGetAttr,/*tp_getattro*/
1353 0, /*tp_setattro*/
1354 0, /*tp_as_buffer*/
1355 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1356 0, /*tp_doc*/
1357 0, /*tp_traverse*/
1358 0, /*tp_clear*/
1359 0, /*tp_richcompare*/
1360 0, /*tp_weaklistoffset*/
1361 0, /*tp_iter*/
1362 0, /*tp_iternext*/
1363 unicodedata_functions, /*tp_methods*/
1364 DB_members, /*tp_members*/
1365 0, /*tp_getset*/
1366 0, /*tp_base*/
1367 0, /*tp_dict*/
1368 0, /*tp_descr_get*/
1369 0, /*tp_descr_set*/
1370 0, /*tp_dictoffset*/
1371 0, /*tp_init*/
1372 0, /*tp_alloc*/
1373 0, /*tp_new*/
1374 0, /*tp_free*/
1375 0, /*tp_is_gc*/
1376};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001377
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001378PyDoc_STRVAR(unicodedata_docstring,
1379"This module provides access to the Unicode Character Database which\n\
1380defines character properties for all Unicode characters. The data in\n\
1381this database is based on the UnicodeData.txt file version\n\
Benjamin Peterson8aa7b892013-10-10 20:22:10 -04001382" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001383\n\
1384The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001385UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001386
1387static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001388 PyModuleDef_HEAD_INIT,
1389 "unicodedata",
1390 unicodedata_docstring,
1391 -1,
1392 unicodedata_functions,
1393 NULL,
1394 NULL,
1395 NULL,
1396 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001397};
1398
Mark Hammond62b1ab12002-07-23 06:31:15 +00001399PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001400PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001401{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001402 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001403
Christian Heimes90aa7642007-12-19 02:45:37 +00001404 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001405
Martin v. Löwis1a214512008-06-11 05:26:20 +00001406 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001407 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001408 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001409
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001410 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001411 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001412 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001413
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001414 /* Previous versions */
1415 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1416 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001417 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001418
Fredrik Lundh06d12682001-01-24 07:59:11 +00001419 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001420 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001421 if (v != NULL)
1422 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001423 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001424}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001425
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001426/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001427Local variables:
1428c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001429indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001430End:
1431*/