blob: d6f382942684bad41191bdbb73458af81746ced0 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000020#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Larry Hastings61272b72014-01-07 12:41:53 -080022/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080023module unicodedata
Larry Hastingsc2047262014-01-25 20:43:29 -080024class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Larry Hastings61272b72014-01-07 12:41:53 -080025[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080026/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080027
Fredrik Lundh06d12682001-01-24 07:59:11 +000028/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000029
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000031 const unsigned char category; /* index into
32 _PyUnicode_CategoryNames */
33 const unsigned char combining; /* combining class value 0 - 255 */
34 const unsigned char bidirectional; /* index into
35 _PyUnicode_BidirectionalNames */
36 const unsigned char mirrored; /* true if mirrored in bidir mode */
37 const unsigned char east_asian_width; /* index into
38 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000039 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000040} _PyUnicode_DatabaseRecord;
41
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042typedef struct change_record {
43 /* sequence of fields should be the same as in merge_old_version */
44 const unsigned char bidir_changed;
45 const unsigned char category_changed;
46 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000047 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000048 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000049} change_record;
50
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000051/* data file generated by Tools/unicode/makeunicodedata.py */
52#include "unicodedata_db.h"
53
54static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000055_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000056{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000057 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000058 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000059 index = 0;
60 else {
61 index = index1[(code>>SHIFT)];
62 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
63 }
64
65 return &_PyUnicode_Database_Records[index];
66}
67
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000068/* ------------- Previous-version API ------------------------------------- */
69typedef struct previous_version {
70 PyObject_HEAD
71 const char *name;
72 const change_record* (*getrecord)(Py_UCS4);
73 Py_UCS4 (*normalization)(Py_UCS4);
74} PreviousDBVersion;
75
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030076#include "clinic/unicodedata.c.h"
77
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
79
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000080static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000081 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000082 {NULL}
83};
84
Thomas Wouters89f507f2006-12-13 04:49:30 +000085/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000086static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000087#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088
89static PyObject*
90new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
91 Py_UCS4 (*normalization)(Py_UCS4))
92{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000093 PreviousDBVersion *self;
94 self = PyObject_New(PreviousDBVersion, &UCD_Type);
95 if (self == NULL)
96 return NULL;
97 self->name = name;
98 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000099 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000100 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000101}
102
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000103
104static Py_UCS4 getuchar(PyUnicodeObject *obj)
105{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 if (PyUnicode_READY(obj))
107 return (Py_UCS4)-1;
108 if (PyUnicode_GET_LENGTH(obj) == 1) {
109 if (PyUnicode_READY(obj))
110 return (Py_UCS4)-1;
111 return PyUnicode_READ_CHAR(obj, 0);
112 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000113 PyErr_SetString(PyExc_TypeError,
114 "need a single Unicode character as parameter");
115 return (Py_UCS4)-1;
116}
117
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118/* --- Module API --------------------------------------------------------- */
119
Larry Hastings61272b72014-01-07 12:41:53 -0800120/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -0800121
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800122unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700123
Larry Hastings77561cc2014-01-07 12:13:13 -0800124 unichr: object(type='PyUnicodeObject *', subclass_of='&PyUnicode_Type')
Larry Hastings31826802013-10-19 00:09:25 -0700125 default: object=NULL
126 /
127
128Converts a Unicode character into its equivalent decimal value.
129
130Returns the decimal value assigned to the Unicode character unichr
131as integer. If no such value is defined, default is returned, or, if
132not given, ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800133[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700134
Larry Hastings31826802013-10-19 00:09:25 -0700135static PyObject *
Larry Hastings89964c42015-04-14 18:07:59 -0400136unicodedata_UCD_decimal_impl(PreviousDBVersion *self,
137 PyUnicodeObject *unichr,
138 PyObject *default_value)
139/*[clinic end generated code: output=bf853108f246ba19 input=c25c9d2b4de076b1]*/
Larry Hastings31826802013-10-19 00:09:25 -0700140{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000141 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000142 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000143 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000144
Larry Hastingsc2047262014-01-25 20:43:29 -0800145 c = getuchar(unichr);
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000146 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000147 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000148
Martin v. Löwis1a214512008-06-11 05:26:20 +0000149 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000150 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000151 if (old->category_changed == 0) {
152 /* unassigned */
153 have_old = 1;
154 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000155 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000156 else if (old->decimal_changed != 0xFF) {
157 have_old = 1;
158 rc = old->decimal_changed;
159 }
160 }
161
162 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000163 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000164 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700165 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000166 PyErr_SetString(PyExc_ValueError,
167 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000168 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000169 }
170 else {
Larry Hastings31826802013-10-19 00:09:25 -0700171 Py_INCREF(default_value);
172 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000174 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000175 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000176}
177
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000178PyDoc_STRVAR(unicodedata_digit__doc__,
179"digit(unichr[, default])\n\
180\n\
181Returns the digit value assigned to the Unicode character unichr as\n\
182integer. If no such value is defined, default is returned, or, if\n\
183not given, ValueError is raised.");
184
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000185static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000186unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000187{
188 PyUnicodeObject *v;
189 PyObject *defobj = NULL;
190 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000191 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000192
Fredrik Lundh06d12682001-01-24 07:59:11 +0000193 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000194 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000195 c = getuchar(v);
196 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000197 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000198 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000199 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000200 if (defobj == NULL) {
201 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000202 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000203 }
204 else {
205 Py_INCREF(defobj);
206 return defobj;
207 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000208 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000209 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000210}
211
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000212PyDoc_STRVAR(unicodedata_numeric__doc__,
213"numeric(unichr[, default])\n\
214\n\
215Returns the numeric value assigned to the Unicode character unichr\n\
216as float. If no such value is defined, default is returned, or, if\n\
217not given, ValueError is raised.");
218
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000219static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000220unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000221{
222 PyUnicodeObject *v;
223 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000224 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000225 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000226 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000227
Fredrik Lundh06d12682001-01-24 07:59:11 +0000228 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000229 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000230 c = getuchar(v);
231 if (c == (Py_UCS4)-1)
232 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000233
Martin v. Löwis1a214512008-06-11 05:26:20 +0000234 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000235 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000236 if (old->category_changed == 0) {
237 /* unassigned */
238 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000240 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000241 else if (old->decimal_changed != 0xFF) {
242 have_old = 1;
243 rc = old->decimal_changed;
244 }
245 }
246
247 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000248 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000249 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 if (defobj == NULL) {
251 PyErr_SetString(PyExc_ValueError, "not a numeric character");
252 return NULL;
253 }
254 else {
255 Py_INCREF(defobj);
256 return defobj;
257 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000258 }
259 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000260}
261
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000262PyDoc_STRVAR(unicodedata_category__doc__,
263"category(unichr)\n\
264\n\
265Returns the general category assigned to the Unicode character\n\
266unichr as string.");
267
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000268static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000269unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000270{
271 PyUnicodeObject *v;
272 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000273 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000274
275 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000276 &PyUnicode_Type, &v))
277 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000278 c = getuchar(v);
279 if (c == (Py_UCS4)-1)
280 return NULL;
281 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000282 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000283 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000284 if (old->category_changed != 0xFF)
285 index = old->category_changed;
286 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000287 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000288}
289
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000290PyDoc_STRVAR(unicodedata_bidirectional__doc__,
291"bidirectional(unichr)\n\
292\n\
Ezio Melottie3d7e542012-12-14 20:12:25 +0200293Returns the bidirectional class assigned to the Unicode character\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000294unichr as string. If no such value is defined, an empty string is\n\
295returned.");
296
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000297static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000298unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000299{
300 PyUnicodeObject *v;
301 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000302 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000303
304 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000305 &PyUnicode_Type, &v))
306 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000307 c = getuchar(v);
308 if (c == (Py_UCS4)-1)
309 return NULL;
310 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000311 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000312 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000313 if (old->category_changed == 0)
314 index = 0; /* unassigned */
315 else if (old->bidir_changed != 0xFF)
316 index = old->bidir_changed;
317 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000318 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000319}
320
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000321PyDoc_STRVAR(unicodedata_combining__doc__,
322"combining(unichr)\n\
323\n\
324Returns the canonical combining class assigned to the Unicode\n\
325character unichr as integer. Returns 0 if no combining class is\n\
326defined.");
327
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000328static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000329unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000330{
331 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000332 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000333 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000334
335 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000336 &PyUnicode_Type, &v))
337 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000338 c = getuchar(v);
339 if (c == (Py_UCS4)-1)
340 return NULL;
341 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000342 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000343 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000344 if (old->category_changed == 0)
345 index = 0; /* unassigned */
346 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000347 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000348}
349
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000350PyDoc_STRVAR(unicodedata_mirrored__doc__,
351"mirrored(unichr)\n\
352\n\
353Returns the mirrored property assigned to the Unicode character\n\
354unichr as integer. Returns 1 if the character has been identified as\n\
355a \"mirrored\" character in bidirectional text, 0 otherwise.");
356
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000357static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000358unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000359{
360 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000361 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000362 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000363
364 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000365 &PyUnicode_Type, &v))
366 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000367 c = getuchar(v);
368 if (c == (Py_UCS4)-1)
369 return NULL;
370 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000371 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000372 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000373 if (old->category_changed == 0)
374 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000375 else if (old->mirrored_changed != 0xFF)
376 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000377 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000378 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000379}
380
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000381PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
382"east_asian_width(unichr)\n\
383\n\
384Returns the east asian width assigned to the Unicode character\n\
385unichr as string.");
386
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000387static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000388unicodedata_east_asian_width(PyObject *self, PyObject *args)
389{
390 PyUnicodeObject *v;
391 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000392 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000393
394 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000395 &PyUnicode_Type, &v))
396 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000397 c = getuchar(v);
398 if (c == (Py_UCS4)-1)
399 return NULL;
400 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000401 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000402 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000403 if (old->category_changed == 0)
404 index = 0; /* unassigned */
405 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000406 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000407}
408
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000409PyDoc_STRVAR(unicodedata_decomposition__doc__,
410"decomposition(unichr)\n\
411\n\
412Returns the character decomposition mapping assigned to the Unicode\n\
413character unichr as string. An empty string is returned in case no\n\
414such mapping is defined.");
415
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000416static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000417unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000418{
419 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000420 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000421 int code, index, count;
422 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000423 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000424 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000425
426 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 &PyUnicode_Type, &v))
428 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000429 c = getuchar(v);
430 if (c == (Py_UCS4)-1)
431 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000432
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000433 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000434
Martin v. Löwis1a214512008-06-11 05:26:20 +0000435 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000436 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000437 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000438 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000439 }
440
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000441 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000442 index = 0;
443 else {
444 index = decomp_index1[(code>>DECOMP_SHIFT)];
445 index = decomp_index2[(index<<DECOMP_SHIFT)+
446 (code&((1<<DECOMP_SHIFT)-1))];
447 }
448
Tim Peters69b83b12001-11-30 07:23:05 +0000449 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000450 is prefix code (from*/
451 count = decomp_data[index] >> 8;
452
453 /* XXX: could allocate the PyString up front instead
454 (strlen(prefix) + 5 * count + 1 bytes) */
455
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000456 /* Based on how index is calculated above and decomp_data is generated
457 from Tools/unicode/makeunicodedata.py, it should not be possible
458 to overflow decomp_prefix. */
459 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200460 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000461
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000462 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000463 i = strlen(decomp_prefix[prefix_index]);
464 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000465
466 while (count-- > 0) {
467 if (i)
468 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000469 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000470 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
471 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000472 i += strlen(decomp + i);
473 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000474 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000475}
476
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000477static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000478get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000479{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000480 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000481 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000482 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000483 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000484 /* unassigned in old version */
485 *index = 0;
486 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000487 else {
488 *index = decomp_index1[(code>>DECOMP_SHIFT)];
489 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
490 (code&((1<<DECOMP_SHIFT)-1))];
491 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000492
Martin v. Löwis677bde22002-11-23 22:08:15 +0000493 /* high byte is number of hex bytes (usually one or two), low byte
494 is prefix code (from*/
495 *count = decomp_data[*index] >> 8;
496 *prefix = decomp_data[*index] & 255;
497
498 (*index)++;
499}
500
501#define SBase 0xAC00
502#define LBase 0x1100
503#define VBase 0x1161
504#define TBase 0x11A7
505#define LCount 19
506#define VCount 21
507#define TCount 28
508#define NCount (VCount*TCount)
509#define SCount (LCount*NCount)
510
511static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000512nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000513{
514 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200515 Py_UCS4 *output;
516 Py_ssize_t i, o, osize;
517 int kind;
518 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000519 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200520 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000521 Py_ssize_t space, isize;
522 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000523 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000524
Martin v. Löwis677bde22002-11-23 22:08:15 +0000525 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200526 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500527 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300528 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500529 if (space > 10) {
530 if (space <= PY_SSIZE_T_MAX - 10)
531 space += 10;
532 }
533 else {
534 space *= 2;
535 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200536 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500537 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200538 if (!output) {
539 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000540 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200541 }
542 i = o = 0;
543 kind = PyUnicode_KIND(input);
544 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000545
Martin v. Löwis22970662011-09-29 13:39:38 +0200546 while (i < isize) {
547 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000548 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200549 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000550 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300551 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000552 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000553 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200554 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000555 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000556 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
557 if (new_output == NULL) {
558 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200559 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000560 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200561 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000562 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000563 }
564 /* Hangul Decomposition. */
565 if (SBase <= code && code < (SBase+SCount)) {
566 int SIndex = code - SBase;
567 int L = LBase + SIndex / NCount;
568 int V = VBase + (SIndex % NCount) / TCount;
569 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200570 output[o++] = L;
571 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000572 space -= 2;
573 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200574 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000575 space --;
576 }
577 continue;
578 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000579 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000580 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000581 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
582 if (value != 0) {
583 stack[stackptr++] = value;
584 continue;
585 }
586 }
587
588 /* Other decompositions. */
589 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000590
591 /* Copy character if it is not decomposable, or has a
592 compatibility decomposition, but we do NFD. */
593 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200594 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000595 space--;
596 continue;
597 }
598 /* Copy decomposition onto the stack, in reverse
599 order. */
600 while(count) {
601 code = decomp_data[index + (--count)];
602 stack[stackptr++] = code;
603 }
604 }
605 }
606
Martin v. Löwis22970662011-09-29 13:39:38 +0200607 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
608 output, o);
609 PyMem_Free(output);
610 if (!result)
611 return NULL;
612 /* result is guaranteed to be ready, as it is compact. */
613 kind = PyUnicode_KIND(result);
614 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000615
616 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200617 i = 0;
618 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
619 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
620 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000621 if (prev == 0 || cur == 0 || prev <= cur) {
622 prev = cur;
623 continue;
624 }
625 /* Non-canonical order. Need to switch *i with previous. */
626 o = i - 1;
627 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200628 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
629 PyUnicode_WRITE(kind, data, o+1,
630 PyUnicode_READ(kind, data, o));
631 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000632 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200633 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000634 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200635 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000636 if (prev == 0 || prev <= cur)
637 break;
638 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200639 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000640 }
641 return result;
642}
643
644static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200645find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000646{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200647 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000648 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200649 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000650 if (code < start)
651 return -1;
652 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200653 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000654 return nfc[index].index + delta;
655 }
656 }
657 return -1;
658}
659
660static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000661nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000662{
663 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200664 int kind;
665 void *data;
666 Py_UCS4 *output;
667 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000668 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200669 Py_UCS4 code;
670 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000671 int cskipped = 0;
672
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000673 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000674 if (!result)
675 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200676 /* result will be "ready". */
677 kind = PyUnicode_KIND(result);
678 data = PyUnicode_DATA(result);
679 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000680
Martin v. Löwis22970662011-09-29 13:39:38 +0200681 /* We allocate a buffer for the output.
682 If we find that we made no changes, we still return
683 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500684 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200685 if (!output) {
686 PyErr_NoMemory();
687 Py_DECREF(result);
688 return 0;
689 }
690 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000691
Martin v. Löwis677bde22002-11-23 22:08:15 +0000692 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200693 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000694 for (index = 0; index < cskipped; index++) {
695 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000696 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000697 Remove from list. */
698 skipped[index] = skipped[cskipped-1];
699 cskipped--;
700 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000701 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000702 }
703 }
704 /* Hangul Composition. We don't need to check for <LV,T>
705 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200706 code = PyUnicode_READ(kind, data, i);
707 if (LBase <= code && code < (LBase+LCount) &&
708 i + 1 < len &&
709 VBase <= PyUnicode_READ(kind, data, i+1) &&
710 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000711 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200712 LIndex = code - LBase;
713 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000714 code = SBase + (LIndex*VCount+VIndex)*TCount;
715 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200716 if (i < len &&
717 TBase <= PyUnicode_READ(kind, data, i) &&
718 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
719 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000720 i++;
721 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200722 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000723 continue;
724 }
725
Martin v. Löwis22970662011-09-29 13:39:38 +0200726 /* code is still input[i] here */
727 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000728 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200729 output[o++] = code;
730 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000731 continue;
732 }
733 /* Find next unblocked character. */
734 i1 = i+1;
735 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200736 /* output base character for now; might be updated later. */
737 output[o] = PyUnicode_READ(kind, data, i);
738 while (i1 < len) {
739 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
740 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000741 if (comb) {
742 if (comb1 == 0)
743 break;
744 if (comb >= comb1) {
745 /* Character is blocked. */
746 i1++;
747 continue;
748 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000749 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200750 l = find_nfc_index(self, nfc_last, code1);
751 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000752 is a starter, we don't need to look further.
753 Otherwise, record the combining class. */
754 if (l == -1) {
755 not_combinable:
756 if (comb1 == 0)
757 break;
758 comb = comb1;
759 i1++;
760 continue;
761 }
762 index = f*TOTAL_LAST + l;
763 index1 = comp_index[index >> COMP_SHIFT];
764 code = comp_data[(index1<<COMP_SHIFT)+
765 (index&((1<<COMP_SHIFT)-1))];
766 if (code == 0)
767 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000768
Martin v. Löwis677bde22002-11-23 22:08:15 +0000769 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200770 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000771 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000772 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000773 skipped[cskipped++] = i1;
774 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200775 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000776 if (f == -1)
777 break;
778 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200779 /* Output character was already written.
780 Just advance the indices. */
781 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000782 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200783 if (o == len) {
784 /* No changes. Return original string. */
785 PyMem_Free(output);
786 return result;
787 }
788 Py_DECREF(result);
789 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
790 output, o);
791 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000792 return result;
793}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000794
795/* Return 1 if the input is certainly normalized, 0 if it might not be. */
796static int
797is_normalized(PyObject *self, PyObject *input, int nfc, int k)
798{
Martin v. Löwis22970662011-09-29 13:39:38 +0200799 Py_ssize_t i, len;
800 int kind;
801 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000802 unsigned char prev_combining = 0, quickcheck_mask;
803
804 /* An older version of the database is requested, quickchecks must be
805 disabled. */
806 if (self && UCD_Check(self))
807 return 0;
808
809 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
810 as described in http://unicode.org/reports/tr15/#Annex8. */
811 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
812
Martin v. Löwis22970662011-09-29 13:39:38 +0200813 i = 0;
814 kind = PyUnicode_KIND(input);
815 data = PyUnicode_DATA(input);
816 len = PyUnicode_GET_LENGTH(input);
817 while (i < len) {
818 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
819 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000820 unsigned char combining = record->combining;
821 unsigned char quickcheck = record->normalization_quick_check;
822
823 if (quickcheck & quickcheck_mask)
824 return 0; /* this string might need normalization */
825 if (combining && prev_combining > combining)
826 return 0; /* non-canonical sort order, not normalized */
827 prev_combining = combining;
828 }
829 return 1; /* certainly normalized */
830}
831
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000832PyDoc_STRVAR(unicodedata_normalize__doc__,
833"normalize(form, unistr)\n\
834\n\
835Return the normal form 'form' for the Unicode string unistr. Valid\n\
836values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
837
Martin v. Löwis677bde22002-11-23 22:08:15 +0000838static PyObject*
839unicodedata_normalize(PyObject *self, PyObject *args)
840{
841 char *form;
842 PyObject *input;
843
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000844 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000845 &form, &PyUnicode_Type, &input))
846 return NULL;
847
Martin v. Löwis22970662011-09-29 13:39:38 +0200848 if (PyUnicode_READY(input) == -1)
849 return NULL;
850
851 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000852 /* Special case empty input strings, since resizing
853 them later would cause internal errors. */
854 Py_INCREF(input);
855 return input;
856 }
857
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000858 if (strcmp(form, "NFC") == 0) {
859 if (is_normalized(self, input, 1, 0)) {
860 Py_INCREF(input);
861 return input;
862 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000863 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000864 }
865 if (strcmp(form, "NFKC") == 0) {
866 if (is_normalized(self, input, 1, 1)) {
867 Py_INCREF(input);
868 return input;
869 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000870 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000871 }
872 if (strcmp(form, "NFD") == 0) {
873 if (is_normalized(self, input, 0, 0)) {
874 Py_INCREF(input);
875 return input;
876 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000877 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000878 }
879 if (strcmp(form, "NFKD") == 0) {
880 if (is_normalized(self, input, 0, 1)) {
881 Py_INCREF(input);
882 return input;
883 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000884 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000885 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000886 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
887 return NULL;
888}
889
Fredrik Lundh06d12682001-01-24 07:59:11 +0000890/* -------------------------------------------------------------------- */
891/* unicode character name tables */
892
893/* data file generated by Tools/unicode/makeunicodedata.py */
894#include "unicodename_db.h"
895
896/* -------------------------------------------------------------------- */
897/* database code (cut and pasted from the unidb package) */
898
899static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000900_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000901{
902 int i;
903 unsigned long h = 0;
904 unsigned long ix;
905 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200906 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000907 ix = h & 0xff000000;
908 if (ix)
909 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
910 }
911 return h;
912}
913
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000914static char *hangul_syllables[][3] = {
915 { "G", "A", "" },
916 { "GG", "AE", "G" },
917 { "N", "YA", "GG" },
918 { "D", "YAE", "GS" },
919 { "DD", "EO", "N", },
920 { "R", "E", "NJ" },
921 { "M", "YEO", "NH" },
922 { "B", "YE", "D" },
923 { "BB", "O", "L" },
924 { "S", "WA", "LG" },
925 { "SS", "WAE", "LM" },
926 { "", "OE", "LB" },
927 { "J", "YO", "LS" },
928 { "JJ", "U", "LT" },
929 { "C", "WEO", "LP" },
930 { "K", "WE", "LH" },
931 { "T", "WI", "M" },
932 { "P", "YU", "B" },
933 { "H", "EU", "BS" },
934 { 0, "YI", "S" },
935 { 0, "I", "SS" },
936 { 0, 0, "NG" },
937 { 0, 0, "J" },
938 { 0, 0, "C" },
939 { 0, 0, "K" },
940 { 0, 0, "T" },
941 { 0, 0, "P" },
942 { 0, 0, "H" }
943};
944
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000945/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000946static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000947is_unified_ideograph(Py_UCS4 code)
948{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000949 return
950 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500951 (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000952 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
953 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
954 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000955}
956
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200957/* macros used to determine if the given code point is in the PUA range that
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300958 * we are using to store aliases and named sequences */
959#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
960#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
961 (cp < named_sequences_end))
962
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000963static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300964_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
965 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000966{
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200967 /* Find the name associated with the given code point.
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300968 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
969 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000970 int offset;
971 int i;
972 int word;
973 unsigned char* w;
974
Martin v. Löwisc3509122006-03-11 12:16:23 +0000975 if (code >= 0x110000)
976 return 0;
977
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200978 /* XXX should we just skip all the code points in the PUAs here? */
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300979 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
980 return 0;
981
Martin v. Löwis1a214512008-06-11 05:26:20 +0000982 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300983 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +0300984 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300985 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
986 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +0300987 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +0000988 if (old->category_changed == 0) {
989 /* unassigned */
990 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000991 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000992 }
993
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000994 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000995 /* Hangul syllable. */
996 int SIndex = code - SBase;
997 int L = SIndex / NCount;
998 int V = (SIndex % NCount) / TCount;
999 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001000
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001001 if (buflen < 27)
1002 /* Worst case: HANGUL SYLLABLE <10chars>. */
1003 return 0;
1004 strcpy(buffer, "HANGUL SYLLABLE ");
1005 buffer += 16;
1006 strcpy(buffer, hangul_syllables[L][0]);
1007 buffer += strlen(hangul_syllables[L][0]);
1008 strcpy(buffer, hangul_syllables[V][1]);
1009 buffer += strlen(hangul_syllables[V][1]);
1010 strcpy(buffer, hangul_syllables[T][2]);
1011 buffer += strlen(hangul_syllables[T][2]);
1012 *buffer = '\0';
1013 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001014 }
1015
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001016 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001017 if (buflen < 28)
1018 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1019 return 0;
1020 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1021 return 1;
1022 }
1023
Fredrik Lundh06d12682001-01-24 07:59:11 +00001024 /* get offset into phrasebook */
1025 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1026 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1027 (code&((1<<phrasebook_shift)-1))];
1028 if (!offset)
1029 return 0;
1030
1031 i = 0;
1032
1033 for (;;) {
1034 /* get word index */
1035 word = phrasebook[offset] - phrasebook_short;
1036 if (word >= 0) {
1037 word = (word << 8) + phrasebook[offset+1];
1038 offset += 2;
1039 } else
1040 word = phrasebook[offset++];
1041 if (i) {
1042 if (i > buflen)
1043 return 0; /* buffer overflow */
1044 buffer[i++] = ' ';
1045 }
1046 /* copy word string from lexicon. the last character in the
1047 word has bit 7 set. the last word in a string ends with
1048 0x80 */
1049 w = lexicon + lexicon_offset[word];
1050 while (*w < 128) {
1051 if (i >= buflen)
1052 return 0; /* buffer overflow */
1053 buffer[i++] = *w++;
1054 }
1055 if (i >= buflen)
1056 return 0; /* buffer overflow */
1057 buffer[i++] = *w & 127;
1058 if (*w == 128)
1059 break; /* end of word */
1060 }
1061
1062 return 1;
1063}
1064
1065static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001066_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001067{
1068 /* check if code corresponds to the given name */
1069 int i;
1070 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001071 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001072 return 0;
1073 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001074 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001075 return 0;
1076 }
1077 return buffer[namelen] == '\0';
1078}
1079
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001080static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001081find_syllable(const char *str, int *len, int *pos, int count, int column)
1082{
1083 int i, len1;
1084 *len = -1;
1085 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001086 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001087 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001088 if (len1 <= *len)
1089 continue;
1090 if (strncmp(str, s, len1) == 0) {
1091 *len = len1;
1092 *pos = i;
1093 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001094 }
1095 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001096 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001097 }
1098}
1099
Fredrik Lundh06d12682001-01-24 07:59:11 +00001100static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001101_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001102{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001103 /* check if named sequences are allowed */
1104 if (!with_named_seq && IS_NAMED_SEQ(cp))
1105 return 0;
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001106 /* if the code point is in the PUA range that we use for aliases,
1107 * convert it to obtain the right code point */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001108 if (IS_ALIAS(cp))
1109 *code = name_aliases[cp-aliases_start];
1110 else
1111 *code = cp;
1112 return 1;
1113}
1114
1115static int
1116_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1117 int with_named_seq)
1118{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001119 /* Return the code point associated with the given name.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001120 * Named aliases are resolved too (unless self != NULL (i.e. we are using
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001121 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001122 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001123 unsigned int h, v;
1124 unsigned int mask = code_size-1;
1125 unsigned int i, incr;
1126
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001127 /* Check for hangul syllables. */
1128 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001129 int len, L = -1, V = -1, T = -1;
1130 const char *pos = name + 16;
1131 find_syllable(pos, &len, &L, LCount, 0);
1132 pos += len;
1133 find_syllable(pos, &len, &V, VCount, 1);
1134 pos += len;
1135 find_syllable(pos, &len, &T, TCount, 2);
1136 pos += len;
1137 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1138 *code = SBase + (L*VCount+V)*TCount + T;
1139 return 1;
1140 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001141 /* Otherwise, it's an illegal syllable name. */
1142 return 0;
1143 }
1144
1145 /* Check for unified ideographs. */
1146 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1147 /* Four or five hexdigits must follow. */
1148 v = 0;
1149 name += 22;
1150 namelen -= 22;
1151 if (namelen != 4 && namelen != 5)
1152 return 0;
1153 while (namelen--) {
1154 v *= 16;
1155 if (*name >= '0' && *name <= '9')
1156 v += *name - '0';
1157 else if (*name >= 'A' && *name <= 'F')
1158 v += *name - 'A' + 10;
1159 else
1160 return 0;
1161 name++;
1162 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001163 if (!is_unified_ideograph(v))
1164 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001165 *code = v;
1166 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001167 }
1168
Fredrik Lundh06d12682001-01-24 07:59:11 +00001169 /* the following is the same as python's dictionary lookup, with
1170 only minor changes. see the makeunicodedata script for more
1171 details */
1172
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001173 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001174 i = (~h) & mask;
1175 v = code_hash[i];
1176 if (!v)
1177 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001178 if (_cmpname(self, v, name, namelen))
1179 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001180 incr = (h ^ (h >> 3)) & mask;
1181 if (!incr)
1182 incr = mask;
1183 for (;;) {
1184 i = (i + incr) & mask;
1185 v = code_hash[i];
1186 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001187 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001188 if (_cmpname(self, v, name, namelen))
1189 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001190 incr = incr << 1;
1191 if (incr > mask)
1192 incr = incr ^ code_poly;
1193 }
1194}
1195
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001196static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001197{
1198 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001199 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001200 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001201};
1202
1203/* -------------------------------------------------------------------- */
1204/* Python bindings */
1205
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001206PyDoc_STRVAR(unicodedata_name__doc__,
1207"name(unichr[, default])\n\
1208Returns the name assigned to the Unicode character unichr as a\n\
1209string. If no name is defined, default is returned, or, if not\n\
1210given, ValueError is raised.");
1211
Fredrik Lundh06d12682001-01-24 07:59:11 +00001212static PyObject *
1213unicodedata_name(PyObject* self, PyObject* args)
1214{
1215 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001216 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001217
1218 PyUnicodeObject* v;
1219 PyObject* defobj = NULL;
1220 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1221 return NULL;
1222
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001223 c = getuchar(v);
1224 if (c == (Py_UCS4)-1)
1225 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001226
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001227 if (!_getucname(self, c, name, sizeof(name), 0)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001228 if (defobj == NULL) {
1229 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001230 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001231 }
1232 else {
1233 Py_INCREF(defobj);
1234 return defobj;
1235 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001236 }
1237
Walter Dörwald4254e762007-06-05 16:04:09 +00001238 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001239}
1240
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001241PyDoc_STRVAR(unicodedata_lookup__doc__,
1242"lookup(name)\n\
1243\n\
1244Look up character by name. If a character with the\n\
1245given name is found, return the corresponding Unicode\n\
1246character. If not found, KeyError is raised.");
1247
Fredrik Lundh06d12682001-01-24 07:59:11 +00001248static PyObject *
1249unicodedata_lookup(PyObject* self, PyObject* args)
1250{
1251 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001252
1253 char* name;
Victor Stinner65a31442014-07-01 16:45:52 +02001254 Py_ssize_t namelen;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001255 unsigned int index;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001256 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1257 return NULL;
Victor Stinner65a31442014-07-01 16:45:52 +02001258 if (namelen > INT_MAX) {
1259 PyErr_SetString(PyExc_KeyError, "name too long");
1260 return NULL;
1261 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001262
Victor Stinner65a31442014-07-01 16:45:52 +02001263 if (!_getcode(self, name, (int)namelen, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001264 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001265 return NULL;
1266 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001267 /* check if code is in the PUA range that we use for named sequences
1268 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001269 if (IS_NAMED_SEQ(code)) {
1270 index = code-named_sequences_start;
1271 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1272 named_sequences[index].seq,
1273 named_sequences[index].seqlen);
1274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001276}
1277
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001278/* XXX Add doc strings. */
1279
1280static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001281 UNICODEDATA_UCD_DECIMAL_METHODDEF
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001282 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1283 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1284 {"category", unicodedata_category, METH_VARARGS,
1285 unicodedata_category__doc__},
1286 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1287 unicodedata_bidirectional__doc__},
1288 {"combining", unicodedata_combining, METH_VARARGS,
1289 unicodedata_combining__doc__},
1290 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1291 unicodedata_mirrored__doc__},
1292 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1293 unicodedata_east_asian_width__doc__},
1294 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1295 unicodedata_decomposition__doc__},
1296 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1297 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1298 {"normalize", unicodedata_normalize, METH_VARARGS,
1299 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001300 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001301};
1302
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001303static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001304 /* The ob_type field must be initialized in the module init function
1305 * to be portable to Windows without using C++. */
1306 PyVarObject_HEAD_INIT(NULL, 0)
1307 "unicodedata.UCD", /*tp_name*/
1308 sizeof(PreviousDBVersion), /*tp_basicsize*/
1309 0, /*tp_itemsize*/
1310 /* methods */
1311 (destructor)PyObject_Del, /*tp_dealloc*/
1312 0, /*tp_print*/
1313 0, /*tp_getattr*/
1314 0, /*tp_setattr*/
1315 0, /*tp_reserved*/
1316 0, /*tp_repr*/
1317 0, /*tp_as_number*/
1318 0, /*tp_as_sequence*/
1319 0, /*tp_as_mapping*/
1320 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001321 0, /*tp_call*/
1322 0, /*tp_str*/
1323 PyObject_GenericGetAttr,/*tp_getattro*/
1324 0, /*tp_setattro*/
1325 0, /*tp_as_buffer*/
1326 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1327 0, /*tp_doc*/
1328 0, /*tp_traverse*/
1329 0, /*tp_clear*/
1330 0, /*tp_richcompare*/
1331 0, /*tp_weaklistoffset*/
1332 0, /*tp_iter*/
1333 0, /*tp_iternext*/
1334 unicodedata_functions, /*tp_methods*/
1335 DB_members, /*tp_members*/
1336 0, /*tp_getset*/
1337 0, /*tp_base*/
1338 0, /*tp_dict*/
1339 0, /*tp_descr_get*/
1340 0, /*tp_descr_set*/
1341 0, /*tp_dictoffset*/
1342 0, /*tp_init*/
1343 0, /*tp_alloc*/
1344 0, /*tp_new*/
1345 0, /*tp_free*/
1346 0, /*tp_is_gc*/
1347};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001348
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001349PyDoc_STRVAR(unicodedata_docstring,
1350"This module provides access to the Unicode Character Database which\n\
1351defines character properties for all Unicode characters. The data in\n\
1352this database is based on the UnicodeData.txt file version\n\
Benjamin Peterson8aa7b892013-10-10 20:22:10 -04001353" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001354\n\
1355The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001356UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001357
1358static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001359 PyModuleDef_HEAD_INIT,
1360 "unicodedata",
1361 unicodedata_docstring,
1362 -1,
1363 unicodedata_functions,
1364 NULL,
1365 NULL,
1366 NULL,
1367 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001368};
1369
Mark Hammond62b1ab12002-07-23 06:31:15 +00001370PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001371PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001372{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001373 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001374
Christian Heimes90aa7642007-12-19 02:45:37 +00001375 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001376
Martin v. Löwis1a214512008-06-11 05:26:20 +00001377 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001378 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001379 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001380
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001381 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001382 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001383 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001384
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001385 /* Previous versions */
1386 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1387 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001388 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001389
Fredrik Lundh06d12682001-01-24 07:59:11 +00001390 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001391 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001392 if (v != NULL)
1393 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001394 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001395}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001396
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001397/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001398Local variables:
1399c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001400indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001401End:
1402*/