blob: e89c92d669fd22003c1e334f13bad42ef9b37852 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000020#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Larry Hastings61272b72014-01-07 12:41:53 -080022/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080023module unicodedata
Larry Hastingsc2047262014-01-25 20:43:29 -080024class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Larry Hastings61272b72014-01-07 12:41:53 -080025[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080026/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080027
Fredrik Lundh06d12682001-01-24 07:59:11 +000028/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000029
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000031 const unsigned char category; /* index into
32 _PyUnicode_CategoryNames */
33 const unsigned char combining; /* combining class value 0 - 255 */
34 const unsigned char bidirectional; /* index into
35 _PyUnicode_BidirectionalNames */
36 const unsigned char mirrored; /* true if mirrored in bidir mode */
37 const unsigned char east_asian_width; /* index into
38 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000039 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000040} _PyUnicode_DatabaseRecord;
41
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042typedef struct change_record {
43 /* sequence of fields should be the same as in merge_old_version */
44 const unsigned char bidir_changed;
45 const unsigned char category_changed;
46 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000047 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000048 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000049} change_record;
50
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000051/* data file generated by Tools/unicode/makeunicodedata.py */
52#include "unicodedata_db.h"
53
54static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000055_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000056{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000057 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000058 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000059 index = 0;
60 else {
61 index = index1[(code>>SHIFT)];
62 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
63 }
64
65 return &_PyUnicode_Database_Records[index];
66}
67
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000068/* ------------- Previous-version API ------------------------------------- */
69typedef struct previous_version {
70 PyObject_HEAD
71 const char *name;
72 const change_record* (*getrecord)(Py_UCS4);
73 Py_UCS4 (*normalization)(Py_UCS4);
74} PreviousDBVersion;
75
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030076#include "clinic/unicodedata.c.h"
77
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
79
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000080static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000081 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000082 {NULL}
83};
84
Thomas Wouters89f507f2006-12-13 04:49:30 +000085/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000086static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000087#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088
89static PyObject*
90new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
91 Py_UCS4 (*normalization)(Py_UCS4))
92{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000093 PreviousDBVersion *self;
94 self = PyObject_New(PreviousDBVersion, &UCD_Type);
95 if (self == NULL)
96 return NULL;
97 self->name = name;
98 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000099 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000100 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000101}
102
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000103
104static Py_UCS4 getuchar(PyUnicodeObject *obj)
105{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 if (PyUnicode_READY(obj))
107 return (Py_UCS4)-1;
108 if (PyUnicode_GET_LENGTH(obj) == 1) {
109 if (PyUnicode_READY(obj))
110 return (Py_UCS4)-1;
111 return PyUnicode_READ_CHAR(obj, 0);
112 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000113 PyErr_SetString(PyExc_TypeError,
114 "need a single Unicode character as parameter");
115 return (Py_UCS4)-1;
116}
117
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118/* --- Module API --------------------------------------------------------- */
119
Larry Hastings61272b72014-01-07 12:41:53 -0800120/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -0800121
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800122unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700123
Larry Hastings77561cc2014-01-07 12:13:13 -0800124 unichr: object(type='PyUnicodeObject *', subclass_of='&PyUnicode_Type')
Larry Hastings31826802013-10-19 00:09:25 -0700125 default: object=NULL
126 /
127
128Converts a Unicode character into its equivalent decimal value.
129
130Returns the decimal value assigned to the Unicode character unichr
131as integer. If no such value is defined, default is returned, or, if
132not given, ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800133[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700134
Larry Hastings31826802013-10-19 00:09:25 -0700135static PyObject *
Larry Hastingsc2047262014-01-25 20:43:29 -0800136unicodedata_UCD_decimal_impl(PreviousDBVersion *self, PyUnicodeObject *unichr, PyObject *default_value)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300137/*[clinic end generated code: output=d285215533b58b28 input=c25c9d2b4de076b1]*/
Larry Hastings31826802013-10-19 00:09:25 -0700138{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000139 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000140 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000141 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000142
Larry Hastingsc2047262014-01-25 20:43:29 -0800143 c = getuchar(unichr);
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000144 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000145 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000146
Martin v. Löwis1a214512008-06-11 05:26:20 +0000147 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000148 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000149 if (old->category_changed == 0) {
150 /* unassigned */
151 have_old = 1;
152 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000153 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000154 else if (old->decimal_changed != 0xFF) {
155 have_old = 1;
156 rc = old->decimal_changed;
157 }
158 }
159
160 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000161 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000162 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700163 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000164 PyErr_SetString(PyExc_ValueError,
165 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000166 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 }
168 else {
Larry Hastings31826802013-10-19 00:09:25 -0700169 Py_INCREF(default_value);
170 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000171 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000172 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000173 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000174}
175
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000176PyDoc_STRVAR(unicodedata_digit__doc__,
177"digit(unichr[, default])\n\
178\n\
179Returns the digit value assigned to the Unicode character unichr as\n\
180integer. If no such value is defined, default is returned, or, if\n\
181not given, ValueError is raised.");
182
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000183static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000184unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000185{
186 PyUnicodeObject *v;
187 PyObject *defobj = NULL;
188 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000189 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000190
Fredrik Lundh06d12682001-01-24 07:59:11 +0000191 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000192 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000193 c = getuchar(v);
194 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000195 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000196 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000197 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 if (defobj == NULL) {
199 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000200 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000201 }
202 else {
203 Py_INCREF(defobj);
204 return defobj;
205 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000206 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000207 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000208}
209
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000210PyDoc_STRVAR(unicodedata_numeric__doc__,
211"numeric(unichr[, default])\n\
212\n\
213Returns the numeric value assigned to the Unicode character unichr\n\
214as float. If no such value is defined, default is returned, or, if\n\
215not given, ValueError is raised.");
216
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000217static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000218unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000219{
220 PyUnicodeObject *v;
221 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000222 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000223 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000224 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000225
Fredrik Lundh06d12682001-01-24 07:59:11 +0000226 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000227 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000228 c = getuchar(v);
229 if (c == (Py_UCS4)-1)
230 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000231
Martin v. Löwis1a214512008-06-11 05:26:20 +0000232 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000233 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000234 if (old->category_changed == 0) {
235 /* unassigned */
236 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000238 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000239 else if (old->decimal_changed != 0xFF) {
240 have_old = 1;
241 rc = old->decimal_changed;
242 }
243 }
244
245 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000246 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000247 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000248 if (defobj == NULL) {
249 PyErr_SetString(PyExc_ValueError, "not a numeric character");
250 return NULL;
251 }
252 else {
253 Py_INCREF(defobj);
254 return defobj;
255 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000256 }
257 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000258}
259
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000260PyDoc_STRVAR(unicodedata_category__doc__,
261"category(unichr)\n\
262\n\
263Returns the general category assigned to the Unicode character\n\
264unichr as string.");
265
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000266static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000267unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000268{
269 PyUnicodeObject *v;
270 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000271 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000272
273 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000274 &PyUnicode_Type, &v))
275 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000276 c = getuchar(v);
277 if (c == (Py_UCS4)-1)
278 return NULL;
279 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000280 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000281 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000282 if (old->category_changed != 0xFF)
283 index = old->category_changed;
284 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000285 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000286}
287
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000288PyDoc_STRVAR(unicodedata_bidirectional__doc__,
289"bidirectional(unichr)\n\
290\n\
Ezio Melottie3d7e542012-12-14 20:12:25 +0200291Returns the bidirectional class assigned to the Unicode character\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000292unichr as string. If no such value is defined, an empty string is\n\
293returned.");
294
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000295static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000296unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000297{
298 PyUnicodeObject *v;
299 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000300 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000301
302 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000303 &PyUnicode_Type, &v))
304 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000305 c = getuchar(v);
306 if (c == (Py_UCS4)-1)
307 return NULL;
308 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000309 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000310 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000311 if (old->category_changed == 0)
312 index = 0; /* unassigned */
313 else if (old->bidir_changed != 0xFF)
314 index = old->bidir_changed;
315 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000316 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000317}
318
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000319PyDoc_STRVAR(unicodedata_combining__doc__,
320"combining(unichr)\n\
321\n\
322Returns the canonical combining class assigned to the Unicode\n\
323character unichr as integer. Returns 0 if no combining class is\n\
324defined.");
325
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000326static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000327unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000328{
329 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000330 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000331 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000332
333 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000334 &PyUnicode_Type, &v))
335 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000336 c = getuchar(v);
337 if (c == (Py_UCS4)-1)
338 return NULL;
339 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000340 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000341 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000342 if (old->category_changed == 0)
343 index = 0; /* unassigned */
344 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000345 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000346}
347
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000348PyDoc_STRVAR(unicodedata_mirrored__doc__,
349"mirrored(unichr)\n\
350\n\
351Returns the mirrored property assigned to the Unicode character\n\
352unichr as integer. Returns 1 if the character has been identified as\n\
353a \"mirrored\" character in bidirectional text, 0 otherwise.");
354
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000355static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000356unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000357{
358 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000359 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000360 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000361
362 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000363 &PyUnicode_Type, &v))
364 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000365 c = getuchar(v);
366 if (c == (Py_UCS4)-1)
367 return NULL;
368 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000369 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000370 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000371 if (old->category_changed == 0)
372 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000373 else if (old->mirrored_changed != 0xFF)
374 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000375 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000376 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000377}
378
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000379PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
380"east_asian_width(unichr)\n\
381\n\
382Returns the east asian width assigned to the Unicode character\n\
383unichr as string.");
384
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000385static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000386unicodedata_east_asian_width(PyObject *self, PyObject *args)
387{
388 PyUnicodeObject *v;
389 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000390 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000391
392 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000393 &PyUnicode_Type, &v))
394 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000395 c = getuchar(v);
396 if (c == (Py_UCS4)-1)
397 return NULL;
398 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000399 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000400 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000401 if (old->category_changed == 0)
402 index = 0; /* unassigned */
403 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000404 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000405}
406
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000407PyDoc_STRVAR(unicodedata_decomposition__doc__,
408"decomposition(unichr)\n\
409\n\
410Returns the character decomposition mapping assigned to the Unicode\n\
411character unichr as string. An empty string is returned in case no\n\
412such mapping is defined.");
413
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000414static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000415unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000416{
417 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000418 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000419 int code, index, count;
420 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000421 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000422 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000423
424 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000425 &PyUnicode_Type, &v))
426 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000427 c = getuchar(v);
428 if (c == (Py_UCS4)-1)
429 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000430
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000431 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000432
Martin v. Löwis1a214512008-06-11 05:26:20 +0000433 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000434 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000435 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000436 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000437 }
438
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000439 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000440 index = 0;
441 else {
442 index = decomp_index1[(code>>DECOMP_SHIFT)];
443 index = decomp_index2[(index<<DECOMP_SHIFT)+
444 (code&((1<<DECOMP_SHIFT)-1))];
445 }
446
Tim Peters69b83b12001-11-30 07:23:05 +0000447 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000448 is prefix code (from*/
449 count = decomp_data[index] >> 8;
450
451 /* XXX: could allocate the PyString up front instead
452 (strlen(prefix) + 5 * count + 1 bytes) */
453
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000454 /* Based on how index is calculated above and decomp_data is generated
455 from Tools/unicode/makeunicodedata.py, it should not be possible
456 to overflow decomp_prefix. */
457 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200458 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000459
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000460 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000461 i = strlen(decomp_prefix[prefix_index]);
462 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000463
464 while (count-- > 0) {
465 if (i)
466 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000467 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000468 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
469 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000470 i += strlen(decomp + i);
471 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000472 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000473}
474
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000475static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000476get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000477{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000478 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000479 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000480 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000481 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000482 /* unassigned in old version */
483 *index = 0;
484 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000485 else {
486 *index = decomp_index1[(code>>DECOMP_SHIFT)];
487 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
488 (code&((1<<DECOMP_SHIFT)-1))];
489 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000490
Martin v. Löwis677bde22002-11-23 22:08:15 +0000491 /* high byte is number of hex bytes (usually one or two), low byte
492 is prefix code (from*/
493 *count = decomp_data[*index] >> 8;
494 *prefix = decomp_data[*index] & 255;
495
496 (*index)++;
497}
498
499#define SBase 0xAC00
500#define LBase 0x1100
501#define VBase 0x1161
502#define TBase 0x11A7
503#define LCount 19
504#define VCount 21
505#define TCount 28
506#define NCount (VCount*TCount)
507#define SCount (LCount*NCount)
508
509static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000510nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000511{
512 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200513 Py_UCS4 *output;
514 Py_ssize_t i, o, osize;
515 int kind;
516 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000517 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200518 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000519 Py_ssize_t space, isize;
520 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000521 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000522
Martin v. Löwis677bde22002-11-23 22:08:15 +0000523 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200524 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500525 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300526 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500527 if (space > 10) {
528 if (space <= PY_SSIZE_T_MAX - 10)
529 space += 10;
530 }
531 else {
532 space *= 2;
533 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200534 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500535 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200536 if (!output) {
537 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000538 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200539 }
540 i = o = 0;
541 kind = PyUnicode_KIND(input);
542 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000543
Martin v. Löwis22970662011-09-29 13:39:38 +0200544 while (i < isize) {
545 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000546 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200547 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000548 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300549 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000550 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000551 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200552 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000553 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000554 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
555 if (new_output == NULL) {
556 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200557 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000558 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200559 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000560 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000561 }
562 /* Hangul Decomposition. */
563 if (SBase <= code && code < (SBase+SCount)) {
564 int SIndex = code - SBase;
565 int L = LBase + SIndex / NCount;
566 int V = VBase + (SIndex % NCount) / TCount;
567 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200568 output[o++] = L;
569 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000570 space -= 2;
571 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200572 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000573 space --;
574 }
575 continue;
576 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000577 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000578 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000579 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
580 if (value != 0) {
581 stack[stackptr++] = value;
582 continue;
583 }
584 }
585
586 /* Other decompositions. */
587 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000588
589 /* Copy character if it is not decomposable, or has a
590 compatibility decomposition, but we do NFD. */
591 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200592 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000593 space--;
594 continue;
595 }
596 /* Copy decomposition onto the stack, in reverse
597 order. */
598 while(count) {
599 code = decomp_data[index + (--count)];
600 stack[stackptr++] = code;
601 }
602 }
603 }
604
Martin v. Löwis22970662011-09-29 13:39:38 +0200605 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
606 output, o);
607 PyMem_Free(output);
608 if (!result)
609 return NULL;
610 /* result is guaranteed to be ready, as it is compact. */
611 kind = PyUnicode_KIND(result);
612 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000613
614 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200615 i = 0;
616 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
617 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
618 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000619 if (prev == 0 || cur == 0 || prev <= cur) {
620 prev = cur;
621 continue;
622 }
623 /* Non-canonical order. Need to switch *i with previous. */
624 o = i - 1;
625 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200626 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
627 PyUnicode_WRITE(kind, data, o+1,
628 PyUnicode_READ(kind, data, o));
629 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000630 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200631 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000632 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200633 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000634 if (prev == 0 || prev <= cur)
635 break;
636 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200637 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000638 }
639 return result;
640}
641
642static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200643find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000644{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200645 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000646 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200647 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000648 if (code < start)
649 return -1;
650 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200651 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000652 return nfc[index].index + delta;
653 }
654 }
655 return -1;
656}
657
658static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000659nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000660{
661 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200662 int kind;
663 void *data;
664 Py_UCS4 *output;
665 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000666 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200667 Py_UCS4 code;
668 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000669 int cskipped = 0;
670
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000671 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000672 if (!result)
673 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200674 /* result will be "ready". */
675 kind = PyUnicode_KIND(result);
676 data = PyUnicode_DATA(result);
677 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000678
Martin v. Löwis22970662011-09-29 13:39:38 +0200679 /* We allocate a buffer for the output.
680 If we find that we made no changes, we still return
681 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500682 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200683 if (!output) {
684 PyErr_NoMemory();
685 Py_DECREF(result);
686 return 0;
687 }
688 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000689
Martin v. Löwis677bde22002-11-23 22:08:15 +0000690 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200691 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000692 for (index = 0; index < cskipped; index++) {
693 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000694 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000695 Remove from list. */
696 skipped[index] = skipped[cskipped-1];
697 cskipped--;
698 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000699 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000700 }
701 }
702 /* Hangul Composition. We don't need to check for <LV,T>
703 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200704 code = PyUnicode_READ(kind, data, i);
705 if (LBase <= code && code < (LBase+LCount) &&
706 i + 1 < len &&
707 VBase <= PyUnicode_READ(kind, data, i+1) &&
708 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000709 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200710 LIndex = code - LBase;
711 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000712 code = SBase + (LIndex*VCount+VIndex)*TCount;
713 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200714 if (i < len &&
715 TBase <= PyUnicode_READ(kind, data, i) &&
716 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
717 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000718 i++;
719 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200720 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000721 continue;
722 }
723
Martin v. Löwis22970662011-09-29 13:39:38 +0200724 /* code is still input[i] here */
725 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000726 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200727 output[o++] = code;
728 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000729 continue;
730 }
731 /* Find next unblocked character. */
732 i1 = i+1;
733 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200734 /* output base character for now; might be updated later. */
735 output[o] = PyUnicode_READ(kind, data, i);
736 while (i1 < len) {
737 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
738 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000739 if (comb) {
740 if (comb1 == 0)
741 break;
742 if (comb >= comb1) {
743 /* Character is blocked. */
744 i1++;
745 continue;
746 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000747 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200748 l = find_nfc_index(self, nfc_last, code1);
749 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000750 is a starter, we don't need to look further.
751 Otherwise, record the combining class. */
752 if (l == -1) {
753 not_combinable:
754 if (comb1 == 0)
755 break;
756 comb = comb1;
757 i1++;
758 continue;
759 }
760 index = f*TOTAL_LAST + l;
761 index1 = comp_index[index >> COMP_SHIFT];
762 code = comp_data[(index1<<COMP_SHIFT)+
763 (index&((1<<COMP_SHIFT)-1))];
764 if (code == 0)
765 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000766
Martin v. Löwis677bde22002-11-23 22:08:15 +0000767 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200768 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000769 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000770 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000771 skipped[cskipped++] = i1;
772 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200773 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000774 if (f == -1)
775 break;
776 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200777 /* Output character was already written.
778 Just advance the indices. */
779 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000780 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200781 if (o == len) {
782 /* No changes. Return original string. */
783 PyMem_Free(output);
784 return result;
785 }
786 Py_DECREF(result);
787 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
788 output, o);
789 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000790 return result;
791}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000792
793/* Return 1 if the input is certainly normalized, 0 if it might not be. */
794static int
795is_normalized(PyObject *self, PyObject *input, int nfc, int k)
796{
Martin v. Löwis22970662011-09-29 13:39:38 +0200797 Py_ssize_t i, len;
798 int kind;
799 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000800 unsigned char prev_combining = 0, quickcheck_mask;
801
802 /* An older version of the database is requested, quickchecks must be
803 disabled. */
804 if (self && UCD_Check(self))
805 return 0;
806
807 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
808 as described in http://unicode.org/reports/tr15/#Annex8. */
809 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
810
Martin v. Löwis22970662011-09-29 13:39:38 +0200811 i = 0;
812 kind = PyUnicode_KIND(input);
813 data = PyUnicode_DATA(input);
814 len = PyUnicode_GET_LENGTH(input);
815 while (i < len) {
816 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
817 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000818 unsigned char combining = record->combining;
819 unsigned char quickcheck = record->normalization_quick_check;
820
821 if (quickcheck & quickcheck_mask)
822 return 0; /* this string might need normalization */
823 if (combining && prev_combining > combining)
824 return 0; /* non-canonical sort order, not normalized */
825 prev_combining = combining;
826 }
827 return 1; /* certainly normalized */
828}
829
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000830PyDoc_STRVAR(unicodedata_normalize__doc__,
831"normalize(form, unistr)\n\
832\n\
833Return the normal form 'form' for the Unicode string unistr. Valid\n\
834values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
835
Martin v. Löwis677bde22002-11-23 22:08:15 +0000836static PyObject*
837unicodedata_normalize(PyObject *self, PyObject *args)
838{
839 char *form;
840 PyObject *input;
841
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000842 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000843 &form, &PyUnicode_Type, &input))
844 return NULL;
845
Martin v. Löwis22970662011-09-29 13:39:38 +0200846 if (PyUnicode_READY(input) == -1)
847 return NULL;
848
849 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000850 /* Special case empty input strings, since resizing
851 them later would cause internal errors. */
852 Py_INCREF(input);
853 return input;
854 }
855
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000856 if (strcmp(form, "NFC") == 0) {
857 if (is_normalized(self, input, 1, 0)) {
858 Py_INCREF(input);
859 return input;
860 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000861 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000862 }
863 if (strcmp(form, "NFKC") == 0) {
864 if (is_normalized(self, input, 1, 1)) {
865 Py_INCREF(input);
866 return input;
867 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000868 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000869 }
870 if (strcmp(form, "NFD") == 0) {
871 if (is_normalized(self, input, 0, 0)) {
872 Py_INCREF(input);
873 return input;
874 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000875 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000876 }
877 if (strcmp(form, "NFKD") == 0) {
878 if (is_normalized(self, input, 0, 1)) {
879 Py_INCREF(input);
880 return input;
881 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000882 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000883 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000884 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
885 return NULL;
886}
887
Fredrik Lundh06d12682001-01-24 07:59:11 +0000888/* -------------------------------------------------------------------- */
889/* unicode character name tables */
890
891/* data file generated by Tools/unicode/makeunicodedata.py */
892#include "unicodename_db.h"
893
894/* -------------------------------------------------------------------- */
895/* database code (cut and pasted from the unidb package) */
896
897static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000898_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000899{
900 int i;
901 unsigned long h = 0;
902 unsigned long ix;
903 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200904 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000905 ix = h & 0xff000000;
906 if (ix)
907 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
908 }
909 return h;
910}
911
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000912static char *hangul_syllables[][3] = {
913 { "G", "A", "" },
914 { "GG", "AE", "G" },
915 { "N", "YA", "GG" },
916 { "D", "YAE", "GS" },
917 { "DD", "EO", "N", },
918 { "R", "E", "NJ" },
919 { "M", "YEO", "NH" },
920 { "B", "YE", "D" },
921 { "BB", "O", "L" },
922 { "S", "WA", "LG" },
923 { "SS", "WAE", "LM" },
924 { "", "OE", "LB" },
925 { "J", "YO", "LS" },
926 { "JJ", "U", "LT" },
927 { "C", "WEO", "LP" },
928 { "K", "WE", "LH" },
929 { "T", "WI", "M" },
930 { "P", "YU", "B" },
931 { "H", "EU", "BS" },
932 { 0, "YI", "S" },
933 { 0, "I", "SS" },
934 { 0, 0, "NG" },
935 { 0, 0, "J" },
936 { 0, 0, "C" },
937 { 0, 0, "K" },
938 { 0, 0, "T" },
939 { 0, 0, "P" },
940 { 0, 0, "H" }
941};
942
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000943/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000944static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000945is_unified_ideograph(Py_UCS4 code)
946{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000947 return
948 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500949 (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000950 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
951 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
952 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000953}
954
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200955/* macros used to determine if the given code point is in the PUA range that
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300956 * we are using to store aliases and named sequences */
957#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
958#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
959 (cp < named_sequences_end))
960
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000961static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300962_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
963 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000964{
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200965 /* Find the name associated with the given code point.
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300966 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
967 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000968 int offset;
969 int i;
970 int word;
971 unsigned char* w;
972
Martin v. Löwisc3509122006-03-11 12:16:23 +0000973 if (code >= 0x110000)
974 return 0;
975
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200976 /* XXX should we just skip all the code points in the PUAs here? */
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300977 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
978 return 0;
979
Martin v. Löwis1a214512008-06-11 05:26:20 +0000980 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300981 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +0300982 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300983 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
984 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +0300985 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +0000986 if (old->category_changed == 0) {
987 /* unassigned */
988 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000989 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000990 }
991
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000992 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000993 /* Hangul syllable. */
994 int SIndex = code - SBase;
995 int L = SIndex / NCount;
996 int V = (SIndex % NCount) / TCount;
997 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000998
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000999 if (buflen < 27)
1000 /* Worst case: HANGUL SYLLABLE <10chars>. */
1001 return 0;
1002 strcpy(buffer, "HANGUL SYLLABLE ");
1003 buffer += 16;
1004 strcpy(buffer, hangul_syllables[L][0]);
1005 buffer += strlen(hangul_syllables[L][0]);
1006 strcpy(buffer, hangul_syllables[V][1]);
1007 buffer += strlen(hangul_syllables[V][1]);
1008 strcpy(buffer, hangul_syllables[T][2]);
1009 buffer += strlen(hangul_syllables[T][2]);
1010 *buffer = '\0';
1011 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001012 }
1013
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001014 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001015 if (buflen < 28)
1016 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1017 return 0;
1018 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1019 return 1;
1020 }
1021
Fredrik Lundh06d12682001-01-24 07:59:11 +00001022 /* get offset into phrasebook */
1023 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1024 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1025 (code&((1<<phrasebook_shift)-1))];
1026 if (!offset)
1027 return 0;
1028
1029 i = 0;
1030
1031 for (;;) {
1032 /* get word index */
1033 word = phrasebook[offset] - phrasebook_short;
1034 if (word >= 0) {
1035 word = (word << 8) + phrasebook[offset+1];
1036 offset += 2;
1037 } else
1038 word = phrasebook[offset++];
1039 if (i) {
1040 if (i > buflen)
1041 return 0; /* buffer overflow */
1042 buffer[i++] = ' ';
1043 }
1044 /* copy word string from lexicon. the last character in the
1045 word has bit 7 set. the last word in a string ends with
1046 0x80 */
1047 w = lexicon + lexicon_offset[word];
1048 while (*w < 128) {
1049 if (i >= buflen)
1050 return 0; /* buffer overflow */
1051 buffer[i++] = *w++;
1052 }
1053 if (i >= buflen)
1054 return 0; /* buffer overflow */
1055 buffer[i++] = *w & 127;
1056 if (*w == 128)
1057 break; /* end of word */
1058 }
1059
1060 return 1;
1061}
1062
1063static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001064_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001065{
1066 /* check if code corresponds to the given name */
1067 int i;
1068 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001069 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001070 return 0;
1071 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001072 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001073 return 0;
1074 }
1075 return buffer[namelen] == '\0';
1076}
1077
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001078static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001079find_syllable(const char *str, int *len, int *pos, int count, int column)
1080{
1081 int i, len1;
1082 *len = -1;
1083 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001084 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001085 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001086 if (len1 <= *len)
1087 continue;
1088 if (strncmp(str, s, len1) == 0) {
1089 *len = len1;
1090 *pos = i;
1091 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001092 }
1093 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001094 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001095 }
1096}
1097
Fredrik Lundh06d12682001-01-24 07:59:11 +00001098static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001099_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001100{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001101 /* check if named sequences are allowed */
1102 if (!with_named_seq && IS_NAMED_SEQ(cp))
1103 return 0;
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001104 /* if the code point is in the PUA range that we use for aliases,
1105 * convert it to obtain the right code point */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001106 if (IS_ALIAS(cp))
1107 *code = name_aliases[cp-aliases_start];
1108 else
1109 *code = cp;
1110 return 1;
1111}
1112
1113static int
1114_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1115 int with_named_seq)
1116{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001117 /* Return the code point associated with the given name.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001118 * Named aliases are resolved too (unless self != NULL (i.e. we are using
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001119 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001120 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001121 unsigned int h, v;
1122 unsigned int mask = code_size-1;
1123 unsigned int i, incr;
1124
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001125 /* Check for hangul syllables. */
1126 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001127 int len, L = -1, V = -1, T = -1;
1128 const char *pos = name + 16;
1129 find_syllable(pos, &len, &L, LCount, 0);
1130 pos += len;
1131 find_syllable(pos, &len, &V, VCount, 1);
1132 pos += len;
1133 find_syllable(pos, &len, &T, TCount, 2);
1134 pos += len;
1135 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1136 *code = SBase + (L*VCount+V)*TCount + T;
1137 return 1;
1138 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001139 /* Otherwise, it's an illegal syllable name. */
1140 return 0;
1141 }
1142
1143 /* Check for unified ideographs. */
1144 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1145 /* Four or five hexdigits must follow. */
1146 v = 0;
1147 name += 22;
1148 namelen -= 22;
1149 if (namelen != 4 && namelen != 5)
1150 return 0;
1151 while (namelen--) {
1152 v *= 16;
1153 if (*name >= '0' && *name <= '9')
1154 v += *name - '0';
1155 else if (*name >= 'A' && *name <= 'F')
1156 v += *name - 'A' + 10;
1157 else
1158 return 0;
1159 name++;
1160 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001161 if (!is_unified_ideograph(v))
1162 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001163 *code = v;
1164 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001165 }
1166
Fredrik Lundh06d12682001-01-24 07:59:11 +00001167 /* the following is the same as python's dictionary lookup, with
1168 only minor changes. see the makeunicodedata script for more
1169 details */
1170
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001171 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001172 i = (~h) & mask;
1173 v = code_hash[i];
1174 if (!v)
1175 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001176 if (_cmpname(self, v, name, namelen))
1177 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001178 incr = (h ^ (h >> 3)) & mask;
1179 if (!incr)
1180 incr = mask;
1181 for (;;) {
1182 i = (i + incr) & mask;
1183 v = code_hash[i];
1184 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001185 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001186 if (_cmpname(self, v, name, namelen))
1187 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001188 incr = incr << 1;
1189 if (incr > mask)
1190 incr = incr ^ code_poly;
1191 }
1192}
1193
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001194static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001195{
1196 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001197 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001198 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001199};
1200
1201/* -------------------------------------------------------------------- */
1202/* Python bindings */
1203
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001204PyDoc_STRVAR(unicodedata_name__doc__,
1205"name(unichr[, default])\n\
1206Returns the name assigned to the Unicode character unichr as a\n\
1207string. If no name is defined, default is returned, or, if not\n\
1208given, ValueError is raised.");
1209
Fredrik Lundh06d12682001-01-24 07:59:11 +00001210static PyObject *
1211unicodedata_name(PyObject* self, PyObject* args)
1212{
1213 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001214 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001215
1216 PyUnicodeObject* v;
1217 PyObject* defobj = NULL;
1218 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1219 return NULL;
1220
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001221 c = getuchar(v);
1222 if (c == (Py_UCS4)-1)
1223 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001224
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001225 if (!_getucname(self, c, name, sizeof(name), 0)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001226 if (defobj == NULL) {
1227 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001228 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001229 }
1230 else {
1231 Py_INCREF(defobj);
1232 return defobj;
1233 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001234 }
1235
Walter Dörwald4254e762007-06-05 16:04:09 +00001236 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001237}
1238
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001239PyDoc_STRVAR(unicodedata_lookup__doc__,
1240"lookup(name)\n\
1241\n\
1242Look up character by name. If a character with the\n\
1243given name is found, return the corresponding Unicode\n\
1244character. If not found, KeyError is raised.");
1245
Fredrik Lundh06d12682001-01-24 07:59:11 +00001246static PyObject *
1247unicodedata_lookup(PyObject* self, PyObject* args)
1248{
1249 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001250
1251 char* name;
Victor Stinner65a31442014-07-01 16:45:52 +02001252 Py_ssize_t namelen;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001253 unsigned int index;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001254 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1255 return NULL;
Victor Stinner65a31442014-07-01 16:45:52 +02001256 if (namelen > INT_MAX) {
1257 PyErr_SetString(PyExc_KeyError, "name too long");
1258 return NULL;
1259 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001260
Victor Stinner65a31442014-07-01 16:45:52 +02001261 if (!_getcode(self, name, (int)namelen, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001262 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001263 return NULL;
1264 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001265 /* check if code is in the PUA range that we use for named sequences
1266 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001267 if (IS_NAMED_SEQ(code)) {
1268 index = code-named_sequences_start;
1269 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1270 named_sequences[index].seq,
1271 named_sequences[index].seqlen);
1272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001273 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001274}
1275
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001276/* XXX Add doc strings. */
1277
1278static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001279 UNICODEDATA_UCD_DECIMAL_METHODDEF
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001280 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1281 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1282 {"category", unicodedata_category, METH_VARARGS,
1283 unicodedata_category__doc__},
1284 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1285 unicodedata_bidirectional__doc__},
1286 {"combining", unicodedata_combining, METH_VARARGS,
1287 unicodedata_combining__doc__},
1288 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1289 unicodedata_mirrored__doc__},
1290 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1291 unicodedata_east_asian_width__doc__},
1292 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1293 unicodedata_decomposition__doc__},
1294 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1295 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1296 {"normalize", unicodedata_normalize, METH_VARARGS,
1297 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001298 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001299};
1300
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001301static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001302 /* The ob_type field must be initialized in the module init function
1303 * to be portable to Windows without using C++. */
1304 PyVarObject_HEAD_INIT(NULL, 0)
1305 "unicodedata.UCD", /*tp_name*/
1306 sizeof(PreviousDBVersion), /*tp_basicsize*/
1307 0, /*tp_itemsize*/
1308 /* methods */
1309 (destructor)PyObject_Del, /*tp_dealloc*/
1310 0, /*tp_print*/
1311 0, /*tp_getattr*/
1312 0, /*tp_setattr*/
1313 0, /*tp_reserved*/
1314 0, /*tp_repr*/
1315 0, /*tp_as_number*/
1316 0, /*tp_as_sequence*/
1317 0, /*tp_as_mapping*/
1318 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001319 0, /*tp_call*/
1320 0, /*tp_str*/
1321 PyObject_GenericGetAttr,/*tp_getattro*/
1322 0, /*tp_setattro*/
1323 0, /*tp_as_buffer*/
1324 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1325 0, /*tp_doc*/
1326 0, /*tp_traverse*/
1327 0, /*tp_clear*/
1328 0, /*tp_richcompare*/
1329 0, /*tp_weaklistoffset*/
1330 0, /*tp_iter*/
1331 0, /*tp_iternext*/
1332 unicodedata_functions, /*tp_methods*/
1333 DB_members, /*tp_members*/
1334 0, /*tp_getset*/
1335 0, /*tp_base*/
1336 0, /*tp_dict*/
1337 0, /*tp_descr_get*/
1338 0, /*tp_descr_set*/
1339 0, /*tp_dictoffset*/
1340 0, /*tp_init*/
1341 0, /*tp_alloc*/
1342 0, /*tp_new*/
1343 0, /*tp_free*/
1344 0, /*tp_is_gc*/
1345};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001346
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001347PyDoc_STRVAR(unicodedata_docstring,
1348"This module provides access to the Unicode Character Database which\n\
1349defines character properties for all Unicode characters. The data in\n\
1350this database is based on the UnicodeData.txt file version\n\
Benjamin Peterson8aa7b892013-10-10 20:22:10 -04001351" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001352\n\
1353The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001354UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001355
1356static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001357 PyModuleDef_HEAD_INIT,
1358 "unicodedata",
1359 unicodedata_docstring,
1360 -1,
1361 unicodedata_functions,
1362 NULL,
1363 NULL,
1364 NULL,
1365 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001366};
1367
Mark Hammond62b1ab12002-07-23 06:31:15 +00001368PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001369PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001370{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001371 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001372
Christian Heimes90aa7642007-12-19 02:45:37 +00001373 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001374
Martin v. Löwis1a214512008-06-11 05:26:20 +00001375 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001376 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001377 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001378
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001379 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001380 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001381 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001382
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001383 /* Previous versions */
1384 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1385 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001386 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001387
Fredrik Lundh06d12682001-01-24 07:59:11 +00001388 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001389 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001390 if (v != NULL)
1391 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001392 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001393}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001394
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001395/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001396Local variables:
1397c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001398indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001399End:
1400*/