blob: 3fd665b203ed43da5e27a897a2a9313d836e1269 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000020#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Larry Hastings61272b72014-01-07 12:41:53 -080022/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080023module unicodedata
Larry Hastingsc2047262014-01-25 20:43:29 -080024class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Larry Hastings61272b72014-01-07 12:41:53 -080025[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080026/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080027
Fredrik Lundh06d12682001-01-24 07:59:11 +000028/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000029
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000031 const unsigned char category; /* index into
32 _PyUnicode_CategoryNames */
33 const unsigned char combining; /* combining class value 0 - 255 */
34 const unsigned char bidirectional; /* index into
35 _PyUnicode_BidirectionalNames */
36 const unsigned char mirrored; /* true if mirrored in bidir mode */
37 const unsigned char east_asian_width; /* index into
38 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000039 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000040} _PyUnicode_DatabaseRecord;
41
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042typedef struct change_record {
43 /* sequence of fields should be the same as in merge_old_version */
44 const unsigned char bidir_changed;
45 const unsigned char category_changed;
46 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000047 const unsigned char mirrored_changed;
Benjamin Peterson67752312016-09-14 23:53:47 -070048 const unsigned char east_asian_width_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000049 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000050} change_record;
51
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000052/* data file generated by Tools/unicode/makeunicodedata.py */
53#include "unicodedata_db.h"
54
55static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000056_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000057{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000058 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000059 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000060 index = 0;
61 else {
62 index = index1[(code>>SHIFT)];
63 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
64 }
65
66 return &_PyUnicode_Database_Records[index];
67}
68
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000069/* ------------- Previous-version API ------------------------------------- */
70typedef struct previous_version {
71 PyObject_HEAD
72 const char *name;
73 const change_record* (*getrecord)(Py_UCS4);
74 Py_UCS4 (*normalization)(Py_UCS4);
75} PreviousDBVersion;
76
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030077#include "clinic/unicodedata.c.h"
78
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000079#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
80
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000081static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000082 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000083 {NULL}
84};
85
Thomas Wouters89f507f2006-12-13 04:49:30 +000086/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000087static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000088#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000089
90static PyObject*
91new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
92 Py_UCS4 (*normalization)(Py_UCS4))
93{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000094 PreviousDBVersion *self;
95 self = PyObject_New(PreviousDBVersion, &UCD_Type);
96 if (self == NULL)
97 return NULL;
98 self->name = name;
99 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000100 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000101 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000102}
103
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000104
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000105/* --- Module API --------------------------------------------------------- */
106
Larry Hastings61272b72014-01-07 12:41:53 -0800107/*[clinic input]
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800108unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700109
Serhiy Storchaka63596412015-04-17 21:18:49 +0300110 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700111 chr: int(accept={str})
Larry Hastings31826802013-10-19 00:09:25 -0700112 default: object=NULL
113 /
114
115Converts a Unicode character into its equivalent decimal value.
116
Serhiy Storchaka63596412015-04-17 21:18:49 +0300117Returns the decimal value assigned to the character chr as integer.
118If no such value is defined, default is returned, or, if not given,
119ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800120[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700121
Larry Hastings31826802013-10-19 00:09:25 -0700122static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300123unicodedata_UCD_decimal_impl(PyObject *self, int chr,
Larry Hastings89964c42015-04-14 18:07:59 -0400124 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700125/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
Larry Hastings31826802013-10-19 00:09:25 -0700126{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000127 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000128 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300129 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000130
Martin v. Löwis1a214512008-06-11 05:26:20 +0000131 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000132 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000133 if (old->category_changed == 0) {
134 /* unassigned */
135 have_old = 1;
136 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000137 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000138 else if (old->decimal_changed != 0xFF) {
139 have_old = 1;
140 rc = old->decimal_changed;
141 }
142 }
143
144 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000145 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000146 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700147 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 PyErr_SetString(PyExc_ValueError,
149 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000150 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000151 }
152 else {
Larry Hastings31826802013-10-19 00:09:25 -0700153 Py_INCREF(default_value);
154 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000155 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000156 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000157 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000158}
159
Serhiy Storchaka63596412015-04-17 21:18:49 +0300160/*[clinic input]
161unicodedata.UCD.digit
162
163 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700164 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300165 default: object=NULL
166 /
167
168Converts a Unicode character into its equivalent digit value.
169
170Returns the digit value assigned to the character chr as integer.
171If no such value is defined, default is returned, or, if not given,
172ValueError is raised.
173[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000174
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000175static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300176unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700177/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000178{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000179 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300180 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000181 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000182 if (rc < 0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300183 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000184 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000185 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000186 }
187 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300188 Py_INCREF(default_value);
189 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000190 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000191 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000192 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193}
194
Serhiy Storchaka63596412015-04-17 21:18:49 +0300195/*[clinic input]
196unicodedata.UCD.numeric
197
198 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700199 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300200 default: object=NULL
201 /
202
203Converts a Unicode character into its equivalent numeric value.
204
205Returns the numeric value assigned to the character chr as float.
206If no such value is defined, default is returned, or, if not given,
207ValueError is raised.
208[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000209
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000210static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300211unicodedata_UCD_numeric_impl(PyObject *self, int chr,
212 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700213/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000214{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000215 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000216 double rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300217 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000218
Martin v. Löwis1a214512008-06-11 05:26:20 +0000219 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000220 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000221 if (old->category_changed == 0) {
222 /* unassigned */
223 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000225 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000226 else if (old->decimal_changed != 0xFF) {
227 have_old = 1;
228 rc = old->decimal_changed;
229 }
230 }
231
232 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000233 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 if (rc == -1.0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300235 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000236 PyErr_SetString(PyExc_ValueError, "not a numeric character");
237 return NULL;
238 }
239 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300240 Py_INCREF(default_value);
241 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000242 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000243 }
244 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000245}
246
Serhiy Storchaka63596412015-04-17 21:18:49 +0300247/*[clinic input]
248unicodedata.UCD.category
249
250 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700251 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300252 /
253
254Returns the general category assigned to the character chr as string.
255[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000256
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000257static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300258unicodedata_UCD_category_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700259/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000260{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000261 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300262 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000263 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000264 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000265 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000266 if (old->category_changed != 0xFF)
267 index = old->category_changed;
268 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000269 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000270}
271
Serhiy Storchaka63596412015-04-17 21:18:49 +0300272/*[clinic input]
273unicodedata.UCD.bidirectional
274
275 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700276 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300277 /
278
279Returns the bidirectional class assigned to the character chr as string.
280
281If no such value is defined, an empty string is returned.
282[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000283
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000284static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300285unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700286/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000287{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000288 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300289 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000290 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000291 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000292 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000293 if (old->category_changed == 0)
294 index = 0; /* unassigned */
295 else if (old->bidir_changed != 0xFF)
296 index = old->bidir_changed;
297 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000298 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000299}
300
Serhiy Storchaka63596412015-04-17 21:18:49 +0300301/*[clinic input]
302unicodedata.UCD.combining -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000303
Serhiy Storchaka63596412015-04-17 21:18:49 +0300304 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700305 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300306 /
307
308Returns the canonical combining class assigned to the character chr as integer.
309
310Returns 0 if no combining class is defined.
311[clinic start generated code]*/
312
313static int
314unicodedata_UCD_combining_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700315/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000316{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000317 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300318 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000319 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000320 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000321 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000322 if (old->category_changed == 0)
323 index = 0; /* unassigned */
324 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300325 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000326}
327
Serhiy Storchaka63596412015-04-17 21:18:49 +0300328/*[clinic input]
329unicodedata.UCD.mirrored -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000330
Serhiy Storchaka63596412015-04-17 21:18:49 +0300331 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700332 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300333 /
334
335Returns the mirrored property assigned to the character chr as integer.
336
337Returns 1 if the character has been identified as a "mirrored"
338character in bidirectional text, 0 otherwise.
339[clinic start generated code]*/
340
341static int
342unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700343/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000344{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000345 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300346 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000347 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000348 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000349 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000350 if (old->category_changed == 0)
351 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000352 else if (old->mirrored_changed != 0xFF)
353 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000354 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300355 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000356}
357
Serhiy Storchaka63596412015-04-17 21:18:49 +0300358/*[clinic input]
359unicodedata.UCD.east_asian_width
360
361 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700362 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300363 /
364
365Returns the east asian width assigned to the character chr as string.
366[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000367
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000368static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300369unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700370/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000371{
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000372 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300373 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000374 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000375 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000376 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000377 if (old->category_changed == 0)
378 index = 0; /* unassigned */
Benjamin Peterson67752312016-09-14 23:53:47 -0700379 else if (old->east_asian_width_changed != 0xFF)
380 index = old->east_asian_width_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000381 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000382 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000383}
384
Serhiy Storchaka63596412015-04-17 21:18:49 +0300385/*[clinic input]
386unicodedata.UCD.decomposition
387
388 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700389 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300390 /
391
392Returns the character decomposition mapping assigned to the character chr as string.
393
394An empty string is returned in case no such mapping is defined.
395[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000396
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000397static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300398unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700399/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000400{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000401 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000402 int code, index, count;
403 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000404 unsigned int prefix_index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300405 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000406
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000407 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000408
Martin v. Löwis1a214512008-06-11 05:26:20 +0000409 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000410 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000411 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000412 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000413 }
414
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000415 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000416 index = 0;
417 else {
418 index = decomp_index1[(code>>DECOMP_SHIFT)];
419 index = decomp_index2[(index<<DECOMP_SHIFT)+
420 (code&((1<<DECOMP_SHIFT)-1))];
421 }
422
Tim Peters69b83b12001-11-30 07:23:05 +0000423 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000424 is prefix code (from*/
425 count = decomp_data[index] >> 8;
426
427 /* XXX: could allocate the PyString up front instead
428 (strlen(prefix) + 5 * count + 1 bytes) */
429
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000430 /* Based on how index is calculated above and decomp_data is generated
431 from Tools/unicode/makeunicodedata.py, it should not be possible
432 to overflow decomp_prefix. */
433 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200434 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000435
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000436 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000437 i = strlen(decomp_prefix[prefix_index]);
438 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000439
440 while (count-- > 0) {
441 if (i)
442 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000443 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000444 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
445 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000446 i += strlen(decomp + i);
447 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000448 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000449}
450
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000451static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000452get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000453{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000454 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000455 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000456 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000457 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000458 /* unassigned in old version */
459 *index = 0;
460 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000461 else {
462 *index = decomp_index1[(code>>DECOMP_SHIFT)];
463 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
464 (code&((1<<DECOMP_SHIFT)-1))];
465 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000466
Martin v. Löwis677bde22002-11-23 22:08:15 +0000467 /* high byte is number of hex bytes (usually one or two), low byte
468 is prefix code (from*/
469 *count = decomp_data[*index] >> 8;
470 *prefix = decomp_data[*index] & 255;
471
472 (*index)++;
473}
474
475#define SBase 0xAC00
476#define LBase 0x1100
477#define VBase 0x1161
478#define TBase 0x11A7
479#define LCount 19
480#define VCount 21
481#define TCount 28
482#define NCount (VCount*TCount)
483#define SCount (LCount*NCount)
484
485static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000486nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000487{
488 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200489 Py_UCS4 *output;
490 Py_ssize_t i, o, osize;
491 int kind;
492 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000493 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200494 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000495 Py_ssize_t space, isize;
496 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000497 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000498
Martin v. Löwis677bde22002-11-23 22:08:15 +0000499 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200500 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500501 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300502 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500503 if (space > 10) {
504 if (space <= PY_SSIZE_T_MAX - 10)
505 space += 10;
506 }
507 else {
508 space *= 2;
509 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200510 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500511 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200512 if (!output) {
513 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000514 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200515 }
516 i = o = 0;
517 kind = PyUnicode_KIND(input);
518 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000519
Martin v. Löwis22970662011-09-29 13:39:38 +0200520 while (i < isize) {
521 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000522 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200523 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000524 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300525 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000526 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000527 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200528 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000529 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000530 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
531 if (new_output == NULL) {
532 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200533 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000534 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200535 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000536 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000537 }
538 /* Hangul Decomposition. */
539 if (SBase <= code && code < (SBase+SCount)) {
540 int SIndex = code - SBase;
541 int L = LBase + SIndex / NCount;
542 int V = VBase + (SIndex % NCount) / TCount;
543 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200544 output[o++] = L;
545 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000546 space -= 2;
547 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200548 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000549 space --;
550 }
551 continue;
552 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000553 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000554 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000555 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
556 if (value != 0) {
557 stack[stackptr++] = value;
558 continue;
559 }
560 }
561
562 /* Other decompositions. */
563 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000564
565 /* Copy character if it is not decomposable, or has a
566 compatibility decomposition, but we do NFD. */
567 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200568 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000569 space--;
570 continue;
571 }
572 /* Copy decomposition onto the stack, in reverse
573 order. */
574 while(count) {
575 code = decomp_data[index + (--count)];
576 stack[stackptr++] = code;
577 }
578 }
579 }
580
Martin v. Löwis22970662011-09-29 13:39:38 +0200581 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
582 output, o);
583 PyMem_Free(output);
584 if (!result)
585 return NULL;
586 /* result is guaranteed to be ready, as it is compact. */
587 kind = PyUnicode_KIND(result);
588 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000589
590 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200591 i = 0;
592 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
593 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
594 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000595 if (prev == 0 || cur == 0 || prev <= cur) {
596 prev = cur;
597 continue;
598 }
599 /* Non-canonical order. Need to switch *i with previous. */
600 o = i - 1;
601 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200602 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
603 PyUnicode_WRITE(kind, data, o+1,
604 PyUnicode_READ(kind, data, o));
605 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000606 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200607 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000608 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200609 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000610 if (prev == 0 || prev <= cur)
611 break;
612 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200613 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000614 }
615 return result;
616}
617
618static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200619find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000620{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200621 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000622 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200623 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000624 if (code < start)
625 return -1;
626 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200627 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000628 return nfc[index].index + delta;
629 }
630 }
631 return -1;
632}
633
634static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000635nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000636{
637 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200638 int kind;
639 void *data;
640 Py_UCS4 *output;
641 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000642 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200643 Py_UCS4 code;
644 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000645 int cskipped = 0;
646
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000647 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000648 if (!result)
649 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200650 /* result will be "ready". */
651 kind = PyUnicode_KIND(result);
652 data = PyUnicode_DATA(result);
653 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000654
Martin v. Löwis22970662011-09-29 13:39:38 +0200655 /* We allocate a buffer for the output.
656 If we find that we made no changes, we still return
657 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500658 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200659 if (!output) {
660 PyErr_NoMemory();
661 Py_DECREF(result);
662 return 0;
663 }
664 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000665
Martin v. Löwis677bde22002-11-23 22:08:15 +0000666 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200667 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000668 for (index = 0; index < cskipped; index++) {
669 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000670 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000671 Remove from list. */
672 skipped[index] = skipped[cskipped-1];
673 cskipped--;
674 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000675 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000676 }
677 }
678 /* Hangul Composition. We don't need to check for <LV,T>
679 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200680 code = PyUnicode_READ(kind, data, i);
681 if (LBase <= code && code < (LBase+LCount) &&
682 i + 1 < len &&
683 VBase <= PyUnicode_READ(kind, data, i+1) &&
684 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000685 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200686 LIndex = code - LBase;
687 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000688 code = SBase + (LIndex*VCount+VIndex)*TCount;
689 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200690 if (i < len &&
691 TBase <= PyUnicode_READ(kind, data, i) &&
692 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
693 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000694 i++;
695 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200696 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000697 continue;
698 }
699
Martin v. Löwis22970662011-09-29 13:39:38 +0200700 /* code is still input[i] here */
701 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000702 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200703 output[o++] = code;
704 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000705 continue;
706 }
707 /* Find next unblocked character. */
708 i1 = i+1;
709 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200710 /* output base character for now; might be updated later. */
711 output[o] = PyUnicode_READ(kind, data, i);
712 while (i1 < len) {
713 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
714 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000715 if (comb) {
716 if (comb1 == 0)
717 break;
718 if (comb >= comb1) {
719 /* Character is blocked. */
720 i1++;
721 continue;
722 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000723 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200724 l = find_nfc_index(self, nfc_last, code1);
725 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000726 is a starter, we don't need to look further.
727 Otherwise, record the combining class. */
728 if (l == -1) {
729 not_combinable:
730 if (comb1 == 0)
731 break;
732 comb = comb1;
733 i1++;
734 continue;
735 }
736 index = f*TOTAL_LAST + l;
737 index1 = comp_index[index >> COMP_SHIFT];
738 code = comp_data[(index1<<COMP_SHIFT)+
739 (index&((1<<COMP_SHIFT)-1))];
740 if (code == 0)
741 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000742
Martin v. Löwis677bde22002-11-23 22:08:15 +0000743 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200744 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000745 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000746 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000747 skipped[cskipped++] = i1;
748 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200749 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000750 if (f == -1)
751 break;
752 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200753 /* Output character was already written.
754 Just advance the indices. */
755 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000756 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200757 if (o == len) {
758 /* No changes. Return original string. */
759 PyMem_Free(output);
760 return result;
761 }
762 Py_DECREF(result);
763 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
764 output, o);
765 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000766 return result;
767}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000768
769/* Return 1 if the input is certainly normalized, 0 if it might not be. */
770static int
771is_normalized(PyObject *self, PyObject *input, int nfc, int k)
772{
Martin v. Löwis22970662011-09-29 13:39:38 +0200773 Py_ssize_t i, len;
774 int kind;
775 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000776 unsigned char prev_combining = 0, quickcheck_mask;
777
778 /* An older version of the database is requested, quickchecks must be
779 disabled. */
780 if (self && UCD_Check(self))
781 return 0;
782
783 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
784 as described in http://unicode.org/reports/tr15/#Annex8. */
785 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
786
Martin v. Löwis22970662011-09-29 13:39:38 +0200787 i = 0;
788 kind = PyUnicode_KIND(input);
789 data = PyUnicode_DATA(input);
790 len = PyUnicode_GET_LENGTH(input);
791 while (i < len) {
792 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
793 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000794 unsigned char combining = record->combining;
795 unsigned char quickcheck = record->normalization_quick_check;
796
797 if (quickcheck & quickcheck_mask)
798 return 0; /* this string might need normalization */
799 if (combining && prev_combining > combining)
800 return 0; /* non-canonical sort order, not normalized */
801 prev_combining = combining;
802 }
803 return 1; /* certainly normalized */
804}
805
Serhiy Storchaka63596412015-04-17 21:18:49 +0300806/*[clinic input]
807unicodedata.UCD.normalize
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000808
Serhiy Storchaka63596412015-04-17 21:18:49 +0300809 self: self
810 form: str
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +0300811 unistr as input: unicode
Serhiy Storchaka63596412015-04-17 21:18:49 +0300812 /
813
814Return the normal form 'form' for the Unicode string unistr.
815
816Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
817[clinic start generated code]*/
818
819static PyObject *
820unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
821 PyObject *input)
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +0300822/*[clinic end generated code: output=62d1f8870027efdc input=1744c55f4ab79bf0]*/
Martin v. Löwis677bde22002-11-23 22:08:15 +0000823{
Martin v. Löwis22970662011-09-29 13:39:38 +0200824 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000825 /* Special case empty input strings, since resizing
826 them later would cause internal errors. */
827 Py_INCREF(input);
828 return input;
829 }
830
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000831 if (strcmp(form, "NFC") == 0) {
832 if (is_normalized(self, input, 1, 0)) {
833 Py_INCREF(input);
834 return input;
835 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000836 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000837 }
838 if (strcmp(form, "NFKC") == 0) {
839 if (is_normalized(self, input, 1, 1)) {
840 Py_INCREF(input);
841 return input;
842 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000843 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000844 }
845 if (strcmp(form, "NFD") == 0) {
846 if (is_normalized(self, input, 0, 0)) {
847 Py_INCREF(input);
848 return input;
849 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000850 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000851 }
852 if (strcmp(form, "NFKD") == 0) {
853 if (is_normalized(self, input, 0, 1)) {
854 Py_INCREF(input);
855 return input;
856 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000857 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000858 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000859 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
860 return NULL;
861}
862
Fredrik Lundh06d12682001-01-24 07:59:11 +0000863/* -------------------------------------------------------------------- */
864/* unicode character name tables */
865
866/* data file generated by Tools/unicode/makeunicodedata.py */
867#include "unicodename_db.h"
868
869/* -------------------------------------------------------------------- */
870/* database code (cut and pasted from the unidb package) */
871
872static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000873_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000874{
875 int i;
876 unsigned long h = 0;
877 unsigned long ix;
878 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200879 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000880 ix = h & 0xff000000;
881 if (ix)
882 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
883 }
884 return h;
885}
886
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200887static const char * const hangul_syllables[][3] = {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000888 { "G", "A", "" },
889 { "GG", "AE", "G" },
890 { "N", "YA", "GG" },
891 { "D", "YAE", "GS" },
892 { "DD", "EO", "N", },
893 { "R", "E", "NJ" },
894 { "M", "YEO", "NH" },
895 { "B", "YE", "D" },
896 { "BB", "O", "L" },
897 { "S", "WA", "LG" },
898 { "SS", "WAE", "LM" },
899 { "", "OE", "LB" },
900 { "J", "YO", "LS" },
901 { "JJ", "U", "LT" },
902 { "C", "WEO", "LP" },
903 { "K", "WE", "LH" },
904 { "T", "WI", "M" },
905 { "P", "YU", "B" },
906 { "H", "EU", "BS" },
907 { 0, "YI", "S" },
908 { 0, "I", "SS" },
909 { 0, 0, "NG" },
910 { 0, 0, "J" },
911 { 0, 0, "C" },
912 { 0, 0, "K" },
913 { 0, 0, "T" },
914 { 0, 0, "P" },
915 { 0, 0, "H" }
916};
917
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000918/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000919static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000920is_unified_ideograph(Py_UCS4 code)
921{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000922 return
923 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson279a9622017-06-22 22:31:08 -0700924 (0x4E00 <= code && code <= 0x9FEA) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000925 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
926 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
Benjamin Peterson48013832015-06-27 15:45:56 -0500927 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
Benjamin Peterson279a9622017-06-22 22:31:08 -0700928 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
929 (0x2CEB0 <= code && code <= 0x2EBEF); /* CJK Ideograph Extension F */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000930}
931
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200932/* macros used to determine if the given code point is in the PUA range that
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300933 * we are using to store aliases and named sequences */
934#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
935#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
936 (cp < named_sequences_end))
937
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000938static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300939_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
940 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000941{
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200942 /* Find the name associated with the given code point.
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300943 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
944 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000945 int offset;
946 int i;
947 int word;
948 unsigned char* w;
949
Martin v. Löwisc3509122006-03-11 12:16:23 +0000950 if (code >= 0x110000)
951 return 0;
952
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200953 /* XXX should we just skip all the code points in the PUAs here? */
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300954 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
955 return 0;
956
Martin v. Löwis1a214512008-06-11 05:26:20 +0000957 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300958 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +0300959 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300960 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
961 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +0300962 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +0000963 if (old->category_changed == 0) {
964 /* unassigned */
965 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000966 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000967 }
968
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000969 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000970 /* Hangul syllable. */
971 int SIndex = code - SBase;
972 int L = SIndex / NCount;
973 int V = (SIndex % NCount) / TCount;
974 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000975
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000976 if (buflen < 27)
977 /* Worst case: HANGUL SYLLABLE <10chars>. */
978 return 0;
979 strcpy(buffer, "HANGUL SYLLABLE ");
980 buffer += 16;
981 strcpy(buffer, hangul_syllables[L][0]);
982 buffer += strlen(hangul_syllables[L][0]);
983 strcpy(buffer, hangul_syllables[V][1]);
984 buffer += strlen(hangul_syllables[V][1]);
985 strcpy(buffer, hangul_syllables[T][2]);
986 buffer += strlen(hangul_syllables[T][2]);
987 *buffer = '\0';
988 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000989 }
990
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000991 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000992 if (buflen < 28)
993 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
994 return 0;
995 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
996 return 1;
997 }
998
Fredrik Lundh06d12682001-01-24 07:59:11 +0000999 /* get offset into phrasebook */
1000 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1001 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1002 (code&((1<<phrasebook_shift)-1))];
1003 if (!offset)
1004 return 0;
1005
1006 i = 0;
1007
1008 for (;;) {
1009 /* get word index */
1010 word = phrasebook[offset] - phrasebook_short;
1011 if (word >= 0) {
1012 word = (word << 8) + phrasebook[offset+1];
1013 offset += 2;
1014 } else
1015 word = phrasebook[offset++];
1016 if (i) {
1017 if (i > buflen)
1018 return 0; /* buffer overflow */
1019 buffer[i++] = ' ';
1020 }
1021 /* copy word string from lexicon. the last character in the
1022 word has bit 7 set. the last word in a string ends with
1023 0x80 */
1024 w = lexicon + lexicon_offset[word];
1025 while (*w < 128) {
1026 if (i >= buflen)
1027 return 0; /* buffer overflow */
1028 buffer[i++] = *w++;
1029 }
1030 if (i >= buflen)
1031 return 0; /* buffer overflow */
1032 buffer[i++] = *w & 127;
1033 if (*w == 128)
1034 break; /* end of word */
1035 }
1036
1037 return 1;
1038}
1039
1040static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001041_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001042{
1043 /* check if code corresponds to the given name */
1044 int i;
Christian Heimes2f366ca2016-09-23 20:20:27 +02001045 char buffer[NAME_MAXLEN+1];
1046 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001047 return 0;
1048 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001049 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001050 return 0;
1051 }
1052 return buffer[namelen] == '\0';
1053}
1054
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001055static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001056find_syllable(const char *str, int *len, int *pos, int count, int column)
1057{
1058 int i, len1;
1059 *len = -1;
1060 for (i = 0; i < count; i++) {
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02001061 const char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001062 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001063 if (len1 <= *len)
1064 continue;
1065 if (strncmp(str, s, len1) == 0) {
1066 *len = len1;
1067 *pos = i;
1068 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001069 }
1070 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001071 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001072 }
1073}
1074
Fredrik Lundh06d12682001-01-24 07:59:11 +00001075static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001076_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001077{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001078 /* check if named sequences are allowed */
1079 if (!with_named_seq && IS_NAMED_SEQ(cp))
1080 return 0;
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001081 /* if the code point is in the PUA range that we use for aliases,
1082 * convert it to obtain the right code point */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001083 if (IS_ALIAS(cp))
1084 *code = name_aliases[cp-aliases_start];
1085 else
1086 *code = cp;
1087 return 1;
1088}
1089
1090static int
1091_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1092 int with_named_seq)
1093{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001094 /* Return the code point associated with the given name.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001095 * Named aliases are resolved too (unless self != NULL (i.e. we are using
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001096 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001097 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001098 unsigned int h, v;
1099 unsigned int mask = code_size-1;
1100 unsigned int i, incr;
1101
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001102 /* Check for hangul syllables. */
1103 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001104 int len, L = -1, V = -1, T = -1;
1105 const char *pos = name + 16;
1106 find_syllable(pos, &len, &L, LCount, 0);
1107 pos += len;
1108 find_syllable(pos, &len, &V, VCount, 1);
1109 pos += len;
1110 find_syllable(pos, &len, &T, TCount, 2);
1111 pos += len;
1112 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1113 *code = SBase + (L*VCount+V)*TCount + T;
1114 return 1;
1115 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001116 /* Otherwise, it's an illegal syllable name. */
1117 return 0;
1118 }
1119
1120 /* Check for unified ideographs. */
1121 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1122 /* Four or five hexdigits must follow. */
1123 v = 0;
1124 name += 22;
1125 namelen -= 22;
1126 if (namelen != 4 && namelen != 5)
1127 return 0;
1128 while (namelen--) {
1129 v *= 16;
1130 if (*name >= '0' && *name <= '9')
1131 v += *name - '0';
1132 else if (*name >= 'A' && *name <= 'F')
1133 v += *name - 'A' + 10;
1134 else
1135 return 0;
1136 name++;
1137 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001138 if (!is_unified_ideograph(v))
1139 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001140 *code = v;
1141 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001142 }
1143
Fredrik Lundh06d12682001-01-24 07:59:11 +00001144 /* the following is the same as python's dictionary lookup, with
1145 only minor changes. see the makeunicodedata script for more
1146 details */
1147
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001148 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001149 i = (~h) & mask;
1150 v = code_hash[i];
1151 if (!v)
1152 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001153 if (_cmpname(self, v, name, namelen))
1154 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001155 incr = (h ^ (h >> 3)) & mask;
1156 if (!incr)
1157 incr = mask;
1158 for (;;) {
1159 i = (i + incr) & mask;
1160 v = code_hash[i];
1161 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001162 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001163 if (_cmpname(self, v, name, namelen))
1164 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001165 incr = incr << 1;
1166 if (incr > mask)
1167 incr = incr ^ code_poly;
1168 }
1169}
1170
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001171static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001172{
1173 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001174 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001175 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001176};
1177
1178/* -------------------------------------------------------------------- */
1179/* Python bindings */
1180
Serhiy Storchaka63596412015-04-17 21:18:49 +03001181/*[clinic input]
1182unicodedata.UCD.name
1183
1184 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001185 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +03001186 default: object=NULL
1187 /
1188
1189Returns the name assigned to the character chr as a string.
1190
1191If no name is defined, default is returned, or, if not given,
1192ValueError is raised.
1193[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001194
Fredrik Lundh06d12682001-01-24 07:59:11 +00001195static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001196unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001197/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001198{
Christian Heimes2f366ca2016-09-23 20:20:27 +02001199 char name[NAME_MAXLEN+1];
Serhiy Storchaka63596412015-04-17 21:18:49 +03001200 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001201
Christian Heimes2f366ca2016-09-23 20:20:27 +02001202 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001203 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001204 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001205 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001206 }
1207 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001208 Py_INCREF(default_value);
1209 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001210 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001211 }
1212
Walter Dörwald4254e762007-06-05 16:04:09 +00001213 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001214}
1215
Serhiy Storchaka63596412015-04-17 21:18:49 +03001216/*[clinic input]
1217unicodedata.UCD.lookup
1218
1219 self: self
Larry Hastings38337d12015-05-07 23:30:09 -07001220 name: str(accept={str, robuffer}, zeroes=True)
Serhiy Storchaka63596412015-04-17 21:18:49 +03001221 /
1222
1223Look up character by name.
1224
1225If a character with the given name is found, return the
1226corresponding character. If not found, KeyError is raised.
1227[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001228
Fredrik Lundh06d12682001-01-24 07:59:11 +00001229static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001230unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1231 Py_ssize_clean_t name_length)
Larry Hastings38337d12015-05-07 23:30:09 -07001232/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001233{
1234 Py_UCS4 code;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001235 unsigned int index;
Christian Heimes7ce20132016-09-14 10:25:46 +02001236 if (name_length > NAME_MAXLEN) {
Victor Stinner65a31442014-07-01 16:45:52 +02001237 PyErr_SetString(PyExc_KeyError, "name too long");
1238 return NULL;
1239 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001240
Serhiy Storchaka63596412015-04-17 21:18:49 +03001241 if (!_getcode(self, name, (int)name_length, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001242 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001243 return NULL;
1244 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001245 /* check if code is in the PUA range that we use for named sequences
1246 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001247 if (IS_NAMED_SEQ(code)) {
1248 index = code-named_sequences_start;
1249 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1250 named_sequences[index].seq,
1251 named_sequences[index].seqlen);
1252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001253 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001254}
1255
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001256/* XXX Add doc strings. */
1257
1258static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001259 UNICODEDATA_UCD_DECIMAL_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001260 UNICODEDATA_UCD_DIGIT_METHODDEF
1261 UNICODEDATA_UCD_NUMERIC_METHODDEF
1262 UNICODEDATA_UCD_CATEGORY_METHODDEF
1263 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1264 UNICODEDATA_UCD_COMBINING_METHODDEF
1265 UNICODEDATA_UCD_MIRRORED_METHODDEF
1266 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1267 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1268 UNICODEDATA_UCD_NAME_METHODDEF
1269 UNICODEDATA_UCD_LOOKUP_METHODDEF
1270 UNICODEDATA_UCD_NORMALIZE_METHODDEF
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001271 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001272};
1273
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001274static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001275 /* The ob_type field must be initialized in the module init function
1276 * to be portable to Windows without using C++. */
1277 PyVarObject_HEAD_INIT(NULL, 0)
1278 "unicodedata.UCD", /*tp_name*/
1279 sizeof(PreviousDBVersion), /*tp_basicsize*/
1280 0, /*tp_itemsize*/
1281 /* methods */
1282 (destructor)PyObject_Del, /*tp_dealloc*/
1283 0, /*tp_print*/
1284 0, /*tp_getattr*/
1285 0, /*tp_setattr*/
1286 0, /*tp_reserved*/
1287 0, /*tp_repr*/
1288 0, /*tp_as_number*/
1289 0, /*tp_as_sequence*/
1290 0, /*tp_as_mapping*/
1291 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001292 0, /*tp_call*/
1293 0, /*tp_str*/
1294 PyObject_GenericGetAttr,/*tp_getattro*/
1295 0, /*tp_setattro*/
1296 0, /*tp_as_buffer*/
1297 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1298 0, /*tp_doc*/
1299 0, /*tp_traverse*/
1300 0, /*tp_clear*/
1301 0, /*tp_richcompare*/
1302 0, /*tp_weaklistoffset*/
1303 0, /*tp_iter*/
1304 0, /*tp_iternext*/
1305 unicodedata_functions, /*tp_methods*/
1306 DB_members, /*tp_members*/
1307 0, /*tp_getset*/
1308 0, /*tp_base*/
1309 0, /*tp_dict*/
1310 0, /*tp_descr_get*/
1311 0, /*tp_descr_set*/
1312 0, /*tp_dictoffset*/
1313 0, /*tp_init*/
1314 0, /*tp_alloc*/
1315 0, /*tp_new*/
1316 0, /*tp_free*/
1317 0, /*tp_is_gc*/
1318};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001319
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001320PyDoc_STRVAR(unicodedata_docstring,
1321"This module provides access to the Unicode Character Database which\n\
1322defines character properties for all Unicode characters. The data in\n\
1323this database is based on the UnicodeData.txt file version\n\
Benjamin Peterson8aa7b892013-10-10 20:22:10 -04001324" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001325\n\
1326The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001327UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001328
1329static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001330 PyModuleDef_HEAD_INIT,
1331 "unicodedata",
1332 unicodedata_docstring,
1333 -1,
1334 unicodedata_functions,
1335 NULL,
1336 NULL,
1337 NULL,
1338 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001339};
1340
Mark Hammond62b1ab12002-07-23 06:31:15 +00001341PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001342PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001343{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001344 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001345
Christian Heimes90aa7642007-12-19 02:45:37 +00001346 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001347
Martin v. Löwis1a214512008-06-11 05:26:20 +00001348 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001349 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001350 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001351
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001352 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001353 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001354 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001355
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001356 /* Previous versions */
1357 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1358 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001359 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001360
Fredrik Lundh06d12682001-01-24 07:59:11 +00001361 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001362 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001363 if (v != NULL)
1364 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001365 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001366}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001367
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001368/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001369Local variables:
1370c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001371indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001372End:
1373*/