blob: 9e71e018e21590230334c6012876109a16ac74ba [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000020#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Larry Hastings61272b72014-01-07 12:41:53 -080022/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080023module unicodedata
Larry Hastingsc2047262014-01-25 20:43:29 -080024class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Larry Hastings61272b72014-01-07 12:41:53 -080025[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080026/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080027
Fredrik Lundh06d12682001-01-24 07:59:11 +000028/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000029
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000031 const unsigned char category; /* index into
32 _PyUnicode_CategoryNames */
33 const unsigned char combining; /* combining class value 0 - 255 */
34 const unsigned char bidirectional; /* index into
35 _PyUnicode_BidirectionalNames */
36 const unsigned char mirrored; /* true if mirrored in bidir mode */
37 const unsigned char east_asian_width; /* index into
38 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000039 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000040} _PyUnicode_DatabaseRecord;
41
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042typedef struct change_record {
43 /* sequence of fields should be the same as in merge_old_version */
44 const unsigned char bidir_changed;
45 const unsigned char category_changed;
46 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000047 const unsigned char mirrored_changed;
Benjamin Peterson67752312016-09-14 23:53:47 -070048 const unsigned char east_asian_width_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000049 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000050} change_record;
51
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000052/* data file generated by Tools/unicode/makeunicodedata.py */
53#include "unicodedata_db.h"
54
55static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000056_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000057{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000058 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000059 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000060 index = 0;
61 else {
62 index = index1[(code>>SHIFT)];
63 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
64 }
65
66 return &_PyUnicode_Database_Records[index];
67}
68
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000069/* ------------- Previous-version API ------------------------------------- */
70typedef struct previous_version {
71 PyObject_HEAD
72 const char *name;
73 const change_record* (*getrecord)(Py_UCS4);
74 Py_UCS4 (*normalization)(Py_UCS4);
75} PreviousDBVersion;
76
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030077#include "clinic/unicodedata.c.h"
78
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000079#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
80
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000081static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000082 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000083 {NULL}
84};
85
Thomas Wouters89f507f2006-12-13 04:49:30 +000086/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000087static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000088#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000089
90static PyObject*
91new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
92 Py_UCS4 (*normalization)(Py_UCS4))
93{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000094 PreviousDBVersion *self;
95 self = PyObject_New(PreviousDBVersion, &UCD_Type);
96 if (self == NULL)
97 return NULL;
98 self->name = name;
99 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000100 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000101 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000102}
103
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000104
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000105/* --- Module API --------------------------------------------------------- */
106
Larry Hastings61272b72014-01-07 12:41:53 -0800107/*[clinic input]
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800108unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700109
Serhiy Storchaka63596412015-04-17 21:18:49 +0300110 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700111 chr: int(accept={str})
Larry Hastings31826802013-10-19 00:09:25 -0700112 default: object=NULL
113 /
114
115Converts a Unicode character into its equivalent decimal value.
116
Serhiy Storchaka63596412015-04-17 21:18:49 +0300117Returns the decimal value assigned to the character chr as integer.
118If no such value is defined, default is returned, or, if not given,
119ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800120[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700121
Larry Hastings31826802013-10-19 00:09:25 -0700122static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300123unicodedata_UCD_decimal_impl(PyObject *self, int chr,
Larry Hastings89964c42015-04-14 18:07:59 -0400124 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700125/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
Larry Hastings31826802013-10-19 00:09:25 -0700126{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000127 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000128 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300129 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000130
Martin v. Löwis1a214512008-06-11 05:26:20 +0000131 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000132 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000133 if (old->category_changed == 0) {
134 /* unassigned */
135 have_old = 1;
136 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000137 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000138 else if (old->decimal_changed != 0xFF) {
139 have_old = 1;
140 rc = old->decimal_changed;
141 }
142 }
143
144 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000145 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000146 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700147 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 PyErr_SetString(PyExc_ValueError,
149 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000150 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000151 }
152 else {
Larry Hastings31826802013-10-19 00:09:25 -0700153 Py_INCREF(default_value);
154 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000155 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000156 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000157 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000158}
159
Serhiy Storchaka63596412015-04-17 21:18:49 +0300160/*[clinic input]
161unicodedata.UCD.digit
162
163 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700164 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300165 default: object=NULL
166 /
167
168Converts a Unicode character into its equivalent digit value.
169
170Returns the digit value assigned to the character chr as integer.
171If no such value is defined, default is returned, or, if not given,
172ValueError is raised.
173[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000174
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000175static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300176unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700177/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000178{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000179 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300180 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000181 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000182 if (rc < 0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300183 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000184 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000185 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000186 }
187 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300188 Py_INCREF(default_value);
189 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000190 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000191 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000192 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193}
194
Serhiy Storchaka63596412015-04-17 21:18:49 +0300195/*[clinic input]
196unicodedata.UCD.numeric
197
198 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700199 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300200 default: object=NULL
201 /
202
203Converts a Unicode character into its equivalent numeric value.
204
205Returns the numeric value assigned to the character chr as float.
206If no such value is defined, default is returned, or, if not given,
207ValueError is raised.
208[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000209
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000210static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300211unicodedata_UCD_numeric_impl(PyObject *self, int chr,
212 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700213/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000214{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000215 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000216 double rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300217 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000218
Martin v. Löwis1a214512008-06-11 05:26:20 +0000219 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000220 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000221 if (old->category_changed == 0) {
222 /* unassigned */
223 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000225 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000226 else if (old->decimal_changed != 0xFF) {
227 have_old = 1;
228 rc = old->decimal_changed;
229 }
230 }
231
232 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000233 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 if (rc == -1.0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300235 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000236 PyErr_SetString(PyExc_ValueError, "not a numeric character");
237 return NULL;
238 }
239 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300240 Py_INCREF(default_value);
241 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000242 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000243 }
244 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000245}
246
Serhiy Storchaka63596412015-04-17 21:18:49 +0300247/*[clinic input]
248unicodedata.UCD.category
249
250 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700251 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300252 /
253
254Returns the general category assigned to the character chr as string.
255[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000256
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000257static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300258unicodedata_UCD_category_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700259/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000260{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000261 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300262 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000263 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000264 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000265 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000266 if (old->category_changed != 0xFF)
267 index = old->category_changed;
268 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000269 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000270}
271
Serhiy Storchaka63596412015-04-17 21:18:49 +0300272/*[clinic input]
273unicodedata.UCD.bidirectional
274
275 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700276 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300277 /
278
279Returns the bidirectional class assigned to the character chr as string.
280
281If no such value is defined, an empty string is returned.
282[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000283
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000284static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300285unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700286/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000287{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000288 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300289 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000290 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000291 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000292 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000293 if (old->category_changed == 0)
294 index = 0; /* unassigned */
295 else if (old->bidir_changed != 0xFF)
296 index = old->bidir_changed;
297 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000298 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000299}
300
Serhiy Storchaka63596412015-04-17 21:18:49 +0300301/*[clinic input]
302unicodedata.UCD.combining -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000303
Serhiy Storchaka63596412015-04-17 21:18:49 +0300304 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700305 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300306 /
307
308Returns the canonical combining class assigned to the character chr as integer.
309
310Returns 0 if no combining class is defined.
311[clinic start generated code]*/
312
313static int
314unicodedata_UCD_combining_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700315/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000316{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000317 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300318 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000319 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000320 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000321 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000322 if (old->category_changed == 0)
323 index = 0; /* unassigned */
324 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300325 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000326}
327
Serhiy Storchaka63596412015-04-17 21:18:49 +0300328/*[clinic input]
329unicodedata.UCD.mirrored -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000330
Serhiy Storchaka63596412015-04-17 21:18:49 +0300331 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700332 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300333 /
334
335Returns the mirrored property assigned to the character chr as integer.
336
337Returns 1 if the character has been identified as a "mirrored"
338character in bidirectional text, 0 otherwise.
339[clinic start generated code]*/
340
341static int
342unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700343/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000344{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000345 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300346 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000347 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000348 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000349 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000350 if (old->category_changed == 0)
351 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000352 else if (old->mirrored_changed != 0xFF)
353 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000354 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300355 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000356}
357
Serhiy Storchaka63596412015-04-17 21:18:49 +0300358/*[clinic input]
359unicodedata.UCD.east_asian_width
360
361 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700362 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300363 /
364
365Returns the east asian width assigned to the character chr as string.
366[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000367
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000368static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300369unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700370/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000371{
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000372 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300373 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000374 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000375 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000376 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000377 if (old->category_changed == 0)
378 index = 0; /* unassigned */
Benjamin Peterson67752312016-09-14 23:53:47 -0700379 else if (old->east_asian_width_changed != 0xFF)
380 index = old->east_asian_width_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000381 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000382 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000383}
384
Serhiy Storchaka63596412015-04-17 21:18:49 +0300385/*[clinic input]
386unicodedata.UCD.decomposition
387
388 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700389 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300390 /
391
392Returns the character decomposition mapping assigned to the character chr as string.
393
394An empty string is returned in case no such mapping is defined.
395[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000396
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000397static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300398unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700399/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000400{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000401 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000402 int code, index, count;
403 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000404 unsigned int prefix_index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300405 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000406
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000407 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000408
Martin v. Löwis1a214512008-06-11 05:26:20 +0000409 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000410 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000411 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000412 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000413 }
414
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000415 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000416 index = 0;
417 else {
418 index = decomp_index1[(code>>DECOMP_SHIFT)];
419 index = decomp_index2[(index<<DECOMP_SHIFT)+
420 (code&((1<<DECOMP_SHIFT)-1))];
421 }
422
Tim Peters69b83b12001-11-30 07:23:05 +0000423 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000424 is prefix code (from*/
425 count = decomp_data[index] >> 8;
426
427 /* XXX: could allocate the PyString up front instead
428 (strlen(prefix) + 5 * count + 1 bytes) */
429
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000430 /* Based on how index is calculated above and decomp_data is generated
431 from Tools/unicode/makeunicodedata.py, it should not be possible
432 to overflow decomp_prefix. */
433 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200434 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000435
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000436 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000437 i = strlen(decomp_prefix[prefix_index]);
438 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000439
440 while (count-- > 0) {
441 if (i)
442 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000443 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000444 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
445 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000446 i += strlen(decomp + i);
447 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000448 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000449}
450
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000451static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000452get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000453{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000454 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000455 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000456 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000457 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000458 /* unassigned in old version */
459 *index = 0;
460 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000461 else {
462 *index = decomp_index1[(code>>DECOMP_SHIFT)];
463 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
464 (code&((1<<DECOMP_SHIFT)-1))];
465 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000466
Martin v. Löwis677bde22002-11-23 22:08:15 +0000467 /* high byte is number of hex bytes (usually one or two), low byte
468 is prefix code (from*/
469 *count = decomp_data[*index] >> 8;
470 *prefix = decomp_data[*index] & 255;
471
472 (*index)++;
473}
474
475#define SBase 0xAC00
476#define LBase 0x1100
477#define VBase 0x1161
478#define TBase 0x11A7
479#define LCount 19
480#define VCount 21
481#define TCount 28
482#define NCount (VCount*TCount)
483#define SCount (LCount*NCount)
484
485static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000486nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000487{
488 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200489 Py_UCS4 *output;
490 Py_ssize_t i, o, osize;
491 int kind;
492 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000493 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200494 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000495 Py_ssize_t space, isize;
496 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000497 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000498
Martin v. Löwis677bde22002-11-23 22:08:15 +0000499 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200500 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500501 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300502 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500503 if (space > 10) {
504 if (space <= PY_SSIZE_T_MAX - 10)
505 space += 10;
506 }
507 else {
508 space *= 2;
509 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200510 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500511 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200512 if (!output) {
513 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000514 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200515 }
516 i = o = 0;
517 kind = PyUnicode_KIND(input);
518 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000519
Martin v. Löwis22970662011-09-29 13:39:38 +0200520 while (i < isize) {
521 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000522 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200523 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000524 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300525 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000526 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000527 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200528 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000529 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000530 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
531 if (new_output == NULL) {
532 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200533 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000534 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200535 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000536 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000537 }
538 /* Hangul Decomposition. */
539 if (SBase <= code && code < (SBase+SCount)) {
540 int SIndex = code - SBase;
541 int L = LBase + SIndex / NCount;
542 int V = VBase + (SIndex % NCount) / TCount;
543 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200544 output[o++] = L;
545 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000546 space -= 2;
547 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200548 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000549 space --;
550 }
551 continue;
552 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000553 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000554 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000555 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
556 if (value != 0) {
557 stack[stackptr++] = value;
558 continue;
559 }
560 }
561
562 /* Other decompositions. */
563 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000564
565 /* Copy character if it is not decomposable, or has a
566 compatibility decomposition, but we do NFD. */
567 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200568 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000569 space--;
570 continue;
571 }
572 /* Copy decomposition onto the stack, in reverse
573 order. */
574 while(count) {
575 code = decomp_data[index + (--count)];
576 stack[stackptr++] = code;
577 }
578 }
579 }
580
Martin v. Löwis22970662011-09-29 13:39:38 +0200581 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
582 output, o);
583 PyMem_Free(output);
584 if (!result)
585 return NULL;
586 /* result is guaranteed to be ready, as it is compact. */
587 kind = PyUnicode_KIND(result);
588 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000589
590 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200591 i = 0;
592 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
593 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
594 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000595 if (prev == 0 || cur == 0 || prev <= cur) {
596 prev = cur;
597 continue;
598 }
599 /* Non-canonical order. Need to switch *i with previous. */
600 o = i - 1;
601 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200602 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
603 PyUnicode_WRITE(kind, data, o+1,
604 PyUnicode_READ(kind, data, o));
605 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000606 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200607 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000608 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200609 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000610 if (prev == 0 || prev <= cur)
611 break;
612 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200613 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000614 }
615 return result;
616}
617
618static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200619find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000620{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200621 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000622 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200623 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000624 if (code < start)
625 return -1;
626 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200627 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000628 return nfc[index].index + delta;
629 }
630 }
631 return -1;
632}
633
634static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000635nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000636{
637 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200638 int kind;
639 void *data;
640 Py_UCS4 *output;
641 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000642 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200643 Py_UCS4 code;
644 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000645 int cskipped = 0;
646
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000647 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000648 if (!result)
649 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200650 /* result will be "ready". */
651 kind = PyUnicode_KIND(result);
652 data = PyUnicode_DATA(result);
653 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000654
Martin v. Löwis22970662011-09-29 13:39:38 +0200655 /* We allocate a buffer for the output.
656 If we find that we made no changes, we still return
657 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500658 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200659 if (!output) {
660 PyErr_NoMemory();
661 Py_DECREF(result);
662 return 0;
663 }
664 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000665
Martin v. Löwis677bde22002-11-23 22:08:15 +0000666 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200667 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000668 for (index = 0; index < cskipped; index++) {
669 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000670 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000671 Remove from list. */
672 skipped[index] = skipped[cskipped-1];
673 cskipped--;
674 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000675 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000676 }
677 }
678 /* Hangul Composition. We don't need to check for <LV,T>
679 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200680 code = PyUnicode_READ(kind, data, i);
681 if (LBase <= code && code < (LBase+LCount) &&
682 i + 1 < len &&
683 VBase <= PyUnicode_READ(kind, data, i+1) &&
684 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000685 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200686 LIndex = code - LBase;
687 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000688 code = SBase + (LIndex*VCount+VIndex)*TCount;
689 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200690 if (i < len &&
691 TBase <= PyUnicode_READ(kind, data, i) &&
692 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
693 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000694 i++;
695 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200696 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000697 continue;
698 }
699
Martin v. Löwis22970662011-09-29 13:39:38 +0200700 /* code is still input[i] here */
701 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000702 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200703 output[o++] = code;
704 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000705 continue;
706 }
707 /* Find next unblocked character. */
708 i1 = i+1;
709 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200710 /* output base character for now; might be updated later. */
711 output[o] = PyUnicode_READ(kind, data, i);
712 while (i1 < len) {
713 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
714 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000715 if (comb) {
716 if (comb1 == 0)
717 break;
718 if (comb >= comb1) {
719 /* Character is blocked. */
720 i1++;
721 continue;
722 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000723 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200724 l = find_nfc_index(self, nfc_last, code1);
725 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000726 is a starter, we don't need to look further.
727 Otherwise, record the combining class. */
728 if (l == -1) {
729 not_combinable:
730 if (comb1 == 0)
731 break;
732 comb = comb1;
733 i1++;
734 continue;
735 }
736 index = f*TOTAL_LAST + l;
737 index1 = comp_index[index >> COMP_SHIFT];
738 code = comp_data[(index1<<COMP_SHIFT)+
739 (index&((1<<COMP_SHIFT)-1))];
740 if (code == 0)
741 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000742
Martin v. Löwis677bde22002-11-23 22:08:15 +0000743 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200744 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000745 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000746 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000747 skipped[cskipped++] = i1;
748 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200749 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000750 if (f == -1)
751 break;
752 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200753 /* Output character was already written.
754 Just advance the indices. */
755 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000756 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200757 if (o == len) {
758 /* No changes. Return original string. */
759 PyMem_Free(output);
760 return result;
761 }
762 Py_DECREF(result);
763 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
764 output, o);
765 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000766 return result;
767}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000768
769/* Return 1 if the input is certainly normalized, 0 if it might not be. */
770static int
771is_normalized(PyObject *self, PyObject *input, int nfc, int k)
772{
Martin v. Löwis22970662011-09-29 13:39:38 +0200773 Py_ssize_t i, len;
774 int kind;
775 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000776 unsigned char prev_combining = 0, quickcheck_mask;
777
778 /* An older version of the database is requested, quickchecks must be
779 disabled. */
780 if (self && UCD_Check(self))
781 return 0;
782
783 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
784 as described in http://unicode.org/reports/tr15/#Annex8. */
785 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
786
Martin v. Löwis22970662011-09-29 13:39:38 +0200787 i = 0;
788 kind = PyUnicode_KIND(input);
789 data = PyUnicode_DATA(input);
790 len = PyUnicode_GET_LENGTH(input);
791 while (i < len) {
792 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
793 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000794 unsigned char combining = record->combining;
795 unsigned char quickcheck = record->normalization_quick_check;
796
797 if (quickcheck & quickcheck_mask)
798 return 0; /* this string might need normalization */
799 if (combining && prev_combining > combining)
800 return 0; /* non-canonical sort order, not normalized */
801 prev_combining = combining;
802 }
803 return 1; /* certainly normalized */
804}
805
Serhiy Storchaka63596412015-04-17 21:18:49 +0300806/*[clinic input]
807unicodedata.UCD.normalize
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000808
Serhiy Storchaka63596412015-04-17 21:18:49 +0300809 self: self
810 form: str
811 unistr as input: object(subclass_of='&PyUnicode_Type')
812 /
813
814Return the normal form 'form' for the Unicode string unistr.
815
816Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
817[clinic start generated code]*/
818
819static PyObject *
820unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
821 PyObject *input)
822/*[clinic end generated code: output=62d1f8870027efdc input=cd092e631cf11883]*/
Martin v. Löwis677bde22002-11-23 22:08:15 +0000823{
Martin v. Löwis22970662011-09-29 13:39:38 +0200824 if (PyUnicode_READY(input) == -1)
825 return NULL;
826
827 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000828 /* Special case empty input strings, since resizing
829 them later would cause internal errors. */
830 Py_INCREF(input);
831 return input;
832 }
833
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000834 if (strcmp(form, "NFC") == 0) {
835 if (is_normalized(self, input, 1, 0)) {
836 Py_INCREF(input);
837 return input;
838 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000839 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000840 }
841 if (strcmp(form, "NFKC") == 0) {
842 if (is_normalized(self, input, 1, 1)) {
843 Py_INCREF(input);
844 return input;
845 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000846 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000847 }
848 if (strcmp(form, "NFD") == 0) {
849 if (is_normalized(self, input, 0, 0)) {
850 Py_INCREF(input);
851 return input;
852 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000853 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000854 }
855 if (strcmp(form, "NFKD") == 0) {
856 if (is_normalized(self, input, 0, 1)) {
857 Py_INCREF(input);
858 return input;
859 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000860 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000861 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000862 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
863 return NULL;
864}
865
Fredrik Lundh06d12682001-01-24 07:59:11 +0000866/* -------------------------------------------------------------------- */
867/* unicode character name tables */
868
869/* data file generated by Tools/unicode/makeunicodedata.py */
870#include "unicodename_db.h"
871
872/* -------------------------------------------------------------------- */
873/* database code (cut and pasted from the unidb package) */
874
875static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000876_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000877{
878 int i;
879 unsigned long h = 0;
880 unsigned long ix;
881 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200882 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000883 ix = h & 0xff000000;
884 if (ix)
885 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
886 }
887 return h;
888}
889
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200890static const char * const hangul_syllables[][3] = {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000891 { "G", "A", "" },
892 { "GG", "AE", "G" },
893 { "N", "YA", "GG" },
894 { "D", "YAE", "GS" },
895 { "DD", "EO", "N", },
896 { "R", "E", "NJ" },
897 { "M", "YEO", "NH" },
898 { "B", "YE", "D" },
899 { "BB", "O", "L" },
900 { "S", "WA", "LG" },
901 { "SS", "WAE", "LM" },
902 { "", "OE", "LB" },
903 { "J", "YO", "LS" },
904 { "JJ", "U", "LT" },
905 { "C", "WEO", "LP" },
906 { "K", "WE", "LH" },
907 { "T", "WI", "M" },
908 { "P", "YU", "B" },
909 { "H", "EU", "BS" },
910 { 0, "YI", "S" },
911 { 0, "I", "SS" },
912 { 0, 0, "NG" },
913 { 0, 0, "J" },
914 { 0, 0, "C" },
915 { 0, 0, "K" },
916 { 0, 0, "T" },
917 { 0, 0, "P" },
918 { 0, 0, "H" }
919};
920
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000921/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000922static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000923is_unified_ideograph(Py_UCS4 code)
924{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000925 return
926 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson48013832015-06-27 15:45:56 -0500927 (0x4E00 <= code && code <= 0x9FD5) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000928 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
929 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
Benjamin Peterson48013832015-06-27 15:45:56 -0500930 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
931 (0x2B820 <= code && code <= 0x2CEA1); /* CJK Ideograph Extension E */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000932}
933
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200934/* macros used to determine if the given code point is in the PUA range that
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300935 * we are using to store aliases and named sequences */
936#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
937#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
938 (cp < named_sequences_end))
939
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000940static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300941_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
942 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000943{
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200944 /* Find the name associated with the given code point.
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300945 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
946 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000947 int offset;
948 int i;
949 int word;
950 unsigned char* w;
951
Martin v. Löwisc3509122006-03-11 12:16:23 +0000952 if (code >= 0x110000)
953 return 0;
954
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200955 /* XXX should we just skip all the code points in the PUAs here? */
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300956 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
957 return 0;
958
Martin v. Löwis1a214512008-06-11 05:26:20 +0000959 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300960 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +0300961 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300962 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
963 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +0300964 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +0000965 if (old->category_changed == 0) {
966 /* unassigned */
967 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000968 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000969 }
970
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000971 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000972 /* Hangul syllable. */
973 int SIndex = code - SBase;
974 int L = SIndex / NCount;
975 int V = (SIndex % NCount) / TCount;
976 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000977
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000978 if (buflen < 27)
979 /* Worst case: HANGUL SYLLABLE <10chars>. */
980 return 0;
981 strcpy(buffer, "HANGUL SYLLABLE ");
982 buffer += 16;
983 strcpy(buffer, hangul_syllables[L][0]);
984 buffer += strlen(hangul_syllables[L][0]);
985 strcpy(buffer, hangul_syllables[V][1]);
986 buffer += strlen(hangul_syllables[V][1]);
987 strcpy(buffer, hangul_syllables[T][2]);
988 buffer += strlen(hangul_syllables[T][2]);
989 *buffer = '\0';
990 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000991 }
992
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000993 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000994 if (buflen < 28)
995 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
996 return 0;
997 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
998 return 1;
999 }
1000
Fredrik Lundh06d12682001-01-24 07:59:11 +00001001 /* get offset into phrasebook */
1002 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1003 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1004 (code&((1<<phrasebook_shift)-1))];
1005 if (!offset)
1006 return 0;
1007
1008 i = 0;
1009
1010 for (;;) {
1011 /* get word index */
1012 word = phrasebook[offset] - phrasebook_short;
1013 if (word >= 0) {
1014 word = (word << 8) + phrasebook[offset+1];
1015 offset += 2;
1016 } else
1017 word = phrasebook[offset++];
1018 if (i) {
1019 if (i > buflen)
1020 return 0; /* buffer overflow */
1021 buffer[i++] = ' ';
1022 }
1023 /* copy word string from lexicon. the last character in the
1024 word has bit 7 set. the last word in a string ends with
1025 0x80 */
1026 w = lexicon + lexicon_offset[word];
1027 while (*w < 128) {
1028 if (i >= buflen)
1029 return 0; /* buffer overflow */
1030 buffer[i++] = *w++;
1031 }
1032 if (i >= buflen)
1033 return 0; /* buffer overflow */
1034 buffer[i++] = *w & 127;
1035 if (*w == 128)
1036 break; /* end of word */
1037 }
1038
1039 return 1;
1040}
1041
1042static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001043_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001044{
1045 /* check if code corresponds to the given name */
1046 int i;
Christian Heimes2f366ca2016-09-23 20:20:27 +02001047 char buffer[NAME_MAXLEN+1];
1048 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001049 return 0;
1050 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001051 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001052 return 0;
1053 }
1054 return buffer[namelen] == '\0';
1055}
1056
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001057static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001058find_syllable(const char *str, int *len, int *pos, int count, int column)
1059{
1060 int i, len1;
1061 *len = -1;
1062 for (i = 0; i < count; i++) {
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02001063 const char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001064 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001065 if (len1 <= *len)
1066 continue;
1067 if (strncmp(str, s, len1) == 0) {
1068 *len = len1;
1069 *pos = i;
1070 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001071 }
1072 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001073 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001074 }
1075}
1076
Fredrik Lundh06d12682001-01-24 07:59:11 +00001077static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001078_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001079{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001080 /* check if named sequences are allowed */
1081 if (!with_named_seq && IS_NAMED_SEQ(cp))
1082 return 0;
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001083 /* if the code point is in the PUA range that we use for aliases,
1084 * convert it to obtain the right code point */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001085 if (IS_ALIAS(cp))
1086 *code = name_aliases[cp-aliases_start];
1087 else
1088 *code = cp;
1089 return 1;
1090}
1091
1092static int
1093_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1094 int with_named_seq)
1095{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001096 /* Return the code point associated with the given name.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001097 * Named aliases are resolved too (unless self != NULL (i.e. we are using
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001098 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001099 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001100 unsigned int h, v;
1101 unsigned int mask = code_size-1;
1102 unsigned int i, incr;
1103
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001104 /* Check for hangul syllables. */
1105 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001106 int len, L = -1, V = -1, T = -1;
1107 const char *pos = name + 16;
1108 find_syllable(pos, &len, &L, LCount, 0);
1109 pos += len;
1110 find_syllable(pos, &len, &V, VCount, 1);
1111 pos += len;
1112 find_syllable(pos, &len, &T, TCount, 2);
1113 pos += len;
1114 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1115 *code = SBase + (L*VCount+V)*TCount + T;
1116 return 1;
1117 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001118 /* Otherwise, it's an illegal syllable name. */
1119 return 0;
1120 }
1121
1122 /* Check for unified ideographs. */
1123 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1124 /* Four or five hexdigits must follow. */
1125 v = 0;
1126 name += 22;
1127 namelen -= 22;
1128 if (namelen != 4 && namelen != 5)
1129 return 0;
1130 while (namelen--) {
1131 v *= 16;
1132 if (*name >= '0' && *name <= '9')
1133 v += *name - '0';
1134 else if (*name >= 'A' && *name <= 'F')
1135 v += *name - 'A' + 10;
1136 else
1137 return 0;
1138 name++;
1139 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001140 if (!is_unified_ideograph(v))
1141 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001142 *code = v;
1143 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001144 }
1145
Fredrik Lundh06d12682001-01-24 07:59:11 +00001146 /* the following is the same as python's dictionary lookup, with
1147 only minor changes. see the makeunicodedata script for more
1148 details */
1149
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001150 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001151 i = (~h) & mask;
1152 v = code_hash[i];
1153 if (!v)
1154 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001155 if (_cmpname(self, v, name, namelen))
1156 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001157 incr = (h ^ (h >> 3)) & mask;
1158 if (!incr)
1159 incr = mask;
1160 for (;;) {
1161 i = (i + incr) & mask;
1162 v = code_hash[i];
1163 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001164 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001165 if (_cmpname(self, v, name, namelen))
1166 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001167 incr = incr << 1;
1168 if (incr > mask)
1169 incr = incr ^ code_poly;
1170 }
1171}
1172
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001173static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001174{
1175 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001176 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001177 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001178};
1179
1180/* -------------------------------------------------------------------- */
1181/* Python bindings */
1182
Serhiy Storchaka63596412015-04-17 21:18:49 +03001183/*[clinic input]
1184unicodedata.UCD.name
1185
1186 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001187 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +03001188 default: object=NULL
1189 /
1190
1191Returns the name assigned to the character chr as a string.
1192
1193If no name is defined, default is returned, or, if not given,
1194ValueError is raised.
1195[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001196
Fredrik Lundh06d12682001-01-24 07:59:11 +00001197static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001198unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001199/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001200{
Christian Heimes2f366ca2016-09-23 20:20:27 +02001201 char name[NAME_MAXLEN+1];
Serhiy Storchaka63596412015-04-17 21:18:49 +03001202 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001203
Christian Heimes2f366ca2016-09-23 20:20:27 +02001204 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001205 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001206 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001207 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001208 }
1209 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001210 Py_INCREF(default_value);
1211 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001212 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001213 }
1214
Walter Dörwald4254e762007-06-05 16:04:09 +00001215 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001216}
1217
Serhiy Storchaka63596412015-04-17 21:18:49 +03001218/*[clinic input]
1219unicodedata.UCD.lookup
1220
1221 self: self
Larry Hastings38337d12015-05-07 23:30:09 -07001222 name: str(accept={str, robuffer}, zeroes=True)
Serhiy Storchaka63596412015-04-17 21:18:49 +03001223 /
1224
1225Look up character by name.
1226
1227If a character with the given name is found, return the
1228corresponding character. If not found, KeyError is raised.
1229[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001230
Fredrik Lundh06d12682001-01-24 07:59:11 +00001231static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001232unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1233 Py_ssize_clean_t name_length)
Larry Hastings38337d12015-05-07 23:30:09 -07001234/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001235{
1236 Py_UCS4 code;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001237 unsigned int index;
Christian Heimes7ce20132016-09-14 10:25:46 +02001238 if (name_length > NAME_MAXLEN) {
Victor Stinner65a31442014-07-01 16:45:52 +02001239 PyErr_SetString(PyExc_KeyError, "name too long");
1240 return NULL;
1241 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001242
Serhiy Storchaka63596412015-04-17 21:18:49 +03001243 if (!_getcode(self, name, (int)name_length, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001244 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001245 return NULL;
1246 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001247 /* check if code is in the PUA range that we use for named sequences
1248 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001249 if (IS_NAMED_SEQ(code)) {
1250 index = code-named_sequences_start;
1251 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1252 named_sequences[index].seq,
1253 named_sequences[index].seqlen);
1254 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001255 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001256}
1257
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001258/* XXX Add doc strings. */
1259
1260static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001261 UNICODEDATA_UCD_DECIMAL_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001262 UNICODEDATA_UCD_DIGIT_METHODDEF
1263 UNICODEDATA_UCD_NUMERIC_METHODDEF
1264 UNICODEDATA_UCD_CATEGORY_METHODDEF
1265 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1266 UNICODEDATA_UCD_COMBINING_METHODDEF
1267 UNICODEDATA_UCD_MIRRORED_METHODDEF
1268 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1269 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1270 UNICODEDATA_UCD_NAME_METHODDEF
1271 UNICODEDATA_UCD_LOOKUP_METHODDEF
1272 UNICODEDATA_UCD_NORMALIZE_METHODDEF
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001273 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001274};
1275
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001276static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001277 /* The ob_type field must be initialized in the module init function
1278 * to be portable to Windows without using C++. */
1279 PyVarObject_HEAD_INIT(NULL, 0)
1280 "unicodedata.UCD", /*tp_name*/
1281 sizeof(PreviousDBVersion), /*tp_basicsize*/
1282 0, /*tp_itemsize*/
1283 /* methods */
1284 (destructor)PyObject_Del, /*tp_dealloc*/
1285 0, /*tp_print*/
1286 0, /*tp_getattr*/
1287 0, /*tp_setattr*/
1288 0, /*tp_reserved*/
1289 0, /*tp_repr*/
1290 0, /*tp_as_number*/
1291 0, /*tp_as_sequence*/
1292 0, /*tp_as_mapping*/
1293 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001294 0, /*tp_call*/
1295 0, /*tp_str*/
1296 PyObject_GenericGetAttr,/*tp_getattro*/
1297 0, /*tp_setattro*/
1298 0, /*tp_as_buffer*/
1299 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1300 0, /*tp_doc*/
1301 0, /*tp_traverse*/
1302 0, /*tp_clear*/
1303 0, /*tp_richcompare*/
1304 0, /*tp_weaklistoffset*/
1305 0, /*tp_iter*/
1306 0, /*tp_iternext*/
1307 unicodedata_functions, /*tp_methods*/
1308 DB_members, /*tp_members*/
1309 0, /*tp_getset*/
1310 0, /*tp_base*/
1311 0, /*tp_dict*/
1312 0, /*tp_descr_get*/
1313 0, /*tp_descr_set*/
1314 0, /*tp_dictoffset*/
1315 0, /*tp_init*/
1316 0, /*tp_alloc*/
1317 0, /*tp_new*/
1318 0, /*tp_free*/
1319 0, /*tp_is_gc*/
1320};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001321
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001322PyDoc_STRVAR(unicodedata_docstring,
1323"This module provides access to the Unicode Character Database which\n\
1324defines character properties for all Unicode characters. The data in\n\
1325this database is based on the UnicodeData.txt file version\n\
Benjamin Peterson8aa7b892013-10-10 20:22:10 -04001326" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001327\n\
1328The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001329UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001330
1331static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001332 PyModuleDef_HEAD_INIT,
1333 "unicodedata",
1334 unicodedata_docstring,
1335 -1,
1336 unicodedata_functions,
1337 NULL,
1338 NULL,
1339 NULL,
1340 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001341};
1342
Mark Hammond62b1ab12002-07-23 06:31:15 +00001343PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001344PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001345{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001346 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001347
Christian Heimes90aa7642007-12-19 02:45:37 +00001348 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001349
Martin v. Löwis1a214512008-06-11 05:26:20 +00001350 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001351 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001352 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001353
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001354 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001355 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001356 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001357
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001358 /* Previous versions */
1359 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1360 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001361 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001362
Fredrik Lundh06d12682001-01-24 07:59:11 +00001363 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001364 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001365 if (v != NULL)
1366 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001367 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001368}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001369
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001370/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001371Local variables:
1372c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001373indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001374End:
1375*/