blob: 9ceab1b3db4f78818a320ee619f5f5ef4ea59d9f [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000020#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Max Bélanger2810dd72018-11-04 15:58:24 -080022_Py_IDENTIFIER(NFC);
23_Py_IDENTIFIER(NFD);
24_Py_IDENTIFIER(NFKC);
25_Py_IDENTIFIER(NFKD);
26
Larry Hastings61272b72014-01-07 12:41:53 -080027/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080028module unicodedata
Larry Hastingsc2047262014-01-25 20:43:29 -080029class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Larry Hastings61272b72014-01-07 12:41:53 -080030[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080031/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080032
Fredrik Lundh06d12682001-01-24 07:59:11 +000033/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000034
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000035typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000036 const unsigned char category; /* index into
37 _PyUnicode_CategoryNames */
38 const unsigned char combining; /* combining class value 0 - 255 */
39 const unsigned char bidirectional; /* index into
40 _PyUnicode_BidirectionalNames */
41 const unsigned char mirrored; /* true if mirrored in bidir mode */
42 const unsigned char east_asian_width; /* index into
43 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000044 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000045} _PyUnicode_DatabaseRecord;
46
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000047typedef struct change_record {
48 /* sequence of fields should be the same as in merge_old_version */
49 const unsigned char bidir_changed;
50 const unsigned char category_changed;
51 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000052 const unsigned char mirrored_changed;
Benjamin Peterson67752312016-09-14 23:53:47 -070053 const unsigned char east_asian_width_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000054 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000055} change_record;
56
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000057/* data file generated by Tools/unicode/makeunicodedata.py */
58#include "unicodedata_db.h"
59
60static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000061_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000062{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000063 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000064 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000065 index = 0;
66 else {
67 index = index1[(code>>SHIFT)];
68 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
69 }
70
71 return &_PyUnicode_Database_Records[index];
72}
73
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000074/* ------------- Previous-version API ------------------------------------- */
75typedef struct previous_version {
76 PyObject_HEAD
77 const char *name;
78 const change_record* (*getrecord)(Py_UCS4);
79 Py_UCS4 (*normalization)(Py_UCS4);
80} PreviousDBVersion;
81
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030082#include "clinic/unicodedata.c.h"
83
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000084#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
85
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000087 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088 {NULL}
89};
90
Thomas Wouters89f507f2006-12-13 04:49:30 +000091/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000092static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000093#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000094
95static PyObject*
96new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
97 Py_UCS4 (*normalization)(Py_UCS4))
98{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000099 PreviousDBVersion *self;
100 self = PyObject_New(PreviousDBVersion, &UCD_Type);
101 if (self == NULL)
102 return NULL;
103 self->name = name;
104 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000105 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000106 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000107}
108
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000109
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000110/* --- Module API --------------------------------------------------------- */
111
Larry Hastings61272b72014-01-07 12:41:53 -0800112/*[clinic input]
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800113unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700114
Serhiy Storchaka63596412015-04-17 21:18:49 +0300115 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700116 chr: int(accept={str})
Larry Hastings31826802013-10-19 00:09:25 -0700117 default: object=NULL
118 /
119
120Converts a Unicode character into its equivalent decimal value.
121
Serhiy Storchaka63596412015-04-17 21:18:49 +0300122Returns the decimal value assigned to the character chr as integer.
123If no such value is defined, default is returned, or, if not given,
124ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800125[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700126
Larry Hastings31826802013-10-19 00:09:25 -0700127static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300128unicodedata_UCD_decimal_impl(PyObject *self, int chr,
Larry Hastings89964c42015-04-14 18:07:59 -0400129 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700130/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
Larry Hastings31826802013-10-19 00:09:25 -0700131{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000132 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000133 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300134 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000135
Martin v. Löwis1a214512008-06-11 05:26:20 +0000136 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000137 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000138 if (old->category_changed == 0) {
139 /* unassigned */
140 have_old = 1;
141 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000142 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000143 else if (old->decimal_changed != 0xFF) {
144 have_old = 1;
145 rc = old->decimal_changed;
146 }
147 }
148
149 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000150 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000151 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700152 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000153 PyErr_SetString(PyExc_ValueError,
154 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000155 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 }
157 else {
Larry Hastings31826802013-10-19 00:09:25 -0700158 Py_INCREF(default_value);
159 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000160 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000161 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000162 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000163}
164
Serhiy Storchaka63596412015-04-17 21:18:49 +0300165/*[clinic input]
166unicodedata.UCD.digit
167
168 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700169 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300170 default: object=NULL
171 /
172
173Converts a Unicode character into its equivalent digit value.
174
175Returns the digit value assigned to the character chr as integer.
176If no such value is defined, default is returned, or, if not given,
177ValueError is raised.
178[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000179
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000180static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300181unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700182/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000183{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000184 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300185 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000186 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000187 if (rc < 0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300188 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000189 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000190 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191 }
192 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300193 Py_INCREF(default_value);
194 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000195 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000196 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000197 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000198}
199
Serhiy Storchaka63596412015-04-17 21:18:49 +0300200/*[clinic input]
201unicodedata.UCD.numeric
202
203 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700204 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300205 default: object=NULL
206 /
207
208Converts a Unicode character into its equivalent numeric value.
209
210Returns the numeric value assigned to the character chr as float.
211If no such value is defined, default is returned, or, if not given,
212ValueError is raised.
213[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000214
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000215static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300216unicodedata_UCD_numeric_impl(PyObject *self, int chr,
217 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700218/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000219{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000220 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000221 double rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300222 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000223
Martin v. Löwis1a214512008-06-11 05:26:20 +0000224 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000225 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000226 if (old->category_changed == 0) {
227 /* unassigned */
228 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000229 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000230 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000231 else if (old->decimal_changed != 0xFF) {
232 have_old = 1;
233 rc = old->decimal_changed;
234 }
235 }
236
237 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000238 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239 if (rc == -1.0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300240 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000241 PyErr_SetString(PyExc_ValueError, "not a numeric character");
242 return NULL;
243 }
244 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300245 Py_INCREF(default_value);
246 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000248 }
249 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250}
251
Serhiy Storchaka63596412015-04-17 21:18:49 +0300252/*[clinic input]
253unicodedata.UCD.category
254
255 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700256 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300257 /
258
259Returns the general category assigned to the character chr as string.
260[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000261
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000262static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300263unicodedata_UCD_category_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700264/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000265{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000266 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300267 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000268 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000269 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000270 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000271 if (old->category_changed != 0xFF)
272 index = old->category_changed;
273 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000274 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000275}
276
Serhiy Storchaka63596412015-04-17 21:18:49 +0300277/*[clinic input]
278unicodedata.UCD.bidirectional
279
280 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700281 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300282 /
283
284Returns the bidirectional class assigned to the character chr as string.
285
286If no such value is defined, an empty string is returned.
287[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000288
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000289static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300290unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700291/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000292{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000293 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300294 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000295 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000296 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000297 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000298 if (old->category_changed == 0)
299 index = 0; /* unassigned */
300 else if (old->bidir_changed != 0xFF)
301 index = old->bidir_changed;
302 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000303 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000304}
305
Serhiy Storchaka63596412015-04-17 21:18:49 +0300306/*[clinic input]
307unicodedata.UCD.combining -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000308
Serhiy Storchaka63596412015-04-17 21:18:49 +0300309 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700310 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300311 /
312
313Returns the canonical combining class assigned to the character chr as integer.
314
315Returns 0 if no combining class is defined.
316[clinic start generated code]*/
317
318static int
319unicodedata_UCD_combining_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700320/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000321{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000322 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300323 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000324 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000325 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000326 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000327 if (old->category_changed == 0)
328 index = 0; /* unassigned */
329 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300330 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000331}
332
Serhiy Storchaka63596412015-04-17 21:18:49 +0300333/*[clinic input]
334unicodedata.UCD.mirrored -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000335
Serhiy Storchaka63596412015-04-17 21:18:49 +0300336 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700337 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300338 /
339
340Returns the mirrored property assigned to the character chr as integer.
341
342Returns 1 if the character has been identified as a "mirrored"
343character in bidirectional text, 0 otherwise.
344[clinic start generated code]*/
345
346static int
347unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700348/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000349{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000350 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300351 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000352 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000353 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000354 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000355 if (old->category_changed == 0)
356 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000357 else if (old->mirrored_changed != 0xFF)
358 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000359 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300360 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000361}
362
Serhiy Storchaka63596412015-04-17 21:18:49 +0300363/*[clinic input]
364unicodedata.UCD.east_asian_width
365
366 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700367 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300368 /
369
370Returns the east asian width assigned to the character chr as string.
371[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000372
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000373static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300374unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700375/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000376{
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000377 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300378 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000379 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000380 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000381 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000382 if (old->category_changed == 0)
383 index = 0; /* unassigned */
Benjamin Peterson67752312016-09-14 23:53:47 -0700384 else if (old->east_asian_width_changed != 0xFF)
385 index = old->east_asian_width_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000386 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000387 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000388}
389
Serhiy Storchaka63596412015-04-17 21:18:49 +0300390/*[clinic input]
391unicodedata.UCD.decomposition
392
393 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700394 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300395 /
396
397Returns the character decomposition mapping assigned to the character chr as string.
398
399An empty string is returned in case no such mapping is defined.
400[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000401
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000402static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300403unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700404/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000405{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000406 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000407 int code, index, count;
408 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000409 unsigned int prefix_index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300410 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000411
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000412 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000413
Martin v. Löwis1a214512008-06-11 05:26:20 +0000414 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000415 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000416 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000417 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000418 }
419
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000420 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000421 index = 0;
422 else {
423 index = decomp_index1[(code>>DECOMP_SHIFT)];
424 index = decomp_index2[(index<<DECOMP_SHIFT)+
425 (code&((1<<DECOMP_SHIFT)-1))];
426 }
427
Tim Peters69b83b12001-11-30 07:23:05 +0000428 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000429 is prefix code (from*/
430 count = decomp_data[index] >> 8;
431
432 /* XXX: could allocate the PyString up front instead
433 (strlen(prefix) + 5 * count + 1 bytes) */
434
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000435 /* Based on how index is calculated above and decomp_data is generated
436 from Tools/unicode/makeunicodedata.py, it should not be possible
437 to overflow decomp_prefix. */
438 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200439 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000440
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000441 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000442 i = strlen(decomp_prefix[prefix_index]);
443 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000444
445 while (count-- > 0) {
446 if (i)
447 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000448 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000449 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
450 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000451 i += strlen(decomp + i);
452 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000453 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000454}
455
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000456static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000457get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000458{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000459 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000460 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000461 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000462 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000463 /* unassigned in old version */
464 *index = 0;
465 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000466 else {
467 *index = decomp_index1[(code>>DECOMP_SHIFT)];
468 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
469 (code&((1<<DECOMP_SHIFT)-1))];
470 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000471
Martin v. Löwis677bde22002-11-23 22:08:15 +0000472 /* high byte is number of hex bytes (usually one or two), low byte
473 is prefix code (from*/
474 *count = decomp_data[*index] >> 8;
475 *prefix = decomp_data[*index] & 255;
476
477 (*index)++;
478}
479
480#define SBase 0xAC00
481#define LBase 0x1100
482#define VBase 0x1161
483#define TBase 0x11A7
484#define LCount 19
485#define VCount 21
486#define TCount 28
487#define NCount (VCount*TCount)
488#define SCount (LCount*NCount)
489
490static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000491nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000492{
493 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200494 Py_UCS4 *output;
495 Py_ssize_t i, o, osize;
496 int kind;
497 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000498 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200499 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000500 Py_ssize_t space, isize;
501 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000502 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000503
Martin v. Löwis677bde22002-11-23 22:08:15 +0000504 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200505 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500506 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300507 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500508 if (space > 10) {
509 if (space <= PY_SSIZE_T_MAX - 10)
510 space += 10;
511 }
512 else {
513 space *= 2;
514 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200515 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500516 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200517 if (!output) {
518 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000519 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200520 }
521 i = o = 0;
522 kind = PyUnicode_KIND(input);
523 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000524
Martin v. Löwis22970662011-09-29 13:39:38 +0200525 while (i < isize) {
526 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000527 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200528 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000529 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300530 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000531 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000532 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200533 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000534 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000535 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
536 if (new_output == NULL) {
537 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200538 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000539 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200540 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000541 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000542 }
543 /* Hangul Decomposition. */
544 if (SBase <= code && code < (SBase+SCount)) {
545 int SIndex = code - SBase;
546 int L = LBase + SIndex / NCount;
547 int V = VBase + (SIndex % NCount) / TCount;
548 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200549 output[o++] = L;
550 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000551 space -= 2;
552 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200553 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000554 space --;
555 }
556 continue;
557 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000558 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000559 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000560 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
561 if (value != 0) {
562 stack[stackptr++] = value;
563 continue;
564 }
565 }
566
567 /* Other decompositions. */
568 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000569
570 /* Copy character if it is not decomposable, or has a
571 compatibility decomposition, but we do NFD. */
572 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200573 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000574 space--;
575 continue;
576 }
577 /* Copy decomposition onto the stack, in reverse
578 order. */
579 while(count) {
580 code = decomp_data[index + (--count)];
581 stack[stackptr++] = code;
582 }
583 }
584 }
585
Martin v. Löwis22970662011-09-29 13:39:38 +0200586 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
587 output, o);
588 PyMem_Free(output);
589 if (!result)
590 return NULL;
591 /* result is guaranteed to be ready, as it is compact. */
592 kind = PyUnicode_KIND(result);
593 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000594
595 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200596 i = 0;
597 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
598 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
599 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000600 if (prev == 0 || cur == 0 || prev <= cur) {
601 prev = cur;
602 continue;
603 }
604 /* Non-canonical order. Need to switch *i with previous. */
605 o = i - 1;
606 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200607 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
608 PyUnicode_WRITE(kind, data, o+1,
609 PyUnicode_READ(kind, data, o));
610 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000611 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200612 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000613 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200614 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000615 if (prev == 0 || prev <= cur)
616 break;
617 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200618 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000619 }
620 return result;
621}
622
623static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200624find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000625{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200626 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000627 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200628 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000629 if (code < start)
630 return -1;
631 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200632 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000633 return nfc[index].index + delta;
634 }
635 }
636 return -1;
637}
638
639static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000640nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000641{
642 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200643 int kind;
644 void *data;
645 Py_UCS4 *output;
646 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000647 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200648 Py_UCS4 code;
649 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000650 int cskipped = 0;
651
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000652 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000653 if (!result)
654 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200655 /* result will be "ready". */
656 kind = PyUnicode_KIND(result);
657 data = PyUnicode_DATA(result);
658 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000659
Martin v. Löwis22970662011-09-29 13:39:38 +0200660 /* We allocate a buffer for the output.
661 If we find that we made no changes, we still return
662 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500663 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200664 if (!output) {
665 PyErr_NoMemory();
666 Py_DECREF(result);
667 return 0;
668 }
669 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000670
Martin v. Löwis677bde22002-11-23 22:08:15 +0000671 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200672 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000673 for (index = 0; index < cskipped; index++) {
674 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000675 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000676 Remove from list. */
677 skipped[index] = skipped[cskipped-1];
678 cskipped--;
679 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000680 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000681 }
682 }
683 /* Hangul Composition. We don't need to check for <LV,T>
684 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200685 code = PyUnicode_READ(kind, data, i);
686 if (LBase <= code && code < (LBase+LCount) &&
687 i + 1 < len &&
688 VBase <= PyUnicode_READ(kind, data, i+1) &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900689 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
690 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
691 and V character is a modern vowel (0x1161 ~ 0x1175). */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000692 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200693 LIndex = code - LBase;
694 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000695 code = SBase + (LIndex*VCount+VIndex)*TCount;
696 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200697 if (i < len &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900698 TBase < PyUnicode_READ(kind, data, i) &&
699 PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
700 /* check T character is a modern trailing consonant
701 (0x11A8 ~ 0x11C2). */
Martin v. Löwis22970662011-09-29 13:39:38 +0200702 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000703 i++;
704 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200705 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000706 continue;
707 }
708
Martin v. Löwis22970662011-09-29 13:39:38 +0200709 /* code is still input[i] here */
710 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000711 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200712 output[o++] = code;
713 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000714 continue;
715 }
716 /* Find next unblocked character. */
717 i1 = i+1;
718 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200719 /* output base character for now; might be updated later. */
720 output[o] = PyUnicode_READ(kind, data, i);
721 while (i1 < len) {
722 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
723 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000724 if (comb) {
725 if (comb1 == 0)
726 break;
727 if (comb >= comb1) {
728 /* Character is blocked. */
729 i1++;
730 continue;
731 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000732 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200733 l = find_nfc_index(self, nfc_last, code1);
734 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000735 is a starter, we don't need to look further.
736 Otherwise, record the combining class. */
737 if (l == -1) {
738 not_combinable:
739 if (comb1 == 0)
740 break;
741 comb = comb1;
742 i1++;
743 continue;
744 }
745 index = f*TOTAL_LAST + l;
746 index1 = comp_index[index >> COMP_SHIFT];
747 code = comp_data[(index1<<COMP_SHIFT)+
748 (index&((1<<COMP_SHIFT)-1))];
749 if (code == 0)
750 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000751
Martin v. Löwis677bde22002-11-23 22:08:15 +0000752 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200753 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000754 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000755 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000756 skipped[cskipped++] = i1;
757 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200758 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000759 if (f == -1)
760 break;
761 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200762 /* Output character was already written.
763 Just advance the indices. */
764 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000765 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200766 if (o == len) {
767 /* No changes. Return original string. */
768 PyMem_Free(output);
769 return result;
770 }
771 Py_DECREF(result);
772 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
773 output, o);
774 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000775 return result;
776}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000777
Max Bélanger2810dd72018-11-04 15:58:24 -0800778typedef enum {YES, NO, MAYBE} NormalMode;
779
780/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
781static NormalMode
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000782is_normalized(PyObject *self, PyObject *input, int nfc, int k)
783{
Martin v. Löwis22970662011-09-29 13:39:38 +0200784 Py_ssize_t i, len;
785 int kind;
786 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000787 unsigned char prev_combining = 0, quickcheck_mask;
788
789 /* An older version of the database is requested, quickchecks must be
790 disabled. */
791 if (self && UCD_Check(self))
Max Bélanger2810dd72018-11-04 15:58:24 -0800792 return NO;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000793
794 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
795 as described in http://unicode.org/reports/tr15/#Annex8. */
796 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
797
Martin v. Löwis22970662011-09-29 13:39:38 +0200798 i = 0;
799 kind = PyUnicode_KIND(input);
800 data = PyUnicode_DATA(input);
801 len = PyUnicode_GET_LENGTH(input);
802 while (i < len) {
803 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
804 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000805 unsigned char combining = record->combining;
806 unsigned char quickcheck = record->normalization_quick_check;
807
808 if (quickcheck & quickcheck_mask)
Max Bélanger2810dd72018-11-04 15:58:24 -0800809 return MAYBE; /* this string might need normalization */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000810 if (combining && prev_combining > combining)
Max Bélanger2810dd72018-11-04 15:58:24 -0800811 return NO; /* non-canonical sort order, not normalized */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000812 prev_combining = combining;
813 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800814 return YES; /* certainly normalized */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000815}
816
Serhiy Storchaka63596412015-04-17 21:18:49 +0300817/*[clinic input]
Max Bélanger2810dd72018-11-04 15:58:24 -0800818unicodedata.UCD.is_normalized
819
820 self: self
821 form: unicode
822 unistr as input: unicode
823 /
824
825Return whether the Unicode string unistr is in the normal form 'form'.
826
827Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
828[clinic start generated code]*/
829
830static PyObject *
831unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
832 PyObject *input)
833/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
834{
835 if (PyUnicode_READY(input) == -1) {
836 return NULL;
837 }
838
839 if (PyUnicode_GET_LENGTH(input) == 0) {
840 /* special case empty input strings. */
841 Py_RETURN_TRUE;
842 }
843
844 PyObject *result;
845 int nfc = 0;
846 int k = 0;
847 NormalMode m;
848
849 PyObject *cmp;
850 int match = 0;
851
852 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
853 nfc = 1;
854 }
855 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
856 nfc = 1;
857 k = 1;
858 }
859 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
860 /* matches default values for `nfc` and `k` */
861 }
862 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
863 k = 1;
864 }
865 else {
866 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
867 return NULL;
868 }
869
870 m = is_normalized(self, input, nfc, k);
871
872 if (m == MAYBE) {
873 cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
874 if (cmp == NULL) {
875 return NULL;
876 }
877 match = PyUnicode_Compare(input, cmp);
878 Py_DECREF(cmp);
879 result = (match == 0) ? Py_True : Py_False;
880 }
881 else {
882 result = (m == YES) ? Py_True : Py_False;
883 }
884
885 Py_INCREF(result);
886 return result;
887}
888
889
890/*[clinic input]
Serhiy Storchaka63596412015-04-17 21:18:49 +0300891unicodedata.UCD.normalize
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000892
Serhiy Storchaka63596412015-04-17 21:18:49 +0300893 self: self
Max Bélanger2810dd72018-11-04 15:58:24 -0800894 form: unicode
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +0300895 unistr as input: unicode
Serhiy Storchaka63596412015-04-17 21:18:49 +0300896 /
897
898Return the normal form 'form' for the Unicode string unistr.
899
900Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
901[clinic start generated code]*/
902
903static PyObject *
Max Bélanger2810dd72018-11-04 15:58:24 -0800904unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
Serhiy Storchaka63596412015-04-17 21:18:49 +0300905 PyObject *input)
Max Bélanger2810dd72018-11-04 15:58:24 -0800906/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
Martin v. Löwis677bde22002-11-23 22:08:15 +0000907{
Martin v. Löwis22970662011-09-29 13:39:38 +0200908 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000909 /* Special case empty input strings, since resizing
910 them later would cause internal errors. */
911 Py_INCREF(input);
912 return input;
913 }
914
Max Bélanger2810dd72018-11-04 15:58:24 -0800915 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
916 if (is_normalized(self, input, 1, 0) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000917 Py_INCREF(input);
918 return input;
919 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000920 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000921 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800922 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
923 if (is_normalized(self, input, 1, 1) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000924 Py_INCREF(input);
925 return input;
926 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000927 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000928 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800929 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
930 if (is_normalized(self, input, 0, 0) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000931 Py_INCREF(input);
932 return input;
933 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000934 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000935 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800936 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
937 if (is_normalized(self, input, 0, 1) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000938 Py_INCREF(input);
939 return input;
940 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000941 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000942 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000943 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
944 return NULL;
945}
946
Fredrik Lundh06d12682001-01-24 07:59:11 +0000947/* -------------------------------------------------------------------- */
948/* unicode character name tables */
949
950/* data file generated by Tools/unicode/makeunicodedata.py */
951#include "unicodename_db.h"
952
953/* -------------------------------------------------------------------- */
954/* database code (cut and pasted from the unidb package) */
955
956static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000957_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000958{
959 int i;
960 unsigned long h = 0;
961 unsigned long ix;
962 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200963 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000964 ix = h & 0xff000000;
965 if (ix)
966 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
967 }
968 return h;
969}
970
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200971static const char * const hangul_syllables[][3] = {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000972 { "G", "A", "" },
973 { "GG", "AE", "G" },
974 { "N", "YA", "GG" },
975 { "D", "YAE", "GS" },
976 { "DD", "EO", "N", },
977 { "R", "E", "NJ" },
978 { "M", "YEO", "NH" },
979 { "B", "YE", "D" },
980 { "BB", "O", "L" },
981 { "S", "WA", "LG" },
982 { "SS", "WAE", "LM" },
983 { "", "OE", "LB" },
984 { "J", "YO", "LS" },
985 { "JJ", "U", "LT" },
986 { "C", "WEO", "LP" },
987 { "K", "WE", "LH" },
988 { "T", "WI", "M" },
989 { "P", "YU", "B" },
990 { "H", "EU", "BS" },
991 { 0, "YI", "S" },
992 { 0, "I", "SS" },
993 { 0, 0, "NG" },
994 { 0, 0, "J" },
995 { 0, 0, "C" },
996 { 0, 0, "K" },
997 { 0, 0, "T" },
998 { 0, 0, "P" },
999 { 0, 0, "H" }
1000};
1001
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001002/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001003static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001004is_unified_ideograph(Py_UCS4 code)
1005{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001006 return
1007 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -07001008 (0x4E00 <= code && code <= 0x9FEF) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001009 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
1010 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
Benjamin Peterson48013832015-06-27 15:45:56 -05001011 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
Benjamin Peterson279a9622017-06-22 22:31:08 -07001012 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1013 (0x2CEB0 <= code && code <= 0x2EBEF); /* CJK Ideograph Extension F */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001014}
1015
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001016/* macros used to determine if the given code point is in the PUA range that
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001017 * we are using to store aliases and named sequences */
1018#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1019#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1020 (cp < named_sequences_end))
1021
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001022static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001023_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
1024 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001025{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001026 /* Find the name associated with the given code point.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001027 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1028 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001029 int offset;
1030 int i;
1031 int word;
1032 unsigned char* w;
1033
Martin v. Löwisc3509122006-03-11 12:16:23 +00001034 if (code >= 0x110000)
1035 return 0;
1036
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001037 /* XXX should we just skip all the code points in the PUAs here? */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001038 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1039 return 0;
1040
Martin v. Löwis1a214512008-06-11 05:26:20 +00001041 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001042 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +03001043 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001044 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1045 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +03001046 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +00001047 if (old->category_changed == 0) {
1048 /* unassigned */
1049 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001050 }
Martin v. Löwisc3509122006-03-11 12:16:23 +00001051 }
1052
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +00001053 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001054 /* Hangul syllable. */
1055 int SIndex = code - SBase;
1056 int L = SIndex / NCount;
1057 int V = (SIndex % NCount) / TCount;
1058 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001059
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001060 if (buflen < 27)
1061 /* Worst case: HANGUL SYLLABLE <10chars>. */
1062 return 0;
1063 strcpy(buffer, "HANGUL SYLLABLE ");
1064 buffer += 16;
1065 strcpy(buffer, hangul_syllables[L][0]);
1066 buffer += strlen(hangul_syllables[L][0]);
1067 strcpy(buffer, hangul_syllables[V][1]);
1068 buffer += strlen(hangul_syllables[V][1]);
1069 strcpy(buffer, hangul_syllables[T][2]);
1070 buffer += strlen(hangul_syllables[T][2]);
1071 *buffer = '\0';
1072 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001073 }
1074
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001075 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001076 if (buflen < 28)
1077 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1078 return 0;
1079 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1080 return 1;
1081 }
1082
Fredrik Lundh06d12682001-01-24 07:59:11 +00001083 /* get offset into phrasebook */
1084 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1085 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1086 (code&((1<<phrasebook_shift)-1))];
1087 if (!offset)
1088 return 0;
1089
1090 i = 0;
1091
1092 for (;;) {
1093 /* get word index */
1094 word = phrasebook[offset] - phrasebook_short;
1095 if (word >= 0) {
1096 word = (word << 8) + phrasebook[offset+1];
1097 offset += 2;
1098 } else
1099 word = phrasebook[offset++];
1100 if (i) {
1101 if (i > buflen)
1102 return 0; /* buffer overflow */
1103 buffer[i++] = ' ';
1104 }
1105 /* copy word string from lexicon. the last character in the
1106 word has bit 7 set. the last word in a string ends with
1107 0x80 */
1108 w = lexicon + lexicon_offset[word];
1109 while (*w < 128) {
1110 if (i >= buflen)
1111 return 0; /* buffer overflow */
1112 buffer[i++] = *w++;
1113 }
1114 if (i >= buflen)
1115 return 0; /* buffer overflow */
1116 buffer[i++] = *w & 127;
1117 if (*w == 128)
1118 break; /* end of word */
1119 }
1120
1121 return 1;
1122}
1123
1124static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001125_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001126{
1127 /* check if code corresponds to the given name */
1128 int i;
Christian Heimes2f366ca2016-09-23 20:20:27 +02001129 char buffer[NAME_MAXLEN+1];
1130 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001131 return 0;
1132 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001133 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001134 return 0;
1135 }
1136 return buffer[namelen] == '\0';
1137}
1138
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001139static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001140find_syllable(const char *str, int *len, int *pos, int count, int column)
1141{
1142 int i, len1;
1143 *len = -1;
1144 for (i = 0; i < count; i++) {
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02001145 const char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001146 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001147 if (len1 <= *len)
1148 continue;
1149 if (strncmp(str, s, len1) == 0) {
1150 *len = len1;
1151 *pos = i;
1152 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001153 }
1154 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001155 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001156 }
1157}
1158
Fredrik Lundh06d12682001-01-24 07:59:11 +00001159static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001160_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001161{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001162 /* check if named sequences are allowed */
1163 if (!with_named_seq && IS_NAMED_SEQ(cp))
1164 return 0;
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001165 /* if the code point is in the PUA range that we use for aliases,
1166 * convert it to obtain the right code point */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001167 if (IS_ALIAS(cp))
1168 *code = name_aliases[cp-aliases_start];
1169 else
1170 *code = cp;
1171 return 1;
1172}
1173
1174static int
1175_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1176 int with_named_seq)
1177{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001178 /* Return the code point associated with the given name.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001179 * Named aliases are resolved too (unless self != NULL (i.e. we are using
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001180 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001181 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001182 unsigned int h, v;
1183 unsigned int mask = code_size-1;
1184 unsigned int i, incr;
1185
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001186 /* Check for hangul syllables. */
1187 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001188 int len, L = -1, V = -1, T = -1;
1189 const char *pos = name + 16;
1190 find_syllable(pos, &len, &L, LCount, 0);
1191 pos += len;
1192 find_syllable(pos, &len, &V, VCount, 1);
1193 pos += len;
1194 find_syllable(pos, &len, &T, TCount, 2);
1195 pos += len;
1196 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1197 *code = SBase + (L*VCount+V)*TCount + T;
1198 return 1;
1199 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001200 /* Otherwise, it's an illegal syllable name. */
1201 return 0;
1202 }
1203
1204 /* Check for unified ideographs. */
1205 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1206 /* Four or five hexdigits must follow. */
1207 v = 0;
1208 name += 22;
1209 namelen -= 22;
1210 if (namelen != 4 && namelen != 5)
1211 return 0;
1212 while (namelen--) {
1213 v *= 16;
1214 if (*name >= '0' && *name <= '9')
1215 v += *name - '0';
1216 else if (*name >= 'A' && *name <= 'F')
1217 v += *name - 'A' + 10;
1218 else
1219 return 0;
1220 name++;
1221 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001222 if (!is_unified_ideograph(v))
1223 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001224 *code = v;
1225 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001226 }
1227
Fredrik Lundh06d12682001-01-24 07:59:11 +00001228 /* the following is the same as python's dictionary lookup, with
1229 only minor changes. see the makeunicodedata script for more
1230 details */
1231
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001232 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001233 i = (~h) & mask;
1234 v = code_hash[i];
1235 if (!v)
1236 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001237 if (_cmpname(self, v, name, namelen))
1238 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001239 incr = (h ^ (h >> 3)) & mask;
1240 if (!incr)
1241 incr = mask;
1242 for (;;) {
1243 i = (i + incr) & mask;
1244 v = code_hash[i];
1245 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001246 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001247 if (_cmpname(self, v, name, namelen))
1248 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001249 incr = incr << 1;
1250 if (incr > mask)
1251 incr = incr ^ code_poly;
1252 }
1253}
1254
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001255static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001256{
1257 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001258 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001259 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001260};
1261
1262/* -------------------------------------------------------------------- */
1263/* Python bindings */
1264
Serhiy Storchaka63596412015-04-17 21:18:49 +03001265/*[clinic input]
1266unicodedata.UCD.name
1267
1268 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001269 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +03001270 default: object=NULL
1271 /
1272
1273Returns the name assigned to the character chr as a string.
1274
1275If no name is defined, default is returned, or, if not given,
1276ValueError is raised.
1277[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001278
Fredrik Lundh06d12682001-01-24 07:59:11 +00001279static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001280unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001281/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001282{
Christian Heimes2f366ca2016-09-23 20:20:27 +02001283 char name[NAME_MAXLEN+1];
Serhiy Storchaka63596412015-04-17 21:18:49 +03001284 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001285
Christian Heimes2f366ca2016-09-23 20:20:27 +02001286 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001287 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001289 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001290 }
1291 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001292 Py_INCREF(default_value);
1293 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001294 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001295 }
1296
Walter Dörwald4254e762007-06-05 16:04:09 +00001297 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001298}
1299
Serhiy Storchaka63596412015-04-17 21:18:49 +03001300/*[clinic input]
1301unicodedata.UCD.lookup
1302
1303 self: self
Larry Hastings38337d12015-05-07 23:30:09 -07001304 name: str(accept={str, robuffer}, zeroes=True)
Serhiy Storchaka63596412015-04-17 21:18:49 +03001305 /
1306
1307Look up character by name.
1308
1309If a character with the given name is found, return the
1310corresponding character. If not found, KeyError is raised.
1311[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001312
Fredrik Lundh06d12682001-01-24 07:59:11 +00001313static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001314unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1315 Py_ssize_clean_t name_length)
Larry Hastings38337d12015-05-07 23:30:09 -07001316/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001317{
1318 Py_UCS4 code;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001319 unsigned int index;
Christian Heimes7ce20132016-09-14 10:25:46 +02001320 if (name_length > NAME_MAXLEN) {
Victor Stinner65a31442014-07-01 16:45:52 +02001321 PyErr_SetString(PyExc_KeyError, "name too long");
1322 return NULL;
1323 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001324
Serhiy Storchaka63596412015-04-17 21:18:49 +03001325 if (!_getcode(self, name, (int)name_length, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001326 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001327 return NULL;
1328 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001329 /* check if code is in the PUA range that we use for named sequences
1330 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001331 if (IS_NAMED_SEQ(code)) {
1332 index = code-named_sequences_start;
1333 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1334 named_sequences[index].seq,
1335 named_sequences[index].seqlen);
1336 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001338}
1339
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001340/* XXX Add doc strings. */
1341
1342static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001343 UNICODEDATA_UCD_DECIMAL_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001344 UNICODEDATA_UCD_DIGIT_METHODDEF
1345 UNICODEDATA_UCD_NUMERIC_METHODDEF
1346 UNICODEDATA_UCD_CATEGORY_METHODDEF
1347 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1348 UNICODEDATA_UCD_COMBINING_METHODDEF
1349 UNICODEDATA_UCD_MIRRORED_METHODDEF
1350 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1351 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1352 UNICODEDATA_UCD_NAME_METHODDEF
1353 UNICODEDATA_UCD_LOOKUP_METHODDEF
Max Bélanger2810dd72018-11-04 15:58:24 -08001354 UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001355 UNICODEDATA_UCD_NORMALIZE_METHODDEF
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001356 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001357};
1358
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001359static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001360 /* The ob_type field must be initialized in the module init function
1361 * to be portable to Windows without using C++. */
1362 PyVarObject_HEAD_INIT(NULL, 0)
1363 "unicodedata.UCD", /*tp_name*/
1364 sizeof(PreviousDBVersion), /*tp_basicsize*/
1365 0, /*tp_itemsize*/
1366 /* methods */
1367 (destructor)PyObject_Del, /*tp_dealloc*/
1368 0, /*tp_print*/
1369 0, /*tp_getattr*/
1370 0, /*tp_setattr*/
1371 0, /*tp_reserved*/
1372 0, /*tp_repr*/
1373 0, /*tp_as_number*/
1374 0, /*tp_as_sequence*/
1375 0, /*tp_as_mapping*/
1376 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001377 0, /*tp_call*/
1378 0, /*tp_str*/
1379 PyObject_GenericGetAttr,/*tp_getattro*/
1380 0, /*tp_setattro*/
1381 0, /*tp_as_buffer*/
1382 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1383 0, /*tp_doc*/
1384 0, /*tp_traverse*/
1385 0, /*tp_clear*/
1386 0, /*tp_richcompare*/
1387 0, /*tp_weaklistoffset*/
1388 0, /*tp_iter*/
1389 0, /*tp_iternext*/
1390 unicodedata_functions, /*tp_methods*/
1391 DB_members, /*tp_members*/
1392 0, /*tp_getset*/
1393 0, /*tp_base*/
1394 0, /*tp_dict*/
1395 0, /*tp_descr_get*/
1396 0, /*tp_descr_set*/
1397 0, /*tp_dictoffset*/
1398 0, /*tp_init*/
1399 0, /*tp_alloc*/
1400 0, /*tp_new*/
1401 0, /*tp_free*/
1402 0, /*tp_is_gc*/
1403};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001404
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001405PyDoc_STRVAR(unicodedata_docstring,
1406"This module provides access to the Unicode Character Database which\n\
1407defines character properties for all Unicode characters. The data in\n\
1408this database is based on the UnicodeData.txt file version\n\
luzpaza5293b42017-11-05 07:37:50 -06001409" UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001410\n\
1411The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001412UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001413
1414static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001415 PyModuleDef_HEAD_INIT,
1416 "unicodedata",
1417 unicodedata_docstring,
1418 -1,
1419 unicodedata_functions,
1420 NULL,
1421 NULL,
1422 NULL,
1423 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001424};
1425
Mark Hammond62b1ab12002-07-23 06:31:15 +00001426PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001427PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001428{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001429 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001430
Christian Heimes90aa7642007-12-19 02:45:37 +00001431 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001432
Martin v. Löwis1a214512008-06-11 05:26:20 +00001433 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001434 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001435 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001436
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001437 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001438 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001439 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001440
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001441 /* Previous versions */
1442 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1443 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001444 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001445
Fredrik Lundh06d12682001-01-24 07:59:11 +00001446 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001447 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001448 if (v != NULL)
1449 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001450 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001451}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001452
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001453/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001454Local variables:
1455c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001456indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001457End:
1458*/