blob: ce97bdf89f0acb1172be5246779601659972bac7 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000020#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Greg Price2f094132019-09-03 19:45:44 -070022#include <stdbool.h>
23
Max Bélanger2810dd72018-11-04 15:58:24 -080024_Py_IDENTIFIER(NFC);
25_Py_IDENTIFIER(NFD);
26_Py_IDENTIFIER(NFKC);
27_Py_IDENTIFIER(NFKD);
28
Larry Hastings61272b72014-01-07 12:41:53 -080029/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080030module unicodedata
Larry Hastingsc2047262014-01-25 20:43:29 -080031class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Larry Hastings61272b72014-01-07 12:41:53 -080032[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080033/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080034
Fredrik Lundh06d12682001-01-24 07:59:11 +000035/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000036
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000037typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000038 const unsigned char category; /* index into
39 _PyUnicode_CategoryNames */
40 const unsigned char combining; /* combining class value 0 - 255 */
41 const unsigned char bidirectional; /* index into
42 _PyUnicode_BidirectionalNames */
43 const unsigned char mirrored; /* true if mirrored in bidir mode */
44 const unsigned char east_asian_width; /* index into
45 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000046 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047} _PyUnicode_DatabaseRecord;
48
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000049typedef struct change_record {
50 /* sequence of fields should be the same as in merge_old_version */
51 const unsigned char bidir_changed;
52 const unsigned char category_changed;
53 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000054 const unsigned char mirrored_changed;
Benjamin Peterson67752312016-09-14 23:53:47 -070055 const unsigned char east_asian_width_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000056 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000057} change_record;
58
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000059/* data file generated by Tools/unicode/makeunicodedata.py */
60#include "unicodedata_db.h"
61
62static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000063_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000064{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000065 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000066 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000067 index = 0;
68 else {
69 index = index1[(code>>SHIFT)];
70 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
71 }
72
73 return &_PyUnicode_Database_Records[index];
74}
75
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000076/* ------------- Previous-version API ------------------------------------- */
77typedef struct previous_version {
78 PyObject_HEAD
79 const char *name;
80 const change_record* (*getrecord)(Py_UCS4);
81 Py_UCS4 (*normalization)(Py_UCS4);
82} PreviousDBVersion;
83
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030084#include "clinic/unicodedata.c.h"
85
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
87
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090 {NULL}
91};
92
Thomas Wouters89f507f2006-12-13 04:49:30 +000093/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000094static PyTypeObject UCD_Type;
Dong-hee Na1b55b652020-02-17 19:09:15 +090095#define UCD_Check(o) Py_IS_TYPE(o, &UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000096
97static PyObject*
98new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
99 Py_UCS4 (*normalization)(Py_UCS4))
100{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000101 PreviousDBVersion *self;
102 self = PyObject_New(PreviousDBVersion, &UCD_Type);
103 if (self == NULL)
104 return NULL;
105 self->name = name;
106 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000107 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000108 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000109}
110
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000111
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000112/* --- Module API --------------------------------------------------------- */
113
Larry Hastings61272b72014-01-07 12:41:53 -0800114/*[clinic input]
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800115unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700116
Serhiy Storchaka63596412015-04-17 21:18:49 +0300117 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700118 chr: int(accept={str})
Larry Hastings31826802013-10-19 00:09:25 -0700119 default: object=NULL
120 /
121
122Converts a Unicode character into its equivalent decimal value.
123
Serhiy Storchaka63596412015-04-17 21:18:49 +0300124Returns the decimal value assigned to the character chr as integer.
125If no such value is defined, default is returned, or, if not given,
126ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800127[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700128
Larry Hastings31826802013-10-19 00:09:25 -0700129static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300130unicodedata_UCD_decimal_impl(PyObject *self, int chr,
Larry Hastings89964c42015-04-14 18:07:59 -0400131 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700132/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
Larry Hastings31826802013-10-19 00:09:25 -0700133{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000134 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000135 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300136 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000137
Martin v. Löwis1a214512008-06-11 05:26:20 +0000138 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000139 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000140 if (old->category_changed == 0) {
141 /* unassigned */
142 have_old = 1;
143 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000144 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000145 else if (old->decimal_changed != 0xFF) {
146 have_old = 1;
147 rc = old->decimal_changed;
148 }
149 }
150
151 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000152 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000153 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700154 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000155 PyErr_SetString(PyExc_ValueError,
156 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000157 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000158 }
159 else {
Larry Hastings31826802013-10-19 00:09:25 -0700160 Py_INCREF(default_value);
161 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000162 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000163 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000164 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000165}
166
Serhiy Storchaka63596412015-04-17 21:18:49 +0300167/*[clinic input]
168unicodedata.UCD.digit
169
170 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700171 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300172 default: object=NULL
173 /
174
175Converts a Unicode character into its equivalent digit value.
176
177Returns the digit value assigned to the character chr as integer.
178If no such value is defined, default is returned, or, if not given,
179ValueError is raised.
180[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000181
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000182static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300183unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700184/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000185{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000186 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300187 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000188 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000189 if (rc < 0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300190 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000192 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000193 }
194 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300195 Py_INCREF(default_value);
196 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000197 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000198 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000199 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000200}
201
Serhiy Storchaka63596412015-04-17 21:18:49 +0300202/*[clinic input]
203unicodedata.UCD.numeric
204
205 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700206 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300207 default: object=NULL
208 /
209
210Converts a Unicode character into its equivalent numeric value.
211
212Returns the numeric value assigned to the character chr as float.
213If no such value is defined, default is returned, or, if not given,
214ValueError is raised.
215[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000216
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000217static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300218unicodedata_UCD_numeric_impl(PyObject *self, int chr,
219 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700220/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000221{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000222 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000223 double rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300224 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000225
Martin v. Löwis1a214512008-06-11 05:26:20 +0000226 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000227 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000228 if (old->category_changed == 0) {
229 /* unassigned */
230 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000232 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000233 else if (old->decimal_changed != 0xFF) {
234 have_old = 1;
235 rc = old->decimal_changed;
236 }
237 }
238
239 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000240 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000241 if (rc == -1.0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300242 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000243 PyErr_SetString(PyExc_ValueError, "not a numeric character");
244 return NULL;
245 }
246 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300247 Py_INCREF(default_value);
248 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000249 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250 }
251 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000252}
253
Serhiy Storchaka63596412015-04-17 21:18:49 +0300254/*[clinic input]
255unicodedata.UCD.category
256
257 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700258 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300259 /
260
261Returns the general category assigned to the character chr as string.
262[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000263
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000264static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300265unicodedata_UCD_category_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700266/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000267{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000268 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300269 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000270 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000271 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000272 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000273 if (old->category_changed != 0xFF)
274 index = old->category_changed;
275 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000276 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000277}
278
Serhiy Storchaka63596412015-04-17 21:18:49 +0300279/*[clinic input]
280unicodedata.UCD.bidirectional
281
282 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700283 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300284 /
285
286Returns the bidirectional class assigned to the character chr as string.
287
288If no such value is defined, an empty string is returned.
289[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000290
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000291static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300292unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700293/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000294{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000295 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300296 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000297 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000298 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000299 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000300 if (old->category_changed == 0)
301 index = 0; /* unassigned */
302 else if (old->bidir_changed != 0xFF)
303 index = old->bidir_changed;
304 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000305 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000306}
307
Serhiy Storchaka63596412015-04-17 21:18:49 +0300308/*[clinic input]
309unicodedata.UCD.combining -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000310
Serhiy Storchaka63596412015-04-17 21:18:49 +0300311 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700312 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300313 /
314
315Returns the canonical combining class assigned to the character chr as integer.
316
317Returns 0 if no combining class is defined.
318[clinic start generated code]*/
319
320static int
321unicodedata_UCD_combining_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700322/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000323{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000324 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300325 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000326 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000327 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000328 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000329 if (old->category_changed == 0)
330 index = 0; /* unassigned */
331 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300332 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000333}
334
Serhiy Storchaka63596412015-04-17 21:18:49 +0300335/*[clinic input]
336unicodedata.UCD.mirrored -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000337
Serhiy Storchaka63596412015-04-17 21:18:49 +0300338 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700339 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300340 /
341
342Returns the mirrored property assigned to the character chr as integer.
343
344Returns 1 if the character has been identified as a "mirrored"
345character in bidirectional text, 0 otherwise.
346[clinic start generated code]*/
347
348static int
349unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700350/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000351{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000352 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300353 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000354 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000355 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000356 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000357 if (old->category_changed == 0)
358 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000359 else if (old->mirrored_changed != 0xFF)
360 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000361 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300362 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000363}
364
Serhiy Storchaka63596412015-04-17 21:18:49 +0300365/*[clinic input]
366unicodedata.UCD.east_asian_width
367
368 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700369 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300370 /
371
372Returns the east asian width assigned to the character chr as string.
373[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000374
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000375static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300376unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700377/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000378{
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000379 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300380 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000381 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000382 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000383 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000384 if (old->category_changed == 0)
385 index = 0; /* unassigned */
Benjamin Peterson67752312016-09-14 23:53:47 -0700386 else if (old->east_asian_width_changed != 0xFF)
387 index = old->east_asian_width_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000388 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000389 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000390}
391
Serhiy Storchaka63596412015-04-17 21:18:49 +0300392/*[clinic input]
393unicodedata.UCD.decomposition
394
395 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700396 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300397 /
398
399Returns the character decomposition mapping assigned to the character chr as string.
400
401An empty string is returned in case no such mapping is defined.
402[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000403
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000404static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300405unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700406/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000407{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000408 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000409 int code, index, count;
410 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000411 unsigned int prefix_index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300412 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000413
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000414 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000415
Martin v. Löwis1a214512008-06-11 05:26:20 +0000416 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000417 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000418 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000419 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000420 }
421
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000422 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000423 index = 0;
424 else {
425 index = decomp_index1[(code>>DECOMP_SHIFT)];
426 index = decomp_index2[(index<<DECOMP_SHIFT)+
427 (code&((1<<DECOMP_SHIFT)-1))];
428 }
429
Tim Peters69b83b12001-11-30 07:23:05 +0000430 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000431 is prefix code (from*/
432 count = decomp_data[index] >> 8;
433
434 /* XXX: could allocate the PyString up front instead
435 (strlen(prefix) + 5 * count + 1 bytes) */
436
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000437 /* Based on how index is calculated above and decomp_data is generated
438 from Tools/unicode/makeunicodedata.py, it should not be possible
439 to overflow decomp_prefix. */
440 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200441 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000442
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000443 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000444 i = strlen(decomp_prefix[prefix_index]);
445 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000446
447 while (count-- > 0) {
448 if (i)
449 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000450 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000451 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
452 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000453 i += strlen(decomp + i);
454 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000455 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000456}
457
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000458static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000459get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000460{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000461 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000462 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000463 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000464 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000465 /* unassigned in old version */
466 *index = 0;
467 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000468 else {
469 *index = decomp_index1[(code>>DECOMP_SHIFT)];
470 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
471 (code&((1<<DECOMP_SHIFT)-1))];
472 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000473
Martin v. Löwis677bde22002-11-23 22:08:15 +0000474 /* high byte is number of hex bytes (usually one or two), low byte
475 is prefix code (from*/
476 *count = decomp_data[*index] >> 8;
477 *prefix = decomp_data[*index] & 255;
478
479 (*index)++;
480}
481
482#define SBase 0xAC00
483#define LBase 0x1100
484#define VBase 0x1161
485#define TBase 0x11A7
486#define LCount 19
487#define VCount 21
488#define TCount 28
489#define NCount (VCount*TCount)
490#define SCount (LCount*NCount)
491
492static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000493nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000494{
495 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200496 Py_UCS4 *output;
497 Py_ssize_t i, o, osize;
498 int kind;
499 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000500 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200501 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000502 Py_ssize_t space, isize;
503 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000504 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000505
Martin v. Löwis677bde22002-11-23 22:08:15 +0000506 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200507 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500508 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300509 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500510 if (space > 10) {
511 if (space <= PY_SSIZE_T_MAX - 10)
512 space += 10;
513 }
514 else {
515 space *= 2;
516 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200517 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500518 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200519 if (!output) {
520 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000521 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200522 }
523 i = o = 0;
524 kind = PyUnicode_KIND(input);
525 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000526
Martin v. Löwis22970662011-09-29 13:39:38 +0200527 while (i < isize) {
528 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000529 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200530 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000531 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300532 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000533 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000534 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200535 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000536 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000537 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
538 if (new_output == NULL) {
539 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200540 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000541 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200542 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000543 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000544 }
545 /* Hangul Decomposition. */
546 if (SBase <= code && code < (SBase+SCount)) {
547 int SIndex = code - SBase;
548 int L = LBase + SIndex / NCount;
549 int V = VBase + (SIndex % NCount) / TCount;
550 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200551 output[o++] = L;
552 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000553 space -= 2;
554 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200555 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000556 space --;
557 }
558 continue;
559 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000560 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000561 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000562 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
563 if (value != 0) {
564 stack[stackptr++] = value;
565 continue;
566 }
567 }
568
569 /* Other decompositions. */
570 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000571
572 /* Copy character if it is not decomposable, or has a
573 compatibility decomposition, but we do NFD. */
574 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200575 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000576 space--;
577 continue;
578 }
579 /* Copy decomposition onto the stack, in reverse
580 order. */
581 while(count) {
582 code = decomp_data[index + (--count)];
583 stack[stackptr++] = code;
584 }
585 }
586 }
587
Martin v. Löwis22970662011-09-29 13:39:38 +0200588 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
589 output, o);
590 PyMem_Free(output);
591 if (!result)
592 return NULL;
593 /* result is guaranteed to be ready, as it is compact. */
594 kind = PyUnicode_KIND(result);
595 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000596
597 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200598 i = 0;
599 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
600 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
601 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000602 if (prev == 0 || cur == 0 || prev <= cur) {
603 prev = cur;
604 continue;
605 }
606 /* Non-canonical order. Need to switch *i with previous. */
607 o = i - 1;
608 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200609 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
610 PyUnicode_WRITE(kind, data, o+1,
611 PyUnicode_READ(kind, data, o));
612 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000613 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200614 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000615 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200616 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000617 if (prev == 0 || prev <= cur)
618 break;
619 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200620 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000621 }
622 return result;
623}
624
625static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200626find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000627{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200628 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000629 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200630 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000631 if (code < start)
632 return -1;
633 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200634 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000635 return nfc[index].index + delta;
636 }
637 }
638 return -1;
639}
640
641static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000642nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000643{
644 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200645 int kind;
646 void *data;
647 Py_UCS4 *output;
648 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000649 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200650 Py_UCS4 code;
651 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000652 int cskipped = 0;
653
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000654 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000655 if (!result)
656 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200657 /* result will be "ready". */
658 kind = PyUnicode_KIND(result);
659 data = PyUnicode_DATA(result);
660 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000661
Martin v. Löwis22970662011-09-29 13:39:38 +0200662 /* We allocate a buffer for the output.
663 If we find that we made no changes, we still return
664 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500665 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200666 if (!output) {
667 PyErr_NoMemory();
668 Py_DECREF(result);
669 return 0;
670 }
671 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000672
Martin v. Löwis677bde22002-11-23 22:08:15 +0000673 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200674 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000675 for (index = 0; index < cskipped; index++) {
676 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000677 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000678 Remove from list. */
679 skipped[index] = skipped[cskipped-1];
680 cskipped--;
681 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000682 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000683 }
684 }
685 /* Hangul Composition. We don't need to check for <LV,T>
686 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200687 code = PyUnicode_READ(kind, data, i);
688 if (LBase <= code && code < (LBase+LCount) &&
689 i + 1 < len &&
690 VBase <= PyUnicode_READ(kind, data, i+1) &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900691 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
692 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
693 and V character is a modern vowel (0x1161 ~ 0x1175). */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000694 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200695 LIndex = code - LBase;
696 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000697 code = SBase + (LIndex*VCount+VIndex)*TCount;
698 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200699 if (i < len &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900700 TBase < PyUnicode_READ(kind, data, i) &&
701 PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
702 /* check T character is a modern trailing consonant
703 (0x11A8 ~ 0x11C2). */
Martin v. Löwis22970662011-09-29 13:39:38 +0200704 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000705 i++;
706 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200707 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000708 continue;
709 }
710
Martin v. Löwis22970662011-09-29 13:39:38 +0200711 /* code is still input[i] here */
712 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000713 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200714 output[o++] = code;
715 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000716 continue;
717 }
718 /* Find next unblocked character. */
719 i1 = i+1;
720 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200721 /* output base character for now; might be updated later. */
722 output[o] = PyUnicode_READ(kind, data, i);
723 while (i1 < len) {
724 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
725 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000726 if (comb) {
727 if (comb1 == 0)
728 break;
729 if (comb >= comb1) {
730 /* Character is blocked. */
731 i1++;
732 continue;
733 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000734 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200735 l = find_nfc_index(self, nfc_last, code1);
736 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000737 is a starter, we don't need to look further.
738 Otherwise, record the combining class. */
739 if (l == -1) {
740 not_combinable:
741 if (comb1 == 0)
742 break;
743 comb = comb1;
744 i1++;
745 continue;
746 }
747 index = f*TOTAL_LAST + l;
748 index1 = comp_index[index >> COMP_SHIFT];
749 code = comp_data[(index1<<COMP_SHIFT)+
750 (index&((1<<COMP_SHIFT)-1))];
751 if (code == 0)
752 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000753
Martin v. Löwis677bde22002-11-23 22:08:15 +0000754 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200755 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000756 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000757 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000758 skipped[cskipped++] = i1;
759 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200760 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000761 if (f == -1)
762 break;
763 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200764 /* Output character was already written.
765 Just advance the indices. */
766 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000767 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200768 if (o == len) {
769 /* No changes. Return original string. */
770 PyMem_Free(output);
771 return result;
772 }
773 Py_DECREF(result);
774 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
775 output, o);
776 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000777 return result;
778}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000779
Greg Price2f094132019-09-03 19:45:44 -0700780// This needs to match the logic in makeunicodedata.py
781// which constructs the quickcheck data.
782typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
Max Bélanger2810dd72018-11-04 15:58:24 -0800783
Greg Price2f094132019-09-03 19:45:44 -0700784/* Run the Unicode normalization "quickcheck" algorithm.
785 *
786 * Return YES or NO if quickcheck determines the input is certainly
787 * normalized or certainly not, and MAYBE if quickcheck is unable to
788 * tell.
789 *
790 * If `yes_only` is true, then return MAYBE as soon as we determine
791 * the answer is not YES.
792 *
793 * For background and details on the algorithm, see UAX #15:
794 * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
795 */
796static QuickcheckResult
797is_normalized_quickcheck(PyObject *self, PyObject *input,
Greg Price7669cb82019-09-09 02:16:31 -0700798 bool nfc, bool k, bool yes_only)
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000799{
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000800 /* An older version of the database is requested, quickchecks must be
801 disabled. */
802 if (self && UCD_Check(self))
Max Bélanger2810dd72018-11-04 15:58:24 -0800803 return NO;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000804
Greg Price2f094132019-09-03 19:45:44 -0700805 Py_ssize_t i, len;
806 int kind;
807 void *data;
808 unsigned char prev_combining = 0;
809
810 /* The two quickcheck bits at this shift have type QuickcheckResult. */
811 int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
812
813 QuickcheckResult result = YES; /* certainly normalized, unless we find something */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000814
Martin v. Löwis22970662011-09-29 13:39:38 +0200815 i = 0;
816 kind = PyUnicode_KIND(input);
817 data = PyUnicode_DATA(input);
818 len = PyUnicode_GET_LENGTH(input);
819 while (i < len) {
820 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
821 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000822
Greg Price2f094132019-09-03 19:45:44 -0700823 unsigned char combining = record->combining;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000824 if (combining && prev_combining > combining)
Max Bélanger2810dd72018-11-04 15:58:24 -0800825 return NO; /* non-canonical sort order, not normalized */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000826 prev_combining = combining;
Greg Price2f094132019-09-03 19:45:44 -0700827
828 unsigned char quickcheck_whole = record->normalization_quick_check;
829 if (yes_only) {
830 if (quickcheck_whole & (3 << quickcheck_shift))
831 return MAYBE;
832 } else {
833 switch ((quickcheck_whole >> quickcheck_shift) & 3) {
834 case NO:
835 return NO;
836 case MAYBE:
837 result = MAYBE; /* this string might need normalization */
838 }
839 }
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000840 }
Greg Price2f094132019-09-03 19:45:44 -0700841 return result;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000842}
843
Serhiy Storchaka63596412015-04-17 21:18:49 +0300844/*[clinic input]
Max Bélanger2810dd72018-11-04 15:58:24 -0800845unicodedata.UCD.is_normalized
846
847 self: self
848 form: unicode
849 unistr as input: unicode
850 /
851
852Return whether the Unicode string unistr is in the normal form 'form'.
853
854Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
855[clinic start generated code]*/
856
857static PyObject *
858unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
859 PyObject *input)
860/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
861{
862 if (PyUnicode_READY(input) == -1) {
863 return NULL;
864 }
865
866 if (PyUnicode_GET_LENGTH(input) == 0) {
867 /* special case empty input strings. */
868 Py_RETURN_TRUE;
869 }
870
871 PyObject *result;
Greg Price7669cb82019-09-09 02:16:31 -0700872 bool nfc = false;
873 bool k = false;
Greg Price2f094132019-09-03 19:45:44 -0700874 QuickcheckResult m;
Max Bélanger2810dd72018-11-04 15:58:24 -0800875
876 PyObject *cmp;
877 int match = 0;
878
879 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700880 nfc = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800881 }
882 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700883 nfc = true;
884 k = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800885 }
886 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
887 /* matches default values for `nfc` and `k` */
888 }
889 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
Greg Price7669cb82019-09-09 02:16:31 -0700890 k = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800891 }
892 else {
893 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
894 return NULL;
895 }
896
Greg Price2f094132019-09-03 19:45:44 -0700897 m = is_normalized_quickcheck(self, input, nfc, k, false);
Max Bélanger2810dd72018-11-04 15:58:24 -0800898
899 if (m == MAYBE) {
900 cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
901 if (cmp == NULL) {
902 return NULL;
903 }
904 match = PyUnicode_Compare(input, cmp);
905 Py_DECREF(cmp);
906 result = (match == 0) ? Py_True : Py_False;
907 }
908 else {
909 result = (m == YES) ? Py_True : Py_False;
910 }
911
912 Py_INCREF(result);
913 return result;
914}
915
916
917/*[clinic input]
Serhiy Storchaka63596412015-04-17 21:18:49 +0300918unicodedata.UCD.normalize
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000919
Serhiy Storchaka63596412015-04-17 21:18:49 +0300920 self: self
Max Bélanger2810dd72018-11-04 15:58:24 -0800921 form: unicode
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +0300922 unistr as input: unicode
Serhiy Storchaka63596412015-04-17 21:18:49 +0300923 /
924
925Return the normal form 'form' for the Unicode string unistr.
926
927Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
928[clinic start generated code]*/
929
930static PyObject *
Max Bélanger2810dd72018-11-04 15:58:24 -0800931unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
Serhiy Storchaka63596412015-04-17 21:18:49 +0300932 PyObject *input)
Max Bélanger2810dd72018-11-04 15:58:24 -0800933/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
Martin v. Löwis677bde22002-11-23 22:08:15 +0000934{
Martin v. Löwis22970662011-09-29 13:39:38 +0200935 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000936 /* Special case empty input strings, since resizing
937 them later would cause internal errors. */
938 Py_INCREF(input);
939 return input;
940 }
941
Max Bélanger2810dd72018-11-04 15:58:24 -0800942 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700943 if (is_normalized_quickcheck(self, input, true, false, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000944 Py_INCREF(input);
945 return input;
946 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000947 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000948 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800949 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700950 if (is_normalized_quickcheck(self, input, true, true, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000951 Py_INCREF(input);
952 return input;
953 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000954 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000955 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800956 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
Greg Price7669cb82019-09-09 02:16:31 -0700957 if (is_normalized_quickcheck(self, input, false, false, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000958 Py_INCREF(input);
959 return input;
960 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000961 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000962 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800963 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
Greg Price7669cb82019-09-09 02:16:31 -0700964 if (is_normalized_quickcheck(self, input, false, true, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000965 Py_INCREF(input);
966 return input;
967 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000968 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000969 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000970 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
971 return NULL;
972}
973
Fredrik Lundh06d12682001-01-24 07:59:11 +0000974/* -------------------------------------------------------------------- */
975/* unicode character name tables */
976
977/* data file generated by Tools/unicode/makeunicodedata.py */
978#include "unicodename_db.h"
979
980/* -------------------------------------------------------------------- */
981/* database code (cut and pasted from the unidb package) */
982
983static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000984_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000985{
986 int i;
987 unsigned long h = 0;
988 unsigned long ix;
989 for (i = 0; i < len; i++) {
Jordon Xu2ec70102019-09-11 00:04:08 +0800990 h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000991 ix = h & 0xff000000;
992 if (ix)
993 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
994 }
995 return h;
996}
997
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200998static const char * const hangul_syllables[][3] = {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000999 { "G", "A", "" },
1000 { "GG", "AE", "G" },
1001 { "N", "YA", "GG" },
1002 { "D", "YAE", "GS" },
1003 { "DD", "EO", "N", },
1004 { "R", "E", "NJ" },
1005 { "M", "YEO", "NH" },
1006 { "B", "YE", "D" },
1007 { "BB", "O", "L" },
1008 { "S", "WA", "LG" },
1009 { "SS", "WAE", "LM" },
1010 { "", "OE", "LB" },
1011 { "J", "YO", "LS" },
1012 { "JJ", "U", "LT" },
1013 { "C", "WEO", "LP" },
1014 { "K", "WE", "LH" },
1015 { "T", "WI", "M" },
1016 { "P", "YU", "B" },
1017 { "H", "EU", "BS" },
1018 { 0, "YI", "S" },
1019 { 0, "I", "SS" },
1020 { 0, 0, "NG" },
1021 { 0, 0, "J" },
1022 { 0, 0, "C" },
1023 { 0, 0, "K" },
1024 { 0, 0, "T" },
1025 { 0, 0, "P" },
1026 { 0, 0, "H" }
1027};
1028
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001029/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001030static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001031is_unified_ideograph(Py_UCS4 code)
1032{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001033 return
1034 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson7c69c1c2018-06-06 20:14:28 -07001035 (0x4E00 <= code && code <= 0x9FEF) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001036 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
1037 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
Benjamin Peterson48013832015-06-27 15:45:56 -05001038 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
Benjamin Peterson279a9622017-06-22 22:31:08 -07001039 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1040 (0x2CEB0 <= code && code <= 0x2EBEF); /* CJK Ideograph Extension F */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001041}
1042
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001043/* macros used to determine if the given code point is in the PUA range that
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001044 * we are using to store aliases and named sequences */
1045#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1046#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1047 (cp < named_sequences_end))
1048
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001049static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001050_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
1051 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001052{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001053 /* Find the name associated with the given code point.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001054 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1055 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001056 int offset;
1057 int i;
1058 int word;
Inada Naoki6fec9052019-04-17 08:40:34 +09001059 const unsigned char* w;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001060
Martin v. Löwisc3509122006-03-11 12:16:23 +00001061 if (code >= 0x110000)
1062 return 0;
1063
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001064 /* XXX should we just skip all the code points in the PUAs here? */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001065 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1066 return 0;
1067
Martin v. Löwis1a214512008-06-11 05:26:20 +00001068 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001069 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +03001070 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001071 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1072 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +03001073 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +00001074 if (old->category_changed == 0) {
1075 /* unassigned */
1076 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001077 }
Martin v. Löwisc3509122006-03-11 12:16:23 +00001078 }
1079
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +00001080 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001081 /* Hangul syllable. */
1082 int SIndex = code - SBase;
1083 int L = SIndex / NCount;
1084 int V = (SIndex % NCount) / TCount;
1085 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001086
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001087 if (buflen < 27)
1088 /* Worst case: HANGUL SYLLABLE <10chars>. */
1089 return 0;
1090 strcpy(buffer, "HANGUL SYLLABLE ");
1091 buffer += 16;
1092 strcpy(buffer, hangul_syllables[L][0]);
1093 buffer += strlen(hangul_syllables[L][0]);
1094 strcpy(buffer, hangul_syllables[V][1]);
1095 buffer += strlen(hangul_syllables[V][1]);
1096 strcpy(buffer, hangul_syllables[T][2]);
1097 buffer += strlen(hangul_syllables[T][2]);
1098 *buffer = '\0';
1099 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001100 }
1101
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001102 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001103 if (buflen < 28)
1104 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1105 return 0;
1106 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1107 return 1;
1108 }
1109
Fredrik Lundh06d12682001-01-24 07:59:11 +00001110 /* get offset into phrasebook */
1111 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1112 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1113 (code&((1<<phrasebook_shift)-1))];
1114 if (!offset)
1115 return 0;
1116
1117 i = 0;
1118
1119 for (;;) {
1120 /* get word index */
1121 word = phrasebook[offset] - phrasebook_short;
1122 if (word >= 0) {
1123 word = (word << 8) + phrasebook[offset+1];
1124 offset += 2;
1125 } else
1126 word = phrasebook[offset++];
1127 if (i) {
1128 if (i > buflen)
1129 return 0; /* buffer overflow */
1130 buffer[i++] = ' ';
1131 }
1132 /* copy word string from lexicon. the last character in the
1133 word has bit 7 set. the last word in a string ends with
1134 0x80 */
1135 w = lexicon + lexicon_offset[word];
1136 while (*w < 128) {
1137 if (i >= buflen)
1138 return 0; /* buffer overflow */
1139 buffer[i++] = *w++;
1140 }
1141 if (i >= buflen)
1142 return 0; /* buffer overflow */
1143 buffer[i++] = *w & 127;
1144 if (*w == 128)
1145 break; /* end of word */
1146 }
1147
1148 return 1;
1149}
1150
1151static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001152_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001153{
1154 /* check if code corresponds to the given name */
1155 int i;
Christian Heimes2f366ca2016-09-23 20:20:27 +02001156 char buffer[NAME_MAXLEN+1];
1157 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001158 return 0;
1159 for (i = 0; i < namelen; i++) {
Jordon Xu2ec70102019-09-11 00:04:08 +08001160 if (Py_TOUPPER(name[i]) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001161 return 0;
1162 }
1163 return buffer[namelen] == '\0';
1164}
1165
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001166static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001167find_syllable(const char *str, int *len, int *pos, int count, int column)
1168{
1169 int i, len1;
1170 *len = -1;
1171 for (i = 0; i < count; i++) {
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02001172 const char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001173 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001174 if (len1 <= *len)
1175 continue;
1176 if (strncmp(str, s, len1) == 0) {
1177 *len = len1;
1178 *pos = i;
1179 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001180 }
1181 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001182 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001183 }
1184}
1185
Fredrik Lundh06d12682001-01-24 07:59:11 +00001186static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001187_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001188{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001189 /* check if named sequences are allowed */
1190 if (!with_named_seq && IS_NAMED_SEQ(cp))
1191 return 0;
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001192 /* if the code point is in the PUA range that we use for aliases,
1193 * convert it to obtain the right code point */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001194 if (IS_ALIAS(cp))
1195 *code = name_aliases[cp-aliases_start];
1196 else
1197 *code = cp;
1198 return 1;
1199}
1200
1201static int
1202_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1203 int with_named_seq)
1204{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001205 /* Return the code point associated with the given name.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001206 * Named aliases are resolved too (unless self != NULL (i.e. we are using
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001207 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001208 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001209 unsigned int h, v;
1210 unsigned int mask = code_size-1;
1211 unsigned int i, incr;
1212
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001213 /* Check for hangul syllables. */
1214 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001215 int len, L = -1, V = -1, T = -1;
1216 const char *pos = name + 16;
1217 find_syllable(pos, &len, &L, LCount, 0);
1218 pos += len;
1219 find_syllable(pos, &len, &V, VCount, 1);
1220 pos += len;
1221 find_syllable(pos, &len, &T, TCount, 2);
1222 pos += len;
1223 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1224 *code = SBase + (L*VCount+V)*TCount + T;
1225 return 1;
1226 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001227 /* Otherwise, it's an illegal syllable name. */
1228 return 0;
1229 }
1230
1231 /* Check for unified ideographs. */
1232 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1233 /* Four or five hexdigits must follow. */
1234 v = 0;
1235 name += 22;
1236 namelen -= 22;
1237 if (namelen != 4 && namelen != 5)
1238 return 0;
1239 while (namelen--) {
1240 v *= 16;
1241 if (*name >= '0' && *name <= '9')
1242 v += *name - '0';
1243 else if (*name >= 'A' && *name <= 'F')
1244 v += *name - 'A' + 10;
1245 else
1246 return 0;
1247 name++;
1248 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001249 if (!is_unified_ideograph(v))
1250 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001251 *code = v;
1252 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001253 }
1254
Fredrik Lundh06d12682001-01-24 07:59:11 +00001255 /* the following is the same as python's dictionary lookup, with
1256 only minor changes. see the makeunicodedata script for more
1257 details */
1258
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001259 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001260 i = (~h) & mask;
1261 v = code_hash[i];
1262 if (!v)
1263 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001264 if (_cmpname(self, v, name, namelen))
1265 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001266 incr = (h ^ (h >> 3)) & mask;
1267 if (!incr)
1268 incr = mask;
1269 for (;;) {
1270 i = (i + incr) & mask;
1271 v = code_hash[i];
1272 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001273 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001274 if (_cmpname(self, v, name, namelen))
1275 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001276 incr = incr << 1;
1277 if (incr > mask)
1278 incr = incr ^ code_poly;
1279 }
1280}
1281
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001282static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001283{
1284 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001285 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001286 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001287};
1288
1289/* -------------------------------------------------------------------- */
1290/* Python bindings */
1291
Serhiy Storchaka63596412015-04-17 21:18:49 +03001292/*[clinic input]
1293unicodedata.UCD.name
1294
1295 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001296 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +03001297 default: object=NULL
1298 /
1299
1300Returns the name assigned to the character chr as a string.
1301
1302If no name is defined, default is returned, or, if not given,
1303ValueError is raised.
1304[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001305
Fredrik Lundh06d12682001-01-24 07:59:11 +00001306static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001307unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001308/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001309{
Christian Heimes2f366ca2016-09-23 20:20:27 +02001310 char name[NAME_MAXLEN+1];
Serhiy Storchaka63596412015-04-17 21:18:49 +03001311 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001312
Christian Heimes2f366ca2016-09-23 20:20:27 +02001313 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001314 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001315 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001316 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001317 }
1318 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001319 Py_INCREF(default_value);
1320 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001321 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001322 }
1323
Walter Dörwald4254e762007-06-05 16:04:09 +00001324 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001325}
1326
Serhiy Storchaka63596412015-04-17 21:18:49 +03001327/*[clinic input]
1328unicodedata.UCD.lookup
1329
1330 self: self
Larry Hastings38337d12015-05-07 23:30:09 -07001331 name: str(accept={str, robuffer}, zeroes=True)
Serhiy Storchaka63596412015-04-17 21:18:49 +03001332 /
1333
1334Look up character by name.
1335
1336If a character with the given name is found, return the
1337corresponding character. If not found, KeyError is raised.
1338[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001339
Fredrik Lundh06d12682001-01-24 07:59:11 +00001340static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001341unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1342 Py_ssize_clean_t name_length)
Larry Hastings38337d12015-05-07 23:30:09 -07001343/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001344{
1345 Py_UCS4 code;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001346 unsigned int index;
Christian Heimes7ce20132016-09-14 10:25:46 +02001347 if (name_length > NAME_MAXLEN) {
Victor Stinner65a31442014-07-01 16:45:52 +02001348 PyErr_SetString(PyExc_KeyError, "name too long");
1349 return NULL;
1350 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001351
Serhiy Storchaka63596412015-04-17 21:18:49 +03001352 if (!_getcode(self, name, (int)name_length, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001353 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001354 return NULL;
1355 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001356 /* check if code is in the PUA range that we use for named sequences
1357 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001358 if (IS_NAMED_SEQ(code)) {
1359 index = code-named_sequences_start;
1360 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1361 named_sequences[index].seq,
1362 named_sequences[index].seqlen);
1363 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001364 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001365}
1366
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001367/* XXX Add doc strings. */
1368
1369static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001370 UNICODEDATA_UCD_DECIMAL_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001371 UNICODEDATA_UCD_DIGIT_METHODDEF
1372 UNICODEDATA_UCD_NUMERIC_METHODDEF
1373 UNICODEDATA_UCD_CATEGORY_METHODDEF
1374 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1375 UNICODEDATA_UCD_COMBINING_METHODDEF
1376 UNICODEDATA_UCD_MIRRORED_METHODDEF
1377 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1378 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1379 UNICODEDATA_UCD_NAME_METHODDEF
1380 UNICODEDATA_UCD_LOOKUP_METHODDEF
Max Bélanger2810dd72018-11-04 15:58:24 -08001381 UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001382 UNICODEDATA_UCD_NORMALIZE_METHODDEF
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001383 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001384};
1385
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001386static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001387 /* The ob_type field must be initialized in the module init function
1388 * to be portable to Windows without using C++. */
1389 PyVarObject_HEAD_INIT(NULL, 0)
1390 "unicodedata.UCD", /*tp_name*/
1391 sizeof(PreviousDBVersion), /*tp_basicsize*/
1392 0, /*tp_itemsize*/
1393 /* methods */
1394 (destructor)PyObject_Del, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02001395 0, /*tp_vectorcall_offset*/
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001396 0, /*tp_getattr*/
1397 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02001398 0, /*tp_as_async*/
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001399 0, /*tp_repr*/
1400 0, /*tp_as_number*/
1401 0, /*tp_as_sequence*/
1402 0, /*tp_as_mapping*/
1403 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001404 0, /*tp_call*/
1405 0, /*tp_str*/
1406 PyObject_GenericGetAttr,/*tp_getattro*/
1407 0, /*tp_setattro*/
1408 0, /*tp_as_buffer*/
1409 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1410 0, /*tp_doc*/
1411 0, /*tp_traverse*/
1412 0, /*tp_clear*/
1413 0, /*tp_richcompare*/
1414 0, /*tp_weaklistoffset*/
1415 0, /*tp_iter*/
1416 0, /*tp_iternext*/
1417 unicodedata_functions, /*tp_methods*/
1418 DB_members, /*tp_members*/
1419 0, /*tp_getset*/
1420 0, /*tp_base*/
1421 0, /*tp_dict*/
1422 0, /*tp_descr_get*/
1423 0, /*tp_descr_set*/
1424 0, /*tp_dictoffset*/
1425 0, /*tp_init*/
1426 0, /*tp_alloc*/
1427 0, /*tp_new*/
1428 0, /*tp_free*/
1429 0, /*tp_is_gc*/
1430};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001431
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001432PyDoc_STRVAR(unicodedata_docstring,
1433"This module provides access to the Unicode Character Database which\n\
1434defines character properties for all Unicode characters. The data in\n\
1435this database is based on the UnicodeData.txt file version\n\
luzpaza5293b42017-11-05 07:37:50 -06001436" UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001437\n\
1438The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001439UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001440
1441static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001442 PyModuleDef_HEAD_INIT,
1443 "unicodedata",
1444 unicodedata_docstring,
1445 -1,
1446 unicodedata_functions,
1447 NULL,
1448 NULL,
1449 NULL,
1450 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001451};
1452
Mark Hammond62b1ab12002-07-23 06:31:15 +00001453PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001454PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001455{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001456 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001457
Victor Stinnerd2ec81a2020-02-07 09:17:07 +01001458 Py_SET_TYPE(&UCD_Type, &PyType_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001459
Martin v. Löwis1a214512008-06-11 05:26:20 +00001460 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001461 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001462 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001463
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001464 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001465 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001466 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001467
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001468 /* Previous versions */
1469 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1470 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001471 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001472
Fredrik Lundh06d12682001-01-24 07:59:11 +00001473 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001474 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001475 if (v != NULL)
1476 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001477 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001478}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001479
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001480/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001481Local variables:
1482c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001483indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001484End:
1485*/