blob: 8e11cfc4dafa92338470b685c08c9eed7e3dca68 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019#include "ucnhash.h"
Victor Stinner4a21e572020-04-15 02:35:41 +020020#include "structmember.h" // PyMemberDef
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Greg Price2f094132019-09-03 19:45:44 -070022#include <stdbool.h>
23
Max Bélanger2810dd72018-11-04 15:58:24 -080024_Py_IDENTIFIER(NFC);
25_Py_IDENTIFIER(NFD);
26_Py_IDENTIFIER(NFKC);
27_Py_IDENTIFIER(NFKD);
28
Larry Hastings61272b72014-01-07 12:41:53 -080029/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080030module unicodedata
Larry Hastingsc2047262014-01-25 20:43:29 -080031class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Larry Hastings61272b72014-01-07 12:41:53 -080032[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080033/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080034
Fredrik Lundh06d12682001-01-24 07:59:11 +000035/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000036
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000037typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000038 const unsigned char category; /* index into
39 _PyUnicode_CategoryNames */
40 const unsigned char combining; /* combining class value 0 - 255 */
41 const unsigned char bidirectional; /* index into
42 _PyUnicode_BidirectionalNames */
43 const unsigned char mirrored; /* true if mirrored in bidir mode */
44 const unsigned char east_asian_width; /* index into
45 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000046 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047} _PyUnicode_DatabaseRecord;
48
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000049typedef struct change_record {
50 /* sequence of fields should be the same as in merge_old_version */
51 const unsigned char bidir_changed;
52 const unsigned char category_changed;
53 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000054 const unsigned char mirrored_changed;
Benjamin Peterson67752312016-09-14 23:53:47 -070055 const unsigned char east_asian_width_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000056 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000057} change_record;
58
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000059/* data file generated by Tools/unicode/makeunicodedata.py */
60#include "unicodedata_db.h"
61
62static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000063_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000064{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000065 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000066 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000067 index = 0;
68 else {
69 index = index1[(code>>SHIFT)];
70 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
71 }
72
73 return &_PyUnicode_Database_Records[index];
74}
75
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000076/* ------------- Previous-version API ------------------------------------- */
77typedef struct previous_version {
78 PyObject_HEAD
79 const char *name;
80 const change_record* (*getrecord)(Py_UCS4);
81 Py_UCS4 (*normalization)(Py_UCS4);
82} PreviousDBVersion;
83
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030084#include "clinic/unicodedata.c.h"
85
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
87
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090 {NULL}
91};
92
Thomas Wouters89f507f2006-12-13 04:49:30 +000093/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000094static PyTypeObject UCD_Type;
Mohamed Koubaaddc0dd02020-09-23 05:38:16 -050095
96// Check if self is an instance of UCD_Type.
97// Return 0 if self is NULL (when the PyCapsule C API is used).
98#define UCD_Check(self, ucd_type) (self != NULL && Py_IS_TYPE(self, ucd_type))
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000099
100static PyObject*
101new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
102 Py_UCS4 (*normalization)(Py_UCS4))
103{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000104 PreviousDBVersion *self;
105 self = PyObject_New(PreviousDBVersion, &UCD_Type);
106 if (self == NULL)
107 return NULL;
108 self->name = name;
109 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000110 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000111 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000112}
113
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000114
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000115/* --- Module API --------------------------------------------------------- */
116
Larry Hastings61272b72014-01-07 12:41:53 -0800117/*[clinic input]
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800118unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700119
Serhiy Storchaka63596412015-04-17 21:18:49 +0300120 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700121 chr: int(accept={str})
Larry Hastings31826802013-10-19 00:09:25 -0700122 default: object=NULL
123 /
124
125Converts a Unicode character into its equivalent decimal value.
126
Serhiy Storchaka63596412015-04-17 21:18:49 +0300127Returns the decimal value assigned to the character chr as integer.
128If no such value is defined, default is returned, or, if not given,
129ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800130[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700131
Larry Hastings31826802013-10-19 00:09:25 -0700132static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300133unicodedata_UCD_decimal_impl(PyObject *self, int chr,
Larry Hastings89964c42015-04-14 18:07:59 -0400134 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700135/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
Larry Hastings31826802013-10-19 00:09:25 -0700136{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000137 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000138 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300139 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000140
Mohamed Koubaaddc0dd02020-09-23 05:38:16 -0500141 if (UCD_Check(self, &UCD_Type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000142 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000143 if (old->category_changed == 0) {
144 /* unassigned */
145 have_old = 1;
146 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000148 else if (old->decimal_changed != 0xFF) {
149 have_old = 1;
150 rc = old->decimal_changed;
151 }
152 }
153
154 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000155 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000156 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700157 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000158 PyErr_SetString(PyExc_ValueError,
159 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000160 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000161 }
162 else {
Larry Hastings31826802013-10-19 00:09:25 -0700163 Py_INCREF(default_value);
164 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000166 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000167 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000168}
169
Serhiy Storchaka63596412015-04-17 21:18:49 +0300170/*[clinic input]
171unicodedata.UCD.digit
172
173 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700174 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300175 default: object=NULL
176 /
177
178Converts a Unicode character into its equivalent digit value.
179
180Returns the digit value assigned to the character chr as integer.
181If no such value is defined, default is returned, or, if not given,
182ValueError is raised.
183[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000184
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000185static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300186unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700187/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000188{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000189 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300190 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000191 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000192 if (rc < 0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300193 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000195 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 }
197 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300198 Py_INCREF(default_value);
199 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000200 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000202 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203}
204
Serhiy Storchaka63596412015-04-17 21:18:49 +0300205/*[clinic input]
206unicodedata.UCD.numeric
207
208 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700209 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300210 default: object=NULL
211 /
212
213Converts a Unicode character into its equivalent numeric value.
214
215Returns the numeric value assigned to the character chr as float.
216If no such value is defined, default is returned, or, if not given,
217ValueError is raised.
218[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000219
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000220static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300221unicodedata_UCD_numeric_impl(PyObject *self, int chr,
222 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700223/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000224{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000225 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000226 double rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300227 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000228
Mohamed Koubaaddc0dd02020-09-23 05:38:16 -0500229 if (UCD_Check(self, &UCD_Type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000230 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000231 if (old->category_changed == 0) {
232 /* unassigned */
233 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000235 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000236 else if (old->decimal_changed != 0xFF) {
237 have_old = 1;
238 rc = old->decimal_changed;
239 }
240 }
241
242 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000243 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000244 if (rc == -1.0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300245 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000246 PyErr_SetString(PyExc_ValueError, "not a numeric character");
247 return NULL;
248 }
249 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300250 Py_INCREF(default_value);
251 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000252 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000253 }
254 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000255}
256
Serhiy Storchaka63596412015-04-17 21:18:49 +0300257/*[clinic input]
258unicodedata.UCD.category
259
260 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700261 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300262 /
263
264Returns the general category assigned to the character chr as string.
265[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000266
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000267static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300268unicodedata_UCD_category_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700269/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000270{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000271 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300272 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000273 index = (int) _getrecord_ex(c)->category;
Mohamed Koubaaddc0dd02020-09-23 05:38:16 -0500274 if (UCD_Check(self, &UCD_Type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000275 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000276 if (old->category_changed != 0xFF)
277 index = old->category_changed;
278 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000279 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000280}
281
Serhiy Storchaka63596412015-04-17 21:18:49 +0300282/*[clinic input]
283unicodedata.UCD.bidirectional
284
285 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700286 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300287 /
288
289Returns the bidirectional class assigned to the character chr as string.
290
291If no such value is defined, an empty string is returned.
292[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000293
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000294static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300295unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700296/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000297{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000298 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300299 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000300 index = (int) _getrecord_ex(c)->bidirectional;
Mohamed Koubaaddc0dd02020-09-23 05:38:16 -0500301 if (UCD_Check(self, &UCD_Type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000302 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000303 if (old->category_changed == 0)
304 index = 0; /* unassigned */
305 else if (old->bidir_changed != 0xFF)
306 index = old->bidir_changed;
307 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000308 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000309}
310
Serhiy Storchaka63596412015-04-17 21:18:49 +0300311/*[clinic input]
312unicodedata.UCD.combining -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000313
Serhiy Storchaka63596412015-04-17 21:18:49 +0300314 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700315 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300316 /
317
318Returns the canonical combining class assigned to the character chr as integer.
319
320Returns 0 if no combining class is defined.
321[clinic start generated code]*/
322
323static int
324unicodedata_UCD_combining_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700325/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000326{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000327 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300328 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000329 index = (int) _getrecord_ex(c)->combining;
Mohamed Koubaaddc0dd02020-09-23 05:38:16 -0500330 if (UCD_Check(self, &UCD_Type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000331 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000332 if (old->category_changed == 0)
333 index = 0; /* unassigned */
334 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300335 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000336}
337
Serhiy Storchaka63596412015-04-17 21:18:49 +0300338/*[clinic input]
339unicodedata.UCD.mirrored -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000340
Serhiy Storchaka63596412015-04-17 21:18:49 +0300341 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700342 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300343 /
344
345Returns the mirrored property assigned to the character chr as integer.
346
347Returns 1 if the character has been identified as a "mirrored"
348character in bidirectional text, 0 otherwise.
349[clinic start generated code]*/
350
351static int
352unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700353/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000354{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000355 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300356 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000357 index = (int) _getrecord_ex(c)->mirrored;
Mohamed Koubaaddc0dd02020-09-23 05:38:16 -0500358 if (UCD_Check(self, &UCD_Type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000359 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000360 if (old->category_changed == 0)
361 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000362 else if (old->mirrored_changed != 0xFF)
363 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000364 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300365 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000366}
367
Serhiy Storchaka63596412015-04-17 21:18:49 +0300368/*[clinic input]
369unicodedata.UCD.east_asian_width
370
371 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700372 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300373 /
374
375Returns the east asian width assigned to the character chr as string.
376[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000377
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000378static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300379unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700380/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000381{
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000382 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300383 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000384 index = (int) _getrecord_ex(c)->east_asian_width;
Mohamed Koubaaddc0dd02020-09-23 05:38:16 -0500385 if (UCD_Check(self, &UCD_Type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000386 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000387 if (old->category_changed == 0)
388 index = 0; /* unassigned */
Benjamin Peterson67752312016-09-14 23:53:47 -0700389 else if (old->east_asian_width_changed != 0xFF)
390 index = old->east_asian_width_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000391 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000392 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000393}
394
Serhiy Storchaka63596412015-04-17 21:18:49 +0300395/*[clinic input]
396unicodedata.UCD.decomposition
397
398 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700399 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300400 /
401
402Returns the character decomposition mapping assigned to the character chr as string.
403
404An empty string is returned in case no such mapping is defined.
405[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000406
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000407static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300408unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700409/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000410{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000411 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000412 int code, index, count;
413 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000414 unsigned int prefix_index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300415 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000416
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000417 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000418
Mohamed Koubaaddc0dd02020-09-23 05:38:16 -0500419 if (UCD_Check(self, &UCD_Type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000420 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000421 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000422 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000423 }
424
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000425 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000426 index = 0;
427 else {
428 index = decomp_index1[(code>>DECOMP_SHIFT)];
429 index = decomp_index2[(index<<DECOMP_SHIFT)+
430 (code&((1<<DECOMP_SHIFT)-1))];
431 }
432
Tim Peters69b83b12001-11-30 07:23:05 +0000433 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000434 is prefix code (from*/
435 count = decomp_data[index] >> 8;
436
437 /* XXX: could allocate the PyString up front instead
438 (strlen(prefix) + 5 * count + 1 bytes) */
439
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000440 /* Based on how index is calculated above and decomp_data is generated
441 from Tools/unicode/makeunicodedata.py, it should not be possible
442 to overflow decomp_prefix. */
443 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200444 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000445
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000446 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000447 i = strlen(decomp_prefix[prefix_index]);
448 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000449
450 while (count-- > 0) {
451 if (i)
452 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000453 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000454 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
455 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000456 i += strlen(decomp + i);
457 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000458 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000459}
460
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000461static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000462get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000463{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000464 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000465 *index = 0;
Mohamed Koubaaddc0dd02020-09-23 05:38:16 -0500466 } else if (UCD_Check(self, &UCD_Type) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000467 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000468 /* unassigned in old version */
469 *index = 0;
470 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000471 else {
472 *index = decomp_index1[(code>>DECOMP_SHIFT)];
473 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
474 (code&((1<<DECOMP_SHIFT)-1))];
475 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000476
Martin v. Löwis677bde22002-11-23 22:08:15 +0000477 /* high byte is number of hex bytes (usually one or two), low byte
478 is prefix code (from*/
479 *count = decomp_data[*index] >> 8;
480 *prefix = decomp_data[*index] & 255;
481
482 (*index)++;
483}
484
485#define SBase 0xAC00
486#define LBase 0x1100
487#define VBase 0x1161
488#define TBase 0x11A7
489#define LCount 19
490#define VCount 21
491#define TCount 28
492#define NCount (VCount*TCount)
493#define SCount (LCount*NCount)
494
495static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000496nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000497{
498 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200499 Py_UCS4 *output;
500 Py_ssize_t i, o, osize;
501 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300502 const void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000503 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200504 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000505 Py_ssize_t space, isize;
506 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000507 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000508
Martin v. Löwis677bde22002-11-23 22:08:15 +0000509 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200510 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500511 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300512 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500513 if (space > 10) {
514 if (space <= PY_SSIZE_T_MAX - 10)
515 space += 10;
516 }
517 else {
518 space *= 2;
519 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200520 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500521 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200522 if (!output) {
523 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000524 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200525 }
526 i = o = 0;
527 kind = PyUnicode_KIND(input);
528 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000529
Martin v. Löwis22970662011-09-29 13:39:38 +0200530 while (i < isize) {
531 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000532 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200533 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000534 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300535 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000536 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000537 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200538 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000539 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000540 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
541 if (new_output == NULL) {
542 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200543 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000544 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200545 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000546 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000547 }
548 /* Hangul Decomposition. */
549 if (SBase <= code && code < (SBase+SCount)) {
550 int SIndex = code - SBase;
551 int L = LBase + SIndex / NCount;
552 int V = VBase + (SIndex % NCount) / TCount;
553 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200554 output[o++] = L;
555 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000556 space -= 2;
557 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200558 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000559 space --;
560 }
561 continue;
562 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000563 /* normalization changes */
Mohamed Koubaaddc0dd02020-09-23 05:38:16 -0500564 if (UCD_Check(self, &UCD_Type)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000565 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
566 if (value != 0) {
567 stack[stackptr++] = value;
568 continue;
569 }
570 }
571
572 /* Other decompositions. */
573 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000574
575 /* Copy character if it is not decomposable, or has a
576 compatibility decomposition, but we do NFD. */
577 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200578 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000579 space--;
580 continue;
581 }
582 /* Copy decomposition onto the stack, in reverse
583 order. */
584 while(count) {
585 code = decomp_data[index + (--count)];
586 stack[stackptr++] = code;
587 }
588 }
589 }
590
Martin v. Löwis22970662011-09-29 13:39:38 +0200591 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
592 output, o);
593 PyMem_Free(output);
594 if (!result)
595 return NULL;
596 /* result is guaranteed to be ready, as it is compact. */
597 kind = PyUnicode_KIND(result);
598 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000599
600 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200601 i = 0;
602 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
603 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
604 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000605 if (prev == 0 || cur == 0 || prev <= cur) {
606 prev = cur;
607 continue;
608 }
609 /* Non-canonical order. Need to switch *i with previous. */
610 o = i - 1;
611 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200612 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
613 PyUnicode_WRITE(kind, data, o+1,
614 PyUnicode_READ(kind, data, o));
615 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000616 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200617 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000618 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200619 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000620 if (prev == 0 || prev <= cur)
621 break;
622 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200623 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000624 }
625 return result;
626}
627
628static int
Andy Lester982307b2020-03-17 11:38:12 -0500629find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000630{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200631 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000632 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200633 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000634 if (code < start)
635 return -1;
636 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200637 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000638 return nfc[index].index + delta;
639 }
640 }
641 return -1;
642}
643
644static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000645nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000646{
647 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200648 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300649 const void *data;
Martin v. Löwis22970662011-09-29 13:39:38 +0200650 Py_UCS4 *output;
651 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000652 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200653 Py_UCS4 code;
654 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000655 int cskipped = 0;
656
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000657 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000658 if (!result)
659 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200660 /* result will be "ready". */
661 kind = PyUnicode_KIND(result);
662 data = PyUnicode_DATA(result);
663 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000664
Martin v. Löwis22970662011-09-29 13:39:38 +0200665 /* We allocate a buffer for the output.
666 If we find that we made no changes, we still return
667 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500668 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200669 if (!output) {
670 PyErr_NoMemory();
671 Py_DECREF(result);
672 return 0;
673 }
674 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000675
Martin v. Löwis677bde22002-11-23 22:08:15 +0000676 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200677 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000678 for (index = 0; index < cskipped; index++) {
679 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000680 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000681 Remove from list. */
682 skipped[index] = skipped[cskipped-1];
683 cskipped--;
684 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000685 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000686 }
687 }
688 /* Hangul Composition. We don't need to check for <LV,T>
689 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200690 code = PyUnicode_READ(kind, data, i);
691 if (LBase <= code && code < (LBase+LCount) &&
692 i + 1 < len &&
693 VBase <= PyUnicode_READ(kind, data, i+1) &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900694 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
695 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
696 and V character is a modern vowel (0x1161 ~ 0x1175). */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000697 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200698 LIndex = code - LBase;
699 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000700 code = SBase + (LIndex*VCount+VIndex)*TCount;
701 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200702 if (i < len &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900703 TBase < PyUnicode_READ(kind, data, i) &&
704 PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
705 /* check T character is a modern trailing consonant
706 (0x11A8 ~ 0x11C2). */
Martin v. Löwis22970662011-09-29 13:39:38 +0200707 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000708 i++;
709 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200710 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000711 continue;
712 }
713
Martin v. Löwis22970662011-09-29 13:39:38 +0200714 /* code is still input[i] here */
Andy Lester982307b2020-03-17 11:38:12 -0500715 f = find_nfc_index(nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000716 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200717 output[o++] = code;
718 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000719 continue;
720 }
721 /* Find next unblocked character. */
722 i1 = i+1;
723 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200724 /* output base character for now; might be updated later. */
725 output[o] = PyUnicode_READ(kind, data, i);
726 while (i1 < len) {
727 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
728 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000729 if (comb) {
730 if (comb1 == 0)
731 break;
732 if (comb >= comb1) {
733 /* Character is blocked. */
734 i1++;
735 continue;
736 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000737 }
Andy Lester982307b2020-03-17 11:38:12 -0500738 l = find_nfc_index(nfc_last, code1);
Martin v. Löwis22970662011-09-29 13:39:38 +0200739 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000740 is a starter, we don't need to look further.
741 Otherwise, record the combining class. */
742 if (l == -1) {
743 not_combinable:
744 if (comb1 == 0)
745 break;
746 comb = comb1;
747 i1++;
748 continue;
749 }
750 index = f*TOTAL_LAST + l;
751 index1 = comp_index[index >> COMP_SHIFT];
752 code = comp_data[(index1<<COMP_SHIFT)+
753 (index&((1<<COMP_SHIFT)-1))];
754 if (code == 0)
755 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000756
Martin v. Löwis677bde22002-11-23 22:08:15 +0000757 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200758 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000759 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000760 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000761 skipped[cskipped++] = i1;
762 i1++;
Andy Lester982307b2020-03-17 11:38:12 -0500763 f = find_nfc_index(nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000764 if (f == -1)
765 break;
766 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200767 /* Output character was already written.
768 Just advance the indices. */
769 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000770 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200771 if (o == len) {
772 /* No changes. Return original string. */
773 PyMem_Free(output);
774 return result;
775 }
776 Py_DECREF(result);
777 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
778 output, o);
779 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000780 return result;
781}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000782
Greg Price2f094132019-09-03 19:45:44 -0700783// This needs to match the logic in makeunicodedata.py
784// which constructs the quickcheck data.
785typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
Max Bélanger2810dd72018-11-04 15:58:24 -0800786
Greg Price2f094132019-09-03 19:45:44 -0700787/* Run the Unicode normalization "quickcheck" algorithm.
788 *
789 * Return YES or NO if quickcheck determines the input is certainly
790 * normalized or certainly not, and MAYBE if quickcheck is unable to
791 * tell.
792 *
793 * If `yes_only` is true, then return MAYBE as soon as we determine
794 * the answer is not YES.
795 *
796 * For background and details on the algorithm, see UAX #15:
797 * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
798 */
799static QuickcheckResult
800is_normalized_quickcheck(PyObject *self, PyObject *input,
Greg Price7669cb82019-09-09 02:16:31 -0700801 bool nfc, bool k, bool yes_only)
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000802{
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000803 /* An older version of the database is requested, quickchecks must be
804 disabled. */
Mohamed Koubaaddc0dd02020-09-23 05:38:16 -0500805 if (UCD_Check(self, &UCD_Type))
Max Bélanger2810dd72018-11-04 15:58:24 -0800806 return NO;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000807
Greg Price2f094132019-09-03 19:45:44 -0700808 Py_ssize_t i, len;
809 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300810 const void *data;
Greg Price2f094132019-09-03 19:45:44 -0700811 unsigned char prev_combining = 0;
812
813 /* The two quickcheck bits at this shift have type QuickcheckResult. */
814 int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
815
816 QuickcheckResult result = YES; /* certainly normalized, unless we find something */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000817
Martin v. Löwis22970662011-09-29 13:39:38 +0200818 i = 0;
819 kind = PyUnicode_KIND(input);
820 data = PyUnicode_DATA(input);
821 len = PyUnicode_GET_LENGTH(input);
822 while (i < len) {
823 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
824 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000825
Greg Price2f094132019-09-03 19:45:44 -0700826 unsigned char combining = record->combining;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000827 if (combining && prev_combining > combining)
Max Bélanger2810dd72018-11-04 15:58:24 -0800828 return NO; /* non-canonical sort order, not normalized */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000829 prev_combining = combining;
Greg Price2f094132019-09-03 19:45:44 -0700830
831 unsigned char quickcheck_whole = record->normalization_quick_check;
832 if (yes_only) {
833 if (quickcheck_whole & (3 << quickcheck_shift))
834 return MAYBE;
835 } else {
836 switch ((quickcheck_whole >> quickcheck_shift) & 3) {
837 case NO:
838 return NO;
839 case MAYBE:
840 result = MAYBE; /* this string might need normalization */
841 }
842 }
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000843 }
Greg Price2f094132019-09-03 19:45:44 -0700844 return result;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000845}
846
Serhiy Storchaka63596412015-04-17 21:18:49 +0300847/*[clinic input]
Max Bélanger2810dd72018-11-04 15:58:24 -0800848unicodedata.UCD.is_normalized
849
850 self: self
851 form: unicode
852 unistr as input: unicode
853 /
854
855Return whether the Unicode string unistr is in the normal form 'form'.
856
857Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
858[clinic start generated code]*/
859
860static PyObject *
861unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
862 PyObject *input)
863/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
864{
865 if (PyUnicode_READY(input) == -1) {
866 return NULL;
867 }
868
869 if (PyUnicode_GET_LENGTH(input) == 0) {
870 /* special case empty input strings. */
871 Py_RETURN_TRUE;
872 }
873
874 PyObject *result;
Greg Price7669cb82019-09-09 02:16:31 -0700875 bool nfc = false;
876 bool k = false;
Greg Price2f094132019-09-03 19:45:44 -0700877 QuickcheckResult m;
Max Bélanger2810dd72018-11-04 15:58:24 -0800878
879 PyObject *cmp;
880 int match = 0;
881
882 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700883 nfc = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800884 }
885 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700886 nfc = true;
887 k = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800888 }
889 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
890 /* matches default values for `nfc` and `k` */
891 }
892 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
Greg Price7669cb82019-09-09 02:16:31 -0700893 k = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800894 }
895 else {
896 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
897 return NULL;
898 }
899
Greg Price2f094132019-09-03 19:45:44 -0700900 m = is_normalized_quickcheck(self, input, nfc, k, false);
Max Bélanger2810dd72018-11-04 15:58:24 -0800901
902 if (m == MAYBE) {
903 cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
904 if (cmp == NULL) {
905 return NULL;
906 }
907 match = PyUnicode_Compare(input, cmp);
908 Py_DECREF(cmp);
909 result = (match == 0) ? Py_True : Py_False;
910 }
911 else {
912 result = (m == YES) ? Py_True : Py_False;
913 }
914
915 Py_INCREF(result);
916 return result;
917}
918
919
920/*[clinic input]
Serhiy Storchaka63596412015-04-17 21:18:49 +0300921unicodedata.UCD.normalize
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000922
Serhiy Storchaka63596412015-04-17 21:18:49 +0300923 self: self
Max Bélanger2810dd72018-11-04 15:58:24 -0800924 form: unicode
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +0300925 unistr as input: unicode
Serhiy Storchaka63596412015-04-17 21:18:49 +0300926 /
927
928Return the normal form 'form' for the Unicode string unistr.
929
930Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
931[clinic start generated code]*/
932
933static PyObject *
Max Bélanger2810dd72018-11-04 15:58:24 -0800934unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
Serhiy Storchaka63596412015-04-17 21:18:49 +0300935 PyObject *input)
Max Bélanger2810dd72018-11-04 15:58:24 -0800936/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
Martin v. Löwis677bde22002-11-23 22:08:15 +0000937{
Martin v. Löwis22970662011-09-29 13:39:38 +0200938 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000939 /* Special case empty input strings, since resizing
940 them later would cause internal errors. */
941 Py_INCREF(input);
942 return input;
943 }
944
Max Bélanger2810dd72018-11-04 15:58:24 -0800945 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700946 if (is_normalized_quickcheck(self, input, true, false, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000947 Py_INCREF(input);
948 return input;
949 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000950 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000951 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800952 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700953 if (is_normalized_quickcheck(self, input, true, true, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000954 Py_INCREF(input);
955 return input;
956 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000957 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000958 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800959 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
Greg Price7669cb82019-09-09 02:16:31 -0700960 if (is_normalized_quickcheck(self, input, false, false, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000961 Py_INCREF(input);
962 return input;
963 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000964 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000965 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800966 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
Greg Price7669cb82019-09-09 02:16:31 -0700967 if (is_normalized_quickcheck(self, input, false, true, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000968 Py_INCREF(input);
969 return input;
970 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000971 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000972 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000973 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
974 return NULL;
975}
976
Fredrik Lundh06d12682001-01-24 07:59:11 +0000977/* -------------------------------------------------------------------- */
978/* unicode character name tables */
979
980/* data file generated by Tools/unicode/makeunicodedata.py */
981#include "unicodename_db.h"
982
983/* -------------------------------------------------------------------- */
984/* database code (cut and pasted from the unidb package) */
985
986static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000987_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000988{
989 int i;
990 unsigned long h = 0;
991 unsigned long ix;
992 for (i = 0; i < len; i++) {
Jordon Xu2ec70102019-09-11 00:04:08 +0800993 h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000994 ix = h & 0xff000000;
995 if (ix)
996 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
997 }
998 return h;
999}
1000
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02001001static const char * const hangul_syllables[][3] = {
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001002 { "G", "A", "" },
1003 { "GG", "AE", "G" },
1004 { "N", "YA", "GG" },
1005 { "D", "YAE", "GS" },
1006 { "DD", "EO", "N", },
1007 { "R", "E", "NJ" },
1008 { "M", "YEO", "NH" },
1009 { "B", "YE", "D" },
1010 { "BB", "O", "L" },
1011 { "S", "WA", "LG" },
1012 { "SS", "WAE", "LM" },
1013 { "", "OE", "LB" },
1014 { "J", "YO", "LS" },
1015 { "JJ", "U", "LT" },
1016 { "C", "WEO", "LP" },
1017 { "K", "WE", "LH" },
1018 { "T", "WI", "M" },
1019 { "P", "YU", "B" },
1020 { "H", "EU", "BS" },
1021 { 0, "YI", "S" },
1022 { 0, "I", "SS" },
1023 { 0, 0, "NG" },
1024 { 0, 0, "J" },
1025 { 0, 0, "C" },
1026 { 0, 0, "K" },
1027 { 0, 0, "T" },
1028 { 0, 0, "P" },
1029 { 0, 0, "H" }
1030};
1031
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001032/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001033static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001034is_unified_ideograph(Py_UCS4 code)
1035{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001036 return
Benjamin Peterson051b9d02020-03-10 20:41:34 -07001037 (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
1038 (0x4E00 <= code && code <= 0x9FFC) || /* CJK Ideograph */
1039 (0x20000 <= code && code <= 0x2A6DD) || /* CJK Ideograph Extension B */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001040 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
Benjamin Peterson48013832015-06-27 15:45:56 -05001041 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
Benjamin Peterson279a9622017-06-22 22:31:08 -07001042 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
Benjamin Peterson051b9d02020-03-10 20:41:34 -07001043 (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1044 (0x30000 <= code && code <= 0x3134A); /* CJK Ideograph Extension G */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001045}
1046
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001047/* macros used to determine if the given code point is in the PUA range that
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001048 * we are using to store aliases and named sequences */
1049#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1050#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1051 (cp < named_sequences_end))
1052
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001053static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001054_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
1055 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001056{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001057 /* Find the name associated with the given code point.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001058 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1059 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001060 int offset;
1061 int i;
1062 int word;
Inada Naoki6fec9052019-04-17 08:40:34 +09001063 const unsigned char* w;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001064
Martin v. Löwisc3509122006-03-11 12:16:23 +00001065 if (code >= 0x110000)
1066 return 0;
1067
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001068 /* XXX should we just skip all the code points in the PUAs here? */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001069 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1070 return 0;
1071
Mohamed Koubaaddc0dd02020-09-23 05:38:16 -05001072 if (UCD_Check(self, &UCD_Type)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001073 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +03001074 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001075 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1076 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +03001077 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +00001078 if (old->category_changed == 0) {
1079 /* unassigned */
1080 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001081 }
Martin v. Löwisc3509122006-03-11 12:16:23 +00001082 }
1083
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +00001084 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001085 /* Hangul syllable. */
1086 int SIndex = code - SBase;
1087 int L = SIndex / NCount;
1088 int V = (SIndex % NCount) / TCount;
1089 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001090
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001091 if (buflen < 27)
1092 /* Worst case: HANGUL SYLLABLE <10chars>. */
1093 return 0;
1094 strcpy(buffer, "HANGUL SYLLABLE ");
1095 buffer += 16;
1096 strcpy(buffer, hangul_syllables[L][0]);
1097 buffer += strlen(hangul_syllables[L][0]);
1098 strcpy(buffer, hangul_syllables[V][1]);
1099 buffer += strlen(hangul_syllables[V][1]);
1100 strcpy(buffer, hangul_syllables[T][2]);
1101 buffer += strlen(hangul_syllables[T][2]);
1102 *buffer = '\0';
1103 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001104 }
1105
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001106 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001107 if (buflen < 28)
1108 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1109 return 0;
1110 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1111 return 1;
1112 }
1113
Fredrik Lundh06d12682001-01-24 07:59:11 +00001114 /* get offset into phrasebook */
1115 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1116 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1117 (code&((1<<phrasebook_shift)-1))];
1118 if (!offset)
1119 return 0;
1120
1121 i = 0;
1122
1123 for (;;) {
1124 /* get word index */
1125 word = phrasebook[offset] - phrasebook_short;
1126 if (word >= 0) {
1127 word = (word << 8) + phrasebook[offset+1];
1128 offset += 2;
1129 } else
1130 word = phrasebook[offset++];
1131 if (i) {
1132 if (i > buflen)
1133 return 0; /* buffer overflow */
1134 buffer[i++] = ' ';
1135 }
1136 /* copy word string from lexicon. the last character in the
1137 word has bit 7 set. the last word in a string ends with
1138 0x80 */
1139 w = lexicon + lexicon_offset[word];
1140 while (*w < 128) {
1141 if (i >= buflen)
1142 return 0; /* buffer overflow */
1143 buffer[i++] = *w++;
1144 }
1145 if (i >= buflen)
1146 return 0; /* buffer overflow */
1147 buffer[i++] = *w & 127;
1148 if (*w == 128)
1149 break; /* end of word */
1150 }
1151
1152 return 1;
1153}
1154
1155static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001156_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001157{
1158 /* check if code corresponds to the given name */
1159 int i;
Christian Heimes2f366ca2016-09-23 20:20:27 +02001160 char buffer[NAME_MAXLEN+1];
1161 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001162 return 0;
1163 for (i = 0; i < namelen; i++) {
Jordon Xu2ec70102019-09-11 00:04:08 +08001164 if (Py_TOUPPER(name[i]) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001165 return 0;
1166 }
1167 return buffer[namelen] == '\0';
1168}
1169
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001170static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001171find_syllable(const char *str, int *len, int *pos, int count, int column)
1172{
1173 int i, len1;
1174 *len = -1;
1175 for (i = 0; i < count; i++) {
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02001176 const char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001177 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001178 if (len1 <= *len)
1179 continue;
1180 if (strncmp(str, s, len1) == 0) {
1181 *len = len1;
1182 *pos = i;
1183 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001184 }
1185 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001186 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001187 }
1188}
1189
Fredrik Lundh06d12682001-01-24 07:59:11 +00001190static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001191_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001192{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001193 /* check if named sequences are allowed */
1194 if (!with_named_seq && IS_NAMED_SEQ(cp))
1195 return 0;
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001196 /* if the code point is in the PUA range that we use for aliases,
1197 * convert it to obtain the right code point */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001198 if (IS_ALIAS(cp))
1199 *code = name_aliases[cp-aliases_start];
1200 else
1201 *code = cp;
1202 return 1;
1203}
1204
1205static int
1206_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1207 int with_named_seq)
1208{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001209 /* Return the code point associated with the given name.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001210 * Named aliases are resolved too (unless self != NULL (i.e. we are using
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001211 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001212 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001213 unsigned int h, v;
1214 unsigned int mask = code_size-1;
1215 unsigned int i, incr;
1216
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001217 /* Check for hangul syllables. */
1218 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001219 int len, L = -1, V = -1, T = -1;
1220 const char *pos = name + 16;
1221 find_syllable(pos, &len, &L, LCount, 0);
1222 pos += len;
1223 find_syllable(pos, &len, &V, VCount, 1);
1224 pos += len;
1225 find_syllable(pos, &len, &T, TCount, 2);
1226 pos += len;
1227 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1228 *code = SBase + (L*VCount+V)*TCount + T;
1229 return 1;
1230 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001231 /* Otherwise, it's an illegal syllable name. */
1232 return 0;
1233 }
1234
1235 /* Check for unified ideographs. */
1236 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1237 /* Four or five hexdigits must follow. */
1238 v = 0;
1239 name += 22;
1240 namelen -= 22;
1241 if (namelen != 4 && namelen != 5)
1242 return 0;
1243 while (namelen--) {
1244 v *= 16;
1245 if (*name >= '0' && *name <= '9')
1246 v += *name - '0';
1247 else if (*name >= 'A' && *name <= 'F')
1248 v += *name - 'A' + 10;
1249 else
1250 return 0;
1251 name++;
1252 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001253 if (!is_unified_ideograph(v))
1254 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001255 *code = v;
1256 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001257 }
1258
Fredrik Lundh06d12682001-01-24 07:59:11 +00001259 /* the following is the same as python's dictionary lookup, with
1260 only minor changes. see the makeunicodedata script for more
1261 details */
1262
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001263 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001264 i = (~h) & mask;
1265 v = code_hash[i];
1266 if (!v)
1267 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001268 if (_cmpname(self, v, name, namelen))
1269 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001270 incr = (h ^ (h >> 3)) & mask;
1271 if (!incr)
1272 incr = mask;
1273 for (;;) {
1274 i = (i + incr) & mask;
1275 v = code_hash[i];
1276 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001277 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001278 if (_cmpname(self, v, name, namelen))
1279 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001280 incr = incr << 1;
1281 if (incr > mask)
1282 incr = incr ^ code_poly;
1283 }
1284}
1285
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001286static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001287{
1288 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001289 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001290 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001291};
1292
1293/* -------------------------------------------------------------------- */
1294/* Python bindings */
1295
Serhiy Storchaka63596412015-04-17 21:18:49 +03001296/*[clinic input]
1297unicodedata.UCD.name
1298
1299 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001300 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +03001301 default: object=NULL
1302 /
1303
1304Returns the name assigned to the character chr as a string.
1305
1306If no name is defined, default is returned, or, if not given,
1307ValueError is raised.
1308[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001309
Fredrik Lundh06d12682001-01-24 07:59:11 +00001310static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001311unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001312/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001313{
Christian Heimes2f366ca2016-09-23 20:20:27 +02001314 char name[NAME_MAXLEN+1];
Serhiy Storchaka63596412015-04-17 21:18:49 +03001315 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001316
Christian Heimes2f366ca2016-09-23 20:20:27 +02001317 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001318 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001319 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001320 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001321 }
1322 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001323 Py_INCREF(default_value);
1324 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001325 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001326 }
1327
Walter Dörwald4254e762007-06-05 16:04:09 +00001328 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001329}
1330
Serhiy Storchaka63596412015-04-17 21:18:49 +03001331/*[clinic input]
1332unicodedata.UCD.lookup
1333
1334 self: self
Larry Hastings38337d12015-05-07 23:30:09 -07001335 name: str(accept={str, robuffer}, zeroes=True)
Serhiy Storchaka63596412015-04-17 21:18:49 +03001336 /
1337
1338Look up character by name.
1339
1340If a character with the given name is found, return the
1341corresponding character. If not found, KeyError is raised.
1342[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001343
Fredrik Lundh06d12682001-01-24 07:59:11 +00001344static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001345unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1346 Py_ssize_clean_t name_length)
Larry Hastings38337d12015-05-07 23:30:09 -07001347/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001348{
1349 Py_UCS4 code;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001350 unsigned int index;
Christian Heimes7ce20132016-09-14 10:25:46 +02001351 if (name_length > NAME_MAXLEN) {
Victor Stinner65a31442014-07-01 16:45:52 +02001352 PyErr_SetString(PyExc_KeyError, "name too long");
1353 return NULL;
1354 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001355
Serhiy Storchaka63596412015-04-17 21:18:49 +03001356 if (!_getcode(self, name, (int)name_length, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001357 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001358 return NULL;
1359 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001360 /* check if code is in the PUA range that we use for named sequences
1361 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001362 if (IS_NAMED_SEQ(code)) {
1363 index = code-named_sequences_start;
1364 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1365 named_sequences[index].seq,
1366 named_sequences[index].seqlen);
1367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001369}
1370
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001371/* XXX Add doc strings. */
1372
1373static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001374 UNICODEDATA_UCD_DECIMAL_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001375 UNICODEDATA_UCD_DIGIT_METHODDEF
1376 UNICODEDATA_UCD_NUMERIC_METHODDEF
1377 UNICODEDATA_UCD_CATEGORY_METHODDEF
1378 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1379 UNICODEDATA_UCD_COMBINING_METHODDEF
1380 UNICODEDATA_UCD_MIRRORED_METHODDEF
1381 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1382 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1383 UNICODEDATA_UCD_NAME_METHODDEF
1384 UNICODEDATA_UCD_LOOKUP_METHODDEF
Max Bélanger2810dd72018-11-04 15:58:24 -08001385 UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001386 UNICODEDATA_UCD_NORMALIZE_METHODDEF
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001387 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001388};
1389
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001390static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001391 /* The ob_type field must be initialized in the module init function
1392 * to be portable to Windows without using C++. */
1393 PyVarObject_HEAD_INIT(NULL, 0)
1394 "unicodedata.UCD", /*tp_name*/
1395 sizeof(PreviousDBVersion), /*tp_basicsize*/
1396 0, /*tp_itemsize*/
1397 /* methods */
1398 (destructor)PyObject_Del, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02001399 0, /*tp_vectorcall_offset*/
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001400 0, /*tp_getattr*/
1401 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02001402 0, /*tp_as_async*/
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001403 0, /*tp_repr*/
1404 0, /*tp_as_number*/
1405 0, /*tp_as_sequence*/
1406 0, /*tp_as_mapping*/
1407 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001408 0, /*tp_call*/
1409 0, /*tp_str*/
1410 PyObject_GenericGetAttr,/*tp_getattro*/
1411 0, /*tp_setattro*/
1412 0, /*tp_as_buffer*/
1413 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1414 0, /*tp_doc*/
1415 0, /*tp_traverse*/
1416 0, /*tp_clear*/
1417 0, /*tp_richcompare*/
1418 0, /*tp_weaklistoffset*/
1419 0, /*tp_iter*/
1420 0, /*tp_iternext*/
1421 unicodedata_functions, /*tp_methods*/
1422 DB_members, /*tp_members*/
1423 0, /*tp_getset*/
1424 0, /*tp_base*/
1425 0, /*tp_dict*/
1426 0, /*tp_descr_get*/
1427 0, /*tp_descr_set*/
1428 0, /*tp_dictoffset*/
1429 0, /*tp_init*/
1430 0, /*tp_alloc*/
1431 0, /*tp_new*/
1432 0, /*tp_free*/
1433 0, /*tp_is_gc*/
1434};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001435
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001436PyDoc_STRVAR(unicodedata_docstring,
1437"This module provides access to the Unicode Character Database which\n\
1438defines character properties for all Unicode characters. The data in\n\
1439this database is based on the UnicodeData.txt file version\n\
luzpaza5293b42017-11-05 07:37:50 -06001440" UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001441\n\
1442The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001443UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001444
1445static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001446 PyModuleDef_HEAD_INIT,
1447 "unicodedata",
1448 unicodedata_docstring,
1449 -1,
1450 unicodedata_functions,
1451 NULL,
1452 NULL,
1453 NULL,
1454 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001455};
1456
Mark Hammond62b1ab12002-07-23 06:31:15 +00001457PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001458PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001459{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001460 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001461
Victor Stinnerd2ec81a2020-02-07 09:17:07 +01001462 Py_SET_TYPE(&UCD_Type, &PyType_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001463
Martin v. Löwis1a214512008-06-11 05:26:20 +00001464 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001465 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001466 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001467
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001468 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001469 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001470 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001471
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001472 /* Previous versions */
1473 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1474 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001475 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001476
Fredrik Lundh06d12682001-01-24 07:59:11 +00001477 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001478 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001479 if (v != NULL)
1480 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001481 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001482}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001483
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001484/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001485Local variables:
1486c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001487indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001488End:
1489*/