blob: f87eb608798919216540a6e0a0ca9349e7e1a5f5 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Victor Stinner47e1afd2020-10-26 16:43:47 +010019#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
Victor Stinner4a21e572020-04-15 02:35:41 +020020#include "structmember.h" // PyMemberDef
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Greg Price2f094132019-09-03 19:45:44 -070022#include <stdbool.h>
23
Max Bélanger2810dd72018-11-04 15:58:24 -080024_Py_IDENTIFIER(NFC);
25_Py_IDENTIFIER(NFD);
26_Py_IDENTIFIER(NFKC);
27_Py_IDENTIFIER(NFKD);
28
Larry Hastings61272b72014-01-07 12:41:53 -080029/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080030module unicodedata
Victor Stinnerc8c42002020-10-26 23:19:22 +010031class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
Larry Hastings61272b72014-01-07 12:41:53 -080032[clinic start generated code]*/
Victor Stinnerc8c42002020-10-26 23:19:22 +010033/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080034
Fredrik Lundh06d12682001-01-24 07:59:11 +000035/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000036
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000037typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000038 const unsigned char category; /* index into
39 _PyUnicode_CategoryNames */
40 const unsigned char combining; /* combining class value 0 - 255 */
41 const unsigned char bidirectional; /* index into
42 _PyUnicode_BidirectionalNames */
43 const unsigned char mirrored; /* true if mirrored in bidir mode */
44 const unsigned char east_asian_width; /* index into
45 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000046 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047} _PyUnicode_DatabaseRecord;
48
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000049typedef struct change_record {
50 /* sequence of fields should be the same as in merge_old_version */
51 const unsigned char bidir_changed;
52 const unsigned char category_changed;
53 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000054 const unsigned char mirrored_changed;
Benjamin Peterson67752312016-09-14 23:53:47 -070055 const unsigned char east_asian_width_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000056 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000057} change_record;
58
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000059/* data file generated by Tools/unicode/makeunicodedata.py */
60#include "unicodedata_db.h"
61
62static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000063_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000064{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000065 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000066 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000067 index = 0;
68 else {
69 index = index1[(code>>SHIFT)];
70 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
71 }
72
73 return &_PyUnicode_Database_Records[index];
74}
75
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000076/* ------------- Previous-version API ------------------------------------- */
77typedef struct previous_version {
78 PyObject_HEAD
79 const char *name;
80 const change_record* (*getrecord)(Py_UCS4);
81 Py_UCS4 (*normalization)(Py_UCS4);
82} PreviousDBVersion;
83
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030084#include "clinic/unicodedata.c.h"
85
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
87
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090 {NULL}
91};
92
Victor Stinner920cb642020-10-26 19:19:36 +010093// Check if self is an unicodedata.UCD instance.
94// If self is NULL (when the PyCapsule C API is used), return 0.
95// PyModule_Check() is used to avoid having to retrieve the ucd_type.
96// See unicodedata_functions comment to the rationale of this macro.
97#define UCD_Check(self) (self != NULL && !PyModule_Check(self))
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000098
99static PyObject*
Victor Stinner920cb642020-10-26 19:19:36 +0100100new_previous_version(PyTypeObject *ucd_type,
Victor Stinnere6b8c522020-10-15 16:22:19 +0200101 const char*name, const change_record* (*getrecord)(Py_UCS4),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000102 Py_UCS4 (*normalization)(Py_UCS4))
103{
Victor Stinnere6b8c522020-10-15 16:22:19 +0200104 PreviousDBVersion *self;
Miss Islington (bot)0bf05002021-05-27 08:26:15 -0700105 self = PyObject_GC_New(PreviousDBVersion, ucd_type);
Victor Stinnere6b8c522020-10-15 16:22:19 +0200106 if (self == NULL)
107 return NULL;
108 self->name = name;
109 self->getrecord = getrecord;
110 self->normalization = normalization;
Miss Islington (bot)0bf05002021-05-27 08:26:15 -0700111 PyObject_GC_Track(self);
Victor Stinnere6b8c522020-10-15 16:22:19 +0200112 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000113}
114
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000115
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000116/* --- Module API --------------------------------------------------------- */
117
Larry Hastings61272b72014-01-07 12:41:53 -0800118/*[clinic input]
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800119unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700120
Serhiy Storchaka63596412015-04-17 21:18:49 +0300121 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700122 chr: int(accept={str})
Larry Hastings31826802013-10-19 00:09:25 -0700123 default: object=NULL
124 /
125
126Converts a Unicode character into its equivalent decimal value.
127
Serhiy Storchaka63596412015-04-17 21:18:49 +0300128Returns the decimal value assigned to the character chr as integer.
129If no such value is defined, default is returned, or, if not given,
130ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800131[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700132
Larry Hastings31826802013-10-19 00:09:25 -0700133static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300134unicodedata_UCD_decimal_impl(PyObject *self, int chr,
Larry Hastings89964c42015-04-14 18:07:59 -0400135 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700136/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
Larry Hastings31826802013-10-19 00:09:25 -0700137{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000138 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000139 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300140 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000141
Victor Stinner920cb642020-10-26 19:19:36 +0100142 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000143 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000144 if (old->category_changed == 0) {
145 /* unassigned */
146 have_old = 1;
147 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000149 else if (old->decimal_changed != 0xFF) {
150 have_old = 1;
151 rc = old->decimal_changed;
152 }
153 }
154
155 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000156 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000157 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700158 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000159 PyErr_SetString(PyExc_ValueError,
160 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000161 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000162 }
163 else {
Larry Hastings31826802013-10-19 00:09:25 -0700164 Py_INCREF(default_value);
165 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000166 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000167 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000168 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169}
170
Serhiy Storchaka63596412015-04-17 21:18:49 +0300171/*[clinic input]
172unicodedata.UCD.digit
173
174 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700175 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300176 default: object=NULL
177 /
178
179Converts a Unicode character into its equivalent digit value.
180
181Returns the digit value assigned to the character chr as integer.
182If no such value is defined, default is returned, or, if not given,
183ValueError is raised.
184[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000185
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000186static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300187unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700188/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000189{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000190 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300191 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000192 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193 if (rc < 0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300194 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000195 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000196 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000197 }
198 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300199 Py_INCREF(default_value);
200 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000201 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000202 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000203 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000204}
205
Serhiy Storchaka63596412015-04-17 21:18:49 +0300206/*[clinic input]
207unicodedata.UCD.numeric
208
209 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700210 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300211 default: object=NULL
212 /
213
214Converts a Unicode character into its equivalent numeric value.
215
216Returns the numeric value assigned to the character chr as float.
217If no such value is defined, default is returned, or, if not given,
218ValueError is raised.
219[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000220
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000221static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300222unicodedata_UCD_numeric_impl(PyObject *self, int chr,
223 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700224/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000225{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000226 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000227 double rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300228 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000229
Victor Stinner920cb642020-10-26 19:19:36 +0100230 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000231 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000232 if (old->category_changed == 0) {
233 /* unassigned */
234 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000235 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000236 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000237 else if (old->decimal_changed != 0xFF) {
238 have_old = 1;
239 rc = old->decimal_changed;
240 }
241 }
242
243 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000244 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 if (rc == -1.0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300246 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 PyErr_SetString(PyExc_ValueError, "not a numeric character");
248 return NULL;
249 }
250 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300251 Py_INCREF(default_value);
252 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000253 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000254 }
255 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000256}
257
Serhiy Storchaka63596412015-04-17 21:18:49 +0300258/*[clinic input]
259unicodedata.UCD.category
260
261 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700262 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300263 /
264
265Returns the general category assigned to the character chr as string.
266[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000267
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000268static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300269unicodedata_UCD_category_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700270/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000271{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000272 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300273 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000274 index = (int) _getrecord_ex(c)->category;
Victor Stinner920cb642020-10-26 19:19:36 +0100275 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000276 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000277 if (old->category_changed != 0xFF)
278 index = old->category_changed;
279 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000280 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000281}
282
Serhiy Storchaka63596412015-04-17 21:18:49 +0300283/*[clinic input]
284unicodedata.UCD.bidirectional
285
286 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700287 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300288 /
289
290Returns the bidirectional class assigned to the character chr as string.
291
292If no such value is defined, an empty string is returned.
293[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000294
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000295static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300296unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700297/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000298{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000299 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300300 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000301 index = (int) _getrecord_ex(c)->bidirectional;
Victor Stinner920cb642020-10-26 19:19:36 +0100302 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000303 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000304 if (old->category_changed == 0)
305 index = 0; /* unassigned */
306 else if (old->bidir_changed != 0xFF)
307 index = old->bidir_changed;
308 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000309 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000310}
311
Serhiy Storchaka63596412015-04-17 21:18:49 +0300312/*[clinic input]
313unicodedata.UCD.combining -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000314
Serhiy Storchaka63596412015-04-17 21:18:49 +0300315 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700316 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300317 /
318
319Returns the canonical combining class assigned to the character chr as integer.
320
321Returns 0 if no combining class is defined.
322[clinic start generated code]*/
323
324static int
325unicodedata_UCD_combining_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700326/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000327{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000328 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300329 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000330 index = (int) _getrecord_ex(c)->combining;
Victor Stinner920cb642020-10-26 19:19:36 +0100331 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000332 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000333 if (old->category_changed == 0)
334 index = 0; /* unassigned */
335 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300336 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000337}
338
Serhiy Storchaka63596412015-04-17 21:18:49 +0300339/*[clinic input]
340unicodedata.UCD.mirrored -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000341
Serhiy Storchaka63596412015-04-17 21:18:49 +0300342 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700343 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300344 /
345
346Returns the mirrored property assigned to the character chr as integer.
347
348Returns 1 if the character has been identified as a "mirrored"
349character in bidirectional text, 0 otherwise.
350[clinic start generated code]*/
351
352static int
353unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700354/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000355{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000356 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300357 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000358 index = (int) _getrecord_ex(c)->mirrored;
Victor Stinner920cb642020-10-26 19:19:36 +0100359 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000360 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000361 if (old->category_changed == 0)
362 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000363 else if (old->mirrored_changed != 0xFF)
364 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000365 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300366 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000367}
368
Serhiy Storchaka63596412015-04-17 21:18:49 +0300369/*[clinic input]
370unicodedata.UCD.east_asian_width
371
372 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700373 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300374 /
375
376Returns the east asian width assigned to the character chr as string.
377[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000378
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000379static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300380unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700381/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000382{
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000383 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300384 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000385 index = (int) _getrecord_ex(c)->east_asian_width;
Victor Stinner920cb642020-10-26 19:19:36 +0100386 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000387 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000388 if (old->category_changed == 0)
389 index = 0; /* unassigned */
Benjamin Peterson67752312016-09-14 23:53:47 -0700390 else if (old->east_asian_width_changed != 0xFF)
391 index = old->east_asian_width_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000392 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000393 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000394}
395
Serhiy Storchaka63596412015-04-17 21:18:49 +0300396/*[clinic input]
397unicodedata.UCD.decomposition
398
399 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700400 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300401 /
402
403Returns the character decomposition mapping assigned to the character chr as string.
404
405An empty string is returned in case no such mapping is defined.
406[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000407
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000408static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300409unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700410/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000411{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000412 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000413 int code, index, count;
414 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000415 unsigned int prefix_index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300416 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000417
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000418 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000419
Victor Stinner920cb642020-10-26 19:19:36 +0100420 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000421 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000422 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000423 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000424 }
425
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000426 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000427 index = 0;
428 else {
429 index = decomp_index1[(code>>DECOMP_SHIFT)];
430 index = decomp_index2[(index<<DECOMP_SHIFT)+
431 (code&((1<<DECOMP_SHIFT)-1))];
432 }
433
Tim Peters69b83b12001-11-30 07:23:05 +0000434 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000435 is prefix code (from*/
436 count = decomp_data[index] >> 8;
437
438 /* XXX: could allocate the PyString up front instead
439 (strlen(prefix) + 5 * count + 1 bytes) */
440
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000441 /* Based on how index is calculated above and decomp_data is generated
442 from Tools/unicode/makeunicodedata.py, it should not be possible
443 to overflow decomp_prefix. */
444 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200445 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000446
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000447 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000448 i = strlen(decomp_prefix[prefix_index]);
449 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000450
451 while (count-- > 0) {
452 if (i)
453 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000454 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000455 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
456 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000457 i += strlen(decomp + i);
458 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000459 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000460}
461
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000462static void
Victor Stinner920cb642020-10-26 19:19:36 +0100463get_decomp_record(PyObject *self, Py_UCS4 code,
464 int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000465{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000466 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000467 *index = 0;
Victor Stinner920cb642020-10-26 19:19:36 +0100468 }
469 else if (UCD_Check(self)
470 && get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000471 /* unassigned in old version */
472 *index = 0;
473 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000474 else {
475 *index = decomp_index1[(code>>DECOMP_SHIFT)];
476 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
477 (code&((1<<DECOMP_SHIFT)-1))];
478 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000479
Martin v. Löwis677bde22002-11-23 22:08:15 +0000480 /* high byte is number of hex bytes (usually one or two), low byte
481 is prefix code (from*/
482 *count = decomp_data[*index] >> 8;
483 *prefix = decomp_data[*index] & 255;
484
485 (*index)++;
486}
487
488#define SBase 0xAC00
489#define LBase 0x1100
490#define VBase 0x1161
491#define TBase 0x11A7
492#define LCount 19
493#define VCount 21
494#define TCount 28
495#define NCount (VCount*TCount)
496#define SCount (LCount*NCount)
497
498static PyObject*
Victor Stinner920cb642020-10-26 19:19:36 +0100499nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000500{
501 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200502 Py_UCS4 *output;
503 Py_ssize_t i, o, osize;
504 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300505 const void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000506 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200507 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000508 Py_ssize_t space, isize;
509 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000510 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000511
Martin v. Löwis677bde22002-11-23 22:08:15 +0000512 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200513 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500514 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300515 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500516 if (space > 10) {
517 if (space <= PY_SSIZE_T_MAX - 10)
518 space += 10;
519 }
520 else {
521 space *= 2;
522 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200523 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500524 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200525 if (!output) {
526 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000527 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200528 }
529 i = o = 0;
530 kind = PyUnicode_KIND(input);
531 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000532
Martin v. Löwis22970662011-09-29 13:39:38 +0200533 while (i < isize) {
534 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000535 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200536 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000537 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300538 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000539 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000540 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200541 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000542 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000543 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
544 if (new_output == NULL) {
545 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200546 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000547 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200548 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000549 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000550 }
551 /* Hangul Decomposition. */
552 if (SBase <= code && code < (SBase+SCount)) {
553 int SIndex = code - SBase;
554 int L = LBase + SIndex / NCount;
555 int V = VBase + (SIndex % NCount) / TCount;
556 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200557 output[o++] = L;
558 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000559 space -= 2;
560 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200561 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000562 space --;
563 }
564 continue;
565 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000566 /* normalization changes */
Victor Stinner920cb642020-10-26 19:19:36 +0100567 if (UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000568 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
569 if (value != 0) {
570 stack[stackptr++] = value;
571 continue;
572 }
573 }
574
575 /* Other decompositions. */
Victor Stinner920cb642020-10-26 19:19:36 +0100576 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000577
578 /* Copy character if it is not decomposable, or has a
579 compatibility decomposition, but we do NFD. */
580 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200581 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000582 space--;
583 continue;
584 }
585 /* Copy decomposition onto the stack, in reverse
586 order. */
587 while(count) {
588 code = decomp_data[index + (--count)];
589 stack[stackptr++] = code;
590 }
591 }
592 }
593
Martin v. Löwis22970662011-09-29 13:39:38 +0200594 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
595 output, o);
596 PyMem_Free(output);
597 if (!result)
598 return NULL;
599 /* result is guaranteed to be ready, as it is compact. */
600 kind = PyUnicode_KIND(result);
601 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000602
603 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200604 i = 0;
605 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
606 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
607 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000608 if (prev == 0 || cur == 0 || prev <= cur) {
609 prev = cur;
610 continue;
611 }
612 /* Non-canonical order. Need to switch *i with previous. */
613 o = i - 1;
614 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200615 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
616 PyUnicode_WRITE(kind, data, o+1,
617 PyUnicode_READ(kind, data, o));
618 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000619 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200620 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000621 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200622 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000623 if (prev == 0 || prev <= cur)
624 break;
625 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200626 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000627 }
628 return result;
629}
630
631static int
Andy Lester982307b2020-03-17 11:38:12 -0500632find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000633{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200634 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000635 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200636 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000637 if (code < start)
638 return -1;
639 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200640 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000641 return nfc[index].index + delta;
642 }
643 }
644 return -1;
645}
646
647static PyObject*
Victor Stinner920cb642020-10-26 19:19:36 +0100648nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000649{
650 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200651 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300652 const void *data;
Martin v. Löwis22970662011-09-29 13:39:38 +0200653 Py_UCS4 *output;
654 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000655 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200656 Py_UCS4 code;
657 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000658 int cskipped = 0;
659
Victor Stinner920cb642020-10-26 19:19:36 +0100660 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000661 if (!result)
662 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200663 /* result will be "ready". */
664 kind = PyUnicode_KIND(result);
665 data = PyUnicode_DATA(result);
666 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000667
Martin v. Löwis22970662011-09-29 13:39:38 +0200668 /* We allocate a buffer for the output.
669 If we find that we made no changes, we still return
670 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500671 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200672 if (!output) {
673 PyErr_NoMemory();
674 Py_DECREF(result);
675 return 0;
676 }
677 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000678
Martin v. Löwis677bde22002-11-23 22:08:15 +0000679 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200680 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000681 for (index = 0; index < cskipped; index++) {
682 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000683 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000684 Remove from list. */
685 skipped[index] = skipped[cskipped-1];
686 cskipped--;
687 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000688 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000689 }
690 }
691 /* Hangul Composition. We don't need to check for <LV,T>
692 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200693 code = PyUnicode_READ(kind, data, i);
694 if (LBase <= code && code < (LBase+LCount) &&
695 i + 1 < len &&
696 VBase <= PyUnicode_READ(kind, data, i+1) &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900697 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
698 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
699 and V character is a modern vowel (0x1161 ~ 0x1175). */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000700 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200701 LIndex = code - LBase;
702 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000703 code = SBase + (LIndex*VCount+VIndex)*TCount;
704 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200705 if (i < len &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900706 TBase < PyUnicode_READ(kind, data, i) &&
707 PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
708 /* check T character is a modern trailing consonant
709 (0x11A8 ~ 0x11C2). */
Martin v. Löwis22970662011-09-29 13:39:38 +0200710 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000711 i++;
712 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200713 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000714 continue;
715 }
716
Martin v. Löwis22970662011-09-29 13:39:38 +0200717 /* code is still input[i] here */
Andy Lester982307b2020-03-17 11:38:12 -0500718 f = find_nfc_index(nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000719 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200720 output[o++] = code;
721 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000722 continue;
723 }
724 /* Find next unblocked character. */
725 i1 = i+1;
726 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200727 /* output base character for now; might be updated later. */
728 output[o] = PyUnicode_READ(kind, data, i);
729 while (i1 < len) {
730 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
731 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000732 if (comb) {
733 if (comb1 == 0)
734 break;
735 if (comb >= comb1) {
736 /* Character is blocked. */
737 i1++;
738 continue;
739 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000740 }
Andy Lester982307b2020-03-17 11:38:12 -0500741 l = find_nfc_index(nfc_last, code1);
Martin v. Löwis22970662011-09-29 13:39:38 +0200742 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000743 is a starter, we don't need to look further.
744 Otherwise, record the combining class. */
745 if (l == -1) {
746 not_combinable:
747 if (comb1 == 0)
748 break;
749 comb = comb1;
750 i1++;
751 continue;
752 }
753 index = f*TOTAL_LAST + l;
754 index1 = comp_index[index >> COMP_SHIFT];
755 code = comp_data[(index1<<COMP_SHIFT)+
756 (index&((1<<COMP_SHIFT)-1))];
757 if (code == 0)
758 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000759
Martin v. Löwis677bde22002-11-23 22:08:15 +0000760 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200761 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000762 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000763 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000764 skipped[cskipped++] = i1;
765 i1++;
Andy Lester982307b2020-03-17 11:38:12 -0500766 f = find_nfc_index(nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000767 if (f == -1)
768 break;
769 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200770 /* Output character was already written.
771 Just advance the indices. */
772 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000773 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200774 if (o == len) {
775 /* No changes. Return original string. */
776 PyMem_Free(output);
777 return result;
778 }
779 Py_DECREF(result);
780 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
781 output, o);
782 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000783 return result;
784}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000785
Greg Price2f094132019-09-03 19:45:44 -0700786// This needs to match the logic in makeunicodedata.py
787// which constructs the quickcheck data.
788typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
Max Bélanger2810dd72018-11-04 15:58:24 -0800789
Greg Price2f094132019-09-03 19:45:44 -0700790/* Run the Unicode normalization "quickcheck" algorithm.
791 *
792 * Return YES or NO if quickcheck determines the input is certainly
793 * normalized or certainly not, and MAYBE if quickcheck is unable to
794 * tell.
795 *
796 * If `yes_only` is true, then return MAYBE as soon as we determine
797 * the answer is not YES.
798 *
799 * For background and details on the algorithm, see UAX #15:
800 * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
801 */
802static QuickcheckResult
Victor Stinner920cb642020-10-26 19:19:36 +0100803is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
804 bool yes_only)
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000805{
Victor Stinner920cb642020-10-26 19:19:36 +0100806 /* UCD 3.2.0 is requested, quickchecks must be disabled. */
807 if (UCD_Check(self)) {
Max Bélanger2810dd72018-11-04 15:58:24 -0800808 return NO;
Victor Stinner920cb642020-10-26 19:19:36 +0100809 }
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000810
Greg Price2f094132019-09-03 19:45:44 -0700811 Py_ssize_t i, len;
812 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300813 const void *data;
Greg Price2f094132019-09-03 19:45:44 -0700814 unsigned char prev_combining = 0;
815
816 /* The two quickcheck bits at this shift have type QuickcheckResult. */
817 int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
818
819 QuickcheckResult result = YES; /* certainly normalized, unless we find something */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000820
Martin v. Löwis22970662011-09-29 13:39:38 +0200821 i = 0;
822 kind = PyUnicode_KIND(input);
823 data = PyUnicode_DATA(input);
824 len = PyUnicode_GET_LENGTH(input);
825 while (i < len) {
826 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
827 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000828
Greg Price2f094132019-09-03 19:45:44 -0700829 unsigned char combining = record->combining;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000830 if (combining && prev_combining > combining)
Max Bélanger2810dd72018-11-04 15:58:24 -0800831 return NO; /* non-canonical sort order, not normalized */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000832 prev_combining = combining;
Greg Price2f094132019-09-03 19:45:44 -0700833
834 unsigned char quickcheck_whole = record->normalization_quick_check;
835 if (yes_only) {
836 if (quickcheck_whole & (3 << quickcheck_shift))
837 return MAYBE;
838 } else {
839 switch ((quickcheck_whole >> quickcheck_shift) & 3) {
840 case NO:
841 return NO;
842 case MAYBE:
843 result = MAYBE; /* this string might need normalization */
844 }
845 }
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000846 }
Greg Price2f094132019-09-03 19:45:44 -0700847 return result;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000848}
849
Serhiy Storchaka63596412015-04-17 21:18:49 +0300850/*[clinic input]
Max Bélanger2810dd72018-11-04 15:58:24 -0800851unicodedata.UCD.is_normalized
852
853 self: self
854 form: unicode
855 unistr as input: unicode
856 /
857
858Return whether the Unicode string unistr is in the normal form 'form'.
859
860Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
861[clinic start generated code]*/
862
863static PyObject *
864unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
865 PyObject *input)
866/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
867{
868 if (PyUnicode_READY(input) == -1) {
869 return NULL;
870 }
871
872 if (PyUnicode_GET_LENGTH(input) == 0) {
873 /* special case empty input strings. */
874 Py_RETURN_TRUE;
875 }
876
877 PyObject *result;
Greg Price7669cb82019-09-09 02:16:31 -0700878 bool nfc = false;
879 bool k = false;
Greg Price2f094132019-09-03 19:45:44 -0700880 QuickcheckResult m;
Max Bélanger2810dd72018-11-04 15:58:24 -0800881
882 PyObject *cmp;
883 int match = 0;
884
885 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700886 nfc = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800887 }
888 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700889 nfc = true;
890 k = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800891 }
892 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
893 /* matches default values for `nfc` and `k` */
894 }
895 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
Greg Price7669cb82019-09-09 02:16:31 -0700896 k = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800897 }
898 else {
899 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
900 return NULL;
901 }
902
Victor Stinner920cb642020-10-26 19:19:36 +0100903 m = is_normalized_quickcheck(self, input, nfc, k, false);
Max Bélanger2810dd72018-11-04 15:58:24 -0800904
905 if (m == MAYBE) {
Victor Stinner920cb642020-10-26 19:19:36 +0100906 cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
Max Bélanger2810dd72018-11-04 15:58:24 -0800907 if (cmp == NULL) {
908 return NULL;
909 }
910 match = PyUnicode_Compare(input, cmp);
911 Py_DECREF(cmp);
912 result = (match == 0) ? Py_True : Py_False;
913 }
914 else {
915 result = (m == YES) ? Py_True : Py_False;
916 }
917
918 Py_INCREF(result);
919 return result;
920}
921
922
923/*[clinic input]
Serhiy Storchaka63596412015-04-17 21:18:49 +0300924unicodedata.UCD.normalize
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000925
Serhiy Storchaka63596412015-04-17 21:18:49 +0300926 self: self
Max Bélanger2810dd72018-11-04 15:58:24 -0800927 form: unicode
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +0300928 unistr as input: unicode
Serhiy Storchaka63596412015-04-17 21:18:49 +0300929 /
930
931Return the normal form 'form' for the Unicode string unistr.
932
933Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
934[clinic start generated code]*/
935
936static PyObject *
Max Bélanger2810dd72018-11-04 15:58:24 -0800937unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
Serhiy Storchaka63596412015-04-17 21:18:49 +0300938 PyObject *input)
Max Bélanger2810dd72018-11-04 15:58:24 -0800939/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
Martin v. Löwis677bde22002-11-23 22:08:15 +0000940{
Martin v. Löwis22970662011-09-29 13:39:38 +0200941 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000942 /* Special case empty input strings, since resizing
943 them later would cause internal errors. */
944 Py_INCREF(input);
945 return input;
946 }
947
Max Bélanger2810dd72018-11-04 15:58:24 -0800948 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
Victor Stinner920cb642020-10-26 19:19:36 +0100949 if (is_normalized_quickcheck(self, input,
Victor Stinnere6b8c522020-10-15 16:22:19 +0200950 true, false, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000951 Py_INCREF(input);
952 return input;
953 }
Victor Stinner920cb642020-10-26 19:19:36 +0100954 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000955 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800956 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
Victor Stinner920cb642020-10-26 19:19:36 +0100957 if (is_normalized_quickcheck(self, input,
Victor Stinnere6b8c522020-10-15 16:22:19 +0200958 true, true, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000959 Py_INCREF(input);
960 return input;
961 }
Victor Stinner920cb642020-10-26 19:19:36 +0100962 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000963 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800964 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
Victor Stinner920cb642020-10-26 19:19:36 +0100965 if (is_normalized_quickcheck(self, input,
Victor Stinnere6b8c522020-10-15 16:22:19 +0200966 false, false, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000967 Py_INCREF(input);
968 return input;
969 }
Victor Stinner920cb642020-10-26 19:19:36 +0100970 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000971 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800972 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
Victor Stinner920cb642020-10-26 19:19:36 +0100973 if (is_normalized_quickcheck(self, input,
Victor Stinnere6b8c522020-10-15 16:22:19 +0200974 false, true, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000975 Py_INCREF(input);
976 return input;
977 }
Victor Stinner920cb642020-10-26 19:19:36 +0100978 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000979 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000980 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
981 return NULL;
982}
983
Fredrik Lundh06d12682001-01-24 07:59:11 +0000984/* -------------------------------------------------------------------- */
985/* unicode character name tables */
986
987/* data file generated by Tools/unicode/makeunicodedata.py */
988#include "unicodename_db.h"
989
990/* -------------------------------------------------------------------- */
991/* database code (cut and pasted from the unidb package) */
992
993static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000994_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000995{
996 int i;
997 unsigned long h = 0;
998 unsigned long ix;
999 for (i = 0; i < len; i++) {
Jordon Xu2ec70102019-09-11 00:04:08 +08001000 h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001001 ix = h & 0xff000000;
1002 if (ix)
1003 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
1004 }
1005 return h;
1006}
1007
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02001008static const char * const hangul_syllables[][3] = {
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001009 { "G", "A", "" },
1010 { "GG", "AE", "G" },
1011 { "N", "YA", "GG" },
1012 { "D", "YAE", "GS" },
1013 { "DD", "EO", "N", },
1014 { "R", "E", "NJ" },
1015 { "M", "YEO", "NH" },
1016 { "B", "YE", "D" },
1017 { "BB", "O", "L" },
1018 { "S", "WA", "LG" },
1019 { "SS", "WAE", "LM" },
1020 { "", "OE", "LB" },
1021 { "J", "YO", "LS" },
1022 { "JJ", "U", "LT" },
1023 { "C", "WEO", "LP" },
1024 { "K", "WE", "LH" },
1025 { "T", "WI", "M" },
1026 { "P", "YU", "B" },
1027 { "H", "EU", "BS" },
1028 { 0, "YI", "S" },
1029 { 0, "I", "SS" },
1030 { 0, 0, "NG" },
1031 { 0, 0, "J" },
1032 { 0, 0, "C" },
1033 { 0, 0, "K" },
1034 { 0, 0, "T" },
1035 { 0, 0, "P" },
1036 { 0, 0, "H" }
1037};
1038
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001039/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001040static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001041is_unified_ideograph(Py_UCS4 code)
1042{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001043 return
Benjamin Peterson051b9d02020-03-10 20:41:34 -07001044 (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
1045 (0x4E00 <= code && code <= 0x9FFC) || /* CJK Ideograph */
1046 (0x20000 <= code && code <= 0x2A6DD) || /* CJK Ideograph Extension B */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001047 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
Benjamin Peterson48013832015-06-27 15:45:56 -05001048 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
Benjamin Peterson279a9622017-06-22 22:31:08 -07001049 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
Benjamin Peterson051b9d02020-03-10 20:41:34 -07001050 (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1051 (0x30000 <= code && code <= 0x3134A); /* CJK Ideograph Extension G */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001052}
1053
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001054/* macros used to determine if the given code point is in the PUA range that
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001055 * we are using to store aliases and named sequences */
1056#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1057#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1058 (cp < named_sequences_end))
1059
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001060static int
Victor Stinner920cb642020-10-26 19:19:36 +01001061_getucname(PyObject *self,
Victor Stinnere6b8c522020-10-15 16:22:19 +02001062 Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001063{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001064 /* Find the name associated with the given code point.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001065 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1066 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001067 int offset;
1068 int i;
1069 int word;
Inada Naoki6fec9052019-04-17 08:40:34 +09001070 const unsigned char* w;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001071
Martin v. Löwisc3509122006-03-11 12:16:23 +00001072 if (code >= 0x110000)
1073 return 0;
1074
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001075 /* XXX should we just skip all the code points in the PUAs here? */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001076 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1077 return 0;
1078
Victor Stinner920cb642020-10-26 19:19:36 +01001079 if (UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001080 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +03001081 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001082 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1083 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +03001084 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +00001085 if (old->category_changed == 0) {
1086 /* unassigned */
1087 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001088 }
Martin v. Löwisc3509122006-03-11 12:16:23 +00001089 }
1090
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +00001091 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001092 /* Hangul syllable. */
1093 int SIndex = code - SBase;
1094 int L = SIndex / NCount;
1095 int V = (SIndex % NCount) / TCount;
1096 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001097
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001098 if (buflen < 27)
1099 /* Worst case: HANGUL SYLLABLE <10chars>. */
1100 return 0;
1101 strcpy(buffer, "HANGUL SYLLABLE ");
1102 buffer += 16;
1103 strcpy(buffer, hangul_syllables[L][0]);
1104 buffer += strlen(hangul_syllables[L][0]);
1105 strcpy(buffer, hangul_syllables[V][1]);
1106 buffer += strlen(hangul_syllables[V][1]);
1107 strcpy(buffer, hangul_syllables[T][2]);
1108 buffer += strlen(hangul_syllables[T][2]);
1109 *buffer = '\0';
1110 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001111 }
1112
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001113 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001114 if (buflen < 28)
1115 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1116 return 0;
1117 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1118 return 1;
1119 }
1120
Fredrik Lundh06d12682001-01-24 07:59:11 +00001121 /* get offset into phrasebook */
1122 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1123 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1124 (code&((1<<phrasebook_shift)-1))];
1125 if (!offset)
1126 return 0;
1127
1128 i = 0;
1129
1130 for (;;) {
1131 /* get word index */
1132 word = phrasebook[offset] - phrasebook_short;
1133 if (word >= 0) {
1134 word = (word << 8) + phrasebook[offset+1];
1135 offset += 2;
1136 } else
1137 word = phrasebook[offset++];
1138 if (i) {
1139 if (i > buflen)
1140 return 0; /* buffer overflow */
1141 buffer[i++] = ' ';
1142 }
1143 /* copy word string from lexicon. the last character in the
1144 word has bit 7 set. the last word in a string ends with
1145 0x80 */
1146 w = lexicon + lexicon_offset[word];
1147 while (*w < 128) {
1148 if (i >= buflen)
1149 return 0; /* buffer overflow */
1150 buffer[i++] = *w++;
1151 }
1152 if (i >= buflen)
1153 return 0; /* buffer overflow */
1154 buffer[i++] = *w & 127;
1155 if (*w == 128)
1156 break; /* end of word */
1157 }
1158
1159 return 1;
1160}
1161
1162static int
Victor Stinner920cb642020-10-26 19:19:36 +01001163capi_getucname(Py_UCS4 code,
Victor Stinner47e1afd2020-10-26 16:43:47 +01001164 char* buffer, int buflen,
Victor Stinnere6b8c522020-10-15 16:22:19 +02001165 int with_alias_and_seq)
1166{
Victor Stinner920cb642020-10-26 19:19:36 +01001167 return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001168
1169}
1170
1171static int
Victor Stinner920cb642020-10-26 19:19:36 +01001172_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001173{
1174 /* check if code corresponds to the given name */
1175 int i;
Christian Heimes2f366ca2016-09-23 20:20:27 +02001176 char buffer[NAME_MAXLEN+1];
Victor Stinner920cb642020-10-26 19:19:36 +01001177 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001178 return 0;
1179 for (i = 0; i < namelen; i++) {
Jordon Xu2ec70102019-09-11 00:04:08 +08001180 if (Py_TOUPPER(name[i]) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001181 return 0;
1182 }
1183 return buffer[namelen] == '\0';
1184}
1185
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001186static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001187find_syllable(const char *str, int *len, int *pos, int count, int column)
1188{
1189 int i, len1;
1190 *len = -1;
1191 for (i = 0; i < count; i++) {
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02001192 const char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001193 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001194 if (len1 <= *len)
1195 continue;
1196 if (strncmp(str, s, len1) == 0) {
1197 *len = len1;
1198 *pos = i;
1199 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001200 }
1201 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001202 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001203 }
1204}
1205
Fredrik Lundh06d12682001-01-24 07:59:11 +00001206static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001207_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001208{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001209 /* check if named sequences are allowed */
1210 if (!with_named_seq && IS_NAMED_SEQ(cp))
1211 return 0;
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001212 /* if the code point is in the PUA range that we use for aliases,
1213 * convert it to obtain the right code point */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001214 if (IS_ALIAS(cp))
1215 *code = name_aliases[cp-aliases_start];
1216 else
1217 *code = cp;
1218 return 1;
1219}
1220
1221static int
Victor Stinner920cb642020-10-26 19:19:36 +01001222_getcode(PyObject* self,
Victor Stinnere6b8c522020-10-15 16:22:19 +02001223 const char* name, int namelen, Py_UCS4* code, int with_named_seq)
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001224{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001225 /* Return the code point associated with the given name.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001226 * Named aliases are resolved too (unless self != NULL (i.e. we are using
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001227 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001228 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001229 unsigned int h, v;
1230 unsigned int mask = code_size-1;
1231 unsigned int i, incr;
1232
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001233 /* Check for hangul syllables. */
1234 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001235 int len, L = -1, V = -1, T = -1;
1236 const char *pos = name + 16;
1237 find_syllable(pos, &len, &L, LCount, 0);
1238 pos += len;
1239 find_syllable(pos, &len, &V, VCount, 1);
1240 pos += len;
1241 find_syllable(pos, &len, &T, TCount, 2);
1242 pos += len;
1243 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1244 *code = SBase + (L*VCount+V)*TCount + T;
1245 return 1;
1246 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001247 /* Otherwise, it's an illegal syllable name. */
1248 return 0;
1249 }
1250
1251 /* Check for unified ideographs. */
1252 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1253 /* Four or five hexdigits must follow. */
1254 v = 0;
1255 name += 22;
1256 namelen -= 22;
1257 if (namelen != 4 && namelen != 5)
1258 return 0;
1259 while (namelen--) {
1260 v *= 16;
1261 if (*name >= '0' && *name <= '9')
1262 v += *name - '0';
1263 else if (*name >= 'A' && *name <= 'F')
1264 v += *name - 'A' + 10;
1265 else
1266 return 0;
1267 name++;
1268 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001269 if (!is_unified_ideograph(v))
1270 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001271 *code = v;
1272 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001273 }
1274
Fredrik Lundh06d12682001-01-24 07:59:11 +00001275 /* the following is the same as python's dictionary lookup, with
1276 only minor changes. see the makeunicodedata script for more
1277 details */
1278
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001279 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001280 i = (~h) & mask;
1281 v = code_hash[i];
1282 if (!v)
1283 return 0;
Victor Stinner920cb642020-10-26 19:19:36 +01001284 if (_cmpname(self, v, name, namelen)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001285 return _check_alias_and_seq(v, code, with_named_seq);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001286 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001287 incr = (h ^ (h >> 3)) & mask;
1288 if (!incr)
1289 incr = mask;
1290 for (;;) {
1291 i = (i + incr) & mask;
1292 v = code_hash[i];
1293 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001294 return 0;
Victor Stinner920cb642020-10-26 19:19:36 +01001295 if (_cmpname(self, v, name, namelen)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001296 return _check_alias_and_seq(v, code, with_named_seq);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001297 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001298 incr = incr << 1;
1299 if (incr > mask)
1300 incr = incr ^ code_poly;
1301 }
1302}
1303
Victor Stinnere6b8c522020-10-15 16:22:19 +02001304static int
Victor Stinner920cb642020-10-26 19:19:36 +01001305capi_getcode(const char* name, int namelen, Py_UCS4* code,
Victor Stinnere6b8c522020-10-15 16:22:19 +02001306 int with_named_seq)
1307{
Victor Stinner920cb642020-10-26 19:19:36 +01001308 return _getcode(NULL, name, namelen, code, with_named_seq);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001309
1310}
1311
Erlend Egeberg Aasland61d26392021-01-20 12:03:53 +01001312static void
1313unicodedata_destroy_capi(PyObject *capsule)
Victor Stinner920cb642020-10-26 19:19:36 +01001314{
Erlend Egeberg Aasland61d26392021-01-20 12:03:53 +01001315 void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
1316 PyMem_Free(capi);
1317}
1318
1319static PyObject *
1320unicodedata_create_capi(void)
1321{
1322 _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
1323 if (capi == NULL) {
1324 PyErr_NoMemory();
1325 return NULL;
1326 }
1327 capi->getname = capi_getucname;
1328 capi->getcode = capi_getcode;
1329
1330 PyObject *capsule = PyCapsule_New(capi,
1331 PyUnicodeData_CAPSULE_NAME,
1332 unicodedata_destroy_capi);
1333 if (capsule == NULL) {
1334 PyMem_Free(capi);
1335 }
1336 return capsule;
Victor Stinner920cb642020-10-26 19:19:36 +01001337};
1338
1339
Fredrik Lundh06d12682001-01-24 07:59:11 +00001340/* -------------------------------------------------------------------- */
1341/* Python bindings */
1342
Serhiy Storchaka63596412015-04-17 21:18:49 +03001343/*[clinic input]
1344unicodedata.UCD.name
1345
1346 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001347 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +03001348 default: object=NULL
1349 /
1350
1351Returns the name assigned to the character chr as a string.
1352
1353If no name is defined, default is returned, or, if not given,
1354ValueError is raised.
1355[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001356
Fredrik Lundh06d12682001-01-24 07:59:11 +00001357static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001358unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001359/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001360{
Christian Heimes2f366ca2016-09-23 20:20:27 +02001361 char name[NAME_MAXLEN+1];
Serhiy Storchaka63596412015-04-17 21:18:49 +03001362 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001363
Victor Stinner920cb642020-10-26 19:19:36 +01001364 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001365 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001367 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001368 }
1369 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001370 Py_INCREF(default_value);
1371 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001372 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001373 }
1374
Walter Dörwald4254e762007-06-05 16:04:09 +00001375 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001376}
1377
Serhiy Storchaka63596412015-04-17 21:18:49 +03001378/*[clinic input]
1379unicodedata.UCD.lookup
1380
1381 self: self
Larry Hastings38337d12015-05-07 23:30:09 -07001382 name: str(accept={str, robuffer}, zeroes=True)
Serhiy Storchaka63596412015-04-17 21:18:49 +03001383 /
1384
1385Look up character by name.
1386
1387If a character with the given name is found, return the
1388corresponding character. If not found, KeyError is raised.
1389[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001390
Fredrik Lundh06d12682001-01-24 07:59:11 +00001391static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001392unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1393 Py_ssize_clean_t name_length)
Larry Hastings38337d12015-05-07 23:30:09 -07001394/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001395{
1396 Py_UCS4 code;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001397 unsigned int index;
Christian Heimes7ce20132016-09-14 10:25:46 +02001398 if (name_length > NAME_MAXLEN) {
Victor Stinner65a31442014-07-01 16:45:52 +02001399 PyErr_SetString(PyExc_KeyError, "name too long");
1400 return NULL;
1401 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001402
Victor Stinner920cb642020-10-26 19:19:36 +01001403 if (!_getcode(self, name, (int)name_length, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001404 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001405 return NULL;
1406 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001407 /* check if code is in the PUA range that we use for named sequences
1408 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001409 if (IS_NAMED_SEQ(code)) {
1410 index = code-named_sequences_start;
1411 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1412 named_sequences[index].seq,
1413 named_sequences[index].seqlen);
1414 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001416}
1417
Victor Stinner920cb642020-10-26 19:19:36 +01001418// List of functions used to define module functions *AND* unicodedata.UCD
1419// methods. For module functions, self is the module. For UCD methods, self
1420// is an UCD instance. The UCD_Check() macro is used to check if self is
1421// an UCD instance.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001422static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001423 UNICODEDATA_UCD_DECIMAL_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001424 UNICODEDATA_UCD_DIGIT_METHODDEF
1425 UNICODEDATA_UCD_NUMERIC_METHODDEF
1426 UNICODEDATA_UCD_CATEGORY_METHODDEF
1427 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1428 UNICODEDATA_UCD_COMBINING_METHODDEF
1429 UNICODEDATA_UCD_MIRRORED_METHODDEF
1430 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1431 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1432 UNICODEDATA_UCD_NAME_METHODDEF
1433 UNICODEDATA_UCD_LOOKUP_METHODDEF
Max Bélanger2810dd72018-11-04 15:58:24 -08001434 UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001435 UNICODEDATA_UCD_NORMALIZE_METHODDEF
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001436 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001437};
1438
Miss Islington (bot)0bf05002021-05-27 08:26:15 -07001439static int
1440ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
1441{
1442 Py_VISIT(Py_TYPE(self));
1443 return 0;
1444}
1445
Victor Stinnerc8c42002020-10-26 23:19:22 +01001446static void
1447ucd_dealloc(PreviousDBVersion *self)
1448{
1449 PyTypeObject *tp = Py_TYPE(self);
Miss Islington (bot)0bf05002021-05-27 08:26:15 -07001450 PyObject_GC_UnTrack(self);
1451 PyObject_GC_Del(self);
Victor Stinnerc8c42002020-10-26 23:19:22 +01001452 Py_DECREF(tp);
1453}
1454
1455static PyType_Slot ucd_type_slots[] = {
1456 {Py_tp_dealloc, ucd_dealloc},
Miss Islington (bot)0bf05002021-05-27 08:26:15 -07001457 {Py_tp_traverse, ucd_traverse},
Victor Stinnerc8c42002020-10-26 23:19:22 +01001458 {Py_tp_getattro, PyObject_GenericGetAttr},
1459 {Py_tp_methods, unicodedata_functions},
1460 {Py_tp_members, DB_members},
1461 {0, 0}
1462};
1463
1464static PyType_Spec ucd_type_spec = {
1465 .name = "unicodedata.UCD",
1466 .basicsize = sizeof(PreviousDBVersion),
Miss Islington (bot)0bf05002021-05-27 08:26:15 -07001467 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION |
Miss Islington (bot)7297d742021-06-17 03:19:44 -07001468 Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE),
Victor Stinnerc8c42002020-10-26 23:19:22 +01001469 .slots = ucd_type_slots
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001470};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001471
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001472PyDoc_STRVAR(unicodedata_docstring,
1473"This module provides access to the Unicode Character Database which\n\
1474defines character properties for all Unicode characters. The data in\n\
1475this database is based on the UnicodeData.txt file version\n\
luzpaza5293b42017-11-05 07:37:50 -06001476" UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001477\n\
1478The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001479UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001480
Victor Stinner920cb642020-10-26 19:19:36 +01001481static int
1482unicodedata_exec(PyObject *module)
1483{
Victor Stinner920cb642020-10-26 19:19:36 +01001484 if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
1485 return -1;
1486 }
1487
Victor Stinnerc8c42002020-10-26 23:19:22 +01001488 PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
1489 if (ucd_type == NULL) {
1490 return -1;
1491 }
1492
Victor Stinner920cb642020-10-26 19:19:36 +01001493 if (PyModule_AddType(module, ucd_type) < 0) {
Victor Stinnerc8c42002020-10-26 23:19:22 +01001494 Py_DECREF(ucd_type);
Victor Stinner920cb642020-10-26 19:19:36 +01001495 return -1;
1496 }
1497
Victor Stinner84f73822020-10-27 04:36:22 +01001498 // Unicode database version 3.2.0 used by the IDNA encoding
Victor Stinner920cb642020-10-26 19:19:36 +01001499 PyObject *v;
1500 v = new_previous_version(ucd_type, "3.2.0",
1501 get_change_3_2_0, normalization_3_2_0);
Victor Stinnerc8c42002020-10-26 23:19:22 +01001502 Py_DECREF(ucd_type);
Victor Stinner920cb642020-10-26 19:19:36 +01001503 if (v == NULL) {
1504 return -1;
1505 }
1506 if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
1507 Py_DECREF(v);
1508 return -1;
1509 }
1510
1511 /* Export C API */
Erlend Egeberg Aasland61d26392021-01-20 12:03:53 +01001512 PyObject *capsule = unicodedata_create_capi();
1513 if (capsule == NULL) {
Victor Stinner920cb642020-10-26 19:19:36 +01001514 return -1;
1515 }
Erlend Egeberg Aasland61d26392021-01-20 12:03:53 +01001516 int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule);
1517 Py_DECREF(capsule);
1518 if (rc < 0) {
Victor Stinner920cb642020-10-26 19:19:36 +01001519 return -1;
1520 }
1521 return 0;
1522}
1523
Victor Stinnerc8c42002020-10-26 23:19:22 +01001524static PyModuleDef_Slot unicodedata_slots[] = {
1525 {Py_mod_exec, unicodedata_exec},
1526 {0, NULL}
1527};
1528
1529static struct PyModuleDef unicodedata_module = {
1530 PyModuleDef_HEAD_INIT,
1531 .m_name = "unicodedata",
1532 .m_doc = unicodedata_docstring,
1533 .m_size = 0,
1534 .m_methods = unicodedata_functions,
1535 .m_slots = unicodedata_slots,
1536};
Victor Stinner920cb642020-10-26 19:19:36 +01001537
Mark Hammond62b1ab12002-07-23 06:31:15 +00001538PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001539PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001540{
Victor Stinnerc8c42002020-10-26 23:19:22 +01001541 return PyModuleDef_Init(&unicodedata_module);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001542}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001543
Victor Stinner920cb642020-10-26 19:19:36 +01001544
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001545/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001546Local variables:
1547c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001548indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001549End:
1550*/