blob: 4b8c46c77976651f49092034a9c7d6541ef1928a [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Victor Stinner47e1afd2020-10-26 16:43:47 +010019#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
Victor Stinner4a21e572020-04-15 02:35:41 +020020#include "structmember.h" // PyMemberDef
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Greg Price2f094132019-09-03 19:45:44 -070022#include <stdbool.h>
23
Max Bélanger2810dd72018-11-04 15:58:24 -080024_Py_IDENTIFIER(NFC);
25_Py_IDENTIFIER(NFD);
26_Py_IDENTIFIER(NFKC);
27_Py_IDENTIFIER(NFKD);
28
Larry Hastings61272b72014-01-07 12:41:53 -080029/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080030module unicodedata
Victor Stinnerc8c42002020-10-26 23:19:22 +010031class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
Larry Hastings61272b72014-01-07 12:41:53 -080032[clinic start generated code]*/
Victor Stinnerc8c42002020-10-26 23:19:22 +010033/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080034
Fredrik Lundh06d12682001-01-24 07:59:11 +000035/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000036
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000037typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000038 const unsigned char category; /* index into
39 _PyUnicode_CategoryNames */
40 const unsigned char combining; /* combining class value 0 - 255 */
41 const unsigned char bidirectional; /* index into
42 _PyUnicode_BidirectionalNames */
43 const unsigned char mirrored; /* true if mirrored in bidir mode */
44 const unsigned char east_asian_width; /* index into
45 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000046 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047} _PyUnicode_DatabaseRecord;
48
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000049typedef struct change_record {
50 /* sequence of fields should be the same as in merge_old_version */
51 const unsigned char bidir_changed;
52 const unsigned char category_changed;
53 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000054 const unsigned char mirrored_changed;
Benjamin Peterson67752312016-09-14 23:53:47 -070055 const unsigned char east_asian_width_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000056 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000057} change_record;
58
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000059/* data file generated by Tools/unicode/makeunicodedata.py */
60#include "unicodedata_db.h"
61
62static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000063_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000064{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000065 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000066 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000067 index = 0;
68 else {
69 index = index1[(code>>SHIFT)];
70 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
71 }
72
73 return &_PyUnicode_Database_Records[index];
74}
75
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000076/* ------------- Previous-version API ------------------------------------- */
77typedef struct previous_version {
78 PyObject_HEAD
79 const char *name;
80 const change_record* (*getrecord)(Py_UCS4);
81 Py_UCS4 (*normalization)(Py_UCS4);
82} PreviousDBVersion;
83
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030084#include "clinic/unicodedata.c.h"
85
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
87
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090 {NULL}
91};
92
Victor Stinner920cb642020-10-26 19:19:36 +010093// Check if self is an unicodedata.UCD instance.
94// If self is NULL (when the PyCapsule C API is used), return 0.
95// PyModule_Check() is used to avoid having to retrieve the ucd_type.
96// See unicodedata_functions comment to the rationale of this macro.
97#define UCD_Check(self) (self != NULL && !PyModule_Check(self))
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000098
99static PyObject*
Victor Stinner920cb642020-10-26 19:19:36 +0100100new_previous_version(PyTypeObject *ucd_type,
Victor Stinnere6b8c522020-10-15 16:22:19 +0200101 const char*name, const change_record* (*getrecord)(Py_UCS4),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000102 Py_UCS4 (*normalization)(Py_UCS4))
103{
Victor Stinnere6b8c522020-10-15 16:22:19 +0200104 PreviousDBVersion *self;
Victor Stinner920cb642020-10-26 19:19:36 +0100105 self = PyObject_New(PreviousDBVersion, ucd_type);
Victor Stinnere6b8c522020-10-15 16:22:19 +0200106 if (self == NULL)
107 return NULL;
108 self->name = name;
109 self->getrecord = getrecord;
110 self->normalization = normalization;
111 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000112}
113
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000114
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000115/* --- Module API --------------------------------------------------------- */
116
Larry Hastings61272b72014-01-07 12:41:53 -0800117/*[clinic input]
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800118unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700119
Serhiy Storchaka63596412015-04-17 21:18:49 +0300120 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700121 chr: int(accept={str})
Larry Hastings31826802013-10-19 00:09:25 -0700122 default: object=NULL
123 /
124
125Converts a Unicode character into its equivalent decimal value.
126
Serhiy Storchaka63596412015-04-17 21:18:49 +0300127Returns the decimal value assigned to the character chr as integer.
128If no such value is defined, default is returned, or, if not given,
129ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800130[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700131
Larry Hastings31826802013-10-19 00:09:25 -0700132static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300133unicodedata_UCD_decimal_impl(PyObject *self, int chr,
Larry Hastings89964c42015-04-14 18:07:59 -0400134 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700135/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
Larry Hastings31826802013-10-19 00:09:25 -0700136{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000137 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000138 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300139 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000140
Victor Stinner920cb642020-10-26 19:19:36 +0100141 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000142 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000143 if (old->category_changed == 0) {
144 /* unassigned */
145 have_old = 1;
146 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000148 else if (old->decimal_changed != 0xFF) {
149 have_old = 1;
150 rc = old->decimal_changed;
151 }
152 }
153
154 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000155 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000156 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700157 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000158 PyErr_SetString(PyExc_ValueError,
159 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000160 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000161 }
162 else {
Larry Hastings31826802013-10-19 00:09:25 -0700163 Py_INCREF(default_value);
164 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000166 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000167 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000168}
169
Serhiy Storchaka63596412015-04-17 21:18:49 +0300170/*[clinic input]
171unicodedata.UCD.digit
172
173 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700174 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300175 default: object=NULL
176 /
177
178Converts a Unicode character into its equivalent digit value.
179
180Returns the digit value assigned to the character chr as integer.
181If no such value is defined, default is returned, or, if not given,
182ValueError is raised.
183[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000184
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000185static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300186unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700187/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000188{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000189 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300190 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000191 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000192 if (rc < 0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300193 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000195 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 }
197 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300198 Py_INCREF(default_value);
199 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000200 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000202 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203}
204
Serhiy Storchaka63596412015-04-17 21:18:49 +0300205/*[clinic input]
206unicodedata.UCD.numeric
207
208 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700209 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300210 default: object=NULL
211 /
212
213Converts a Unicode character into its equivalent numeric value.
214
215Returns the numeric value assigned to the character chr as float.
216If no such value is defined, default is returned, or, if not given,
217ValueError is raised.
218[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000219
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000220static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300221unicodedata_UCD_numeric_impl(PyObject *self, int chr,
222 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700223/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000224{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000225 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000226 double rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300227 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000228
Victor Stinner920cb642020-10-26 19:19:36 +0100229 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000230 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000231 if (old->category_changed == 0) {
232 /* unassigned */
233 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000235 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000236 else if (old->decimal_changed != 0xFF) {
237 have_old = 1;
238 rc = old->decimal_changed;
239 }
240 }
241
242 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000243 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000244 if (rc == -1.0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300245 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000246 PyErr_SetString(PyExc_ValueError, "not a numeric character");
247 return NULL;
248 }
249 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300250 Py_INCREF(default_value);
251 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000252 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000253 }
254 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000255}
256
Serhiy Storchaka63596412015-04-17 21:18:49 +0300257/*[clinic input]
258unicodedata.UCD.category
259
260 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700261 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300262 /
263
264Returns the general category assigned to the character chr as string.
265[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000266
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000267static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300268unicodedata_UCD_category_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700269/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000270{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000271 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300272 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000273 index = (int) _getrecord_ex(c)->category;
Victor Stinner920cb642020-10-26 19:19:36 +0100274 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000275 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000276 if (old->category_changed != 0xFF)
277 index = old->category_changed;
278 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000279 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000280}
281
Serhiy Storchaka63596412015-04-17 21:18:49 +0300282/*[clinic input]
283unicodedata.UCD.bidirectional
284
285 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700286 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300287 /
288
289Returns the bidirectional class assigned to the character chr as string.
290
291If no such value is defined, an empty string is returned.
292[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000293
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000294static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300295unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700296/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000297{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000298 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300299 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000300 index = (int) _getrecord_ex(c)->bidirectional;
Victor Stinner920cb642020-10-26 19:19:36 +0100301 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000302 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000303 if (old->category_changed == 0)
304 index = 0; /* unassigned */
305 else if (old->bidir_changed != 0xFF)
306 index = old->bidir_changed;
307 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000308 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000309}
310
Serhiy Storchaka63596412015-04-17 21:18:49 +0300311/*[clinic input]
312unicodedata.UCD.combining -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000313
Serhiy Storchaka63596412015-04-17 21:18:49 +0300314 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700315 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300316 /
317
318Returns the canonical combining class assigned to the character chr as integer.
319
320Returns 0 if no combining class is defined.
321[clinic start generated code]*/
322
323static int
324unicodedata_UCD_combining_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700325/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000326{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000327 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300328 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000329 index = (int) _getrecord_ex(c)->combining;
Victor Stinner920cb642020-10-26 19:19:36 +0100330 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000331 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000332 if (old->category_changed == 0)
333 index = 0; /* unassigned */
334 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300335 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000336}
337
Serhiy Storchaka63596412015-04-17 21:18:49 +0300338/*[clinic input]
339unicodedata.UCD.mirrored -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000340
Serhiy Storchaka63596412015-04-17 21:18:49 +0300341 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700342 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300343 /
344
345Returns the mirrored property assigned to the character chr as integer.
346
347Returns 1 if the character has been identified as a "mirrored"
348character in bidirectional text, 0 otherwise.
349[clinic start generated code]*/
350
351static int
352unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700353/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000354{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000355 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300356 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000357 index = (int) _getrecord_ex(c)->mirrored;
Victor Stinner920cb642020-10-26 19:19:36 +0100358 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000359 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000360 if (old->category_changed == 0)
361 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000362 else if (old->mirrored_changed != 0xFF)
363 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000364 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300365 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000366}
367
Serhiy Storchaka63596412015-04-17 21:18:49 +0300368/*[clinic input]
369unicodedata.UCD.east_asian_width
370
371 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700372 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300373 /
374
375Returns the east asian width assigned to the character chr as string.
376[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000377
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000378static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300379unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700380/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000381{
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000382 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300383 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000384 index = (int) _getrecord_ex(c)->east_asian_width;
Victor Stinner920cb642020-10-26 19:19:36 +0100385 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000386 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000387 if (old->category_changed == 0)
388 index = 0; /* unassigned */
Benjamin Peterson67752312016-09-14 23:53:47 -0700389 else if (old->east_asian_width_changed != 0xFF)
390 index = old->east_asian_width_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000391 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000392 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000393}
394
Serhiy Storchaka63596412015-04-17 21:18:49 +0300395/*[clinic input]
396unicodedata.UCD.decomposition
397
398 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700399 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300400 /
401
402Returns the character decomposition mapping assigned to the character chr as string.
403
404An empty string is returned in case no such mapping is defined.
405[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000406
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000407static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300408unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700409/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000410{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000411 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000412 int code, index, count;
413 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000414 unsigned int prefix_index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300415 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000416
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000417 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000418
Victor Stinner920cb642020-10-26 19:19:36 +0100419 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000420 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000421 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000422 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000423 }
424
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000425 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000426 index = 0;
427 else {
428 index = decomp_index1[(code>>DECOMP_SHIFT)];
429 index = decomp_index2[(index<<DECOMP_SHIFT)+
430 (code&((1<<DECOMP_SHIFT)-1))];
431 }
432
Tim Peters69b83b12001-11-30 07:23:05 +0000433 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000434 is prefix code (from*/
435 count = decomp_data[index] >> 8;
436
437 /* XXX: could allocate the PyString up front instead
438 (strlen(prefix) + 5 * count + 1 bytes) */
439
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000440 /* Based on how index is calculated above and decomp_data is generated
441 from Tools/unicode/makeunicodedata.py, it should not be possible
442 to overflow decomp_prefix. */
443 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200444 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000445
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000446 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000447 i = strlen(decomp_prefix[prefix_index]);
448 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000449
450 while (count-- > 0) {
451 if (i)
452 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000453 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000454 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
455 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000456 i += strlen(decomp + i);
457 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000458 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000459}
460
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000461static void
Victor Stinner920cb642020-10-26 19:19:36 +0100462get_decomp_record(PyObject *self, Py_UCS4 code,
463 int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000464{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000465 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000466 *index = 0;
Victor Stinner920cb642020-10-26 19:19:36 +0100467 }
468 else if (UCD_Check(self)
469 && get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000470 /* unassigned in old version */
471 *index = 0;
472 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000473 else {
474 *index = decomp_index1[(code>>DECOMP_SHIFT)];
475 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
476 (code&((1<<DECOMP_SHIFT)-1))];
477 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000478
Martin v. Löwis677bde22002-11-23 22:08:15 +0000479 /* high byte is number of hex bytes (usually one or two), low byte
480 is prefix code (from*/
481 *count = decomp_data[*index] >> 8;
482 *prefix = decomp_data[*index] & 255;
483
484 (*index)++;
485}
486
487#define SBase 0xAC00
488#define LBase 0x1100
489#define VBase 0x1161
490#define TBase 0x11A7
491#define LCount 19
492#define VCount 21
493#define TCount 28
494#define NCount (VCount*TCount)
495#define SCount (LCount*NCount)
496
497static PyObject*
Victor Stinner920cb642020-10-26 19:19:36 +0100498nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000499{
500 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200501 Py_UCS4 *output;
502 Py_ssize_t i, o, osize;
503 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300504 const void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000505 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200506 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000507 Py_ssize_t space, isize;
508 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000509 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000510
Martin v. Löwis677bde22002-11-23 22:08:15 +0000511 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200512 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500513 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300514 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500515 if (space > 10) {
516 if (space <= PY_SSIZE_T_MAX - 10)
517 space += 10;
518 }
519 else {
520 space *= 2;
521 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200522 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500523 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200524 if (!output) {
525 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000526 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200527 }
528 i = o = 0;
529 kind = PyUnicode_KIND(input);
530 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000531
Martin v. Löwis22970662011-09-29 13:39:38 +0200532 while (i < isize) {
533 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000534 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200535 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000536 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300537 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000538 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000539 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200540 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000541 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000542 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
543 if (new_output == NULL) {
544 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200545 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000546 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200547 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000548 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000549 }
550 /* Hangul Decomposition. */
551 if (SBase <= code && code < (SBase+SCount)) {
552 int SIndex = code - SBase;
553 int L = LBase + SIndex / NCount;
554 int V = VBase + (SIndex % NCount) / TCount;
555 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200556 output[o++] = L;
557 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000558 space -= 2;
559 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200560 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000561 space --;
562 }
563 continue;
564 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000565 /* normalization changes */
Victor Stinner920cb642020-10-26 19:19:36 +0100566 if (UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000567 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
568 if (value != 0) {
569 stack[stackptr++] = value;
570 continue;
571 }
572 }
573
574 /* Other decompositions. */
Victor Stinner920cb642020-10-26 19:19:36 +0100575 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000576
577 /* Copy character if it is not decomposable, or has a
578 compatibility decomposition, but we do NFD. */
579 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200580 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000581 space--;
582 continue;
583 }
584 /* Copy decomposition onto the stack, in reverse
585 order. */
586 while(count) {
587 code = decomp_data[index + (--count)];
588 stack[stackptr++] = code;
589 }
590 }
591 }
592
Martin v. Löwis22970662011-09-29 13:39:38 +0200593 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
594 output, o);
595 PyMem_Free(output);
596 if (!result)
597 return NULL;
598 /* result is guaranteed to be ready, as it is compact. */
599 kind = PyUnicode_KIND(result);
600 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000601
602 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200603 i = 0;
604 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
605 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
606 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000607 if (prev == 0 || cur == 0 || prev <= cur) {
608 prev = cur;
609 continue;
610 }
611 /* Non-canonical order. Need to switch *i with previous. */
612 o = i - 1;
613 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200614 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
615 PyUnicode_WRITE(kind, data, o+1,
616 PyUnicode_READ(kind, data, o));
617 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000618 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200619 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000620 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200621 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000622 if (prev == 0 || prev <= cur)
623 break;
624 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200625 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000626 }
627 return result;
628}
629
630static int
Andy Lester982307b2020-03-17 11:38:12 -0500631find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000632{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200633 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000634 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200635 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000636 if (code < start)
637 return -1;
638 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200639 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000640 return nfc[index].index + delta;
641 }
642 }
643 return -1;
644}
645
646static PyObject*
Victor Stinner920cb642020-10-26 19:19:36 +0100647nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000648{
649 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200650 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300651 const void *data;
Martin v. Löwis22970662011-09-29 13:39:38 +0200652 Py_UCS4 *output;
653 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000654 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200655 Py_UCS4 code;
656 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000657 int cskipped = 0;
658
Victor Stinner920cb642020-10-26 19:19:36 +0100659 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000660 if (!result)
661 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200662 /* result will be "ready". */
663 kind = PyUnicode_KIND(result);
664 data = PyUnicode_DATA(result);
665 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000666
Martin v. Löwis22970662011-09-29 13:39:38 +0200667 /* We allocate a buffer for the output.
668 If we find that we made no changes, we still return
669 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500670 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200671 if (!output) {
672 PyErr_NoMemory();
673 Py_DECREF(result);
674 return 0;
675 }
676 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000677
Martin v. Löwis677bde22002-11-23 22:08:15 +0000678 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200679 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000680 for (index = 0; index < cskipped; index++) {
681 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000682 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000683 Remove from list. */
684 skipped[index] = skipped[cskipped-1];
685 cskipped--;
686 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000687 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000688 }
689 }
690 /* Hangul Composition. We don't need to check for <LV,T>
691 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200692 code = PyUnicode_READ(kind, data, i);
693 if (LBase <= code && code < (LBase+LCount) &&
694 i + 1 < len &&
695 VBase <= PyUnicode_READ(kind, data, i+1) &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900696 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
697 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
698 and V character is a modern vowel (0x1161 ~ 0x1175). */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000699 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200700 LIndex = code - LBase;
701 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000702 code = SBase + (LIndex*VCount+VIndex)*TCount;
703 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200704 if (i < len &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900705 TBase < PyUnicode_READ(kind, data, i) &&
706 PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
707 /* check T character is a modern trailing consonant
708 (0x11A8 ~ 0x11C2). */
Martin v. Löwis22970662011-09-29 13:39:38 +0200709 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000710 i++;
711 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200712 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000713 continue;
714 }
715
Martin v. Löwis22970662011-09-29 13:39:38 +0200716 /* code is still input[i] here */
Andy Lester982307b2020-03-17 11:38:12 -0500717 f = find_nfc_index(nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000718 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200719 output[o++] = code;
720 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000721 continue;
722 }
723 /* Find next unblocked character. */
724 i1 = i+1;
725 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200726 /* output base character for now; might be updated later. */
727 output[o] = PyUnicode_READ(kind, data, i);
728 while (i1 < len) {
729 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
730 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000731 if (comb) {
732 if (comb1 == 0)
733 break;
734 if (comb >= comb1) {
735 /* Character is blocked. */
736 i1++;
737 continue;
738 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000739 }
Andy Lester982307b2020-03-17 11:38:12 -0500740 l = find_nfc_index(nfc_last, code1);
Martin v. Löwis22970662011-09-29 13:39:38 +0200741 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000742 is a starter, we don't need to look further.
743 Otherwise, record the combining class. */
744 if (l == -1) {
745 not_combinable:
746 if (comb1 == 0)
747 break;
748 comb = comb1;
749 i1++;
750 continue;
751 }
752 index = f*TOTAL_LAST + l;
753 index1 = comp_index[index >> COMP_SHIFT];
754 code = comp_data[(index1<<COMP_SHIFT)+
755 (index&((1<<COMP_SHIFT)-1))];
756 if (code == 0)
757 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000758
Martin v. Löwis677bde22002-11-23 22:08:15 +0000759 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200760 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000761 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000762 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000763 skipped[cskipped++] = i1;
764 i1++;
Andy Lester982307b2020-03-17 11:38:12 -0500765 f = find_nfc_index(nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000766 if (f == -1)
767 break;
768 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200769 /* Output character was already written.
770 Just advance the indices. */
771 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000772 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200773 if (o == len) {
774 /* No changes. Return original string. */
775 PyMem_Free(output);
776 return result;
777 }
778 Py_DECREF(result);
779 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
780 output, o);
781 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000782 return result;
783}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000784
Greg Price2f094132019-09-03 19:45:44 -0700785// This needs to match the logic in makeunicodedata.py
786// which constructs the quickcheck data.
787typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
Max Bélanger2810dd72018-11-04 15:58:24 -0800788
Greg Price2f094132019-09-03 19:45:44 -0700789/* Run the Unicode normalization "quickcheck" algorithm.
790 *
791 * Return YES or NO if quickcheck determines the input is certainly
792 * normalized or certainly not, and MAYBE if quickcheck is unable to
793 * tell.
794 *
795 * If `yes_only` is true, then return MAYBE as soon as we determine
796 * the answer is not YES.
797 *
798 * For background and details on the algorithm, see UAX #15:
799 * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
800 */
801static QuickcheckResult
Victor Stinner920cb642020-10-26 19:19:36 +0100802is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
803 bool yes_only)
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000804{
Victor Stinner920cb642020-10-26 19:19:36 +0100805 /* UCD 3.2.0 is requested, quickchecks must be disabled. */
806 if (UCD_Check(self)) {
Max Bélanger2810dd72018-11-04 15:58:24 -0800807 return NO;
Victor Stinner920cb642020-10-26 19:19:36 +0100808 }
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000809
Greg Price2f094132019-09-03 19:45:44 -0700810 Py_ssize_t i, len;
811 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300812 const void *data;
Greg Price2f094132019-09-03 19:45:44 -0700813 unsigned char prev_combining = 0;
814
815 /* The two quickcheck bits at this shift have type QuickcheckResult. */
816 int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
817
818 QuickcheckResult result = YES; /* certainly normalized, unless we find something */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000819
Martin v. Löwis22970662011-09-29 13:39:38 +0200820 i = 0;
821 kind = PyUnicode_KIND(input);
822 data = PyUnicode_DATA(input);
823 len = PyUnicode_GET_LENGTH(input);
824 while (i < len) {
825 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
826 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000827
Greg Price2f094132019-09-03 19:45:44 -0700828 unsigned char combining = record->combining;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000829 if (combining && prev_combining > combining)
Max Bélanger2810dd72018-11-04 15:58:24 -0800830 return NO; /* non-canonical sort order, not normalized */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000831 prev_combining = combining;
Greg Price2f094132019-09-03 19:45:44 -0700832
833 unsigned char quickcheck_whole = record->normalization_quick_check;
834 if (yes_only) {
835 if (quickcheck_whole & (3 << quickcheck_shift))
836 return MAYBE;
837 } else {
838 switch ((quickcheck_whole >> quickcheck_shift) & 3) {
839 case NO:
840 return NO;
841 case MAYBE:
842 result = MAYBE; /* this string might need normalization */
843 }
844 }
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000845 }
Greg Price2f094132019-09-03 19:45:44 -0700846 return result;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000847}
848
Serhiy Storchaka63596412015-04-17 21:18:49 +0300849/*[clinic input]
Max Bélanger2810dd72018-11-04 15:58:24 -0800850unicodedata.UCD.is_normalized
851
852 self: self
853 form: unicode
854 unistr as input: unicode
855 /
856
857Return whether the Unicode string unistr is in the normal form 'form'.
858
859Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
860[clinic start generated code]*/
861
862static PyObject *
863unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
864 PyObject *input)
865/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
866{
867 if (PyUnicode_READY(input) == -1) {
868 return NULL;
869 }
870
871 if (PyUnicode_GET_LENGTH(input) == 0) {
872 /* special case empty input strings. */
873 Py_RETURN_TRUE;
874 }
875
876 PyObject *result;
Greg Price7669cb82019-09-09 02:16:31 -0700877 bool nfc = false;
878 bool k = false;
Greg Price2f094132019-09-03 19:45:44 -0700879 QuickcheckResult m;
Max Bélanger2810dd72018-11-04 15:58:24 -0800880
881 PyObject *cmp;
882 int match = 0;
883
884 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700885 nfc = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800886 }
887 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700888 nfc = true;
889 k = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800890 }
891 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
892 /* matches default values for `nfc` and `k` */
893 }
894 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
Greg Price7669cb82019-09-09 02:16:31 -0700895 k = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800896 }
897 else {
898 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
899 return NULL;
900 }
901
Victor Stinner920cb642020-10-26 19:19:36 +0100902 m = is_normalized_quickcheck(self, input, nfc, k, false);
Max Bélanger2810dd72018-11-04 15:58:24 -0800903
904 if (m == MAYBE) {
Victor Stinner920cb642020-10-26 19:19:36 +0100905 cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
Max Bélanger2810dd72018-11-04 15:58:24 -0800906 if (cmp == NULL) {
907 return NULL;
908 }
909 match = PyUnicode_Compare(input, cmp);
910 Py_DECREF(cmp);
911 result = (match == 0) ? Py_True : Py_False;
912 }
913 else {
914 result = (m == YES) ? Py_True : Py_False;
915 }
916
917 Py_INCREF(result);
918 return result;
919}
920
921
922/*[clinic input]
Serhiy Storchaka63596412015-04-17 21:18:49 +0300923unicodedata.UCD.normalize
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000924
Serhiy Storchaka63596412015-04-17 21:18:49 +0300925 self: self
Max Bélanger2810dd72018-11-04 15:58:24 -0800926 form: unicode
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +0300927 unistr as input: unicode
Serhiy Storchaka63596412015-04-17 21:18:49 +0300928 /
929
930Return the normal form 'form' for the Unicode string unistr.
931
932Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
933[clinic start generated code]*/
934
935static PyObject *
Max Bélanger2810dd72018-11-04 15:58:24 -0800936unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
Serhiy Storchaka63596412015-04-17 21:18:49 +0300937 PyObject *input)
Max Bélanger2810dd72018-11-04 15:58:24 -0800938/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
Martin v. Löwis677bde22002-11-23 22:08:15 +0000939{
Martin v. Löwis22970662011-09-29 13:39:38 +0200940 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000941 /* Special case empty input strings, since resizing
942 them later would cause internal errors. */
943 Py_INCREF(input);
944 return input;
945 }
946
Max Bélanger2810dd72018-11-04 15:58:24 -0800947 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
Victor Stinner920cb642020-10-26 19:19:36 +0100948 if (is_normalized_quickcheck(self, input,
Victor Stinnere6b8c522020-10-15 16:22:19 +0200949 true, false, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000950 Py_INCREF(input);
951 return input;
952 }
Victor Stinner920cb642020-10-26 19:19:36 +0100953 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000954 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800955 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
Victor Stinner920cb642020-10-26 19:19:36 +0100956 if (is_normalized_quickcheck(self, input,
Victor Stinnere6b8c522020-10-15 16:22:19 +0200957 true, true, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000958 Py_INCREF(input);
959 return input;
960 }
Victor Stinner920cb642020-10-26 19:19:36 +0100961 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000962 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800963 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
Victor Stinner920cb642020-10-26 19:19:36 +0100964 if (is_normalized_quickcheck(self, input,
Victor Stinnere6b8c522020-10-15 16:22:19 +0200965 false, false, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000966 Py_INCREF(input);
967 return input;
968 }
Victor Stinner920cb642020-10-26 19:19:36 +0100969 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000970 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800971 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
Victor Stinner920cb642020-10-26 19:19:36 +0100972 if (is_normalized_quickcheck(self, input,
Victor Stinnere6b8c522020-10-15 16:22:19 +0200973 false, true, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000974 Py_INCREF(input);
975 return input;
976 }
Victor Stinner920cb642020-10-26 19:19:36 +0100977 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000978 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000979 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
980 return NULL;
981}
982
Fredrik Lundh06d12682001-01-24 07:59:11 +0000983/* -------------------------------------------------------------------- */
984/* unicode character name tables */
985
986/* data file generated by Tools/unicode/makeunicodedata.py */
987#include "unicodename_db.h"
988
989/* -------------------------------------------------------------------- */
990/* database code (cut and pasted from the unidb package) */
991
992static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000993_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000994{
995 int i;
996 unsigned long h = 0;
997 unsigned long ix;
998 for (i = 0; i < len; i++) {
Jordon Xu2ec70102019-09-11 00:04:08 +0800999 h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001000 ix = h & 0xff000000;
1001 if (ix)
1002 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
1003 }
1004 return h;
1005}
1006
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02001007static const char * const hangul_syllables[][3] = {
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001008 { "G", "A", "" },
1009 { "GG", "AE", "G" },
1010 { "N", "YA", "GG" },
1011 { "D", "YAE", "GS" },
1012 { "DD", "EO", "N", },
1013 { "R", "E", "NJ" },
1014 { "M", "YEO", "NH" },
1015 { "B", "YE", "D" },
1016 { "BB", "O", "L" },
1017 { "S", "WA", "LG" },
1018 { "SS", "WAE", "LM" },
1019 { "", "OE", "LB" },
1020 { "J", "YO", "LS" },
1021 { "JJ", "U", "LT" },
1022 { "C", "WEO", "LP" },
1023 { "K", "WE", "LH" },
1024 { "T", "WI", "M" },
1025 { "P", "YU", "B" },
1026 { "H", "EU", "BS" },
1027 { 0, "YI", "S" },
1028 { 0, "I", "SS" },
1029 { 0, 0, "NG" },
1030 { 0, 0, "J" },
1031 { 0, 0, "C" },
1032 { 0, 0, "K" },
1033 { 0, 0, "T" },
1034 { 0, 0, "P" },
1035 { 0, 0, "H" }
1036};
1037
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001038/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001039static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001040is_unified_ideograph(Py_UCS4 code)
1041{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001042 return
Benjamin Peterson051b9d02020-03-10 20:41:34 -07001043 (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
1044 (0x4E00 <= code && code <= 0x9FFC) || /* CJK Ideograph */
1045 (0x20000 <= code && code <= 0x2A6DD) || /* CJK Ideograph Extension B */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001046 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
Benjamin Peterson48013832015-06-27 15:45:56 -05001047 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
Benjamin Peterson279a9622017-06-22 22:31:08 -07001048 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
Benjamin Peterson051b9d02020-03-10 20:41:34 -07001049 (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1050 (0x30000 <= code && code <= 0x3134A); /* CJK Ideograph Extension G */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001051}
1052
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001053/* macros used to determine if the given code point is in the PUA range that
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001054 * we are using to store aliases and named sequences */
1055#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1056#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1057 (cp < named_sequences_end))
1058
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001059static int
Victor Stinner920cb642020-10-26 19:19:36 +01001060_getucname(PyObject *self,
Victor Stinnere6b8c522020-10-15 16:22:19 +02001061 Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001062{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001063 /* Find the name associated with the given code point.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001064 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1065 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001066 int offset;
1067 int i;
1068 int word;
Inada Naoki6fec9052019-04-17 08:40:34 +09001069 const unsigned char* w;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001070
Martin v. Löwisc3509122006-03-11 12:16:23 +00001071 if (code >= 0x110000)
1072 return 0;
1073
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001074 /* XXX should we just skip all the code points in the PUAs here? */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001075 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1076 return 0;
1077
Victor Stinner920cb642020-10-26 19:19:36 +01001078 if (UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001079 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +03001080 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001081 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1082 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +03001083 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +00001084 if (old->category_changed == 0) {
1085 /* unassigned */
1086 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001087 }
Martin v. Löwisc3509122006-03-11 12:16:23 +00001088 }
1089
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +00001090 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001091 /* Hangul syllable. */
1092 int SIndex = code - SBase;
1093 int L = SIndex / NCount;
1094 int V = (SIndex % NCount) / TCount;
1095 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001096
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001097 if (buflen < 27)
1098 /* Worst case: HANGUL SYLLABLE <10chars>. */
1099 return 0;
1100 strcpy(buffer, "HANGUL SYLLABLE ");
1101 buffer += 16;
1102 strcpy(buffer, hangul_syllables[L][0]);
1103 buffer += strlen(hangul_syllables[L][0]);
1104 strcpy(buffer, hangul_syllables[V][1]);
1105 buffer += strlen(hangul_syllables[V][1]);
1106 strcpy(buffer, hangul_syllables[T][2]);
1107 buffer += strlen(hangul_syllables[T][2]);
1108 *buffer = '\0';
1109 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001110 }
1111
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001112 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001113 if (buflen < 28)
1114 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1115 return 0;
1116 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1117 return 1;
1118 }
1119
Fredrik Lundh06d12682001-01-24 07:59:11 +00001120 /* get offset into phrasebook */
1121 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1122 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1123 (code&((1<<phrasebook_shift)-1))];
1124 if (!offset)
1125 return 0;
1126
1127 i = 0;
1128
1129 for (;;) {
1130 /* get word index */
1131 word = phrasebook[offset] - phrasebook_short;
1132 if (word >= 0) {
1133 word = (word << 8) + phrasebook[offset+1];
1134 offset += 2;
1135 } else
1136 word = phrasebook[offset++];
1137 if (i) {
1138 if (i > buflen)
1139 return 0; /* buffer overflow */
1140 buffer[i++] = ' ';
1141 }
1142 /* copy word string from lexicon. the last character in the
1143 word has bit 7 set. the last word in a string ends with
1144 0x80 */
1145 w = lexicon + lexicon_offset[word];
1146 while (*w < 128) {
1147 if (i >= buflen)
1148 return 0; /* buffer overflow */
1149 buffer[i++] = *w++;
1150 }
1151 if (i >= buflen)
1152 return 0; /* buffer overflow */
1153 buffer[i++] = *w & 127;
1154 if (*w == 128)
1155 break; /* end of word */
1156 }
1157
1158 return 1;
1159}
1160
1161static int
Victor Stinner920cb642020-10-26 19:19:36 +01001162capi_getucname(Py_UCS4 code,
Victor Stinner47e1afd2020-10-26 16:43:47 +01001163 char* buffer, int buflen,
Victor Stinnere6b8c522020-10-15 16:22:19 +02001164 int with_alias_and_seq)
1165{
Victor Stinner920cb642020-10-26 19:19:36 +01001166 return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001167
1168}
1169
1170static int
Victor Stinner920cb642020-10-26 19:19:36 +01001171_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001172{
1173 /* check if code corresponds to the given name */
1174 int i;
Christian Heimes2f366ca2016-09-23 20:20:27 +02001175 char buffer[NAME_MAXLEN+1];
Victor Stinner920cb642020-10-26 19:19:36 +01001176 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001177 return 0;
1178 for (i = 0; i < namelen; i++) {
Jordon Xu2ec70102019-09-11 00:04:08 +08001179 if (Py_TOUPPER(name[i]) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001180 return 0;
1181 }
1182 return buffer[namelen] == '\0';
1183}
1184
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001185static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001186find_syllable(const char *str, int *len, int *pos, int count, int column)
1187{
1188 int i, len1;
1189 *len = -1;
1190 for (i = 0; i < count; i++) {
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02001191 const char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001192 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001193 if (len1 <= *len)
1194 continue;
1195 if (strncmp(str, s, len1) == 0) {
1196 *len = len1;
1197 *pos = i;
1198 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001199 }
1200 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001201 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001202 }
1203}
1204
Fredrik Lundh06d12682001-01-24 07:59:11 +00001205static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001206_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001207{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001208 /* check if named sequences are allowed */
1209 if (!with_named_seq && IS_NAMED_SEQ(cp))
1210 return 0;
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001211 /* if the code point is in the PUA range that we use for aliases,
1212 * convert it to obtain the right code point */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001213 if (IS_ALIAS(cp))
1214 *code = name_aliases[cp-aliases_start];
1215 else
1216 *code = cp;
1217 return 1;
1218}
1219
1220static int
Victor Stinner920cb642020-10-26 19:19:36 +01001221_getcode(PyObject* self,
Victor Stinnere6b8c522020-10-15 16:22:19 +02001222 const char* name, int namelen, Py_UCS4* code, int with_named_seq)
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001223{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001224 /* Return the code point associated with the given name.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001225 * Named aliases are resolved too (unless self != NULL (i.e. we are using
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001226 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001227 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001228 unsigned int h, v;
1229 unsigned int mask = code_size-1;
1230 unsigned int i, incr;
1231
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001232 /* Check for hangul syllables. */
1233 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001234 int len, L = -1, V = -1, T = -1;
1235 const char *pos = name + 16;
1236 find_syllable(pos, &len, &L, LCount, 0);
1237 pos += len;
1238 find_syllable(pos, &len, &V, VCount, 1);
1239 pos += len;
1240 find_syllable(pos, &len, &T, TCount, 2);
1241 pos += len;
1242 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1243 *code = SBase + (L*VCount+V)*TCount + T;
1244 return 1;
1245 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001246 /* Otherwise, it's an illegal syllable name. */
1247 return 0;
1248 }
1249
1250 /* Check for unified ideographs. */
1251 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1252 /* Four or five hexdigits must follow. */
1253 v = 0;
1254 name += 22;
1255 namelen -= 22;
1256 if (namelen != 4 && namelen != 5)
1257 return 0;
1258 while (namelen--) {
1259 v *= 16;
1260 if (*name >= '0' && *name <= '9')
1261 v += *name - '0';
1262 else if (*name >= 'A' && *name <= 'F')
1263 v += *name - 'A' + 10;
1264 else
1265 return 0;
1266 name++;
1267 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001268 if (!is_unified_ideograph(v))
1269 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001270 *code = v;
1271 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001272 }
1273
Fredrik Lundh06d12682001-01-24 07:59:11 +00001274 /* the following is the same as python's dictionary lookup, with
1275 only minor changes. see the makeunicodedata script for more
1276 details */
1277
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001278 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001279 i = (~h) & mask;
1280 v = code_hash[i];
1281 if (!v)
1282 return 0;
Victor Stinner920cb642020-10-26 19:19:36 +01001283 if (_cmpname(self, v, name, namelen)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001284 return _check_alias_and_seq(v, code, with_named_seq);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001285 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001286 incr = (h ^ (h >> 3)) & mask;
1287 if (!incr)
1288 incr = mask;
1289 for (;;) {
1290 i = (i + incr) & mask;
1291 v = code_hash[i];
1292 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001293 return 0;
Victor Stinner920cb642020-10-26 19:19:36 +01001294 if (_cmpname(self, v, name, namelen)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001295 return _check_alias_and_seq(v, code, with_named_seq);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001296 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001297 incr = incr << 1;
1298 if (incr > mask)
1299 incr = incr ^ code_poly;
1300 }
1301}
1302
Victor Stinnere6b8c522020-10-15 16:22:19 +02001303static int
Victor Stinner920cb642020-10-26 19:19:36 +01001304capi_getcode(const char* name, int namelen, Py_UCS4* code,
Victor Stinnere6b8c522020-10-15 16:22:19 +02001305 int with_named_seq)
1306{
Victor Stinner920cb642020-10-26 19:19:36 +01001307 return _getcode(NULL, name, namelen, code, with_named_seq);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001308
1309}
1310
Victor Stinner920cb642020-10-26 19:19:36 +01001311static const _PyUnicode_Name_CAPI unicodedata_capi =
1312{
1313 .getname = capi_getucname,
1314 .getcode = capi_getcode,
1315};
1316
1317
Fredrik Lundh06d12682001-01-24 07:59:11 +00001318/* -------------------------------------------------------------------- */
1319/* Python bindings */
1320
Serhiy Storchaka63596412015-04-17 21:18:49 +03001321/*[clinic input]
1322unicodedata.UCD.name
1323
1324 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001325 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +03001326 default: object=NULL
1327 /
1328
1329Returns the name assigned to the character chr as a string.
1330
1331If no name is defined, default is returned, or, if not given,
1332ValueError is raised.
1333[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001334
Fredrik Lundh06d12682001-01-24 07:59:11 +00001335static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001336unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001337/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001338{
Christian Heimes2f366ca2016-09-23 20:20:27 +02001339 char name[NAME_MAXLEN+1];
Serhiy Storchaka63596412015-04-17 21:18:49 +03001340 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001341
Victor Stinner920cb642020-10-26 19:19:36 +01001342 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001343 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001344 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001345 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001346 }
1347 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001348 Py_INCREF(default_value);
1349 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001350 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001351 }
1352
Walter Dörwald4254e762007-06-05 16:04:09 +00001353 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001354}
1355
Serhiy Storchaka63596412015-04-17 21:18:49 +03001356/*[clinic input]
1357unicodedata.UCD.lookup
1358
1359 self: self
Larry Hastings38337d12015-05-07 23:30:09 -07001360 name: str(accept={str, robuffer}, zeroes=True)
Serhiy Storchaka63596412015-04-17 21:18:49 +03001361 /
1362
1363Look up character by name.
1364
1365If a character with the given name is found, return the
1366corresponding character. If not found, KeyError is raised.
1367[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001368
Fredrik Lundh06d12682001-01-24 07:59:11 +00001369static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001370unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1371 Py_ssize_clean_t name_length)
Larry Hastings38337d12015-05-07 23:30:09 -07001372/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001373{
1374 Py_UCS4 code;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001375 unsigned int index;
Christian Heimes7ce20132016-09-14 10:25:46 +02001376 if (name_length > NAME_MAXLEN) {
Victor Stinner65a31442014-07-01 16:45:52 +02001377 PyErr_SetString(PyExc_KeyError, "name too long");
1378 return NULL;
1379 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001380
Victor Stinner920cb642020-10-26 19:19:36 +01001381 if (!_getcode(self, name, (int)name_length, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001382 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001383 return NULL;
1384 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001385 /* check if code is in the PUA range that we use for named sequences
1386 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001387 if (IS_NAMED_SEQ(code)) {
1388 index = code-named_sequences_start;
1389 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1390 named_sequences[index].seq,
1391 named_sequences[index].seqlen);
1392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001394}
1395
Victor Stinner920cb642020-10-26 19:19:36 +01001396// List of functions used to define module functions *AND* unicodedata.UCD
1397// methods. For module functions, self is the module. For UCD methods, self
1398// is an UCD instance. The UCD_Check() macro is used to check if self is
1399// an UCD instance.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001400static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001401 UNICODEDATA_UCD_DECIMAL_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001402 UNICODEDATA_UCD_DIGIT_METHODDEF
1403 UNICODEDATA_UCD_NUMERIC_METHODDEF
1404 UNICODEDATA_UCD_CATEGORY_METHODDEF
1405 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1406 UNICODEDATA_UCD_COMBINING_METHODDEF
1407 UNICODEDATA_UCD_MIRRORED_METHODDEF
1408 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1409 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1410 UNICODEDATA_UCD_NAME_METHODDEF
1411 UNICODEDATA_UCD_LOOKUP_METHODDEF
Max Bélanger2810dd72018-11-04 15:58:24 -08001412 UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001413 UNICODEDATA_UCD_NORMALIZE_METHODDEF
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001414 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001415};
1416
Victor Stinnerc8c42002020-10-26 23:19:22 +01001417static void
1418ucd_dealloc(PreviousDBVersion *self)
1419{
1420 PyTypeObject *tp = Py_TYPE(self);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001421 PyObject_Free(self);
Victor Stinnerc8c42002020-10-26 23:19:22 +01001422 Py_DECREF(tp);
1423}
1424
1425static PyType_Slot ucd_type_slots[] = {
1426 {Py_tp_dealloc, ucd_dealloc},
1427 {Py_tp_getattro, PyObject_GenericGetAttr},
1428 {Py_tp_methods, unicodedata_functions},
1429 {Py_tp_members, DB_members},
1430 {0, 0}
1431};
1432
1433static PyType_Spec ucd_type_spec = {
1434 .name = "unicodedata.UCD",
1435 .basicsize = sizeof(PreviousDBVersion),
1436 .flags = Py_TPFLAGS_DEFAULT,
1437 .slots = ucd_type_slots
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001438};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001439
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001440PyDoc_STRVAR(unicodedata_docstring,
1441"This module provides access to the Unicode Character Database which\n\
1442defines character properties for all Unicode characters. The data in\n\
1443this database is based on the UnicodeData.txt file version\n\
luzpaza5293b42017-11-05 07:37:50 -06001444" UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001445\n\
1446The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001447UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001448
Victor Stinner920cb642020-10-26 19:19:36 +01001449static int
1450unicodedata_exec(PyObject *module)
1451{
Victor Stinner920cb642020-10-26 19:19:36 +01001452 if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
1453 return -1;
1454 }
1455
Victor Stinnerc8c42002020-10-26 23:19:22 +01001456 PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
1457 if (ucd_type == NULL) {
1458 return -1;
1459 }
1460
Victor Stinner920cb642020-10-26 19:19:36 +01001461 if (PyModule_AddType(module, ucd_type) < 0) {
Victor Stinnerc8c42002020-10-26 23:19:22 +01001462 Py_DECREF(ucd_type);
Victor Stinner920cb642020-10-26 19:19:36 +01001463 return -1;
1464 }
1465
Victor Stinner84f73822020-10-27 04:36:22 +01001466 // Unicode database version 3.2.0 used by the IDNA encoding
Victor Stinner920cb642020-10-26 19:19:36 +01001467 PyObject *v;
1468 v = new_previous_version(ucd_type, "3.2.0",
1469 get_change_3_2_0, normalization_3_2_0);
Victor Stinnerc8c42002020-10-26 23:19:22 +01001470 Py_DECREF(ucd_type);
Victor Stinner920cb642020-10-26 19:19:36 +01001471 if (v == NULL) {
1472 return -1;
1473 }
1474 if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
1475 Py_DECREF(v);
1476 return -1;
1477 }
1478
1479 /* Export C API */
1480 v = PyCapsule_New((void *)&unicodedata_capi, PyUnicodeData_CAPSULE_NAME,
1481 NULL);
1482 if (v == NULL) {
1483 return -1;
1484 }
Victor Stinner84f73822020-10-27 04:36:22 +01001485 if (PyModule_AddObject(module, "_ucnhash_CAPI", v) < 0) {
Victor Stinner920cb642020-10-26 19:19:36 +01001486 Py_DECREF(v);
1487 return -1;
1488 }
1489 return 0;
1490}
1491
Victor Stinnerc8c42002020-10-26 23:19:22 +01001492static PyModuleDef_Slot unicodedata_slots[] = {
1493 {Py_mod_exec, unicodedata_exec},
1494 {0, NULL}
1495};
1496
1497static struct PyModuleDef unicodedata_module = {
1498 PyModuleDef_HEAD_INIT,
1499 .m_name = "unicodedata",
1500 .m_doc = unicodedata_docstring,
1501 .m_size = 0,
1502 .m_methods = unicodedata_functions,
1503 .m_slots = unicodedata_slots,
1504};
Victor Stinner920cb642020-10-26 19:19:36 +01001505
Mark Hammond62b1ab12002-07-23 06:31:15 +00001506PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001507PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001508{
Victor Stinnerc8c42002020-10-26 23:19:22 +01001509 return PyModuleDef_Init(&unicodedata_module);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001510}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001511
Victor Stinner920cb642020-10-26 19:19:36 +01001512
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001513/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001514Local variables:
1515c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001516indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001517End:
1518*/