blob: aebae7da576561f9b10f59d33ca082328fb72f7c [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Victor Stinner47e1afd2020-10-26 16:43:47 +010019#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
Victor Stinner4a21e572020-04-15 02:35:41 +020020#include "structmember.h" // PyMemberDef
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Greg Price2f094132019-09-03 19:45:44 -070022#include <stdbool.h>
23
Max Bélanger2810dd72018-11-04 15:58:24 -080024_Py_IDENTIFIER(NFC);
25_Py_IDENTIFIER(NFD);
26_Py_IDENTIFIER(NFKC);
27_Py_IDENTIFIER(NFKD);
28
Larry Hastings61272b72014-01-07 12:41:53 -080029/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080030module unicodedata
Victor Stinnerc8c42002020-10-26 23:19:22 +010031class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
Larry Hastings61272b72014-01-07 12:41:53 -080032[clinic start generated code]*/
Victor Stinnerc8c42002020-10-26 23:19:22 +010033/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080034
Fredrik Lundh06d12682001-01-24 07:59:11 +000035/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000036
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000037typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000038 const unsigned char category; /* index into
39 _PyUnicode_CategoryNames */
40 const unsigned char combining; /* combining class value 0 - 255 */
41 const unsigned char bidirectional; /* index into
42 _PyUnicode_BidirectionalNames */
43 const unsigned char mirrored; /* true if mirrored in bidir mode */
44 const unsigned char east_asian_width; /* index into
45 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000046 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047} _PyUnicode_DatabaseRecord;
48
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000049typedef struct change_record {
50 /* sequence of fields should be the same as in merge_old_version */
51 const unsigned char bidir_changed;
52 const unsigned char category_changed;
53 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000054 const unsigned char mirrored_changed;
Benjamin Peterson67752312016-09-14 23:53:47 -070055 const unsigned char east_asian_width_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000056 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000057} change_record;
58
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000059/* data file generated by Tools/unicode/makeunicodedata.py */
60#include "unicodedata_db.h"
61
62static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000063_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000064{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000065 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000066 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000067 index = 0;
68 else {
69 index = index1[(code>>SHIFT)];
70 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
71 }
72
73 return &_PyUnicode_Database_Records[index];
74}
75
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000076/* ------------- Previous-version API ------------------------------------- */
77typedef struct previous_version {
78 PyObject_HEAD
79 const char *name;
80 const change_record* (*getrecord)(Py_UCS4);
81 Py_UCS4 (*normalization)(Py_UCS4);
82} PreviousDBVersion;
83
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030084#include "clinic/unicodedata.c.h"
85
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
87
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090 {NULL}
91};
92
Victor Stinner920cb642020-10-26 19:19:36 +010093// Check if self is an unicodedata.UCD instance.
94// If self is NULL (when the PyCapsule C API is used), return 0.
95// PyModule_Check() is used to avoid having to retrieve the ucd_type.
96// See unicodedata_functions comment to the rationale of this macro.
97#define UCD_Check(self) (self != NULL && !PyModule_Check(self))
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000098
99static PyObject*
Victor Stinner920cb642020-10-26 19:19:36 +0100100new_previous_version(PyTypeObject *ucd_type,
Victor Stinnere6b8c522020-10-15 16:22:19 +0200101 const char*name, const change_record* (*getrecord)(Py_UCS4),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000102 Py_UCS4 (*normalization)(Py_UCS4))
103{
Victor Stinnere6b8c522020-10-15 16:22:19 +0200104 PreviousDBVersion *self;
Victor Stinner920cb642020-10-26 19:19:36 +0100105 self = PyObject_New(PreviousDBVersion, ucd_type);
Victor Stinnere6b8c522020-10-15 16:22:19 +0200106 if (self == NULL)
107 return NULL;
108 self->name = name;
109 self->getrecord = getrecord;
110 self->normalization = normalization;
111 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000112}
113
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000114
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000115/* --- Module API --------------------------------------------------------- */
116
Larry Hastings61272b72014-01-07 12:41:53 -0800117/*[clinic input]
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800118unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700119
Serhiy Storchaka63596412015-04-17 21:18:49 +0300120 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700121 chr: int(accept={str})
Larry Hastings31826802013-10-19 00:09:25 -0700122 default: object=NULL
123 /
124
125Converts a Unicode character into its equivalent decimal value.
126
Serhiy Storchaka63596412015-04-17 21:18:49 +0300127Returns the decimal value assigned to the character chr as integer.
128If no such value is defined, default is returned, or, if not given,
129ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800130[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700131
Larry Hastings31826802013-10-19 00:09:25 -0700132static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300133unicodedata_UCD_decimal_impl(PyObject *self, int chr,
Larry Hastings89964c42015-04-14 18:07:59 -0400134 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700135/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
Larry Hastings31826802013-10-19 00:09:25 -0700136{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000137 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000138 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300139 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000140
Victor Stinner920cb642020-10-26 19:19:36 +0100141 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000142 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000143 if (old->category_changed == 0) {
144 /* unassigned */
145 have_old = 1;
146 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000148 else if (old->decimal_changed != 0xFF) {
149 have_old = 1;
150 rc = old->decimal_changed;
151 }
152 }
153
154 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000155 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000156 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700157 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000158 PyErr_SetString(PyExc_ValueError,
159 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000160 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000161 }
162 else {
Larry Hastings31826802013-10-19 00:09:25 -0700163 Py_INCREF(default_value);
164 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000166 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000167 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000168}
169
Serhiy Storchaka63596412015-04-17 21:18:49 +0300170/*[clinic input]
171unicodedata.UCD.digit
172
173 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700174 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300175 default: object=NULL
176 /
177
178Converts a Unicode character into its equivalent digit value.
179
180Returns the digit value assigned to the character chr as integer.
181If no such value is defined, default is returned, or, if not given,
182ValueError is raised.
183[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000184
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000185static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300186unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700187/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000188{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000189 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300190 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000191 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000192 if (rc < 0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300193 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000195 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 }
197 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300198 Py_INCREF(default_value);
199 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000200 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000202 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203}
204
Serhiy Storchaka63596412015-04-17 21:18:49 +0300205/*[clinic input]
206unicodedata.UCD.numeric
207
208 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700209 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300210 default: object=NULL
211 /
212
213Converts a Unicode character into its equivalent numeric value.
214
215Returns the numeric value assigned to the character chr as float.
216If no such value is defined, default is returned, or, if not given,
217ValueError is raised.
218[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000219
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000220static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300221unicodedata_UCD_numeric_impl(PyObject *self, int chr,
222 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700223/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000224{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000225 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000226 double rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300227 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000228
Victor Stinner920cb642020-10-26 19:19:36 +0100229 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000230 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000231 if (old->category_changed == 0) {
232 /* unassigned */
233 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000235 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000236 else if (old->decimal_changed != 0xFF) {
237 have_old = 1;
238 rc = old->decimal_changed;
239 }
240 }
241
242 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000243 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000244 if (rc == -1.0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300245 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000246 PyErr_SetString(PyExc_ValueError, "not a numeric character");
247 return NULL;
248 }
249 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300250 Py_INCREF(default_value);
251 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000252 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000253 }
254 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000255}
256
Serhiy Storchaka63596412015-04-17 21:18:49 +0300257/*[clinic input]
258unicodedata.UCD.category
259
260 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700261 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300262 /
263
264Returns the general category assigned to the character chr as string.
265[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000266
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000267static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300268unicodedata_UCD_category_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700269/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000270{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000271 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300272 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000273 index = (int) _getrecord_ex(c)->category;
Victor Stinner920cb642020-10-26 19:19:36 +0100274 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000275 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000276 if (old->category_changed != 0xFF)
277 index = old->category_changed;
278 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000279 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000280}
281
Serhiy Storchaka63596412015-04-17 21:18:49 +0300282/*[clinic input]
283unicodedata.UCD.bidirectional
284
285 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700286 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300287 /
288
289Returns the bidirectional class assigned to the character chr as string.
290
291If no such value is defined, an empty string is returned.
292[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000293
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000294static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300295unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700296/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000297{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000298 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300299 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000300 index = (int) _getrecord_ex(c)->bidirectional;
Victor Stinner920cb642020-10-26 19:19:36 +0100301 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000302 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000303 if (old->category_changed == 0)
304 index = 0; /* unassigned */
305 else if (old->bidir_changed != 0xFF)
306 index = old->bidir_changed;
307 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000308 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000309}
310
Serhiy Storchaka63596412015-04-17 21:18:49 +0300311/*[clinic input]
312unicodedata.UCD.combining -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000313
Serhiy Storchaka63596412015-04-17 21:18:49 +0300314 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700315 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300316 /
317
318Returns the canonical combining class assigned to the character chr as integer.
319
320Returns 0 if no combining class is defined.
321[clinic start generated code]*/
322
323static int
324unicodedata_UCD_combining_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700325/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000326{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000327 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300328 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000329 index = (int) _getrecord_ex(c)->combining;
Victor Stinner920cb642020-10-26 19:19:36 +0100330 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000331 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000332 if (old->category_changed == 0)
333 index = 0; /* unassigned */
334 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300335 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000336}
337
Serhiy Storchaka63596412015-04-17 21:18:49 +0300338/*[clinic input]
339unicodedata.UCD.mirrored -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000340
Serhiy Storchaka63596412015-04-17 21:18:49 +0300341 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700342 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300343 /
344
345Returns the mirrored property assigned to the character chr as integer.
346
347Returns 1 if the character has been identified as a "mirrored"
348character in bidirectional text, 0 otherwise.
349[clinic start generated code]*/
350
351static int
352unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700353/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000354{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000355 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300356 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000357 index = (int) _getrecord_ex(c)->mirrored;
Victor Stinner920cb642020-10-26 19:19:36 +0100358 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000359 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000360 if (old->category_changed == 0)
361 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000362 else if (old->mirrored_changed != 0xFF)
363 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000364 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300365 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000366}
367
Serhiy Storchaka63596412015-04-17 21:18:49 +0300368/*[clinic input]
369unicodedata.UCD.east_asian_width
370
371 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700372 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300373 /
374
375Returns the east asian width assigned to the character chr as string.
376[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000377
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000378static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300379unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700380/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000381{
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000382 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300383 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000384 index = (int) _getrecord_ex(c)->east_asian_width;
Victor Stinner920cb642020-10-26 19:19:36 +0100385 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000386 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000387 if (old->category_changed == 0)
388 index = 0; /* unassigned */
Benjamin Peterson67752312016-09-14 23:53:47 -0700389 else if (old->east_asian_width_changed != 0xFF)
390 index = old->east_asian_width_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000391 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000392 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000393}
394
Serhiy Storchaka63596412015-04-17 21:18:49 +0300395/*[clinic input]
396unicodedata.UCD.decomposition
397
398 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700399 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300400 /
401
402Returns the character decomposition mapping assigned to the character chr as string.
403
404An empty string is returned in case no such mapping is defined.
405[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000406
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000407static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300408unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700409/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000410{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000411 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000412 int code, index, count;
413 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000414 unsigned int prefix_index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300415 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000416
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000417 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000418
Victor Stinner920cb642020-10-26 19:19:36 +0100419 if (UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000420 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000421 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000422 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000423 }
424
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000425 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000426 index = 0;
427 else {
428 index = decomp_index1[(code>>DECOMP_SHIFT)];
429 index = decomp_index2[(index<<DECOMP_SHIFT)+
430 (code&((1<<DECOMP_SHIFT)-1))];
431 }
432
Tim Peters69b83b12001-11-30 07:23:05 +0000433 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000434 is prefix code (from*/
435 count = decomp_data[index] >> 8;
436
437 /* XXX: could allocate the PyString up front instead
438 (strlen(prefix) + 5 * count + 1 bytes) */
439
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000440 /* Based on how index is calculated above and decomp_data is generated
441 from Tools/unicode/makeunicodedata.py, it should not be possible
442 to overflow decomp_prefix. */
443 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200444 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000445
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000446 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000447 i = strlen(decomp_prefix[prefix_index]);
448 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000449
450 while (count-- > 0) {
451 if (i)
452 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000453 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000454 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
455 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000456 i += strlen(decomp + i);
457 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000458 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000459}
460
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000461static void
Victor Stinner920cb642020-10-26 19:19:36 +0100462get_decomp_record(PyObject *self, Py_UCS4 code,
463 int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000464{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000465 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000466 *index = 0;
Victor Stinner920cb642020-10-26 19:19:36 +0100467 }
468 else if (UCD_Check(self)
469 && get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000470 /* unassigned in old version */
471 *index = 0;
472 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000473 else {
474 *index = decomp_index1[(code>>DECOMP_SHIFT)];
475 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
476 (code&((1<<DECOMP_SHIFT)-1))];
477 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000478
Martin v. Löwis677bde22002-11-23 22:08:15 +0000479 /* high byte is number of hex bytes (usually one or two), low byte
480 is prefix code (from*/
481 *count = decomp_data[*index] >> 8;
482 *prefix = decomp_data[*index] & 255;
483
484 (*index)++;
485}
486
487#define SBase 0xAC00
488#define LBase 0x1100
489#define VBase 0x1161
490#define TBase 0x11A7
491#define LCount 19
492#define VCount 21
493#define TCount 28
494#define NCount (VCount*TCount)
495#define SCount (LCount*NCount)
496
497static PyObject*
Victor Stinner920cb642020-10-26 19:19:36 +0100498nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000499{
500 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200501 Py_UCS4 *output;
502 Py_ssize_t i, o, osize;
503 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300504 const void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000505 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200506 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000507 Py_ssize_t space, isize;
508 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000509 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000510
Martin v. Löwis677bde22002-11-23 22:08:15 +0000511 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200512 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500513 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300514 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500515 if (space > 10) {
516 if (space <= PY_SSIZE_T_MAX - 10)
517 space += 10;
518 }
519 else {
520 space *= 2;
521 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200522 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500523 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200524 if (!output) {
525 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000526 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200527 }
528 i = o = 0;
529 kind = PyUnicode_KIND(input);
530 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000531
Martin v. Löwis22970662011-09-29 13:39:38 +0200532 while (i < isize) {
533 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000534 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200535 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000536 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300537 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000538 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000539 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200540 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000541 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000542 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
543 if (new_output == NULL) {
544 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200545 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000546 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200547 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000548 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000549 }
550 /* Hangul Decomposition. */
551 if (SBase <= code && code < (SBase+SCount)) {
552 int SIndex = code - SBase;
553 int L = LBase + SIndex / NCount;
554 int V = VBase + (SIndex % NCount) / TCount;
555 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200556 output[o++] = L;
557 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000558 space -= 2;
559 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200560 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000561 space --;
562 }
563 continue;
564 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000565 /* normalization changes */
Victor Stinner920cb642020-10-26 19:19:36 +0100566 if (UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000567 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
568 if (value != 0) {
569 stack[stackptr++] = value;
570 continue;
571 }
572 }
573
574 /* Other decompositions. */
Victor Stinner920cb642020-10-26 19:19:36 +0100575 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000576
577 /* Copy character if it is not decomposable, or has a
578 compatibility decomposition, but we do NFD. */
579 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200580 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000581 space--;
582 continue;
583 }
584 /* Copy decomposition onto the stack, in reverse
585 order. */
586 while(count) {
587 code = decomp_data[index + (--count)];
588 stack[stackptr++] = code;
589 }
590 }
591 }
592
Martin v. Löwis22970662011-09-29 13:39:38 +0200593 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
594 output, o);
595 PyMem_Free(output);
596 if (!result)
597 return NULL;
598 /* result is guaranteed to be ready, as it is compact. */
599 kind = PyUnicode_KIND(result);
600 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000601
602 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200603 i = 0;
604 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
605 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
606 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000607 if (prev == 0 || cur == 0 || prev <= cur) {
608 prev = cur;
609 continue;
610 }
611 /* Non-canonical order. Need to switch *i with previous. */
612 o = i - 1;
613 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200614 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
615 PyUnicode_WRITE(kind, data, o+1,
616 PyUnicode_READ(kind, data, o));
617 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000618 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200619 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000620 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200621 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000622 if (prev == 0 || prev <= cur)
623 break;
624 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200625 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000626 }
627 return result;
628}
629
630static int
Andy Lester982307b2020-03-17 11:38:12 -0500631find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000632{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200633 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000634 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200635 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000636 if (code < start)
637 return -1;
638 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200639 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000640 return nfc[index].index + delta;
641 }
642 }
643 return -1;
644}
645
646static PyObject*
Victor Stinner920cb642020-10-26 19:19:36 +0100647nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000648{
649 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200650 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300651 const void *data;
Martin v. Löwis22970662011-09-29 13:39:38 +0200652 Py_UCS4 *output;
653 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000654 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200655 Py_UCS4 code;
656 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000657 int cskipped = 0;
658
Victor Stinner920cb642020-10-26 19:19:36 +0100659 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000660 if (!result)
661 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200662 /* result will be "ready". */
663 kind = PyUnicode_KIND(result);
664 data = PyUnicode_DATA(result);
665 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000666
Martin v. Löwis22970662011-09-29 13:39:38 +0200667 /* We allocate a buffer for the output.
668 If we find that we made no changes, we still return
669 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500670 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200671 if (!output) {
672 PyErr_NoMemory();
673 Py_DECREF(result);
674 return 0;
675 }
676 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000677
Martin v. Löwis677bde22002-11-23 22:08:15 +0000678 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200679 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000680 for (index = 0; index < cskipped; index++) {
681 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000682 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000683 Remove from list. */
684 skipped[index] = skipped[cskipped-1];
685 cskipped--;
686 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000687 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000688 }
689 }
690 /* Hangul Composition. We don't need to check for <LV,T>
691 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200692 code = PyUnicode_READ(kind, data, i);
693 if (LBase <= code && code < (LBase+LCount) &&
694 i + 1 < len &&
695 VBase <= PyUnicode_READ(kind, data, i+1) &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900696 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
697 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
698 and V character is a modern vowel (0x1161 ~ 0x1175). */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000699 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200700 LIndex = code - LBase;
701 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000702 code = SBase + (LIndex*VCount+VIndex)*TCount;
703 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200704 if (i < len &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900705 TBase < PyUnicode_READ(kind, data, i) &&
706 PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
707 /* check T character is a modern trailing consonant
708 (0x11A8 ~ 0x11C2). */
Martin v. Löwis22970662011-09-29 13:39:38 +0200709 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000710 i++;
711 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200712 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000713 continue;
714 }
715
Martin v. Löwis22970662011-09-29 13:39:38 +0200716 /* code is still input[i] here */
Andy Lester982307b2020-03-17 11:38:12 -0500717 f = find_nfc_index(nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000718 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200719 output[o++] = code;
720 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000721 continue;
722 }
723 /* Find next unblocked character. */
724 i1 = i+1;
725 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200726 /* output base character for now; might be updated later. */
727 output[o] = PyUnicode_READ(kind, data, i);
728 while (i1 < len) {
729 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
730 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000731 if (comb) {
732 if (comb1 == 0)
733 break;
734 if (comb >= comb1) {
735 /* Character is blocked. */
736 i1++;
737 continue;
738 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000739 }
Andy Lester982307b2020-03-17 11:38:12 -0500740 l = find_nfc_index(nfc_last, code1);
Martin v. Löwis22970662011-09-29 13:39:38 +0200741 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000742 is a starter, we don't need to look further.
743 Otherwise, record the combining class. */
744 if (l == -1) {
745 not_combinable:
746 if (comb1 == 0)
747 break;
748 comb = comb1;
749 i1++;
750 continue;
751 }
752 index = f*TOTAL_LAST + l;
753 index1 = comp_index[index >> COMP_SHIFT];
754 code = comp_data[(index1<<COMP_SHIFT)+
755 (index&((1<<COMP_SHIFT)-1))];
756 if (code == 0)
757 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000758
Martin v. Löwis677bde22002-11-23 22:08:15 +0000759 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200760 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000761 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000762 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000763 skipped[cskipped++] = i1;
764 i1++;
Andy Lester982307b2020-03-17 11:38:12 -0500765 f = find_nfc_index(nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000766 if (f == -1)
767 break;
768 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200769 /* Output character was already written.
770 Just advance the indices. */
771 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000772 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200773 if (o == len) {
774 /* No changes. Return original string. */
775 PyMem_Free(output);
776 return result;
777 }
778 Py_DECREF(result);
779 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
780 output, o);
781 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000782 return result;
783}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000784
Greg Price2f094132019-09-03 19:45:44 -0700785// This needs to match the logic in makeunicodedata.py
786// which constructs the quickcheck data.
787typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
Max Bélanger2810dd72018-11-04 15:58:24 -0800788
Greg Price2f094132019-09-03 19:45:44 -0700789/* Run the Unicode normalization "quickcheck" algorithm.
790 *
791 * Return YES or NO if quickcheck determines the input is certainly
792 * normalized or certainly not, and MAYBE if quickcheck is unable to
793 * tell.
794 *
795 * If `yes_only` is true, then return MAYBE as soon as we determine
796 * the answer is not YES.
797 *
798 * For background and details on the algorithm, see UAX #15:
799 * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
800 */
801static QuickcheckResult
Victor Stinner920cb642020-10-26 19:19:36 +0100802is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
803 bool yes_only)
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000804{
Victor Stinner920cb642020-10-26 19:19:36 +0100805 /* UCD 3.2.0 is requested, quickchecks must be disabled. */
806 if (UCD_Check(self)) {
Max Bélanger2810dd72018-11-04 15:58:24 -0800807 return NO;
Victor Stinner920cb642020-10-26 19:19:36 +0100808 }
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000809
Greg Price2f094132019-09-03 19:45:44 -0700810 Py_ssize_t i, len;
811 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300812 const void *data;
Greg Price2f094132019-09-03 19:45:44 -0700813 unsigned char prev_combining = 0;
814
815 /* The two quickcheck bits at this shift have type QuickcheckResult. */
816 int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
817
818 QuickcheckResult result = YES; /* certainly normalized, unless we find something */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000819
Martin v. Löwis22970662011-09-29 13:39:38 +0200820 i = 0;
821 kind = PyUnicode_KIND(input);
822 data = PyUnicode_DATA(input);
823 len = PyUnicode_GET_LENGTH(input);
824 while (i < len) {
825 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
826 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000827
Greg Price2f094132019-09-03 19:45:44 -0700828 unsigned char combining = record->combining;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000829 if (combining && prev_combining > combining)
Max Bélanger2810dd72018-11-04 15:58:24 -0800830 return NO; /* non-canonical sort order, not normalized */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000831 prev_combining = combining;
Greg Price2f094132019-09-03 19:45:44 -0700832
833 unsigned char quickcheck_whole = record->normalization_quick_check;
834 if (yes_only) {
835 if (quickcheck_whole & (3 << quickcheck_shift))
836 return MAYBE;
837 } else {
838 switch ((quickcheck_whole >> quickcheck_shift) & 3) {
839 case NO:
840 return NO;
841 case MAYBE:
842 result = MAYBE; /* this string might need normalization */
843 }
844 }
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000845 }
Greg Price2f094132019-09-03 19:45:44 -0700846 return result;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000847}
848
Serhiy Storchaka63596412015-04-17 21:18:49 +0300849/*[clinic input]
Max Bélanger2810dd72018-11-04 15:58:24 -0800850unicodedata.UCD.is_normalized
851
852 self: self
853 form: unicode
854 unistr as input: unicode
855 /
856
857Return whether the Unicode string unistr is in the normal form 'form'.
858
859Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
860[clinic start generated code]*/
861
862static PyObject *
863unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
864 PyObject *input)
865/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
866{
867 if (PyUnicode_READY(input) == -1) {
868 return NULL;
869 }
870
871 if (PyUnicode_GET_LENGTH(input) == 0) {
872 /* special case empty input strings. */
873 Py_RETURN_TRUE;
874 }
875
876 PyObject *result;
Greg Price7669cb82019-09-09 02:16:31 -0700877 bool nfc = false;
878 bool k = false;
Greg Price2f094132019-09-03 19:45:44 -0700879 QuickcheckResult m;
Max Bélanger2810dd72018-11-04 15:58:24 -0800880
881 PyObject *cmp;
882 int match = 0;
883
884 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700885 nfc = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800886 }
887 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700888 nfc = true;
889 k = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800890 }
891 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
892 /* matches default values for `nfc` and `k` */
893 }
894 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
Greg Price7669cb82019-09-09 02:16:31 -0700895 k = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800896 }
897 else {
898 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
899 return NULL;
900 }
901
Victor Stinner920cb642020-10-26 19:19:36 +0100902 m = is_normalized_quickcheck(self, input, nfc, k, false);
Max Bélanger2810dd72018-11-04 15:58:24 -0800903
904 if (m == MAYBE) {
Victor Stinner920cb642020-10-26 19:19:36 +0100905 cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
Max Bélanger2810dd72018-11-04 15:58:24 -0800906 if (cmp == NULL) {
907 return NULL;
908 }
909 match = PyUnicode_Compare(input, cmp);
910 Py_DECREF(cmp);
911 result = (match == 0) ? Py_True : Py_False;
912 }
913 else {
914 result = (m == YES) ? Py_True : Py_False;
915 }
916
917 Py_INCREF(result);
918 return result;
919}
920
921
922/*[clinic input]
Serhiy Storchaka63596412015-04-17 21:18:49 +0300923unicodedata.UCD.normalize
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000924
Serhiy Storchaka63596412015-04-17 21:18:49 +0300925 self: self
Max Bélanger2810dd72018-11-04 15:58:24 -0800926 form: unicode
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +0300927 unistr as input: unicode
Serhiy Storchaka63596412015-04-17 21:18:49 +0300928 /
929
930Return the normal form 'form' for the Unicode string unistr.
931
932Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
933[clinic start generated code]*/
934
935static PyObject *
Max Bélanger2810dd72018-11-04 15:58:24 -0800936unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
Serhiy Storchaka63596412015-04-17 21:18:49 +0300937 PyObject *input)
Max Bélanger2810dd72018-11-04 15:58:24 -0800938/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
Martin v. Löwis677bde22002-11-23 22:08:15 +0000939{
Martin v. Löwis22970662011-09-29 13:39:38 +0200940 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000941 /* Special case empty input strings, since resizing
942 them later would cause internal errors. */
943 Py_INCREF(input);
944 return input;
945 }
946
Max Bélanger2810dd72018-11-04 15:58:24 -0800947 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
Victor Stinner920cb642020-10-26 19:19:36 +0100948 if (is_normalized_quickcheck(self, input,
Victor Stinnere6b8c522020-10-15 16:22:19 +0200949 true, false, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000950 Py_INCREF(input);
951 return input;
952 }
Victor Stinner920cb642020-10-26 19:19:36 +0100953 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000954 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800955 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
Victor Stinner920cb642020-10-26 19:19:36 +0100956 if (is_normalized_quickcheck(self, input,
Victor Stinnere6b8c522020-10-15 16:22:19 +0200957 true, true, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000958 Py_INCREF(input);
959 return input;
960 }
Victor Stinner920cb642020-10-26 19:19:36 +0100961 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000962 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800963 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
Victor Stinner920cb642020-10-26 19:19:36 +0100964 if (is_normalized_quickcheck(self, input,
Victor Stinnere6b8c522020-10-15 16:22:19 +0200965 false, false, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000966 Py_INCREF(input);
967 return input;
968 }
Victor Stinner920cb642020-10-26 19:19:36 +0100969 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000970 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800971 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
Victor Stinner920cb642020-10-26 19:19:36 +0100972 if (is_normalized_quickcheck(self, input,
Victor Stinnere6b8c522020-10-15 16:22:19 +0200973 false, true, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000974 Py_INCREF(input);
975 return input;
976 }
Victor Stinner920cb642020-10-26 19:19:36 +0100977 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000978 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000979 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
980 return NULL;
981}
982
Fredrik Lundh06d12682001-01-24 07:59:11 +0000983/* -------------------------------------------------------------------- */
984/* unicode character name tables */
985
986/* data file generated by Tools/unicode/makeunicodedata.py */
987#include "unicodename_db.h"
988
989/* -------------------------------------------------------------------- */
990/* database code (cut and pasted from the unidb package) */
991
992static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000993_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000994{
995 int i;
996 unsigned long h = 0;
997 unsigned long ix;
998 for (i = 0; i < len; i++) {
Jordon Xu2ec70102019-09-11 00:04:08 +0800999 h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001000 ix = h & 0xff000000;
1001 if (ix)
1002 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
1003 }
1004 return h;
1005}
1006
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02001007static const char * const hangul_syllables[][3] = {
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001008 { "G", "A", "" },
1009 { "GG", "AE", "G" },
1010 { "N", "YA", "GG" },
1011 { "D", "YAE", "GS" },
1012 { "DD", "EO", "N", },
1013 { "R", "E", "NJ" },
1014 { "M", "YEO", "NH" },
1015 { "B", "YE", "D" },
1016 { "BB", "O", "L" },
1017 { "S", "WA", "LG" },
1018 { "SS", "WAE", "LM" },
1019 { "", "OE", "LB" },
1020 { "J", "YO", "LS" },
1021 { "JJ", "U", "LT" },
1022 { "C", "WEO", "LP" },
1023 { "K", "WE", "LH" },
1024 { "T", "WI", "M" },
1025 { "P", "YU", "B" },
1026 { "H", "EU", "BS" },
1027 { 0, "YI", "S" },
1028 { 0, "I", "SS" },
1029 { 0, 0, "NG" },
1030 { 0, 0, "J" },
1031 { 0, 0, "C" },
1032 { 0, 0, "K" },
1033 { 0, 0, "T" },
1034 { 0, 0, "P" },
1035 { 0, 0, "H" }
1036};
1037
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001038/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001039static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001040is_unified_ideograph(Py_UCS4 code)
1041{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001042 return
Benjamin Peterson051b9d02020-03-10 20:41:34 -07001043 (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
1044 (0x4E00 <= code && code <= 0x9FFC) || /* CJK Ideograph */
1045 (0x20000 <= code && code <= 0x2A6DD) || /* CJK Ideograph Extension B */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001046 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
Benjamin Peterson48013832015-06-27 15:45:56 -05001047 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
Benjamin Peterson279a9622017-06-22 22:31:08 -07001048 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
Benjamin Peterson051b9d02020-03-10 20:41:34 -07001049 (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1050 (0x30000 <= code && code <= 0x3134A); /* CJK Ideograph Extension G */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001051}
1052
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001053/* macros used to determine if the given code point is in the PUA range that
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001054 * we are using to store aliases and named sequences */
1055#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1056#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1057 (cp < named_sequences_end))
1058
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001059static int
Victor Stinner920cb642020-10-26 19:19:36 +01001060_getucname(PyObject *self,
Victor Stinnere6b8c522020-10-15 16:22:19 +02001061 Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001062{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001063 /* Find the name associated with the given code point.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001064 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1065 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001066 int offset;
1067 int i;
1068 int word;
Inada Naoki6fec9052019-04-17 08:40:34 +09001069 const unsigned char* w;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001070
Martin v. Löwisc3509122006-03-11 12:16:23 +00001071 if (code >= 0x110000)
1072 return 0;
1073
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001074 /* XXX should we just skip all the code points in the PUAs here? */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001075 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1076 return 0;
1077
Victor Stinner920cb642020-10-26 19:19:36 +01001078 if (UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001079 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +03001080 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001081 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1082 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +03001083 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +00001084 if (old->category_changed == 0) {
1085 /* unassigned */
1086 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001087 }
Martin v. Löwisc3509122006-03-11 12:16:23 +00001088 }
1089
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +00001090 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001091 /* Hangul syllable. */
1092 int SIndex = code - SBase;
1093 int L = SIndex / NCount;
1094 int V = (SIndex % NCount) / TCount;
1095 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001096
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001097 if (buflen < 27)
1098 /* Worst case: HANGUL SYLLABLE <10chars>. */
1099 return 0;
1100 strcpy(buffer, "HANGUL SYLLABLE ");
1101 buffer += 16;
1102 strcpy(buffer, hangul_syllables[L][0]);
1103 buffer += strlen(hangul_syllables[L][0]);
1104 strcpy(buffer, hangul_syllables[V][1]);
1105 buffer += strlen(hangul_syllables[V][1]);
1106 strcpy(buffer, hangul_syllables[T][2]);
1107 buffer += strlen(hangul_syllables[T][2]);
1108 *buffer = '\0';
1109 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001110 }
1111
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001112 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001113 if (buflen < 28)
1114 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1115 return 0;
1116 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1117 return 1;
1118 }
1119
Fredrik Lundh06d12682001-01-24 07:59:11 +00001120 /* get offset into phrasebook */
1121 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1122 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1123 (code&((1<<phrasebook_shift)-1))];
1124 if (!offset)
1125 return 0;
1126
1127 i = 0;
1128
1129 for (;;) {
1130 /* get word index */
1131 word = phrasebook[offset] - phrasebook_short;
1132 if (word >= 0) {
1133 word = (word << 8) + phrasebook[offset+1];
1134 offset += 2;
1135 } else
1136 word = phrasebook[offset++];
1137 if (i) {
1138 if (i > buflen)
1139 return 0; /* buffer overflow */
1140 buffer[i++] = ' ';
1141 }
1142 /* copy word string from lexicon. the last character in the
1143 word has bit 7 set. the last word in a string ends with
1144 0x80 */
1145 w = lexicon + lexicon_offset[word];
1146 while (*w < 128) {
1147 if (i >= buflen)
1148 return 0; /* buffer overflow */
1149 buffer[i++] = *w++;
1150 }
1151 if (i >= buflen)
1152 return 0; /* buffer overflow */
1153 buffer[i++] = *w & 127;
1154 if (*w == 128)
1155 break; /* end of word */
1156 }
1157
1158 return 1;
1159}
1160
1161static int
Victor Stinner920cb642020-10-26 19:19:36 +01001162capi_getucname(Py_UCS4 code,
Victor Stinner47e1afd2020-10-26 16:43:47 +01001163 char* buffer, int buflen,
Victor Stinnere6b8c522020-10-15 16:22:19 +02001164 int with_alias_and_seq)
1165{
Victor Stinner920cb642020-10-26 19:19:36 +01001166 return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001167
1168}
1169
1170static int
Victor Stinner920cb642020-10-26 19:19:36 +01001171_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001172{
1173 /* check if code corresponds to the given name */
1174 int i;
Christian Heimes2f366ca2016-09-23 20:20:27 +02001175 char buffer[NAME_MAXLEN+1];
Victor Stinner920cb642020-10-26 19:19:36 +01001176 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001177 return 0;
1178 for (i = 0; i < namelen; i++) {
Jordon Xu2ec70102019-09-11 00:04:08 +08001179 if (Py_TOUPPER(name[i]) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001180 return 0;
1181 }
1182 return buffer[namelen] == '\0';
1183}
1184
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001185static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001186find_syllable(const char *str, int *len, int *pos, int count, int column)
1187{
1188 int i, len1;
1189 *len = -1;
1190 for (i = 0; i < count; i++) {
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02001191 const char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001192 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001193 if (len1 <= *len)
1194 continue;
1195 if (strncmp(str, s, len1) == 0) {
1196 *len = len1;
1197 *pos = i;
1198 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001199 }
1200 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001201 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001202 }
1203}
1204
Fredrik Lundh06d12682001-01-24 07:59:11 +00001205static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001206_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001207{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001208 /* check if named sequences are allowed */
1209 if (!with_named_seq && IS_NAMED_SEQ(cp))
1210 return 0;
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001211 /* if the code point is in the PUA range that we use for aliases,
1212 * convert it to obtain the right code point */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001213 if (IS_ALIAS(cp))
1214 *code = name_aliases[cp-aliases_start];
1215 else
1216 *code = cp;
1217 return 1;
1218}
1219
1220static int
Victor Stinner920cb642020-10-26 19:19:36 +01001221_getcode(PyObject* self,
Victor Stinnere6b8c522020-10-15 16:22:19 +02001222 const char* name, int namelen, Py_UCS4* code, int with_named_seq)
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001223{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001224 /* Return the code point associated with the given name.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001225 * Named aliases are resolved too (unless self != NULL (i.e. we are using
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001226 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001227 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001228 unsigned int h, v;
1229 unsigned int mask = code_size-1;
1230 unsigned int i, incr;
1231
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001232 /* Check for hangul syllables. */
1233 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001234 int len, L = -1, V = -1, T = -1;
1235 const char *pos = name + 16;
1236 find_syllable(pos, &len, &L, LCount, 0);
1237 pos += len;
1238 find_syllable(pos, &len, &V, VCount, 1);
1239 pos += len;
1240 find_syllable(pos, &len, &T, TCount, 2);
1241 pos += len;
1242 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1243 *code = SBase + (L*VCount+V)*TCount + T;
1244 return 1;
1245 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001246 /* Otherwise, it's an illegal syllable name. */
1247 return 0;
1248 }
1249
1250 /* Check for unified ideographs. */
1251 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1252 /* Four or five hexdigits must follow. */
1253 v = 0;
1254 name += 22;
1255 namelen -= 22;
1256 if (namelen != 4 && namelen != 5)
1257 return 0;
1258 while (namelen--) {
1259 v *= 16;
1260 if (*name >= '0' && *name <= '9')
1261 v += *name - '0';
1262 else if (*name >= 'A' && *name <= 'F')
1263 v += *name - 'A' + 10;
1264 else
1265 return 0;
1266 name++;
1267 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001268 if (!is_unified_ideograph(v))
1269 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001270 *code = v;
1271 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001272 }
1273
Fredrik Lundh06d12682001-01-24 07:59:11 +00001274 /* the following is the same as python's dictionary lookup, with
1275 only minor changes. see the makeunicodedata script for more
1276 details */
1277
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001278 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001279 i = (~h) & mask;
1280 v = code_hash[i];
1281 if (!v)
1282 return 0;
Victor Stinner920cb642020-10-26 19:19:36 +01001283 if (_cmpname(self, v, name, namelen)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001284 return _check_alias_and_seq(v, code, with_named_seq);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001285 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001286 incr = (h ^ (h >> 3)) & mask;
1287 if (!incr)
1288 incr = mask;
1289 for (;;) {
1290 i = (i + incr) & mask;
1291 v = code_hash[i];
1292 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001293 return 0;
Victor Stinner920cb642020-10-26 19:19:36 +01001294 if (_cmpname(self, v, name, namelen)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001295 return _check_alias_and_seq(v, code, with_named_seq);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001296 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001297 incr = incr << 1;
1298 if (incr > mask)
1299 incr = incr ^ code_poly;
1300 }
1301}
1302
Victor Stinnere6b8c522020-10-15 16:22:19 +02001303static int
Victor Stinner920cb642020-10-26 19:19:36 +01001304capi_getcode(const char* name, int namelen, Py_UCS4* code,
Victor Stinnere6b8c522020-10-15 16:22:19 +02001305 int with_named_seq)
1306{
Victor Stinner920cb642020-10-26 19:19:36 +01001307 return _getcode(NULL, name, namelen, code, with_named_seq);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001308
1309}
1310
Erlend Egeberg Aasland61d26392021-01-20 12:03:53 +01001311static void
1312unicodedata_destroy_capi(PyObject *capsule)
Victor Stinner920cb642020-10-26 19:19:36 +01001313{
Erlend Egeberg Aasland61d26392021-01-20 12:03:53 +01001314 void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
1315 PyMem_Free(capi);
1316}
1317
1318static PyObject *
1319unicodedata_create_capi(void)
1320{
1321 _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
1322 if (capi == NULL) {
1323 PyErr_NoMemory();
1324 return NULL;
1325 }
1326 capi->getname = capi_getucname;
1327 capi->getcode = capi_getcode;
1328
1329 PyObject *capsule = PyCapsule_New(capi,
1330 PyUnicodeData_CAPSULE_NAME,
1331 unicodedata_destroy_capi);
1332 if (capsule == NULL) {
1333 PyMem_Free(capi);
1334 }
1335 return capsule;
Victor Stinner920cb642020-10-26 19:19:36 +01001336};
1337
1338
Fredrik Lundh06d12682001-01-24 07:59:11 +00001339/* -------------------------------------------------------------------- */
1340/* Python bindings */
1341
Serhiy Storchaka63596412015-04-17 21:18:49 +03001342/*[clinic input]
1343unicodedata.UCD.name
1344
1345 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001346 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +03001347 default: object=NULL
1348 /
1349
1350Returns the name assigned to the character chr as a string.
1351
1352If no name is defined, default is returned, or, if not given,
1353ValueError is raised.
1354[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001355
Fredrik Lundh06d12682001-01-24 07:59:11 +00001356static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001357unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001358/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001359{
Christian Heimes2f366ca2016-09-23 20:20:27 +02001360 char name[NAME_MAXLEN+1];
Serhiy Storchaka63596412015-04-17 21:18:49 +03001361 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001362
Victor Stinner920cb642020-10-26 19:19:36 +01001363 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001364 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001365 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001366 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001367 }
1368 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001369 Py_INCREF(default_value);
1370 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001371 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001372 }
1373
Walter Dörwald4254e762007-06-05 16:04:09 +00001374 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001375}
1376
Serhiy Storchaka63596412015-04-17 21:18:49 +03001377/*[clinic input]
1378unicodedata.UCD.lookup
1379
1380 self: self
Larry Hastings38337d12015-05-07 23:30:09 -07001381 name: str(accept={str, robuffer}, zeroes=True)
Serhiy Storchaka63596412015-04-17 21:18:49 +03001382 /
1383
1384Look up character by name.
1385
1386If a character with the given name is found, return the
1387corresponding character. If not found, KeyError is raised.
1388[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001389
Fredrik Lundh06d12682001-01-24 07:59:11 +00001390static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001391unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1392 Py_ssize_clean_t name_length)
Larry Hastings38337d12015-05-07 23:30:09 -07001393/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001394{
1395 Py_UCS4 code;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001396 unsigned int index;
Christian Heimes7ce20132016-09-14 10:25:46 +02001397 if (name_length > NAME_MAXLEN) {
Victor Stinner65a31442014-07-01 16:45:52 +02001398 PyErr_SetString(PyExc_KeyError, "name too long");
1399 return NULL;
1400 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001401
Victor Stinner920cb642020-10-26 19:19:36 +01001402 if (!_getcode(self, name, (int)name_length, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001403 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001404 return NULL;
1405 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001406 /* check if code is in the PUA range that we use for named sequences
1407 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001408 if (IS_NAMED_SEQ(code)) {
1409 index = code-named_sequences_start;
1410 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1411 named_sequences[index].seq,
1412 named_sequences[index].seqlen);
1413 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001415}
1416
Victor Stinner920cb642020-10-26 19:19:36 +01001417// List of functions used to define module functions *AND* unicodedata.UCD
1418// methods. For module functions, self is the module. For UCD methods, self
1419// is an UCD instance. The UCD_Check() macro is used to check if self is
1420// an UCD instance.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001421static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001422 UNICODEDATA_UCD_DECIMAL_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001423 UNICODEDATA_UCD_DIGIT_METHODDEF
1424 UNICODEDATA_UCD_NUMERIC_METHODDEF
1425 UNICODEDATA_UCD_CATEGORY_METHODDEF
1426 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1427 UNICODEDATA_UCD_COMBINING_METHODDEF
1428 UNICODEDATA_UCD_MIRRORED_METHODDEF
1429 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1430 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1431 UNICODEDATA_UCD_NAME_METHODDEF
1432 UNICODEDATA_UCD_LOOKUP_METHODDEF
Max Bélanger2810dd72018-11-04 15:58:24 -08001433 UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001434 UNICODEDATA_UCD_NORMALIZE_METHODDEF
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001435 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001436};
1437
Victor Stinnerc8c42002020-10-26 23:19:22 +01001438static void
1439ucd_dealloc(PreviousDBVersion *self)
1440{
1441 PyTypeObject *tp = Py_TYPE(self);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001442 PyObject_Free(self);
Victor Stinnerc8c42002020-10-26 23:19:22 +01001443 Py_DECREF(tp);
1444}
1445
1446static PyType_Slot ucd_type_slots[] = {
1447 {Py_tp_dealloc, ucd_dealloc},
1448 {Py_tp_getattro, PyObject_GenericGetAttr},
1449 {Py_tp_methods, unicodedata_functions},
1450 {Py_tp_members, DB_members},
1451 {0, 0}
1452};
1453
1454static PyType_Spec ucd_type_spec = {
1455 .name = "unicodedata.UCD",
1456 .basicsize = sizeof(PreviousDBVersion),
1457 .flags = Py_TPFLAGS_DEFAULT,
1458 .slots = ucd_type_slots
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001459};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001460
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001461PyDoc_STRVAR(unicodedata_docstring,
1462"This module provides access to the Unicode Character Database which\n\
1463defines character properties for all Unicode characters. The data in\n\
1464this database is based on the UnicodeData.txt file version\n\
luzpaza5293b42017-11-05 07:37:50 -06001465" UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001466\n\
1467The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001468UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001469
Victor Stinner920cb642020-10-26 19:19:36 +01001470static int
1471unicodedata_exec(PyObject *module)
1472{
Victor Stinner920cb642020-10-26 19:19:36 +01001473 if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
1474 return -1;
1475 }
1476
Victor Stinnerc8c42002020-10-26 23:19:22 +01001477 PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
1478 if (ucd_type == NULL) {
1479 return -1;
1480 }
1481
Victor Stinner920cb642020-10-26 19:19:36 +01001482 if (PyModule_AddType(module, ucd_type) < 0) {
Victor Stinnerc8c42002020-10-26 23:19:22 +01001483 Py_DECREF(ucd_type);
Victor Stinner920cb642020-10-26 19:19:36 +01001484 return -1;
1485 }
1486
Victor Stinner84f73822020-10-27 04:36:22 +01001487 // Unicode database version 3.2.0 used by the IDNA encoding
Victor Stinner920cb642020-10-26 19:19:36 +01001488 PyObject *v;
1489 v = new_previous_version(ucd_type, "3.2.0",
1490 get_change_3_2_0, normalization_3_2_0);
Victor Stinnerc8c42002020-10-26 23:19:22 +01001491 Py_DECREF(ucd_type);
Victor Stinner920cb642020-10-26 19:19:36 +01001492 if (v == NULL) {
1493 return -1;
1494 }
1495 if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
1496 Py_DECREF(v);
1497 return -1;
1498 }
1499
1500 /* Export C API */
Erlend Egeberg Aasland61d26392021-01-20 12:03:53 +01001501 PyObject *capsule = unicodedata_create_capi();
1502 if (capsule == NULL) {
Victor Stinner920cb642020-10-26 19:19:36 +01001503 return -1;
1504 }
Erlend Egeberg Aasland61d26392021-01-20 12:03:53 +01001505 int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule);
1506 Py_DECREF(capsule);
1507 if (rc < 0) {
Victor Stinner920cb642020-10-26 19:19:36 +01001508 return -1;
1509 }
1510 return 0;
1511}
1512
Victor Stinnerc8c42002020-10-26 23:19:22 +01001513static PyModuleDef_Slot unicodedata_slots[] = {
1514 {Py_mod_exec, unicodedata_exec},
1515 {0, NULL}
1516};
1517
1518static struct PyModuleDef unicodedata_module = {
1519 PyModuleDef_HEAD_INIT,
1520 .m_name = "unicodedata",
1521 .m_doc = unicodedata_docstring,
1522 .m_size = 0,
1523 .m_methods = unicodedata_functions,
1524 .m_slots = unicodedata_slots,
1525};
Victor Stinner920cb642020-10-26 19:19:36 +01001526
Mark Hammond62b1ab12002-07-23 06:31:15 +00001527PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001528PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001529{
Victor Stinnerc8c42002020-10-26 23:19:22 +01001530 return PyModuleDef_Init(&unicodedata_module);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001531}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001532
Victor Stinner920cb642020-10-26 19:19:36 +01001533
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001534/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001535Local variables:
1536c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001537indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001538End:
1539*/