blob: 941fd2faa742a60542c7e57ae847ceb3fd01678e [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019#include "ucnhash.h"
Victor Stinner4a21e572020-04-15 02:35:41 +020020#include "structmember.h" // PyMemberDef
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Greg Price2f094132019-09-03 19:45:44 -070022#include <stdbool.h>
23
Max Bélanger2810dd72018-11-04 15:58:24 -080024_Py_IDENTIFIER(NFC);
25_Py_IDENTIFIER(NFD);
26_Py_IDENTIFIER(NFKC);
27_Py_IDENTIFIER(NFKD);
28
Larry Hastings61272b72014-01-07 12:41:53 -080029/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080030module unicodedata
Larry Hastingsc2047262014-01-25 20:43:29 -080031class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Larry Hastings61272b72014-01-07 12:41:53 -080032[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080033/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080034
Fredrik Lundh06d12682001-01-24 07:59:11 +000035/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000036
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000037typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000038 const unsigned char category; /* index into
39 _PyUnicode_CategoryNames */
40 const unsigned char combining; /* combining class value 0 - 255 */
41 const unsigned char bidirectional; /* index into
42 _PyUnicode_BidirectionalNames */
43 const unsigned char mirrored; /* true if mirrored in bidir mode */
44 const unsigned char east_asian_width; /* index into
45 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000046 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047} _PyUnicode_DatabaseRecord;
48
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000049typedef struct change_record {
50 /* sequence of fields should be the same as in merge_old_version */
51 const unsigned char bidir_changed;
52 const unsigned char category_changed;
53 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000054 const unsigned char mirrored_changed;
Benjamin Peterson67752312016-09-14 23:53:47 -070055 const unsigned char east_asian_width_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000056 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000057} change_record;
58
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000059/* data file generated by Tools/unicode/makeunicodedata.py */
60#include "unicodedata_db.h"
61
62static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000063_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000064{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000065 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000066 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000067 index = 0;
68 else {
69 index = index1[(code>>SHIFT)];
70 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
71 }
72
73 return &_PyUnicode_Database_Records[index];
74}
75
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000076/* ------------- Previous-version API ------------------------------------- */
77typedef struct previous_version {
78 PyObject_HEAD
79 const char *name;
80 const change_record* (*getrecord)(Py_UCS4);
81 Py_UCS4 (*normalization)(Py_UCS4);
82} PreviousDBVersion;
83
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030084#include "clinic/unicodedata.c.h"
85
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
87
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090 {NULL}
91};
92
Thomas Wouters89f507f2006-12-13 04:49:30 +000093/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000094static PyTypeObject UCD_Type;
Mohamed Koubaaddc0dd02020-09-23 05:38:16 -050095
Victor Stinnere6b8c522020-10-15 16:22:19 +020096typedef struct {
97 // Borrowed reference to &UCD_Type. It is used to prepare the code
98 // to convert the UCD_Type static type to a heap type.
99 PyTypeObject *ucd_type;
100} unicodedata_module_state;
101
102// bpo-1635741: Temporary global state until the unicodedata module
103// gets a real module state.
104static unicodedata_module_state global_module_state;
105
106// Check if self is an instance of ucd_type.
Mohamed Koubaaddc0dd02020-09-23 05:38:16 -0500107// Return 0 if self is NULL (when the PyCapsule C API is used).
108#define UCD_Check(self, ucd_type) (self != NULL && Py_IS_TYPE(self, ucd_type))
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000109
110static PyObject*
Victor Stinnere6b8c522020-10-15 16:22:19 +0200111new_previous_version(unicodedata_module_state *state,
112 const char*name, const change_record* (*getrecord)(Py_UCS4),
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000113 Py_UCS4 (*normalization)(Py_UCS4))
114{
Victor Stinnere6b8c522020-10-15 16:22:19 +0200115 PreviousDBVersion *self;
116 self = PyObject_New(PreviousDBVersion, state->ucd_type);
117 if (self == NULL)
118 return NULL;
119 self->name = name;
120 self->getrecord = getrecord;
121 self->normalization = normalization;
122 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000123}
124
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000125
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000126/* --- Module API --------------------------------------------------------- */
127
Larry Hastings61272b72014-01-07 12:41:53 -0800128/*[clinic input]
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800129unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700130
Serhiy Storchaka63596412015-04-17 21:18:49 +0300131 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700132 chr: int(accept={str})
Larry Hastings31826802013-10-19 00:09:25 -0700133 default: object=NULL
134 /
135
136Converts a Unicode character into its equivalent decimal value.
137
Serhiy Storchaka63596412015-04-17 21:18:49 +0300138Returns the decimal value assigned to the character chr as integer.
139If no such value is defined, default is returned, or, if not given,
140ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800141[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700142
Larry Hastings31826802013-10-19 00:09:25 -0700143static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300144unicodedata_UCD_decimal_impl(PyObject *self, int chr,
Larry Hastings89964c42015-04-14 18:07:59 -0400145 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700146/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
Larry Hastings31826802013-10-19 00:09:25 -0700147{
Victor Stinnere6b8c522020-10-15 16:22:19 +0200148 unicodedata_module_state *state = &global_module_state;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000149 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000150 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300151 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000152
Victor Stinnere6b8c522020-10-15 16:22:19 +0200153 if (UCD_Check(self, state->ucd_type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000154 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000155 if (old->category_changed == 0) {
156 /* unassigned */
157 have_old = 1;
158 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000159 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000160 else if (old->decimal_changed != 0xFF) {
161 have_old = 1;
162 rc = old->decimal_changed;
163 }
164 }
165
166 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000167 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000168 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700169 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 PyErr_SetString(PyExc_ValueError,
171 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000172 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 }
174 else {
Larry Hastings31826802013-10-19 00:09:25 -0700175 Py_INCREF(default_value);
176 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000178 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000179 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000180}
181
Serhiy Storchaka63596412015-04-17 21:18:49 +0300182/*[clinic input]
183unicodedata.UCD.digit
184
185 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700186 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300187 default: object=NULL
188 /
189
190Converts a Unicode character into its equivalent digit value.
191
192Returns the digit value assigned to the character chr as integer.
193If no such value is defined, default is returned, or, if not given,
194ValueError is raised.
195[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000196
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000197static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300198unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700199/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000200{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300202 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000203 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000204 if (rc < 0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300205 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000206 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000207 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000208 }
209 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300210 Py_INCREF(default_value);
211 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000212 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000213 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000214 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000215}
216
Serhiy Storchaka63596412015-04-17 21:18:49 +0300217/*[clinic input]
218unicodedata.UCD.numeric
219
220 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700221 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300222 default: object=NULL
223 /
224
225Converts a Unicode character into its equivalent numeric value.
226
227Returns the numeric value assigned to the character chr as float.
228If no such value is defined, default is returned, or, if not given,
229ValueError is raised.
230[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000231
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000232static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300233unicodedata_UCD_numeric_impl(PyObject *self, int chr,
234 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700235/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000236{
Victor Stinnere6b8c522020-10-15 16:22:19 +0200237 unicodedata_module_state *state = &global_module_state;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000238 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000239 double rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300240 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000241
Victor Stinnere6b8c522020-10-15 16:22:19 +0200242 if (UCD_Check(self, state->ucd_type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000243 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000244 if (old->category_changed == 0) {
245 /* unassigned */
246 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000247 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000248 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000249 else if (old->decimal_changed != 0xFF) {
250 have_old = 1;
251 rc = old->decimal_changed;
252 }
253 }
254
255 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000256 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257 if (rc == -1.0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300258 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000259 PyErr_SetString(PyExc_ValueError, "not a numeric character");
260 return NULL;
261 }
262 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300263 Py_INCREF(default_value);
264 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000266 }
267 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000268}
269
Serhiy Storchaka63596412015-04-17 21:18:49 +0300270/*[clinic input]
271unicodedata.UCD.category
272
273 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700274 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300275 /
276
277Returns the general category assigned to the character chr as string.
278[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000279
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000280static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300281unicodedata_UCD_category_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700282/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000283{
Victor Stinnere6b8c522020-10-15 16:22:19 +0200284 unicodedata_module_state *state = &global_module_state;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000285 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300286 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000287 index = (int) _getrecord_ex(c)->category;
Victor Stinnere6b8c522020-10-15 16:22:19 +0200288 if (UCD_Check(self, state->ucd_type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000289 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000290 if (old->category_changed != 0xFF)
291 index = old->category_changed;
292 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000293 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000294}
295
Serhiy Storchaka63596412015-04-17 21:18:49 +0300296/*[clinic input]
297unicodedata.UCD.bidirectional
298
299 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700300 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300301 /
302
303Returns the bidirectional class assigned to the character chr as string.
304
305If no such value is defined, an empty string is returned.
306[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000307
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000308static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300309unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700310/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000311{
Victor Stinnere6b8c522020-10-15 16:22:19 +0200312 unicodedata_module_state *state = &global_module_state;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000313 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300314 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000315 index = (int) _getrecord_ex(c)->bidirectional;
Victor Stinnere6b8c522020-10-15 16:22:19 +0200316 if (UCD_Check(self, state->ucd_type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000317 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000318 if (old->category_changed == 0)
319 index = 0; /* unassigned */
320 else if (old->bidir_changed != 0xFF)
321 index = old->bidir_changed;
322 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000323 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000324}
325
Serhiy Storchaka63596412015-04-17 21:18:49 +0300326/*[clinic input]
327unicodedata.UCD.combining -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000328
Serhiy Storchaka63596412015-04-17 21:18:49 +0300329 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700330 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300331 /
332
333Returns the canonical combining class assigned to the character chr as integer.
334
335Returns 0 if no combining class is defined.
336[clinic start generated code]*/
337
338static int
339unicodedata_UCD_combining_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700340/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000341{
Victor Stinnere6b8c522020-10-15 16:22:19 +0200342 unicodedata_module_state *state = &global_module_state;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000343 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300344 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000345 index = (int) _getrecord_ex(c)->combining;
Victor Stinnere6b8c522020-10-15 16:22:19 +0200346 if (UCD_Check(self, state->ucd_type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000347 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000348 if (old->category_changed == 0)
349 index = 0; /* unassigned */
350 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300351 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000352}
353
Serhiy Storchaka63596412015-04-17 21:18:49 +0300354/*[clinic input]
355unicodedata.UCD.mirrored -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000356
Serhiy Storchaka63596412015-04-17 21:18:49 +0300357 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700358 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300359 /
360
361Returns the mirrored property assigned to the character chr as integer.
362
363Returns 1 if the character has been identified as a "mirrored"
364character in bidirectional text, 0 otherwise.
365[clinic start generated code]*/
366
367static int
368unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700369/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000370{
Victor Stinnere6b8c522020-10-15 16:22:19 +0200371 unicodedata_module_state *state = &global_module_state;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000372 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300373 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000374 index = (int) _getrecord_ex(c)->mirrored;
Victor Stinnere6b8c522020-10-15 16:22:19 +0200375 if (UCD_Check(self, state->ucd_type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000376 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000377 if (old->category_changed == 0)
378 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000379 else if (old->mirrored_changed != 0xFF)
380 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000381 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300382 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000383}
384
Serhiy Storchaka63596412015-04-17 21:18:49 +0300385/*[clinic input]
386unicodedata.UCD.east_asian_width
387
388 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700389 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300390 /
391
392Returns the east asian width assigned to the character chr as string.
393[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000394
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000395static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300396unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700397/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000398{
Victor Stinnere6b8c522020-10-15 16:22:19 +0200399 unicodedata_module_state *state = &global_module_state;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000400 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300401 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000402 index = (int) _getrecord_ex(c)->east_asian_width;
Victor Stinnere6b8c522020-10-15 16:22:19 +0200403 if (UCD_Check(self, state->ucd_type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000404 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000405 if (old->category_changed == 0)
406 index = 0; /* unassigned */
Benjamin Peterson67752312016-09-14 23:53:47 -0700407 else if (old->east_asian_width_changed != 0xFF)
408 index = old->east_asian_width_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000409 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000410 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000411}
412
Serhiy Storchaka63596412015-04-17 21:18:49 +0300413/*[clinic input]
414unicodedata.UCD.decomposition
415
416 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700417 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300418 /
419
420Returns the character decomposition mapping assigned to the character chr as string.
421
422An empty string is returned in case no such mapping is defined.
423[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000424
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000425static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300426unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700427/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000428{
Victor Stinnere6b8c522020-10-15 16:22:19 +0200429 unicodedata_module_state *state = &global_module_state;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000430 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000431 int code, index, count;
432 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000433 unsigned int prefix_index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300434 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000435
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000436 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000437
Victor Stinnere6b8c522020-10-15 16:22:19 +0200438 if (UCD_Check(self, state->ucd_type)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000439 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000440 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000441 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000442 }
443
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000444 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000445 index = 0;
446 else {
447 index = decomp_index1[(code>>DECOMP_SHIFT)];
448 index = decomp_index2[(index<<DECOMP_SHIFT)+
449 (code&((1<<DECOMP_SHIFT)-1))];
450 }
451
Tim Peters69b83b12001-11-30 07:23:05 +0000452 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000453 is prefix code (from*/
454 count = decomp_data[index] >> 8;
455
456 /* XXX: could allocate the PyString up front instead
457 (strlen(prefix) + 5 * count + 1 bytes) */
458
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000459 /* Based on how index is calculated above and decomp_data is generated
460 from Tools/unicode/makeunicodedata.py, it should not be possible
461 to overflow decomp_prefix. */
462 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200463 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000464
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000465 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000466 i = strlen(decomp_prefix[prefix_index]);
467 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000468
469 while (count-- > 0) {
470 if (i)
471 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000472 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000473 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
474 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000475 i += strlen(decomp + i);
476 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000477 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000478}
479
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000480static void
Victor Stinnere6b8c522020-10-15 16:22:19 +0200481get_decomp_record(unicodedata_module_state *state, PyObject *self,
482 Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000483{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000484 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000485 *index = 0;
Victor Stinnere6b8c522020-10-15 16:22:19 +0200486 } else if (UCD_Check(self, state->ucd_type) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000487 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000488 /* unassigned in old version */
489 *index = 0;
490 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000491 else {
492 *index = decomp_index1[(code>>DECOMP_SHIFT)];
493 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
494 (code&((1<<DECOMP_SHIFT)-1))];
495 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000496
Martin v. Löwis677bde22002-11-23 22:08:15 +0000497 /* high byte is number of hex bytes (usually one or two), low byte
498 is prefix code (from*/
499 *count = decomp_data[*index] >> 8;
500 *prefix = decomp_data[*index] & 255;
501
502 (*index)++;
503}
504
505#define SBase 0xAC00
506#define LBase 0x1100
507#define VBase 0x1161
508#define TBase 0x11A7
509#define LCount 19
510#define VCount 21
511#define TCount 28
512#define NCount (VCount*TCount)
513#define SCount (LCount*NCount)
514
515static PyObject*
Victor Stinnere6b8c522020-10-15 16:22:19 +0200516nfd_nfkd(unicodedata_module_state *state, PyObject *self,
517 PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000518{
519 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200520 Py_UCS4 *output;
521 Py_ssize_t i, o, osize;
522 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300523 const void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000524 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200525 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000526 Py_ssize_t space, isize;
527 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000528 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000529
Martin v. Löwis677bde22002-11-23 22:08:15 +0000530 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200531 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500532 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300533 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500534 if (space > 10) {
535 if (space <= PY_SSIZE_T_MAX - 10)
536 space += 10;
537 }
538 else {
539 space *= 2;
540 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200541 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500542 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200543 if (!output) {
544 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000545 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200546 }
547 i = o = 0;
548 kind = PyUnicode_KIND(input);
549 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000550
Martin v. Löwis22970662011-09-29 13:39:38 +0200551 while (i < isize) {
552 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000553 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200554 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000555 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300556 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000557 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000558 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200559 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000560 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000561 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
562 if (new_output == NULL) {
563 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200564 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000565 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200566 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000567 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000568 }
569 /* Hangul Decomposition. */
570 if (SBase <= code && code < (SBase+SCount)) {
571 int SIndex = code - SBase;
572 int L = LBase + SIndex / NCount;
573 int V = VBase + (SIndex % NCount) / TCount;
574 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200575 output[o++] = L;
576 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000577 space -= 2;
578 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200579 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000580 space --;
581 }
582 continue;
583 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000584 /* normalization changes */
Victor Stinnere6b8c522020-10-15 16:22:19 +0200585 if (UCD_Check(self, state->ucd_type)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000586 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
587 if (value != 0) {
588 stack[stackptr++] = value;
589 continue;
590 }
591 }
592
593 /* Other decompositions. */
Victor Stinnere6b8c522020-10-15 16:22:19 +0200594 get_decomp_record(state, self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000595
596 /* Copy character if it is not decomposable, or has a
597 compatibility decomposition, but we do NFD. */
598 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200599 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000600 space--;
601 continue;
602 }
603 /* Copy decomposition onto the stack, in reverse
604 order. */
605 while(count) {
606 code = decomp_data[index + (--count)];
607 stack[stackptr++] = code;
608 }
609 }
610 }
611
Martin v. Löwis22970662011-09-29 13:39:38 +0200612 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
613 output, o);
614 PyMem_Free(output);
615 if (!result)
616 return NULL;
617 /* result is guaranteed to be ready, as it is compact. */
618 kind = PyUnicode_KIND(result);
619 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000620
621 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200622 i = 0;
623 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
624 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
625 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000626 if (prev == 0 || cur == 0 || prev <= cur) {
627 prev = cur;
628 continue;
629 }
630 /* Non-canonical order. Need to switch *i with previous. */
631 o = i - 1;
632 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200633 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
634 PyUnicode_WRITE(kind, data, o+1,
635 PyUnicode_READ(kind, data, o));
636 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000637 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200638 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000639 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200640 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000641 if (prev == 0 || prev <= cur)
642 break;
643 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200644 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000645 }
646 return result;
647}
648
649static int
Andy Lester982307b2020-03-17 11:38:12 -0500650find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000651{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200652 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000653 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200654 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000655 if (code < start)
656 return -1;
657 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200658 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000659 return nfc[index].index + delta;
660 }
661 }
662 return -1;
663}
664
665static PyObject*
Victor Stinnere6b8c522020-10-15 16:22:19 +0200666nfc_nfkc(unicodedata_module_state *state, PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000667{
668 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200669 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300670 const void *data;
Martin v. Löwis22970662011-09-29 13:39:38 +0200671 Py_UCS4 *output;
672 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000673 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200674 Py_UCS4 code;
675 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000676 int cskipped = 0;
677
Victor Stinnere6b8c522020-10-15 16:22:19 +0200678 result = nfd_nfkd(state, self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000679 if (!result)
680 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200681 /* result will be "ready". */
682 kind = PyUnicode_KIND(result);
683 data = PyUnicode_DATA(result);
684 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000685
Martin v. Löwis22970662011-09-29 13:39:38 +0200686 /* We allocate a buffer for the output.
687 If we find that we made no changes, we still return
688 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500689 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200690 if (!output) {
691 PyErr_NoMemory();
692 Py_DECREF(result);
693 return 0;
694 }
695 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000696
Martin v. Löwis677bde22002-11-23 22:08:15 +0000697 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200698 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000699 for (index = 0; index < cskipped; index++) {
700 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000701 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000702 Remove from list. */
703 skipped[index] = skipped[cskipped-1];
704 cskipped--;
705 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000706 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000707 }
708 }
709 /* Hangul Composition. We don't need to check for <LV,T>
710 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200711 code = PyUnicode_READ(kind, data, i);
712 if (LBase <= code && code < (LBase+LCount) &&
713 i + 1 < len &&
714 VBase <= PyUnicode_READ(kind, data, i+1) &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900715 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
716 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
717 and V character is a modern vowel (0x1161 ~ 0x1175). */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000718 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200719 LIndex = code - LBase;
720 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000721 code = SBase + (LIndex*VCount+VIndex)*TCount;
722 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200723 if (i < len &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900724 TBase < PyUnicode_READ(kind, data, i) &&
725 PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
726 /* check T character is a modern trailing consonant
727 (0x11A8 ~ 0x11C2). */
Martin v. Löwis22970662011-09-29 13:39:38 +0200728 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000729 i++;
730 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200731 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000732 continue;
733 }
734
Martin v. Löwis22970662011-09-29 13:39:38 +0200735 /* code is still input[i] here */
Andy Lester982307b2020-03-17 11:38:12 -0500736 f = find_nfc_index(nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000737 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200738 output[o++] = code;
739 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000740 continue;
741 }
742 /* Find next unblocked character. */
743 i1 = i+1;
744 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200745 /* output base character for now; might be updated later. */
746 output[o] = PyUnicode_READ(kind, data, i);
747 while (i1 < len) {
748 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
749 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000750 if (comb) {
751 if (comb1 == 0)
752 break;
753 if (comb >= comb1) {
754 /* Character is blocked. */
755 i1++;
756 continue;
757 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000758 }
Andy Lester982307b2020-03-17 11:38:12 -0500759 l = find_nfc_index(nfc_last, code1);
Martin v. Löwis22970662011-09-29 13:39:38 +0200760 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000761 is a starter, we don't need to look further.
762 Otherwise, record the combining class. */
763 if (l == -1) {
764 not_combinable:
765 if (comb1 == 0)
766 break;
767 comb = comb1;
768 i1++;
769 continue;
770 }
771 index = f*TOTAL_LAST + l;
772 index1 = comp_index[index >> COMP_SHIFT];
773 code = comp_data[(index1<<COMP_SHIFT)+
774 (index&((1<<COMP_SHIFT)-1))];
775 if (code == 0)
776 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000777
Martin v. Löwis677bde22002-11-23 22:08:15 +0000778 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200779 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000780 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000781 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000782 skipped[cskipped++] = i1;
783 i1++;
Andy Lester982307b2020-03-17 11:38:12 -0500784 f = find_nfc_index(nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000785 if (f == -1)
786 break;
787 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200788 /* Output character was already written.
789 Just advance the indices. */
790 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000791 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200792 if (o == len) {
793 /* No changes. Return original string. */
794 PyMem_Free(output);
795 return result;
796 }
797 Py_DECREF(result);
798 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
799 output, o);
800 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000801 return result;
802}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000803
Greg Price2f094132019-09-03 19:45:44 -0700804// This needs to match the logic in makeunicodedata.py
805// which constructs the quickcheck data.
806typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
Max Bélanger2810dd72018-11-04 15:58:24 -0800807
Greg Price2f094132019-09-03 19:45:44 -0700808/* Run the Unicode normalization "quickcheck" algorithm.
809 *
810 * Return YES or NO if quickcheck determines the input is certainly
811 * normalized or certainly not, and MAYBE if quickcheck is unable to
812 * tell.
813 *
814 * If `yes_only` is true, then return MAYBE as soon as we determine
815 * the answer is not YES.
816 *
817 * For background and details on the algorithm, see UAX #15:
818 * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
819 */
820static QuickcheckResult
Victor Stinnere6b8c522020-10-15 16:22:19 +0200821is_normalized_quickcheck(unicodedata_module_state *state, PyObject *self,
822 PyObject *input, bool nfc, bool k, bool yes_only)
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000823{
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000824 /* An older version of the database is requested, quickchecks must be
825 disabled. */
Victor Stinnere6b8c522020-10-15 16:22:19 +0200826 if (UCD_Check(self, state->ucd_type))
Max Bélanger2810dd72018-11-04 15:58:24 -0800827 return NO;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000828
Greg Price2f094132019-09-03 19:45:44 -0700829 Py_ssize_t i, len;
830 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300831 const void *data;
Greg Price2f094132019-09-03 19:45:44 -0700832 unsigned char prev_combining = 0;
833
834 /* The two quickcheck bits at this shift have type QuickcheckResult. */
835 int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
836
837 QuickcheckResult result = YES; /* certainly normalized, unless we find something */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000838
Martin v. Löwis22970662011-09-29 13:39:38 +0200839 i = 0;
840 kind = PyUnicode_KIND(input);
841 data = PyUnicode_DATA(input);
842 len = PyUnicode_GET_LENGTH(input);
843 while (i < len) {
844 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
845 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000846
Greg Price2f094132019-09-03 19:45:44 -0700847 unsigned char combining = record->combining;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000848 if (combining && prev_combining > combining)
Max Bélanger2810dd72018-11-04 15:58:24 -0800849 return NO; /* non-canonical sort order, not normalized */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000850 prev_combining = combining;
Greg Price2f094132019-09-03 19:45:44 -0700851
852 unsigned char quickcheck_whole = record->normalization_quick_check;
853 if (yes_only) {
854 if (quickcheck_whole & (3 << quickcheck_shift))
855 return MAYBE;
856 } else {
857 switch ((quickcheck_whole >> quickcheck_shift) & 3) {
858 case NO:
859 return NO;
860 case MAYBE:
861 result = MAYBE; /* this string might need normalization */
862 }
863 }
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000864 }
Greg Price2f094132019-09-03 19:45:44 -0700865 return result;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000866}
867
Serhiy Storchaka63596412015-04-17 21:18:49 +0300868/*[clinic input]
Max Bélanger2810dd72018-11-04 15:58:24 -0800869unicodedata.UCD.is_normalized
870
871 self: self
872 form: unicode
873 unistr as input: unicode
874 /
875
876Return whether the Unicode string unistr is in the normal form 'form'.
877
878Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
879[clinic start generated code]*/
880
881static PyObject *
882unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
883 PyObject *input)
884/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
885{
Victor Stinnere6b8c522020-10-15 16:22:19 +0200886 unicodedata_module_state *state = &global_module_state;
Max Bélanger2810dd72018-11-04 15:58:24 -0800887 if (PyUnicode_READY(input) == -1) {
888 return NULL;
889 }
890
891 if (PyUnicode_GET_LENGTH(input) == 0) {
892 /* special case empty input strings. */
893 Py_RETURN_TRUE;
894 }
895
896 PyObject *result;
Greg Price7669cb82019-09-09 02:16:31 -0700897 bool nfc = false;
898 bool k = false;
Greg Price2f094132019-09-03 19:45:44 -0700899 QuickcheckResult m;
Max Bélanger2810dd72018-11-04 15:58:24 -0800900
901 PyObject *cmp;
902 int match = 0;
903
904 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700905 nfc = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800906 }
907 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700908 nfc = true;
909 k = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800910 }
911 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
912 /* matches default values for `nfc` and `k` */
913 }
914 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
Greg Price7669cb82019-09-09 02:16:31 -0700915 k = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800916 }
917 else {
918 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
919 return NULL;
920 }
921
Victor Stinnere6b8c522020-10-15 16:22:19 +0200922 m = is_normalized_quickcheck(state, self, input, nfc, k, false);
Max Bélanger2810dd72018-11-04 15:58:24 -0800923
924 if (m == MAYBE) {
Victor Stinnere6b8c522020-10-15 16:22:19 +0200925 cmp = (nfc ? nfc_nfkc : nfd_nfkd)(state, self, input, k);
Max Bélanger2810dd72018-11-04 15:58:24 -0800926 if (cmp == NULL) {
927 return NULL;
928 }
929 match = PyUnicode_Compare(input, cmp);
930 Py_DECREF(cmp);
931 result = (match == 0) ? Py_True : Py_False;
932 }
933 else {
934 result = (m == YES) ? Py_True : Py_False;
935 }
936
937 Py_INCREF(result);
938 return result;
939}
940
941
942/*[clinic input]
Serhiy Storchaka63596412015-04-17 21:18:49 +0300943unicodedata.UCD.normalize
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000944
Serhiy Storchaka63596412015-04-17 21:18:49 +0300945 self: self
Max Bélanger2810dd72018-11-04 15:58:24 -0800946 form: unicode
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +0300947 unistr as input: unicode
Serhiy Storchaka63596412015-04-17 21:18:49 +0300948 /
949
950Return the normal form 'form' for the Unicode string unistr.
951
952Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
953[clinic start generated code]*/
954
955static PyObject *
Max Bélanger2810dd72018-11-04 15:58:24 -0800956unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
Serhiy Storchaka63596412015-04-17 21:18:49 +0300957 PyObject *input)
Max Bélanger2810dd72018-11-04 15:58:24 -0800958/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
Martin v. Löwis677bde22002-11-23 22:08:15 +0000959{
Victor Stinnere6b8c522020-10-15 16:22:19 +0200960 unicodedata_module_state *state = &global_module_state;
Martin v. Löwis22970662011-09-29 13:39:38 +0200961 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000962 /* Special case empty input strings, since resizing
963 them later would cause internal errors. */
964 Py_INCREF(input);
965 return input;
966 }
967
Max Bélanger2810dd72018-11-04 15:58:24 -0800968 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
Victor Stinnere6b8c522020-10-15 16:22:19 +0200969 if (is_normalized_quickcheck(state, self, input,
970 true, false, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000971 Py_INCREF(input);
972 return input;
973 }
Victor Stinnere6b8c522020-10-15 16:22:19 +0200974 return nfc_nfkc(state, self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000975 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800976 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
Victor Stinnere6b8c522020-10-15 16:22:19 +0200977 if (is_normalized_quickcheck(state, self, input,
978 true, true, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000979 Py_INCREF(input);
980 return input;
981 }
Victor Stinnere6b8c522020-10-15 16:22:19 +0200982 return nfc_nfkc(state, self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000983 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800984 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
Victor Stinnere6b8c522020-10-15 16:22:19 +0200985 if (is_normalized_quickcheck(state, self, input,
986 false, false, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000987 Py_INCREF(input);
988 return input;
989 }
Victor Stinnere6b8c522020-10-15 16:22:19 +0200990 return nfd_nfkd(state, self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000991 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800992 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
Victor Stinnere6b8c522020-10-15 16:22:19 +0200993 if (is_normalized_quickcheck(state, self, input,
994 false, true, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000995 Py_INCREF(input);
996 return input;
997 }
Victor Stinnere6b8c522020-10-15 16:22:19 +0200998 return nfd_nfkd(state, self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000999 }
Martin v. Löwis677bde22002-11-23 22:08:15 +00001000 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
1001 return NULL;
1002}
1003
Fredrik Lundh06d12682001-01-24 07:59:11 +00001004/* -------------------------------------------------------------------- */
1005/* unicode character name tables */
1006
1007/* data file generated by Tools/unicode/makeunicodedata.py */
1008#include "unicodename_db.h"
1009
1010/* -------------------------------------------------------------------- */
1011/* database code (cut and pasted from the unidb package) */
1012
1013static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001014_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001015{
1016 int i;
1017 unsigned long h = 0;
1018 unsigned long ix;
1019 for (i = 0; i < len; i++) {
Jordon Xu2ec70102019-09-11 00:04:08 +08001020 h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001021 ix = h & 0xff000000;
1022 if (ix)
1023 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
1024 }
1025 return h;
1026}
1027
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02001028static const char * const hangul_syllables[][3] = {
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001029 { "G", "A", "" },
1030 { "GG", "AE", "G" },
1031 { "N", "YA", "GG" },
1032 { "D", "YAE", "GS" },
1033 { "DD", "EO", "N", },
1034 { "R", "E", "NJ" },
1035 { "M", "YEO", "NH" },
1036 { "B", "YE", "D" },
1037 { "BB", "O", "L" },
1038 { "S", "WA", "LG" },
1039 { "SS", "WAE", "LM" },
1040 { "", "OE", "LB" },
1041 { "J", "YO", "LS" },
1042 { "JJ", "U", "LT" },
1043 { "C", "WEO", "LP" },
1044 { "K", "WE", "LH" },
1045 { "T", "WI", "M" },
1046 { "P", "YU", "B" },
1047 { "H", "EU", "BS" },
1048 { 0, "YI", "S" },
1049 { 0, "I", "SS" },
1050 { 0, 0, "NG" },
1051 { 0, 0, "J" },
1052 { 0, 0, "C" },
1053 { 0, 0, "K" },
1054 { 0, 0, "T" },
1055 { 0, 0, "P" },
1056 { 0, 0, "H" }
1057};
1058
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001059/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001060static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001061is_unified_ideograph(Py_UCS4 code)
1062{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001063 return
Benjamin Peterson051b9d02020-03-10 20:41:34 -07001064 (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
1065 (0x4E00 <= code && code <= 0x9FFC) || /* CJK Ideograph */
1066 (0x20000 <= code && code <= 0x2A6DD) || /* CJK Ideograph Extension B */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001067 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
Benjamin Peterson48013832015-06-27 15:45:56 -05001068 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
Benjamin Peterson279a9622017-06-22 22:31:08 -07001069 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
Benjamin Peterson051b9d02020-03-10 20:41:34 -07001070 (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1071 (0x30000 <= code && code <= 0x3134A); /* CJK Ideograph Extension G */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001072}
1073
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001074/* macros used to determine if the given code point is in the PUA range that
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001075 * we are using to store aliases and named sequences */
1076#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1077#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1078 (cp < named_sequences_end))
1079
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001080static int
Victor Stinnere6b8c522020-10-15 16:22:19 +02001081_getucname(unicodedata_module_state *state, PyObject *self,
1082 Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001083{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001084 /* Find the name associated with the given code point.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001085 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1086 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001087 int offset;
1088 int i;
1089 int word;
Inada Naoki6fec9052019-04-17 08:40:34 +09001090 const unsigned char* w;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001091
Martin v. Löwisc3509122006-03-11 12:16:23 +00001092 if (code >= 0x110000)
1093 return 0;
1094
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001095 /* XXX should we just skip all the code points in the PUAs here? */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001096 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1097 return 0;
1098
Victor Stinnere6b8c522020-10-15 16:22:19 +02001099 if (UCD_Check(self, state->ucd_type)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001100 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +03001101 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001102 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1103 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +03001104 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +00001105 if (old->category_changed == 0) {
1106 /* unassigned */
1107 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001108 }
Martin v. Löwisc3509122006-03-11 12:16:23 +00001109 }
1110
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +00001111 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001112 /* Hangul syllable. */
1113 int SIndex = code - SBase;
1114 int L = SIndex / NCount;
1115 int V = (SIndex % NCount) / TCount;
1116 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001117
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001118 if (buflen < 27)
1119 /* Worst case: HANGUL SYLLABLE <10chars>. */
1120 return 0;
1121 strcpy(buffer, "HANGUL SYLLABLE ");
1122 buffer += 16;
1123 strcpy(buffer, hangul_syllables[L][0]);
1124 buffer += strlen(hangul_syllables[L][0]);
1125 strcpy(buffer, hangul_syllables[V][1]);
1126 buffer += strlen(hangul_syllables[V][1]);
1127 strcpy(buffer, hangul_syllables[T][2]);
1128 buffer += strlen(hangul_syllables[T][2]);
1129 *buffer = '\0';
1130 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001131 }
1132
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001133 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001134 if (buflen < 28)
1135 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1136 return 0;
1137 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1138 return 1;
1139 }
1140
Fredrik Lundh06d12682001-01-24 07:59:11 +00001141 /* get offset into phrasebook */
1142 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1143 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1144 (code&((1<<phrasebook_shift)-1))];
1145 if (!offset)
1146 return 0;
1147
1148 i = 0;
1149
1150 for (;;) {
1151 /* get word index */
1152 word = phrasebook[offset] - phrasebook_short;
1153 if (word >= 0) {
1154 word = (word << 8) + phrasebook[offset+1];
1155 offset += 2;
1156 } else
1157 word = phrasebook[offset++];
1158 if (i) {
1159 if (i > buflen)
1160 return 0; /* buffer overflow */
1161 buffer[i++] = ' ';
1162 }
1163 /* copy word string from lexicon. the last character in the
1164 word has bit 7 set. the last word in a string ends with
1165 0x80 */
1166 w = lexicon + lexicon_offset[word];
1167 while (*w < 128) {
1168 if (i >= buflen)
1169 return 0; /* buffer overflow */
1170 buffer[i++] = *w++;
1171 }
1172 if (i >= buflen)
1173 return 0; /* buffer overflow */
1174 buffer[i++] = *w & 127;
1175 if (*w == 128)
1176 break; /* end of word */
1177 }
1178
1179 return 1;
1180}
1181
1182static int
Victor Stinnere6b8c522020-10-15 16:22:19 +02001183capi_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
1184 int with_alias_and_seq)
1185{
1186 unicodedata_module_state *state = &global_module_state;
1187 return _getucname(state, self, code, buffer, buflen, with_alias_and_seq);
1188
1189}
1190
1191static int
1192_cmpname(unicodedata_module_state *state, PyObject *self,
1193 int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001194{
1195 /* check if code corresponds to the given name */
1196 int i;
Christian Heimes2f366ca2016-09-23 20:20:27 +02001197 char buffer[NAME_MAXLEN+1];
Victor Stinnere6b8c522020-10-15 16:22:19 +02001198 if (!_getucname(state, self, code, buffer, NAME_MAXLEN, 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001199 return 0;
1200 for (i = 0; i < namelen; i++) {
Jordon Xu2ec70102019-09-11 00:04:08 +08001201 if (Py_TOUPPER(name[i]) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001202 return 0;
1203 }
1204 return buffer[namelen] == '\0';
1205}
1206
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001207static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001208find_syllable(const char *str, int *len, int *pos, int count, int column)
1209{
1210 int i, len1;
1211 *len = -1;
1212 for (i = 0; i < count; i++) {
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02001213 const char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001214 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001215 if (len1 <= *len)
1216 continue;
1217 if (strncmp(str, s, len1) == 0) {
1218 *len = len1;
1219 *pos = i;
1220 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001221 }
1222 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001223 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001224 }
1225}
1226
Fredrik Lundh06d12682001-01-24 07:59:11 +00001227static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001228_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001229{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001230 /* check if named sequences are allowed */
1231 if (!with_named_seq && IS_NAMED_SEQ(cp))
1232 return 0;
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001233 /* if the code point is in the PUA range that we use for aliases,
1234 * convert it to obtain the right code point */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001235 if (IS_ALIAS(cp))
1236 *code = name_aliases[cp-aliases_start];
1237 else
1238 *code = cp;
1239 return 1;
1240}
1241
1242static int
Victor Stinnere6b8c522020-10-15 16:22:19 +02001243_getcode(unicodedata_module_state *state, PyObject* self,
1244 const char* name, int namelen, Py_UCS4* code, int with_named_seq)
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001245{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001246 /* Return the code point associated with the given name.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001247 * Named aliases are resolved too (unless self != NULL (i.e. we are using
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001248 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001249 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001250 unsigned int h, v;
1251 unsigned int mask = code_size-1;
1252 unsigned int i, incr;
1253
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001254 /* Check for hangul syllables. */
1255 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001256 int len, L = -1, V = -1, T = -1;
1257 const char *pos = name + 16;
1258 find_syllable(pos, &len, &L, LCount, 0);
1259 pos += len;
1260 find_syllable(pos, &len, &V, VCount, 1);
1261 pos += len;
1262 find_syllable(pos, &len, &T, TCount, 2);
1263 pos += len;
1264 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1265 *code = SBase + (L*VCount+V)*TCount + T;
1266 return 1;
1267 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001268 /* Otherwise, it's an illegal syllable name. */
1269 return 0;
1270 }
1271
1272 /* Check for unified ideographs. */
1273 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1274 /* Four or five hexdigits must follow. */
1275 v = 0;
1276 name += 22;
1277 namelen -= 22;
1278 if (namelen != 4 && namelen != 5)
1279 return 0;
1280 while (namelen--) {
1281 v *= 16;
1282 if (*name >= '0' && *name <= '9')
1283 v += *name - '0';
1284 else if (*name >= 'A' && *name <= 'F')
1285 v += *name - 'A' + 10;
1286 else
1287 return 0;
1288 name++;
1289 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001290 if (!is_unified_ideograph(v))
1291 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001292 *code = v;
1293 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001294 }
1295
Fredrik Lundh06d12682001-01-24 07:59:11 +00001296 /* the following is the same as python's dictionary lookup, with
1297 only minor changes. see the makeunicodedata script for more
1298 details */
1299
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001300 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001301 i = (~h) & mask;
1302 v = code_hash[i];
1303 if (!v)
1304 return 0;
Victor Stinnere6b8c522020-10-15 16:22:19 +02001305 if (_cmpname(state, self, v, name, namelen)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001306 return _check_alias_and_seq(v, code, with_named_seq);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001307 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001308 incr = (h ^ (h >> 3)) & mask;
1309 if (!incr)
1310 incr = mask;
1311 for (;;) {
1312 i = (i + incr) & mask;
1313 v = code_hash[i];
1314 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001315 return 0;
Victor Stinnere6b8c522020-10-15 16:22:19 +02001316 if (_cmpname(state, self, v, name, namelen)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001317 return _check_alias_and_seq(v, code, with_named_seq);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001318 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001319 incr = incr << 1;
1320 if (incr > mask)
1321 incr = incr ^ code_poly;
1322 }
1323}
1324
Victor Stinnere6b8c522020-10-15 16:22:19 +02001325static int
1326capi_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1327 int with_named_seq)
1328{
1329 unicodedata_module_state *state = &global_module_state;
1330 return _getcode(state, self, name, namelen, code, with_named_seq);
1331
1332}
1333
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001334static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001335{
1336 sizeof(_PyUnicode_Name_CAPI),
Victor Stinnere6b8c522020-10-15 16:22:19 +02001337 capi_getucname,
1338 capi_getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001339};
1340
1341/* -------------------------------------------------------------------- */
1342/* Python bindings */
1343
Serhiy Storchaka63596412015-04-17 21:18:49 +03001344/*[clinic input]
1345unicodedata.UCD.name
1346
1347 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001348 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +03001349 default: object=NULL
1350 /
1351
1352Returns the name assigned to the character chr as a string.
1353
1354If no name is defined, default is returned, or, if not given,
1355ValueError is raised.
1356[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001357
Fredrik Lundh06d12682001-01-24 07:59:11 +00001358static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001359unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001360/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001361{
Victor Stinnere6b8c522020-10-15 16:22:19 +02001362 unicodedata_module_state *state = &global_module_state;
Christian Heimes2f366ca2016-09-23 20:20:27 +02001363 char name[NAME_MAXLEN+1];
Serhiy Storchaka63596412015-04-17 21:18:49 +03001364 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001365
Victor Stinnere6b8c522020-10-15 16:22:19 +02001366 if (!_getucname(state, self, c, name, NAME_MAXLEN, 0)) {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001367 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001368 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001369 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001370 }
1371 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001372 Py_INCREF(default_value);
1373 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001374 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001375 }
1376
Walter Dörwald4254e762007-06-05 16:04:09 +00001377 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001378}
1379
Serhiy Storchaka63596412015-04-17 21:18:49 +03001380/*[clinic input]
1381unicodedata.UCD.lookup
1382
1383 self: self
Larry Hastings38337d12015-05-07 23:30:09 -07001384 name: str(accept={str, robuffer}, zeroes=True)
Serhiy Storchaka63596412015-04-17 21:18:49 +03001385 /
1386
1387Look up character by name.
1388
1389If a character with the given name is found, return the
1390corresponding character. If not found, KeyError is raised.
1391[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001392
Fredrik Lundh06d12682001-01-24 07:59:11 +00001393static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001394unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1395 Py_ssize_clean_t name_length)
Larry Hastings38337d12015-05-07 23:30:09 -07001396/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001397{
Victor Stinnere6b8c522020-10-15 16:22:19 +02001398 unicodedata_module_state *state = &global_module_state;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001399 Py_UCS4 code;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001400 unsigned int index;
Christian Heimes7ce20132016-09-14 10:25:46 +02001401 if (name_length > NAME_MAXLEN) {
Victor Stinner65a31442014-07-01 16:45:52 +02001402 PyErr_SetString(PyExc_KeyError, "name too long");
1403 return NULL;
1404 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001405
Victor Stinnere6b8c522020-10-15 16:22:19 +02001406 if (!_getcode(state, self, name, (int)name_length, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001407 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001408 return NULL;
1409 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001410 /* check if code is in the PUA range that we use for named sequences
1411 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001412 if (IS_NAMED_SEQ(code)) {
1413 index = code-named_sequences_start;
1414 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1415 named_sequences[index].seq,
1416 named_sequences[index].seqlen);
1417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001419}
1420
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001421/* XXX Add doc strings. */
1422
1423static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001424 UNICODEDATA_UCD_DECIMAL_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001425 UNICODEDATA_UCD_DIGIT_METHODDEF
1426 UNICODEDATA_UCD_NUMERIC_METHODDEF
1427 UNICODEDATA_UCD_CATEGORY_METHODDEF
1428 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1429 UNICODEDATA_UCD_COMBINING_METHODDEF
1430 UNICODEDATA_UCD_MIRRORED_METHODDEF
1431 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1432 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1433 UNICODEDATA_UCD_NAME_METHODDEF
1434 UNICODEDATA_UCD_LOOKUP_METHODDEF
Max Bélanger2810dd72018-11-04 15:58:24 -08001435 UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001436 UNICODEDATA_UCD_NORMALIZE_METHODDEF
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001437 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001438};
1439
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001440static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001441 /* The ob_type field must be initialized in the module init function
1442 * to be portable to Windows without using C++. */
1443 PyVarObject_HEAD_INIT(NULL, 0)
1444 "unicodedata.UCD", /*tp_name*/
1445 sizeof(PreviousDBVersion), /*tp_basicsize*/
1446 0, /*tp_itemsize*/
1447 /* methods */
1448 (destructor)PyObject_Del, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02001449 0, /*tp_vectorcall_offset*/
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001450 0, /*tp_getattr*/
1451 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02001452 0, /*tp_as_async*/
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001453 0, /*tp_repr*/
1454 0, /*tp_as_number*/
1455 0, /*tp_as_sequence*/
1456 0, /*tp_as_mapping*/
1457 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001458 0, /*tp_call*/
1459 0, /*tp_str*/
1460 PyObject_GenericGetAttr,/*tp_getattro*/
1461 0, /*tp_setattro*/
1462 0, /*tp_as_buffer*/
1463 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1464 0, /*tp_doc*/
1465 0, /*tp_traverse*/
1466 0, /*tp_clear*/
1467 0, /*tp_richcompare*/
1468 0, /*tp_weaklistoffset*/
1469 0, /*tp_iter*/
1470 0, /*tp_iternext*/
1471 unicodedata_functions, /*tp_methods*/
1472 DB_members, /*tp_members*/
1473 0, /*tp_getset*/
1474 0, /*tp_base*/
1475 0, /*tp_dict*/
1476 0, /*tp_descr_get*/
1477 0, /*tp_descr_set*/
1478 0, /*tp_dictoffset*/
1479 0, /*tp_init*/
1480 0, /*tp_alloc*/
1481 0, /*tp_new*/
1482 0, /*tp_free*/
1483 0, /*tp_is_gc*/
1484};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001485
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001486PyDoc_STRVAR(unicodedata_docstring,
1487"This module provides access to the Unicode Character Database which\n\
1488defines character properties for all Unicode characters. The data in\n\
1489this database is based on the UnicodeData.txt file version\n\
luzpaza5293b42017-11-05 07:37:50 -06001490" UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001491\n\
1492The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001493UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001494
1495static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001496 PyModuleDef_HEAD_INIT,
1497 "unicodedata",
1498 unicodedata_docstring,
1499 -1,
1500 unicodedata_functions,
1501 NULL,
1502 NULL,
1503 NULL,
1504 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001505};
1506
Mark Hammond62b1ab12002-07-23 06:31:15 +00001507PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001508PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001509{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001510 PyObject *m, *v;
Victor Stinnere6b8c522020-10-15 16:22:19 +02001511 unicodedata_module_state *state = &global_module_state;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001512
Victor Stinnerd2ec81a2020-02-07 09:17:07 +01001513 Py_SET_TYPE(&UCD_Type, &PyType_Type);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001514 state->ucd_type = &UCD_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001515
Martin v. Löwis1a214512008-06-11 05:26:20 +00001516 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001517 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001518 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001519
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001520 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Victor Stinnere6b8c522020-10-15 16:22:19 +02001521 Py_INCREF(state->ucd_type);
1522 PyModule_AddObject(m, "UCD", (PyObject*)state->ucd_type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001523
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001524 /* Previous versions */
Victor Stinnere6b8c522020-10-15 16:22:19 +02001525 v = new_previous_version(state, "3.2.0",
1526 get_change_3_2_0, normalization_3_2_0);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001527 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001528 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001529
Fredrik Lundh06d12682001-01-24 07:59:11 +00001530 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001531 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001532 if (v != NULL)
1533 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001534 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001535}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001536
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001537/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001538Local variables:
1539c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001540indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001541End:
1542*/