blob: 8a1198a2b712d931d6c209e4a02678d284fa317a [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019#include "ucnhash.h"
Victor Stinner4a21e572020-04-15 02:35:41 +020020#include "structmember.h" // PyMemberDef
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Greg Price2f094132019-09-03 19:45:44 -070022#include <stdbool.h>
23
Max Bélanger2810dd72018-11-04 15:58:24 -080024_Py_IDENTIFIER(NFC);
25_Py_IDENTIFIER(NFD);
26_Py_IDENTIFIER(NFKC);
27_Py_IDENTIFIER(NFKD);
28
Larry Hastings61272b72014-01-07 12:41:53 -080029/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080030module unicodedata
Larry Hastingsc2047262014-01-25 20:43:29 -080031class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Larry Hastings61272b72014-01-07 12:41:53 -080032[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080033/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080034
Fredrik Lundh06d12682001-01-24 07:59:11 +000035/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000036
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000037typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000038 const unsigned char category; /* index into
39 _PyUnicode_CategoryNames */
40 const unsigned char combining; /* combining class value 0 - 255 */
41 const unsigned char bidirectional; /* index into
42 _PyUnicode_BidirectionalNames */
43 const unsigned char mirrored; /* true if mirrored in bidir mode */
44 const unsigned char east_asian_width; /* index into
45 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000046 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047} _PyUnicode_DatabaseRecord;
48
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000049typedef struct change_record {
50 /* sequence of fields should be the same as in merge_old_version */
51 const unsigned char bidir_changed;
52 const unsigned char category_changed;
53 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000054 const unsigned char mirrored_changed;
Benjamin Peterson67752312016-09-14 23:53:47 -070055 const unsigned char east_asian_width_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000056 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000057} change_record;
58
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000059/* data file generated by Tools/unicode/makeunicodedata.py */
60#include "unicodedata_db.h"
61
62static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000063_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000064{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000065 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000066 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000067 index = 0;
68 else {
69 index = index1[(code>>SHIFT)];
70 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
71 }
72
73 return &_PyUnicode_Database_Records[index];
74}
75
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000076/* ------------- Previous-version API ------------------------------------- */
77typedef struct previous_version {
78 PyObject_HEAD
79 const char *name;
80 const change_record* (*getrecord)(Py_UCS4);
81 Py_UCS4 (*normalization)(Py_UCS4);
82} PreviousDBVersion;
83
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030084#include "clinic/unicodedata.c.h"
85
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000086#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
87
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090 {NULL}
91};
92
Thomas Wouters89f507f2006-12-13 04:49:30 +000093/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000094static PyTypeObject UCD_Type;
Dong-hee Na1b55b652020-02-17 19:09:15 +090095#define UCD_Check(o) Py_IS_TYPE(o, &UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000096
97static PyObject*
98new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
99 Py_UCS4 (*normalization)(Py_UCS4))
100{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000101 PreviousDBVersion *self;
102 self = PyObject_New(PreviousDBVersion, &UCD_Type);
103 if (self == NULL)
104 return NULL;
105 self->name = name;
106 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000107 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000108 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000109}
110
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000111
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000112/* --- Module API --------------------------------------------------------- */
113
Larry Hastings61272b72014-01-07 12:41:53 -0800114/*[clinic input]
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800115unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700116
Serhiy Storchaka63596412015-04-17 21:18:49 +0300117 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700118 chr: int(accept={str})
Larry Hastings31826802013-10-19 00:09:25 -0700119 default: object=NULL
120 /
121
122Converts a Unicode character into its equivalent decimal value.
123
Serhiy Storchaka63596412015-04-17 21:18:49 +0300124Returns the decimal value assigned to the character chr as integer.
125If no such value is defined, default is returned, or, if not given,
126ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800127[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700128
Larry Hastings31826802013-10-19 00:09:25 -0700129static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300130unicodedata_UCD_decimal_impl(PyObject *self, int chr,
Larry Hastings89964c42015-04-14 18:07:59 -0400131 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700132/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
Larry Hastings31826802013-10-19 00:09:25 -0700133{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000134 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000135 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300136 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000137
Martin v. Löwis1a214512008-06-11 05:26:20 +0000138 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000139 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000140 if (old->category_changed == 0) {
141 /* unassigned */
142 have_old = 1;
143 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000144 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000145 else if (old->decimal_changed != 0xFF) {
146 have_old = 1;
147 rc = old->decimal_changed;
148 }
149 }
150
151 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000152 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000153 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700154 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000155 PyErr_SetString(PyExc_ValueError,
156 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000157 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000158 }
159 else {
Larry Hastings31826802013-10-19 00:09:25 -0700160 Py_INCREF(default_value);
161 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000162 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000163 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000164 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000165}
166
Serhiy Storchaka63596412015-04-17 21:18:49 +0300167/*[clinic input]
168unicodedata.UCD.digit
169
170 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700171 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300172 default: object=NULL
173 /
174
175Converts a Unicode character into its equivalent digit value.
176
177Returns the digit value assigned to the character chr as integer.
178If no such value is defined, default is returned, or, if not given,
179ValueError is raised.
180[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000181
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000182static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300183unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700184/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000185{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000186 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300187 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000188 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000189 if (rc < 0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300190 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000192 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000193 }
194 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300195 Py_INCREF(default_value);
196 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000197 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000198 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000199 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000200}
201
Serhiy Storchaka63596412015-04-17 21:18:49 +0300202/*[clinic input]
203unicodedata.UCD.numeric
204
205 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700206 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300207 default: object=NULL
208 /
209
210Converts a Unicode character into its equivalent numeric value.
211
212Returns the numeric value assigned to the character chr as float.
213If no such value is defined, default is returned, or, if not given,
214ValueError is raised.
215[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000216
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000217static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300218unicodedata_UCD_numeric_impl(PyObject *self, int chr,
219 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700220/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000221{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000222 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000223 double rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300224 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000225
Martin v. Löwis1a214512008-06-11 05:26:20 +0000226 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000227 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000228 if (old->category_changed == 0) {
229 /* unassigned */
230 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000232 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000233 else if (old->decimal_changed != 0xFF) {
234 have_old = 1;
235 rc = old->decimal_changed;
236 }
237 }
238
239 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000240 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000241 if (rc == -1.0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300242 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000243 PyErr_SetString(PyExc_ValueError, "not a numeric character");
244 return NULL;
245 }
246 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300247 Py_INCREF(default_value);
248 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000249 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250 }
251 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000252}
253
Serhiy Storchaka63596412015-04-17 21:18:49 +0300254/*[clinic input]
255unicodedata.UCD.category
256
257 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700258 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300259 /
260
261Returns the general category assigned to the character chr as string.
262[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000263
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000264static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300265unicodedata_UCD_category_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700266/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000267{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000268 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300269 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000270 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000271 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000272 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000273 if (old->category_changed != 0xFF)
274 index = old->category_changed;
275 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000276 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000277}
278
Serhiy Storchaka63596412015-04-17 21:18:49 +0300279/*[clinic input]
280unicodedata.UCD.bidirectional
281
282 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700283 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300284 /
285
286Returns the bidirectional class assigned to the character chr as string.
287
288If no such value is defined, an empty string is returned.
289[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000290
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000291static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300292unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700293/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000294{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000295 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300296 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000297 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000298 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000299 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000300 if (old->category_changed == 0)
301 index = 0; /* unassigned */
302 else if (old->bidir_changed != 0xFF)
303 index = old->bidir_changed;
304 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000305 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000306}
307
Serhiy Storchaka63596412015-04-17 21:18:49 +0300308/*[clinic input]
309unicodedata.UCD.combining -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000310
Serhiy Storchaka63596412015-04-17 21:18:49 +0300311 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700312 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300313 /
314
315Returns the canonical combining class assigned to the character chr as integer.
316
317Returns 0 if no combining class is defined.
318[clinic start generated code]*/
319
320static int
321unicodedata_UCD_combining_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700322/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000323{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000324 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300325 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000326 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000327 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000328 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000329 if (old->category_changed == 0)
330 index = 0; /* unassigned */
331 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300332 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000333}
334
Serhiy Storchaka63596412015-04-17 21:18:49 +0300335/*[clinic input]
336unicodedata.UCD.mirrored -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000337
Serhiy Storchaka63596412015-04-17 21:18:49 +0300338 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700339 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300340 /
341
342Returns the mirrored property assigned to the character chr as integer.
343
344Returns 1 if the character has been identified as a "mirrored"
345character in bidirectional text, 0 otherwise.
346[clinic start generated code]*/
347
348static int
349unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700350/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000351{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000352 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300353 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000354 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000355 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000356 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000357 if (old->category_changed == 0)
358 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000359 else if (old->mirrored_changed != 0xFF)
360 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000361 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300362 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000363}
364
Serhiy Storchaka63596412015-04-17 21:18:49 +0300365/*[clinic input]
366unicodedata.UCD.east_asian_width
367
368 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700369 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300370 /
371
372Returns the east asian width assigned to the character chr as string.
373[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000374
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000375static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300376unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700377/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000378{
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000379 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300380 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000381 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000382 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000383 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000384 if (old->category_changed == 0)
385 index = 0; /* unassigned */
Benjamin Peterson67752312016-09-14 23:53:47 -0700386 else if (old->east_asian_width_changed != 0xFF)
387 index = old->east_asian_width_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000388 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000389 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000390}
391
Serhiy Storchaka63596412015-04-17 21:18:49 +0300392/*[clinic input]
393unicodedata.UCD.decomposition
394
395 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700396 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300397 /
398
399Returns the character decomposition mapping assigned to the character chr as string.
400
401An empty string is returned in case no such mapping is defined.
402[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000403
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000404static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300405unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700406/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000407{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000408 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000409 int code, index, count;
410 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000411 unsigned int prefix_index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300412 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000413
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000414 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000415
Martin v. Löwis1a214512008-06-11 05:26:20 +0000416 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000417 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000418 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000419 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000420 }
421
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000422 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000423 index = 0;
424 else {
425 index = decomp_index1[(code>>DECOMP_SHIFT)];
426 index = decomp_index2[(index<<DECOMP_SHIFT)+
427 (code&((1<<DECOMP_SHIFT)-1))];
428 }
429
Tim Peters69b83b12001-11-30 07:23:05 +0000430 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000431 is prefix code (from*/
432 count = decomp_data[index] >> 8;
433
434 /* XXX: could allocate the PyString up front instead
435 (strlen(prefix) + 5 * count + 1 bytes) */
436
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000437 /* Based on how index is calculated above and decomp_data is generated
438 from Tools/unicode/makeunicodedata.py, it should not be possible
439 to overflow decomp_prefix. */
440 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200441 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000442
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000443 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000444 i = strlen(decomp_prefix[prefix_index]);
445 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000446
447 while (count-- > 0) {
448 if (i)
449 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000450 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000451 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
452 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000453 i += strlen(decomp + i);
454 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000455 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000456}
457
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000458static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000459get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000460{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000461 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000462 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000463 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000464 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000465 /* unassigned in old version */
466 *index = 0;
467 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000468 else {
469 *index = decomp_index1[(code>>DECOMP_SHIFT)];
470 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
471 (code&((1<<DECOMP_SHIFT)-1))];
472 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000473
Martin v. Löwis677bde22002-11-23 22:08:15 +0000474 /* high byte is number of hex bytes (usually one or two), low byte
475 is prefix code (from*/
476 *count = decomp_data[*index] >> 8;
477 *prefix = decomp_data[*index] & 255;
478
479 (*index)++;
480}
481
482#define SBase 0xAC00
483#define LBase 0x1100
484#define VBase 0x1161
485#define TBase 0x11A7
486#define LCount 19
487#define VCount 21
488#define TCount 28
489#define NCount (VCount*TCount)
490#define SCount (LCount*NCount)
491
492static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000493nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000494{
495 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200496 Py_UCS4 *output;
497 Py_ssize_t i, o, osize;
498 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300499 const void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000500 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200501 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000502 Py_ssize_t space, isize;
503 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000504 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000505
Martin v. Löwis677bde22002-11-23 22:08:15 +0000506 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200507 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500508 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300509 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500510 if (space > 10) {
511 if (space <= PY_SSIZE_T_MAX - 10)
512 space += 10;
513 }
514 else {
515 space *= 2;
516 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200517 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500518 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200519 if (!output) {
520 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000521 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200522 }
523 i = o = 0;
524 kind = PyUnicode_KIND(input);
525 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000526
Martin v. Löwis22970662011-09-29 13:39:38 +0200527 while (i < isize) {
528 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000529 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200530 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000531 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300532 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000533 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000534 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200535 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000536 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000537 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
538 if (new_output == NULL) {
539 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200540 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000541 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200542 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000543 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000544 }
545 /* Hangul Decomposition. */
546 if (SBase <= code && code < (SBase+SCount)) {
547 int SIndex = code - SBase;
548 int L = LBase + SIndex / NCount;
549 int V = VBase + (SIndex % NCount) / TCount;
550 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200551 output[o++] = L;
552 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000553 space -= 2;
554 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200555 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000556 space --;
557 }
558 continue;
559 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000560 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000561 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000562 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
563 if (value != 0) {
564 stack[stackptr++] = value;
565 continue;
566 }
567 }
568
569 /* Other decompositions. */
570 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000571
572 /* Copy character if it is not decomposable, or has a
573 compatibility decomposition, but we do NFD. */
574 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200575 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000576 space--;
577 continue;
578 }
579 /* Copy decomposition onto the stack, in reverse
580 order. */
581 while(count) {
582 code = decomp_data[index + (--count)];
583 stack[stackptr++] = code;
584 }
585 }
586 }
587
Martin v. Löwis22970662011-09-29 13:39:38 +0200588 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
589 output, o);
590 PyMem_Free(output);
591 if (!result)
592 return NULL;
593 /* result is guaranteed to be ready, as it is compact. */
594 kind = PyUnicode_KIND(result);
595 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000596
597 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200598 i = 0;
599 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
600 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
601 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000602 if (prev == 0 || cur == 0 || prev <= cur) {
603 prev = cur;
604 continue;
605 }
606 /* Non-canonical order. Need to switch *i with previous. */
607 o = i - 1;
608 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200609 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
610 PyUnicode_WRITE(kind, data, o+1,
611 PyUnicode_READ(kind, data, o));
612 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000613 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200614 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000615 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200616 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000617 if (prev == 0 || prev <= cur)
618 break;
619 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200620 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000621 }
622 return result;
623}
624
625static int
Andy Lester982307b2020-03-17 11:38:12 -0500626find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000627{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200628 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000629 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200630 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000631 if (code < start)
632 return -1;
633 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200634 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000635 return nfc[index].index + delta;
636 }
637 }
638 return -1;
639}
640
641static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000642nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000643{
644 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200645 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300646 const void *data;
Martin v. Löwis22970662011-09-29 13:39:38 +0200647 Py_UCS4 *output;
648 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000649 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200650 Py_UCS4 code;
651 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000652 int cskipped = 0;
653
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000654 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000655 if (!result)
656 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200657 /* result will be "ready". */
658 kind = PyUnicode_KIND(result);
659 data = PyUnicode_DATA(result);
660 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000661
Martin v. Löwis22970662011-09-29 13:39:38 +0200662 /* We allocate a buffer for the output.
663 If we find that we made no changes, we still return
664 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500665 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200666 if (!output) {
667 PyErr_NoMemory();
668 Py_DECREF(result);
669 return 0;
670 }
671 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000672
Martin v. Löwis677bde22002-11-23 22:08:15 +0000673 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200674 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000675 for (index = 0; index < cskipped; index++) {
676 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000677 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000678 Remove from list. */
679 skipped[index] = skipped[cskipped-1];
680 cskipped--;
681 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000682 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000683 }
684 }
685 /* Hangul Composition. We don't need to check for <LV,T>
686 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200687 code = PyUnicode_READ(kind, data, i);
688 if (LBase <= code && code < (LBase+LCount) &&
689 i + 1 < len &&
690 VBase <= PyUnicode_READ(kind, data, i+1) &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900691 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
692 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
693 and V character is a modern vowel (0x1161 ~ 0x1175). */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000694 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200695 LIndex = code - LBase;
696 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000697 code = SBase + (LIndex*VCount+VIndex)*TCount;
698 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200699 if (i < len &&
Wonsup Yoond1348092018-06-15 21:03:14 +0900700 TBase < PyUnicode_READ(kind, data, i) &&
701 PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
702 /* check T character is a modern trailing consonant
703 (0x11A8 ~ 0x11C2). */
Martin v. Löwis22970662011-09-29 13:39:38 +0200704 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000705 i++;
706 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200707 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000708 continue;
709 }
710
Martin v. Löwis22970662011-09-29 13:39:38 +0200711 /* code is still input[i] here */
Andy Lester982307b2020-03-17 11:38:12 -0500712 f = find_nfc_index(nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000713 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200714 output[o++] = code;
715 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000716 continue;
717 }
718 /* Find next unblocked character. */
719 i1 = i+1;
720 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200721 /* output base character for now; might be updated later. */
722 output[o] = PyUnicode_READ(kind, data, i);
723 while (i1 < len) {
724 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
725 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000726 if (comb) {
727 if (comb1 == 0)
728 break;
729 if (comb >= comb1) {
730 /* Character is blocked. */
731 i1++;
732 continue;
733 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000734 }
Andy Lester982307b2020-03-17 11:38:12 -0500735 l = find_nfc_index(nfc_last, code1);
Martin v. Löwis22970662011-09-29 13:39:38 +0200736 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000737 is a starter, we don't need to look further.
738 Otherwise, record the combining class. */
739 if (l == -1) {
740 not_combinable:
741 if (comb1 == 0)
742 break;
743 comb = comb1;
744 i1++;
745 continue;
746 }
747 index = f*TOTAL_LAST + l;
748 index1 = comp_index[index >> COMP_SHIFT];
749 code = comp_data[(index1<<COMP_SHIFT)+
750 (index&((1<<COMP_SHIFT)-1))];
751 if (code == 0)
752 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000753
Martin v. Löwis677bde22002-11-23 22:08:15 +0000754 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200755 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000756 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000757 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000758 skipped[cskipped++] = i1;
759 i1++;
Andy Lester982307b2020-03-17 11:38:12 -0500760 f = find_nfc_index(nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000761 if (f == -1)
762 break;
763 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200764 /* Output character was already written.
765 Just advance the indices. */
766 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000767 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200768 if (o == len) {
769 /* No changes. Return original string. */
770 PyMem_Free(output);
771 return result;
772 }
773 Py_DECREF(result);
774 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
775 output, o);
776 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000777 return result;
778}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000779
Greg Price2f094132019-09-03 19:45:44 -0700780// This needs to match the logic in makeunicodedata.py
781// which constructs the quickcheck data.
782typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
Max Bélanger2810dd72018-11-04 15:58:24 -0800783
Greg Price2f094132019-09-03 19:45:44 -0700784/* Run the Unicode normalization "quickcheck" algorithm.
785 *
786 * Return YES or NO if quickcheck determines the input is certainly
787 * normalized or certainly not, and MAYBE if quickcheck is unable to
788 * tell.
789 *
790 * If `yes_only` is true, then return MAYBE as soon as we determine
791 * the answer is not YES.
792 *
793 * For background and details on the algorithm, see UAX #15:
794 * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
795 */
796static QuickcheckResult
797is_normalized_quickcheck(PyObject *self, PyObject *input,
Greg Price7669cb82019-09-09 02:16:31 -0700798 bool nfc, bool k, bool yes_only)
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000799{
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000800 /* An older version of the database is requested, quickchecks must be
801 disabled. */
802 if (self && UCD_Check(self))
Max Bélanger2810dd72018-11-04 15:58:24 -0800803 return NO;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000804
Greg Price2f094132019-09-03 19:45:44 -0700805 Py_ssize_t i, len;
806 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300807 const void *data;
Greg Price2f094132019-09-03 19:45:44 -0700808 unsigned char prev_combining = 0;
809
810 /* The two quickcheck bits at this shift have type QuickcheckResult. */
811 int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
812
813 QuickcheckResult result = YES; /* certainly normalized, unless we find something */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000814
Martin v. Löwis22970662011-09-29 13:39:38 +0200815 i = 0;
816 kind = PyUnicode_KIND(input);
817 data = PyUnicode_DATA(input);
818 len = PyUnicode_GET_LENGTH(input);
819 while (i < len) {
820 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
821 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000822
Greg Price2f094132019-09-03 19:45:44 -0700823 unsigned char combining = record->combining;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000824 if (combining && prev_combining > combining)
Max Bélanger2810dd72018-11-04 15:58:24 -0800825 return NO; /* non-canonical sort order, not normalized */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000826 prev_combining = combining;
Greg Price2f094132019-09-03 19:45:44 -0700827
828 unsigned char quickcheck_whole = record->normalization_quick_check;
829 if (yes_only) {
830 if (quickcheck_whole & (3 << quickcheck_shift))
831 return MAYBE;
832 } else {
833 switch ((quickcheck_whole >> quickcheck_shift) & 3) {
834 case NO:
835 return NO;
836 case MAYBE:
837 result = MAYBE; /* this string might need normalization */
838 }
839 }
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000840 }
Greg Price2f094132019-09-03 19:45:44 -0700841 return result;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000842}
843
Serhiy Storchaka63596412015-04-17 21:18:49 +0300844/*[clinic input]
Max Bélanger2810dd72018-11-04 15:58:24 -0800845unicodedata.UCD.is_normalized
846
847 self: self
848 form: unicode
849 unistr as input: unicode
850 /
851
852Return whether the Unicode string unistr is in the normal form 'form'.
853
854Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
855[clinic start generated code]*/
856
857static PyObject *
858unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
859 PyObject *input)
860/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
861{
862 if (PyUnicode_READY(input) == -1) {
863 return NULL;
864 }
865
866 if (PyUnicode_GET_LENGTH(input) == 0) {
867 /* special case empty input strings. */
868 Py_RETURN_TRUE;
869 }
870
871 PyObject *result;
Greg Price7669cb82019-09-09 02:16:31 -0700872 bool nfc = false;
873 bool k = false;
Greg Price2f094132019-09-03 19:45:44 -0700874 QuickcheckResult m;
Max Bélanger2810dd72018-11-04 15:58:24 -0800875
876 PyObject *cmp;
877 int match = 0;
878
879 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700880 nfc = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800881 }
882 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700883 nfc = true;
884 k = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800885 }
886 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
887 /* matches default values for `nfc` and `k` */
888 }
889 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
Greg Price7669cb82019-09-09 02:16:31 -0700890 k = true;
Max Bélanger2810dd72018-11-04 15:58:24 -0800891 }
892 else {
893 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
894 return NULL;
895 }
896
Greg Price2f094132019-09-03 19:45:44 -0700897 m = is_normalized_quickcheck(self, input, nfc, k, false);
Max Bélanger2810dd72018-11-04 15:58:24 -0800898
899 if (m == MAYBE) {
900 cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
901 if (cmp == NULL) {
902 return NULL;
903 }
904 match = PyUnicode_Compare(input, cmp);
905 Py_DECREF(cmp);
906 result = (match == 0) ? Py_True : Py_False;
907 }
908 else {
909 result = (m == YES) ? Py_True : Py_False;
910 }
911
912 Py_INCREF(result);
913 return result;
914}
915
916
917/*[clinic input]
Serhiy Storchaka63596412015-04-17 21:18:49 +0300918unicodedata.UCD.normalize
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000919
Serhiy Storchaka63596412015-04-17 21:18:49 +0300920 self: self
Max Bélanger2810dd72018-11-04 15:58:24 -0800921 form: unicode
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +0300922 unistr as input: unicode
Serhiy Storchaka63596412015-04-17 21:18:49 +0300923 /
924
925Return the normal form 'form' for the Unicode string unistr.
926
927Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
928[clinic start generated code]*/
929
930static PyObject *
Max Bélanger2810dd72018-11-04 15:58:24 -0800931unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
Serhiy Storchaka63596412015-04-17 21:18:49 +0300932 PyObject *input)
Max Bélanger2810dd72018-11-04 15:58:24 -0800933/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
Martin v. Löwis677bde22002-11-23 22:08:15 +0000934{
Martin v. Löwis22970662011-09-29 13:39:38 +0200935 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000936 /* Special case empty input strings, since resizing
937 them later would cause internal errors. */
938 Py_INCREF(input);
939 return input;
940 }
941
Max Bélanger2810dd72018-11-04 15:58:24 -0800942 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700943 if (is_normalized_quickcheck(self, input, true, false, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000944 Py_INCREF(input);
945 return input;
946 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000947 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000948 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800949 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
Greg Price7669cb82019-09-09 02:16:31 -0700950 if (is_normalized_quickcheck(self, input, true, true, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000951 Py_INCREF(input);
952 return input;
953 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000954 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000955 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800956 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
Greg Price7669cb82019-09-09 02:16:31 -0700957 if (is_normalized_quickcheck(self, input, false, false, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000958 Py_INCREF(input);
959 return input;
960 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000961 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000962 }
Max Bélanger2810dd72018-11-04 15:58:24 -0800963 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
Greg Price7669cb82019-09-09 02:16:31 -0700964 if (is_normalized_quickcheck(self, input, false, true, true) == YES) {
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000965 Py_INCREF(input);
966 return input;
967 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000968 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000969 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000970 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
971 return NULL;
972}
973
Fredrik Lundh06d12682001-01-24 07:59:11 +0000974/* -------------------------------------------------------------------- */
975/* unicode character name tables */
976
977/* data file generated by Tools/unicode/makeunicodedata.py */
978#include "unicodename_db.h"
979
980/* -------------------------------------------------------------------- */
981/* database code (cut and pasted from the unidb package) */
982
983static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000984_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000985{
986 int i;
987 unsigned long h = 0;
988 unsigned long ix;
989 for (i = 0; i < len; i++) {
Jordon Xu2ec70102019-09-11 00:04:08 +0800990 h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
Fredrik Lundh06d12682001-01-24 07:59:11 +0000991 ix = h & 0xff000000;
992 if (ix)
993 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
994 }
995 return h;
996}
997
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200998static const char * const hangul_syllables[][3] = {
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000999 { "G", "A", "" },
1000 { "GG", "AE", "G" },
1001 { "N", "YA", "GG" },
1002 { "D", "YAE", "GS" },
1003 { "DD", "EO", "N", },
1004 { "R", "E", "NJ" },
1005 { "M", "YEO", "NH" },
1006 { "B", "YE", "D" },
1007 { "BB", "O", "L" },
1008 { "S", "WA", "LG" },
1009 { "SS", "WAE", "LM" },
1010 { "", "OE", "LB" },
1011 { "J", "YO", "LS" },
1012 { "JJ", "U", "LT" },
1013 { "C", "WEO", "LP" },
1014 { "K", "WE", "LH" },
1015 { "T", "WI", "M" },
1016 { "P", "YU", "B" },
1017 { "H", "EU", "BS" },
1018 { 0, "YI", "S" },
1019 { 0, "I", "SS" },
1020 { 0, 0, "NG" },
1021 { 0, 0, "J" },
1022 { 0, 0, "C" },
1023 { 0, 0, "K" },
1024 { 0, 0, "T" },
1025 { 0, 0, "P" },
1026 { 0, 0, "H" }
1027};
1028
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001029/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001030static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001031is_unified_ideograph(Py_UCS4 code)
1032{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001033 return
Benjamin Peterson051b9d02020-03-10 20:41:34 -07001034 (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
1035 (0x4E00 <= code && code <= 0x9FFC) || /* CJK Ideograph */
1036 (0x20000 <= code && code <= 0x2A6DD) || /* CJK Ideograph Extension B */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +00001037 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
Benjamin Peterson48013832015-06-27 15:45:56 -05001038 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
Benjamin Peterson279a9622017-06-22 22:31:08 -07001039 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
Benjamin Peterson051b9d02020-03-10 20:41:34 -07001040 (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1041 (0x30000 <= code && code <= 0x3134A); /* CJK Ideograph Extension G */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001042}
1043
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001044/* macros used to determine if the given code point is in the PUA range that
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001045 * we are using to store aliases and named sequences */
1046#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1047#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1048 (cp < named_sequences_end))
1049
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001050static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001051_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
1052 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001053{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001054 /* Find the name associated with the given code point.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001055 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1056 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001057 int offset;
1058 int i;
1059 int word;
Inada Naoki6fec9052019-04-17 08:40:34 +09001060 const unsigned char* w;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001061
Martin v. Löwisc3509122006-03-11 12:16:23 +00001062 if (code >= 0x110000)
1063 return 0;
1064
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001065 /* XXX should we just skip all the code points in the PUAs here? */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001066 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1067 return 0;
1068
Martin v. Löwis1a214512008-06-11 05:26:20 +00001069 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001070 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +03001071 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001072 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1073 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +03001074 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +00001075 if (old->category_changed == 0) {
1076 /* unassigned */
1077 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001078 }
Martin v. Löwisc3509122006-03-11 12:16:23 +00001079 }
1080
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +00001081 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001082 /* Hangul syllable. */
1083 int SIndex = code - SBase;
1084 int L = SIndex / NCount;
1085 int V = (SIndex % NCount) / TCount;
1086 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001087
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001088 if (buflen < 27)
1089 /* Worst case: HANGUL SYLLABLE <10chars>. */
1090 return 0;
1091 strcpy(buffer, "HANGUL SYLLABLE ");
1092 buffer += 16;
1093 strcpy(buffer, hangul_syllables[L][0]);
1094 buffer += strlen(hangul_syllables[L][0]);
1095 strcpy(buffer, hangul_syllables[V][1]);
1096 buffer += strlen(hangul_syllables[V][1]);
1097 strcpy(buffer, hangul_syllables[T][2]);
1098 buffer += strlen(hangul_syllables[T][2]);
1099 *buffer = '\0';
1100 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001101 }
1102
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001103 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001104 if (buflen < 28)
1105 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1106 return 0;
1107 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1108 return 1;
1109 }
1110
Fredrik Lundh06d12682001-01-24 07:59:11 +00001111 /* get offset into phrasebook */
1112 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1113 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1114 (code&((1<<phrasebook_shift)-1))];
1115 if (!offset)
1116 return 0;
1117
1118 i = 0;
1119
1120 for (;;) {
1121 /* get word index */
1122 word = phrasebook[offset] - phrasebook_short;
1123 if (word >= 0) {
1124 word = (word << 8) + phrasebook[offset+1];
1125 offset += 2;
1126 } else
1127 word = phrasebook[offset++];
1128 if (i) {
1129 if (i > buflen)
1130 return 0; /* buffer overflow */
1131 buffer[i++] = ' ';
1132 }
1133 /* copy word string from lexicon. the last character in the
1134 word has bit 7 set. the last word in a string ends with
1135 0x80 */
1136 w = lexicon + lexicon_offset[word];
1137 while (*w < 128) {
1138 if (i >= buflen)
1139 return 0; /* buffer overflow */
1140 buffer[i++] = *w++;
1141 }
1142 if (i >= buflen)
1143 return 0; /* buffer overflow */
1144 buffer[i++] = *w & 127;
1145 if (*w == 128)
1146 break; /* end of word */
1147 }
1148
1149 return 1;
1150}
1151
1152static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001153_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001154{
1155 /* check if code corresponds to the given name */
1156 int i;
Christian Heimes2f366ca2016-09-23 20:20:27 +02001157 char buffer[NAME_MAXLEN+1];
1158 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001159 return 0;
1160 for (i = 0; i < namelen; i++) {
Jordon Xu2ec70102019-09-11 00:04:08 +08001161 if (Py_TOUPPER(name[i]) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001162 return 0;
1163 }
1164 return buffer[namelen] == '\0';
1165}
1166
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001167static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001168find_syllable(const char *str, int *len, int *pos, int count, int column)
1169{
1170 int i, len1;
1171 *len = -1;
1172 for (i = 0; i < count; i++) {
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02001173 const char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001174 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001175 if (len1 <= *len)
1176 continue;
1177 if (strncmp(str, s, len1) == 0) {
1178 *len = len1;
1179 *pos = i;
1180 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001181 }
1182 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001183 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001184 }
1185}
1186
Fredrik Lundh06d12682001-01-24 07:59:11 +00001187static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001188_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001189{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001190 /* check if named sequences are allowed */
1191 if (!with_named_seq && IS_NAMED_SEQ(cp))
1192 return 0;
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001193 /* if the code point is in the PUA range that we use for aliases,
1194 * convert it to obtain the right code point */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001195 if (IS_ALIAS(cp))
1196 *code = name_aliases[cp-aliases_start];
1197 else
1198 *code = cp;
1199 return 1;
1200}
1201
1202static int
1203_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1204 int with_named_seq)
1205{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001206 /* Return the code point associated with the given name.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001207 * Named aliases are resolved too (unless self != NULL (i.e. we are using
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001208 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001209 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001210 unsigned int h, v;
1211 unsigned int mask = code_size-1;
1212 unsigned int i, incr;
1213
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001214 /* Check for hangul syllables. */
1215 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001216 int len, L = -1, V = -1, T = -1;
1217 const char *pos = name + 16;
1218 find_syllable(pos, &len, &L, LCount, 0);
1219 pos += len;
1220 find_syllable(pos, &len, &V, VCount, 1);
1221 pos += len;
1222 find_syllable(pos, &len, &T, TCount, 2);
1223 pos += len;
1224 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1225 *code = SBase + (L*VCount+V)*TCount + T;
1226 return 1;
1227 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001228 /* Otherwise, it's an illegal syllable name. */
1229 return 0;
1230 }
1231
1232 /* Check for unified ideographs. */
1233 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1234 /* Four or five hexdigits must follow. */
1235 v = 0;
1236 name += 22;
1237 namelen -= 22;
1238 if (namelen != 4 && namelen != 5)
1239 return 0;
1240 while (namelen--) {
1241 v *= 16;
1242 if (*name >= '0' && *name <= '9')
1243 v += *name - '0';
1244 else if (*name >= 'A' && *name <= 'F')
1245 v += *name - 'A' + 10;
1246 else
1247 return 0;
1248 name++;
1249 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001250 if (!is_unified_ideograph(v))
1251 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001252 *code = v;
1253 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001254 }
1255
Fredrik Lundh06d12682001-01-24 07:59:11 +00001256 /* the following is the same as python's dictionary lookup, with
1257 only minor changes. see the makeunicodedata script for more
1258 details */
1259
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001260 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001261 i = (~h) & mask;
1262 v = code_hash[i];
1263 if (!v)
1264 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001265 if (_cmpname(self, v, name, namelen))
1266 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001267 incr = (h ^ (h >> 3)) & mask;
1268 if (!incr)
1269 incr = mask;
1270 for (;;) {
1271 i = (i + incr) & mask;
1272 v = code_hash[i];
1273 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001274 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001275 if (_cmpname(self, v, name, namelen))
1276 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001277 incr = incr << 1;
1278 if (incr > mask)
1279 incr = incr ^ code_poly;
1280 }
1281}
1282
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001283static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001284{
1285 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001286 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001287 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001288};
1289
1290/* -------------------------------------------------------------------- */
1291/* Python bindings */
1292
Serhiy Storchaka63596412015-04-17 21:18:49 +03001293/*[clinic input]
1294unicodedata.UCD.name
1295
1296 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001297 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +03001298 default: object=NULL
1299 /
1300
1301Returns the name assigned to the character chr as a string.
1302
1303If no name is defined, default is returned, or, if not given,
1304ValueError is raised.
1305[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001306
Fredrik Lundh06d12682001-01-24 07:59:11 +00001307static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001308unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001309/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001310{
Christian Heimes2f366ca2016-09-23 20:20:27 +02001311 char name[NAME_MAXLEN+1];
Serhiy Storchaka63596412015-04-17 21:18:49 +03001312 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001313
Christian Heimes2f366ca2016-09-23 20:20:27 +02001314 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001315 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001316 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001317 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001318 }
1319 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001320 Py_INCREF(default_value);
1321 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001322 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001323 }
1324
Walter Dörwald4254e762007-06-05 16:04:09 +00001325 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001326}
1327
Serhiy Storchaka63596412015-04-17 21:18:49 +03001328/*[clinic input]
1329unicodedata.UCD.lookup
1330
1331 self: self
Larry Hastings38337d12015-05-07 23:30:09 -07001332 name: str(accept={str, robuffer}, zeroes=True)
Serhiy Storchaka63596412015-04-17 21:18:49 +03001333 /
1334
1335Look up character by name.
1336
1337If a character with the given name is found, return the
1338corresponding character. If not found, KeyError is raised.
1339[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001340
Fredrik Lundh06d12682001-01-24 07:59:11 +00001341static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001342unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1343 Py_ssize_clean_t name_length)
Larry Hastings38337d12015-05-07 23:30:09 -07001344/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001345{
1346 Py_UCS4 code;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001347 unsigned int index;
Christian Heimes7ce20132016-09-14 10:25:46 +02001348 if (name_length > NAME_MAXLEN) {
Victor Stinner65a31442014-07-01 16:45:52 +02001349 PyErr_SetString(PyExc_KeyError, "name too long");
1350 return NULL;
1351 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001352
Serhiy Storchaka63596412015-04-17 21:18:49 +03001353 if (!_getcode(self, name, (int)name_length, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001354 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001355 return NULL;
1356 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001357 /* check if code is in the PUA range that we use for named sequences
1358 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001359 if (IS_NAMED_SEQ(code)) {
1360 index = code-named_sequences_start;
1361 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1362 named_sequences[index].seq,
1363 named_sequences[index].seqlen);
1364 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001366}
1367
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001368/* XXX Add doc strings. */
1369
1370static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001371 UNICODEDATA_UCD_DECIMAL_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001372 UNICODEDATA_UCD_DIGIT_METHODDEF
1373 UNICODEDATA_UCD_NUMERIC_METHODDEF
1374 UNICODEDATA_UCD_CATEGORY_METHODDEF
1375 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1376 UNICODEDATA_UCD_COMBINING_METHODDEF
1377 UNICODEDATA_UCD_MIRRORED_METHODDEF
1378 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1379 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1380 UNICODEDATA_UCD_NAME_METHODDEF
1381 UNICODEDATA_UCD_LOOKUP_METHODDEF
Max Bélanger2810dd72018-11-04 15:58:24 -08001382 UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001383 UNICODEDATA_UCD_NORMALIZE_METHODDEF
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001384 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001385};
1386
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001387static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001388 /* The ob_type field must be initialized in the module init function
1389 * to be portable to Windows without using C++. */
1390 PyVarObject_HEAD_INIT(NULL, 0)
1391 "unicodedata.UCD", /*tp_name*/
1392 sizeof(PreviousDBVersion), /*tp_basicsize*/
1393 0, /*tp_itemsize*/
1394 /* methods */
1395 (destructor)PyObject_Del, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02001396 0, /*tp_vectorcall_offset*/
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001397 0, /*tp_getattr*/
1398 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02001399 0, /*tp_as_async*/
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001400 0, /*tp_repr*/
1401 0, /*tp_as_number*/
1402 0, /*tp_as_sequence*/
1403 0, /*tp_as_mapping*/
1404 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001405 0, /*tp_call*/
1406 0, /*tp_str*/
1407 PyObject_GenericGetAttr,/*tp_getattro*/
1408 0, /*tp_setattro*/
1409 0, /*tp_as_buffer*/
1410 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1411 0, /*tp_doc*/
1412 0, /*tp_traverse*/
1413 0, /*tp_clear*/
1414 0, /*tp_richcompare*/
1415 0, /*tp_weaklistoffset*/
1416 0, /*tp_iter*/
1417 0, /*tp_iternext*/
1418 unicodedata_functions, /*tp_methods*/
1419 DB_members, /*tp_members*/
1420 0, /*tp_getset*/
1421 0, /*tp_base*/
1422 0, /*tp_dict*/
1423 0, /*tp_descr_get*/
1424 0, /*tp_descr_set*/
1425 0, /*tp_dictoffset*/
1426 0, /*tp_init*/
1427 0, /*tp_alloc*/
1428 0, /*tp_new*/
1429 0, /*tp_free*/
1430 0, /*tp_is_gc*/
1431};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001432
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001433PyDoc_STRVAR(unicodedata_docstring,
1434"This module provides access to the Unicode Character Database which\n\
1435defines character properties for all Unicode characters. The data in\n\
1436this database is based on the UnicodeData.txt file version\n\
luzpaza5293b42017-11-05 07:37:50 -06001437" UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001438\n\
1439The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001440UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001441
1442static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001443 PyModuleDef_HEAD_INIT,
1444 "unicodedata",
1445 unicodedata_docstring,
1446 -1,
1447 unicodedata_functions,
1448 NULL,
1449 NULL,
1450 NULL,
1451 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001452};
1453
Mark Hammond62b1ab12002-07-23 06:31:15 +00001454PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001455PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001456{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001457 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001458
Victor Stinnerd2ec81a2020-02-07 09:17:07 +01001459 Py_SET_TYPE(&UCD_Type, &PyType_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001460
Martin v. Löwis1a214512008-06-11 05:26:20 +00001461 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001462 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001463 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001464
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001465 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001466 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001467 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001468
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001469 /* Previous versions */
1470 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1471 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001472 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001473
Fredrik Lundh06d12682001-01-24 07:59:11 +00001474 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001475 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001476 if (v != NULL)
1477 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001478 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001479}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001480
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001481/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001482Local variables:
1483c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001484indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001485End:
1486*/