blob: fe4e90822a3096f3840d02029b20e1baec4de9b7 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000020#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Larry Hastings61272b72014-01-07 12:41:53 -080022/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080023module unicodedata
Larry Hastingsc2047262014-01-25 20:43:29 -080024class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Larry Hastings61272b72014-01-07 12:41:53 -080025[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080026/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080027
Fredrik Lundh06d12682001-01-24 07:59:11 +000028/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000029
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000031 const unsigned char category; /* index into
32 _PyUnicode_CategoryNames */
33 const unsigned char combining; /* combining class value 0 - 255 */
34 const unsigned char bidirectional; /* index into
35 _PyUnicode_BidirectionalNames */
36 const unsigned char mirrored; /* true if mirrored in bidir mode */
37 const unsigned char east_asian_width; /* index into
38 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000039 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000040} _PyUnicode_DatabaseRecord;
41
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042typedef struct change_record {
43 /* sequence of fields should be the same as in merge_old_version */
44 const unsigned char bidir_changed;
45 const unsigned char category_changed;
46 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000047 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000048 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000049} change_record;
50
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000051/* data file generated by Tools/unicode/makeunicodedata.py */
52#include "unicodedata_db.h"
53
54static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000055_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000056{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000057 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000058 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000059 index = 0;
60 else {
61 index = index1[(code>>SHIFT)];
62 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
63 }
64
65 return &_PyUnicode_Database_Records[index];
66}
67
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000068/* ------------- Previous-version API ------------------------------------- */
69typedef struct previous_version {
70 PyObject_HEAD
71 const char *name;
72 const change_record* (*getrecord)(Py_UCS4);
73 Py_UCS4 (*normalization)(Py_UCS4);
74} PreviousDBVersion;
75
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030076#include "clinic/unicodedata.c.h"
77
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
79
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000080static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000081 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000082 {NULL}
83};
84
Thomas Wouters89f507f2006-12-13 04:49:30 +000085/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000086static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000087#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088
89static PyObject*
90new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
91 Py_UCS4 (*normalization)(Py_UCS4))
92{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000093 PreviousDBVersion *self;
94 self = PyObject_New(PreviousDBVersion, &UCD_Type);
95 if (self == NULL)
96 return NULL;
97 self->name = name;
98 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000099 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000100 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000101}
102
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000103
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000104/* --- Module API --------------------------------------------------------- */
105
Larry Hastings61272b72014-01-07 12:41:53 -0800106/*[clinic input]
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800107unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700108
Serhiy Storchaka63596412015-04-17 21:18:49 +0300109 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700110 chr: int(accept={str})
Larry Hastings31826802013-10-19 00:09:25 -0700111 default: object=NULL
112 /
113
114Converts a Unicode character into its equivalent decimal value.
115
Serhiy Storchaka63596412015-04-17 21:18:49 +0300116Returns the decimal value assigned to the character chr as integer.
117If no such value is defined, default is returned, or, if not given,
118ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800119[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700120
Larry Hastings31826802013-10-19 00:09:25 -0700121static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300122unicodedata_UCD_decimal_impl(PyObject *self, int chr,
Larry Hastings89964c42015-04-14 18:07:59 -0400123 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700124/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
Larry Hastings31826802013-10-19 00:09:25 -0700125{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000126 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000127 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300128 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000129
Martin v. Löwis1a214512008-06-11 05:26:20 +0000130 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000131 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000132 if (old->category_changed == 0) {
133 /* unassigned */
134 have_old = 1;
135 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000136 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000137 else if (old->decimal_changed != 0xFF) {
138 have_old = 1;
139 rc = old->decimal_changed;
140 }
141 }
142
143 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000144 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000145 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700146 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 PyErr_SetString(PyExc_ValueError,
148 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000149 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 }
151 else {
Larry Hastings31826802013-10-19 00:09:25 -0700152 Py_INCREF(default_value);
153 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000154 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000155 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000156 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000157}
158
Serhiy Storchaka63596412015-04-17 21:18:49 +0300159/*[clinic input]
160unicodedata.UCD.digit
161
162 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700163 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300164 default: object=NULL
165 /
166
167Converts a Unicode character into its equivalent digit value.
168
169Returns the digit value assigned to the character chr as integer.
170If no such value is defined, default is returned, or, if not given,
171ValueError is raised.
172[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000173
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000174static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300175unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700176/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000177{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000178 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300179 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000180 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000181 if (rc < 0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300182 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000183 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000184 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 }
186 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300187 Py_INCREF(default_value);
188 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000189 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000190 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000191 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000192}
193
Serhiy Storchaka63596412015-04-17 21:18:49 +0300194/*[clinic input]
195unicodedata.UCD.numeric
196
197 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700198 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300199 default: object=NULL
200 /
201
202Converts a Unicode character into its equivalent numeric value.
203
204Returns the numeric value assigned to the character chr as float.
205If no such value is defined, default is returned, or, if not given,
206ValueError is raised.
207[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000208
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000209static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300210unicodedata_UCD_numeric_impl(PyObject *self, int chr,
211 PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700212/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000213{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000214 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000215 double rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300216 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000217
Martin v. Löwis1a214512008-06-11 05:26:20 +0000218 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000219 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000220 if (old->category_changed == 0) {
221 /* unassigned */
222 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000224 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000225 else if (old->decimal_changed != 0xFF) {
226 have_old = 1;
227 rc = old->decimal_changed;
228 }
229 }
230
231 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000232 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233 if (rc == -1.0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300234 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000235 PyErr_SetString(PyExc_ValueError, "not a numeric character");
236 return NULL;
237 }
238 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300239 Py_INCREF(default_value);
240 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000241 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000242 }
243 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000244}
245
Serhiy Storchaka63596412015-04-17 21:18:49 +0300246/*[clinic input]
247unicodedata.UCD.category
248
249 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700250 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300251 /
252
253Returns the general category assigned to the character chr as string.
254[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000255
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000256static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300257unicodedata_UCD_category_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700258/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000259{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000260 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300261 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000262 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000263 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000264 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000265 if (old->category_changed != 0xFF)
266 index = old->category_changed;
267 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000268 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000269}
270
Serhiy Storchaka63596412015-04-17 21:18:49 +0300271/*[clinic input]
272unicodedata.UCD.bidirectional
273
274 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700275 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300276 /
277
278Returns the bidirectional class assigned to the character chr as string.
279
280If no such value is defined, an empty string is returned.
281[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000282
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000283static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300284unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700285/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000286{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000287 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300288 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000289 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000290 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000291 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000292 if (old->category_changed == 0)
293 index = 0; /* unassigned */
294 else if (old->bidir_changed != 0xFF)
295 index = old->bidir_changed;
296 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000297 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000298}
299
Serhiy Storchaka63596412015-04-17 21:18:49 +0300300/*[clinic input]
301unicodedata.UCD.combining -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000302
Serhiy Storchaka63596412015-04-17 21:18:49 +0300303 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700304 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300305 /
306
307Returns the canonical combining class assigned to the character chr as integer.
308
309Returns 0 if no combining class is defined.
310[clinic start generated code]*/
311
312static int
313unicodedata_UCD_combining_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700314/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000315{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000316 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300317 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000318 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000319 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000320 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000321 if (old->category_changed == 0)
322 index = 0; /* unassigned */
323 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300324 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000325}
326
Serhiy Storchaka63596412015-04-17 21:18:49 +0300327/*[clinic input]
328unicodedata.UCD.mirrored -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000329
Serhiy Storchaka63596412015-04-17 21:18:49 +0300330 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700331 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300332 /
333
334Returns the mirrored property assigned to the character chr as integer.
335
336Returns 1 if the character has been identified as a "mirrored"
337character in bidirectional text, 0 otherwise.
338[clinic start generated code]*/
339
340static int
341unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700342/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000343{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000344 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300345 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000346 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000347 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000348 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000349 if (old->category_changed == 0)
350 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000351 else if (old->mirrored_changed != 0xFF)
352 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000353 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300354 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000355}
356
Serhiy Storchaka63596412015-04-17 21:18:49 +0300357/*[clinic input]
358unicodedata.UCD.east_asian_width
359
360 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700361 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300362 /
363
364Returns the east asian width assigned to the character chr as string.
365[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000366
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000367static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300368unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700369/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000370{
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000371 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300372 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000373 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000374 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000375 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000376 if (old->category_changed == 0)
377 index = 0; /* unassigned */
378 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000379 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000380}
381
Serhiy Storchaka63596412015-04-17 21:18:49 +0300382/*[clinic input]
383unicodedata.UCD.decomposition
384
385 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700386 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +0300387 /
388
389Returns the character decomposition mapping assigned to the character chr as string.
390
391An empty string is returned in case no such mapping is defined.
392[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000393
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000394static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300395unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
Larry Hastingsdbfdc382015-05-04 06:59:46 -0700396/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000397{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000398 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000399 int code, index, count;
400 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000401 unsigned int prefix_index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300402 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000403
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000404 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000405
Martin v. Löwis1a214512008-06-11 05:26:20 +0000406 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000407 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000408 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000409 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000410 }
411
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000412 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000413 index = 0;
414 else {
415 index = decomp_index1[(code>>DECOMP_SHIFT)];
416 index = decomp_index2[(index<<DECOMP_SHIFT)+
417 (code&((1<<DECOMP_SHIFT)-1))];
418 }
419
Tim Peters69b83b12001-11-30 07:23:05 +0000420 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000421 is prefix code (from*/
422 count = decomp_data[index] >> 8;
423
424 /* XXX: could allocate the PyString up front instead
425 (strlen(prefix) + 5 * count + 1 bytes) */
426
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000427 /* Based on how index is calculated above and decomp_data is generated
428 from Tools/unicode/makeunicodedata.py, it should not be possible
429 to overflow decomp_prefix. */
430 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200431 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000432
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000433 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000434 i = strlen(decomp_prefix[prefix_index]);
435 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000436
437 while (count-- > 0) {
438 if (i)
439 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000440 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000441 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
442 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000443 i += strlen(decomp + i);
444 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000445 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000446}
447
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000448static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000449get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000450{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000451 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000452 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000453 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000454 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000455 /* unassigned in old version */
456 *index = 0;
457 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000458 else {
459 *index = decomp_index1[(code>>DECOMP_SHIFT)];
460 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
461 (code&((1<<DECOMP_SHIFT)-1))];
462 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000463
Martin v. Löwis677bde22002-11-23 22:08:15 +0000464 /* high byte is number of hex bytes (usually one or two), low byte
465 is prefix code (from*/
466 *count = decomp_data[*index] >> 8;
467 *prefix = decomp_data[*index] & 255;
468
469 (*index)++;
470}
471
472#define SBase 0xAC00
473#define LBase 0x1100
474#define VBase 0x1161
475#define TBase 0x11A7
476#define LCount 19
477#define VCount 21
478#define TCount 28
479#define NCount (VCount*TCount)
480#define SCount (LCount*NCount)
481
482static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000483nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000484{
485 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200486 Py_UCS4 *output;
487 Py_ssize_t i, o, osize;
488 int kind;
489 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000490 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200491 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000492 Py_ssize_t space, isize;
493 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000494 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000495
Martin v. Löwis677bde22002-11-23 22:08:15 +0000496 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200497 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500498 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300499 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500500 if (space > 10) {
501 if (space <= PY_SSIZE_T_MAX - 10)
502 space += 10;
503 }
504 else {
505 space *= 2;
506 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200507 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500508 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200509 if (!output) {
510 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000511 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200512 }
513 i = o = 0;
514 kind = PyUnicode_KIND(input);
515 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000516
Martin v. Löwis22970662011-09-29 13:39:38 +0200517 while (i < isize) {
518 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000519 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200520 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000521 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300522 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000523 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000524 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200525 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000526 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000527 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
528 if (new_output == NULL) {
529 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200530 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000531 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200532 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000533 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000534 }
535 /* Hangul Decomposition. */
536 if (SBase <= code && code < (SBase+SCount)) {
537 int SIndex = code - SBase;
538 int L = LBase + SIndex / NCount;
539 int V = VBase + (SIndex % NCount) / TCount;
540 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200541 output[o++] = L;
542 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000543 space -= 2;
544 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200545 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000546 space --;
547 }
548 continue;
549 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000550 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000551 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000552 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
553 if (value != 0) {
554 stack[stackptr++] = value;
555 continue;
556 }
557 }
558
559 /* Other decompositions. */
560 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000561
562 /* Copy character if it is not decomposable, or has a
563 compatibility decomposition, but we do NFD. */
564 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200565 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000566 space--;
567 continue;
568 }
569 /* Copy decomposition onto the stack, in reverse
570 order. */
571 while(count) {
572 code = decomp_data[index + (--count)];
573 stack[stackptr++] = code;
574 }
575 }
576 }
577
Martin v. Löwis22970662011-09-29 13:39:38 +0200578 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
579 output, o);
580 PyMem_Free(output);
581 if (!result)
582 return NULL;
583 /* result is guaranteed to be ready, as it is compact. */
584 kind = PyUnicode_KIND(result);
585 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000586
587 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200588 i = 0;
589 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
590 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
591 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000592 if (prev == 0 || cur == 0 || prev <= cur) {
593 prev = cur;
594 continue;
595 }
596 /* Non-canonical order. Need to switch *i with previous. */
597 o = i - 1;
598 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200599 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
600 PyUnicode_WRITE(kind, data, o+1,
601 PyUnicode_READ(kind, data, o));
602 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000603 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200604 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000605 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200606 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000607 if (prev == 0 || prev <= cur)
608 break;
609 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200610 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000611 }
612 return result;
613}
614
615static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200616find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000617{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200618 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000619 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200620 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000621 if (code < start)
622 return -1;
623 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200624 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000625 return nfc[index].index + delta;
626 }
627 }
628 return -1;
629}
630
631static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000632nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000633{
634 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200635 int kind;
636 void *data;
637 Py_UCS4 *output;
638 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000639 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200640 Py_UCS4 code;
641 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000642 int cskipped = 0;
643
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000644 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000645 if (!result)
646 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200647 /* result will be "ready". */
648 kind = PyUnicode_KIND(result);
649 data = PyUnicode_DATA(result);
650 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000651
Martin v. Löwis22970662011-09-29 13:39:38 +0200652 /* We allocate a buffer for the output.
653 If we find that we made no changes, we still return
654 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500655 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200656 if (!output) {
657 PyErr_NoMemory();
658 Py_DECREF(result);
659 return 0;
660 }
661 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000662
Martin v. Löwis677bde22002-11-23 22:08:15 +0000663 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200664 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000665 for (index = 0; index < cskipped; index++) {
666 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000667 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000668 Remove from list. */
669 skipped[index] = skipped[cskipped-1];
670 cskipped--;
671 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000672 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000673 }
674 }
675 /* Hangul Composition. We don't need to check for <LV,T>
676 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200677 code = PyUnicode_READ(kind, data, i);
678 if (LBase <= code && code < (LBase+LCount) &&
679 i + 1 < len &&
680 VBase <= PyUnicode_READ(kind, data, i+1) &&
681 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000682 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200683 LIndex = code - LBase;
684 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000685 code = SBase + (LIndex*VCount+VIndex)*TCount;
686 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200687 if (i < len &&
688 TBase <= PyUnicode_READ(kind, data, i) &&
689 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
690 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000691 i++;
692 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200693 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000694 continue;
695 }
696
Martin v. Löwis22970662011-09-29 13:39:38 +0200697 /* code is still input[i] here */
698 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000699 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200700 output[o++] = code;
701 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000702 continue;
703 }
704 /* Find next unblocked character. */
705 i1 = i+1;
706 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200707 /* output base character for now; might be updated later. */
708 output[o] = PyUnicode_READ(kind, data, i);
709 while (i1 < len) {
710 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
711 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000712 if (comb) {
713 if (comb1 == 0)
714 break;
715 if (comb >= comb1) {
716 /* Character is blocked. */
717 i1++;
718 continue;
719 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000720 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200721 l = find_nfc_index(self, nfc_last, code1);
722 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000723 is a starter, we don't need to look further.
724 Otherwise, record the combining class. */
725 if (l == -1) {
726 not_combinable:
727 if (comb1 == 0)
728 break;
729 comb = comb1;
730 i1++;
731 continue;
732 }
733 index = f*TOTAL_LAST + l;
734 index1 = comp_index[index >> COMP_SHIFT];
735 code = comp_data[(index1<<COMP_SHIFT)+
736 (index&((1<<COMP_SHIFT)-1))];
737 if (code == 0)
738 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000739
Martin v. Löwis677bde22002-11-23 22:08:15 +0000740 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200741 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000742 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000743 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000744 skipped[cskipped++] = i1;
745 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200746 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000747 if (f == -1)
748 break;
749 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200750 /* Output character was already written.
751 Just advance the indices. */
752 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000753 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200754 if (o == len) {
755 /* No changes. Return original string. */
756 PyMem_Free(output);
757 return result;
758 }
759 Py_DECREF(result);
760 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
761 output, o);
762 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000763 return result;
764}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000765
766/* Return 1 if the input is certainly normalized, 0 if it might not be. */
767static int
768is_normalized(PyObject *self, PyObject *input, int nfc, int k)
769{
Martin v. Löwis22970662011-09-29 13:39:38 +0200770 Py_ssize_t i, len;
771 int kind;
772 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000773 unsigned char prev_combining = 0, quickcheck_mask;
774
775 /* An older version of the database is requested, quickchecks must be
776 disabled. */
777 if (self && UCD_Check(self))
778 return 0;
779
780 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
781 as described in http://unicode.org/reports/tr15/#Annex8. */
782 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
783
Martin v. Löwis22970662011-09-29 13:39:38 +0200784 i = 0;
785 kind = PyUnicode_KIND(input);
786 data = PyUnicode_DATA(input);
787 len = PyUnicode_GET_LENGTH(input);
788 while (i < len) {
789 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
790 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000791 unsigned char combining = record->combining;
792 unsigned char quickcheck = record->normalization_quick_check;
793
794 if (quickcheck & quickcheck_mask)
795 return 0; /* this string might need normalization */
796 if (combining && prev_combining > combining)
797 return 0; /* non-canonical sort order, not normalized */
798 prev_combining = combining;
799 }
800 return 1; /* certainly normalized */
801}
802
Serhiy Storchaka63596412015-04-17 21:18:49 +0300803/*[clinic input]
804unicodedata.UCD.normalize
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000805
Serhiy Storchaka63596412015-04-17 21:18:49 +0300806 self: self
807 form: str
808 unistr as input: object(subclass_of='&PyUnicode_Type')
809 /
810
811Return the normal form 'form' for the Unicode string unistr.
812
813Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
814[clinic start generated code]*/
815
816static PyObject *
817unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
818 PyObject *input)
819/*[clinic end generated code: output=62d1f8870027efdc input=cd092e631cf11883]*/
Martin v. Löwis677bde22002-11-23 22:08:15 +0000820{
Martin v. Löwis22970662011-09-29 13:39:38 +0200821 if (PyUnicode_READY(input) == -1)
822 return NULL;
823
824 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000825 /* Special case empty input strings, since resizing
826 them later would cause internal errors. */
827 Py_INCREF(input);
828 return input;
829 }
830
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000831 if (strcmp(form, "NFC") == 0) {
832 if (is_normalized(self, input, 1, 0)) {
833 Py_INCREF(input);
834 return input;
835 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000836 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000837 }
838 if (strcmp(form, "NFKC") == 0) {
839 if (is_normalized(self, input, 1, 1)) {
840 Py_INCREF(input);
841 return input;
842 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000843 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000844 }
845 if (strcmp(form, "NFD") == 0) {
846 if (is_normalized(self, input, 0, 0)) {
847 Py_INCREF(input);
848 return input;
849 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000850 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000851 }
852 if (strcmp(form, "NFKD") == 0) {
853 if (is_normalized(self, input, 0, 1)) {
854 Py_INCREF(input);
855 return input;
856 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000857 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000858 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000859 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
860 return NULL;
861}
862
Fredrik Lundh06d12682001-01-24 07:59:11 +0000863/* -------------------------------------------------------------------- */
864/* unicode character name tables */
865
866/* data file generated by Tools/unicode/makeunicodedata.py */
867#include "unicodename_db.h"
868
869/* -------------------------------------------------------------------- */
870/* database code (cut and pasted from the unidb package) */
871
872static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000873_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000874{
875 int i;
876 unsigned long h = 0;
877 unsigned long ix;
878 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200879 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000880 ix = h & 0xff000000;
881 if (ix)
882 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
883 }
884 return h;
885}
886
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000887static char *hangul_syllables[][3] = {
888 { "G", "A", "" },
889 { "GG", "AE", "G" },
890 { "N", "YA", "GG" },
891 { "D", "YAE", "GS" },
892 { "DD", "EO", "N", },
893 { "R", "E", "NJ" },
894 { "M", "YEO", "NH" },
895 { "B", "YE", "D" },
896 { "BB", "O", "L" },
897 { "S", "WA", "LG" },
898 { "SS", "WAE", "LM" },
899 { "", "OE", "LB" },
900 { "J", "YO", "LS" },
901 { "JJ", "U", "LT" },
902 { "C", "WEO", "LP" },
903 { "K", "WE", "LH" },
904 { "T", "WI", "M" },
905 { "P", "YU", "B" },
906 { "H", "EU", "BS" },
907 { 0, "YI", "S" },
908 { 0, "I", "SS" },
909 { 0, 0, "NG" },
910 { 0, 0, "J" },
911 { 0, 0, "C" },
912 { 0, 0, "K" },
913 { 0, 0, "T" },
914 { 0, 0, "P" },
915 { 0, 0, "H" }
916};
917
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000918/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000919static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000920is_unified_ideograph(Py_UCS4 code)
921{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000922 return
923 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson48013832015-06-27 15:45:56 -0500924 (0x4E00 <= code && code <= 0x9FD5) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000925 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
926 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
Benjamin Peterson48013832015-06-27 15:45:56 -0500927 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
928 (0x2B820 <= code && code <= 0x2CEA1); /* CJK Ideograph Extension E */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000929}
930
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200931/* macros used to determine if the given code point is in the PUA range that
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300932 * we are using to store aliases and named sequences */
933#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
934#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
935 (cp < named_sequences_end))
936
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000937static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300938_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
939 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000940{
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200941 /* Find the name associated with the given code point.
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300942 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
943 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000944 int offset;
945 int i;
946 int word;
947 unsigned char* w;
948
Martin v. Löwisc3509122006-03-11 12:16:23 +0000949 if (code >= 0x110000)
950 return 0;
951
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200952 /* XXX should we just skip all the code points in the PUAs here? */
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300953 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
954 return 0;
955
Martin v. Löwis1a214512008-06-11 05:26:20 +0000956 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300957 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +0300958 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300959 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
960 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +0300961 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +0000962 if (old->category_changed == 0) {
963 /* unassigned */
964 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000965 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000966 }
967
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000968 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000969 /* Hangul syllable. */
970 int SIndex = code - SBase;
971 int L = SIndex / NCount;
972 int V = (SIndex % NCount) / TCount;
973 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000974
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000975 if (buflen < 27)
976 /* Worst case: HANGUL SYLLABLE <10chars>. */
977 return 0;
978 strcpy(buffer, "HANGUL SYLLABLE ");
979 buffer += 16;
980 strcpy(buffer, hangul_syllables[L][0]);
981 buffer += strlen(hangul_syllables[L][0]);
982 strcpy(buffer, hangul_syllables[V][1]);
983 buffer += strlen(hangul_syllables[V][1]);
984 strcpy(buffer, hangul_syllables[T][2]);
985 buffer += strlen(hangul_syllables[T][2]);
986 *buffer = '\0';
987 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000988 }
989
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000990 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000991 if (buflen < 28)
992 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
993 return 0;
994 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
995 return 1;
996 }
997
Fredrik Lundh06d12682001-01-24 07:59:11 +0000998 /* get offset into phrasebook */
999 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1000 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1001 (code&((1<<phrasebook_shift)-1))];
1002 if (!offset)
1003 return 0;
1004
1005 i = 0;
1006
1007 for (;;) {
1008 /* get word index */
1009 word = phrasebook[offset] - phrasebook_short;
1010 if (word >= 0) {
1011 word = (word << 8) + phrasebook[offset+1];
1012 offset += 2;
1013 } else
1014 word = phrasebook[offset++];
1015 if (i) {
1016 if (i > buflen)
1017 return 0; /* buffer overflow */
1018 buffer[i++] = ' ';
1019 }
1020 /* copy word string from lexicon. the last character in the
1021 word has bit 7 set. the last word in a string ends with
1022 0x80 */
1023 w = lexicon + lexicon_offset[word];
1024 while (*w < 128) {
1025 if (i >= buflen)
1026 return 0; /* buffer overflow */
1027 buffer[i++] = *w++;
1028 }
1029 if (i >= buflen)
1030 return 0; /* buffer overflow */
1031 buffer[i++] = *w & 127;
1032 if (*w == 128)
1033 break; /* end of word */
1034 }
1035
1036 return 1;
1037}
1038
1039static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001040_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001041{
1042 /* check if code corresponds to the given name */
1043 int i;
1044 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001045 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001046 return 0;
1047 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001048 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001049 return 0;
1050 }
1051 return buffer[namelen] == '\0';
1052}
1053
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001054static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001055find_syllable(const char *str, int *len, int *pos, int count, int column)
1056{
1057 int i, len1;
1058 *len = -1;
1059 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001060 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001061 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001062 if (len1 <= *len)
1063 continue;
1064 if (strncmp(str, s, len1) == 0) {
1065 *len = len1;
1066 *pos = i;
1067 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001068 }
1069 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001070 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001071 }
1072}
1073
Fredrik Lundh06d12682001-01-24 07:59:11 +00001074static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001075_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001076{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001077 /* check if named sequences are allowed */
1078 if (!with_named_seq && IS_NAMED_SEQ(cp))
1079 return 0;
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001080 /* if the code point is in the PUA range that we use for aliases,
1081 * convert it to obtain the right code point */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001082 if (IS_ALIAS(cp))
1083 *code = name_aliases[cp-aliases_start];
1084 else
1085 *code = cp;
1086 return 1;
1087}
1088
1089static int
1090_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1091 int with_named_seq)
1092{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001093 /* Return the code point associated with the given name.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001094 * Named aliases are resolved too (unless self != NULL (i.e. we are using
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001095 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001096 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001097 unsigned int h, v;
1098 unsigned int mask = code_size-1;
1099 unsigned int i, incr;
1100
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001101 /* Check for hangul syllables. */
1102 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001103 int len, L = -1, V = -1, T = -1;
1104 const char *pos = name + 16;
1105 find_syllable(pos, &len, &L, LCount, 0);
1106 pos += len;
1107 find_syllable(pos, &len, &V, VCount, 1);
1108 pos += len;
1109 find_syllable(pos, &len, &T, TCount, 2);
1110 pos += len;
1111 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1112 *code = SBase + (L*VCount+V)*TCount + T;
1113 return 1;
1114 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001115 /* Otherwise, it's an illegal syllable name. */
1116 return 0;
1117 }
1118
1119 /* Check for unified ideographs. */
1120 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1121 /* Four or five hexdigits must follow. */
1122 v = 0;
1123 name += 22;
1124 namelen -= 22;
1125 if (namelen != 4 && namelen != 5)
1126 return 0;
1127 while (namelen--) {
1128 v *= 16;
1129 if (*name >= '0' && *name <= '9')
1130 v += *name - '0';
1131 else if (*name >= 'A' && *name <= 'F')
1132 v += *name - 'A' + 10;
1133 else
1134 return 0;
1135 name++;
1136 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001137 if (!is_unified_ideograph(v))
1138 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001139 *code = v;
1140 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001141 }
1142
Fredrik Lundh06d12682001-01-24 07:59:11 +00001143 /* the following is the same as python's dictionary lookup, with
1144 only minor changes. see the makeunicodedata script for more
1145 details */
1146
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001147 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001148 i = (~h) & mask;
1149 v = code_hash[i];
1150 if (!v)
1151 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001152 if (_cmpname(self, v, name, namelen))
1153 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001154 incr = (h ^ (h >> 3)) & mask;
1155 if (!incr)
1156 incr = mask;
1157 for (;;) {
1158 i = (i + incr) & mask;
1159 v = code_hash[i];
1160 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001161 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001162 if (_cmpname(self, v, name, namelen))
1163 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001164 incr = incr << 1;
1165 if (incr > mask)
1166 incr = incr ^ code_poly;
1167 }
1168}
1169
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001170static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001171{
1172 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001173 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001174 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001175};
1176
1177/* -------------------------------------------------------------------- */
1178/* Python bindings */
1179
Serhiy Storchaka63596412015-04-17 21:18:49 +03001180/*[clinic input]
1181unicodedata.UCD.name
1182
1183 self: self
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001184 chr: int(accept={str})
Serhiy Storchaka63596412015-04-17 21:18:49 +03001185 default: object=NULL
1186 /
1187
1188Returns the name assigned to the character chr as a string.
1189
1190If no name is defined, default is returned, or, if not given,
1191ValueError is raised.
1192[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001193
Fredrik Lundh06d12682001-01-24 07:59:11 +00001194static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001195unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
Larry Hastingsdbfdc382015-05-04 06:59:46 -07001196/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001197{
1198 char name[NAME_MAXLEN];
Serhiy Storchaka63596412015-04-17 21:18:49 +03001199 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001200
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001201 if (!_getucname(self, c, name, sizeof(name), 0)) {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001202 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001203 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001204 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001205 }
1206 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001207 Py_INCREF(default_value);
1208 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001209 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001210 }
1211
Walter Dörwald4254e762007-06-05 16:04:09 +00001212 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001213}
1214
Serhiy Storchaka63596412015-04-17 21:18:49 +03001215/*[clinic input]
1216unicodedata.UCD.lookup
1217
1218 self: self
Larry Hastings38337d12015-05-07 23:30:09 -07001219 name: str(accept={str, robuffer}, zeroes=True)
Serhiy Storchaka63596412015-04-17 21:18:49 +03001220 /
1221
1222Look up character by name.
1223
1224If a character with the given name is found, return the
1225corresponding character. If not found, KeyError is raised.
1226[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001227
Fredrik Lundh06d12682001-01-24 07:59:11 +00001228static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001229unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1230 Py_ssize_clean_t name_length)
Larry Hastings38337d12015-05-07 23:30:09 -07001231/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001232{
1233 Py_UCS4 code;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001234 unsigned int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +03001235 if (name_length > INT_MAX) {
Victor Stinner65a31442014-07-01 16:45:52 +02001236 PyErr_SetString(PyExc_KeyError, "name too long");
1237 return NULL;
1238 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001239
Serhiy Storchaka63596412015-04-17 21:18:49 +03001240 if (!_getcode(self, name, (int)name_length, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001241 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001242 return NULL;
1243 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001244 /* check if code is in the PUA range that we use for named sequences
1245 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001246 if (IS_NAMED_SEQ(code)) {
1247 index = code-named_sequences_start;
1248 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1249 named_sequences[index].seq,
1250 named_sequences[index].seqlen);
1251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001253}
1254
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001255/* XXX Add doc strings. */
1256
1257static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001258 UNICODEDATA_UCD_DECIMAL_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001259 UNICODEDATA_UCD_DIGIT_METHODDEF
1260 UNICODEDATA_UCD_NUMERIC_METHODDEF
1261 UNICODEDATA_UCD_CATEGORY_METHODDEF
1262 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1263 UNICODEDATA_UCD_COMBINING_METHODDEF
1264 UNICODEDATA_UCD_MIRRORED_METHODDEF
1265 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1266 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1267 UNICODEDATA_UCD_NAME_METHODDEF
1268 UNICODEDATA_UCD_LOOKUP_METHODDEF
1269 UNICODEDATA_UCD_NORMALIZE_METHODDEF
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001270 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001271};
1272
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001273static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001274 /* The ob_type field must be initialized in the module init function
1275 * to be portable to Windows without using C++. */
1276 PyVarObject_HEAD_INIT(NULL, 0)
1277 "unicodedata.UCD", /*tp_name*/
1278 sizeof(PreviousDBVersion), /*tp_basicsize*/
1279 0, /*tp_itemsize*/
1280 /* methods */
1281 (destructor)PyObject_Del, /*tp_dealloc*/
1282 0, /*tp_print*/
1283 0, /*tp_getattr*/
1284 0, /*tp_setattr*/
1285 0, /*tp_reserved*/
1286 0, /*tp_repr*/
1287 0, /*tp_as_number*/
1288 0, /*tp_as_sequence*/
1289 0, /*tp_as_mapping*/
1290 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001291 0, /*tp_call*/
1292 0, /*tp_str*/
1293 PyObject_GenericGetAttr,/*tp_getattro*/
1294 0, /*tp_setattro*/
1295 0, /*tp_as_buffer*/
1296 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1297 0, /*tp_doc*/
1298 0, /*tp_traverse*/
1299 0, /*tp_clear*/
1300 0, /*tp_richcompare*/
1301 0, /*tp_weaklistoffset*/
1302 0, /*tp_iter*/
1303 0, /*tp_iternext*/
1304 unicodedata_functions, /*tp_methods*/
1305 DB_members, /*tp_members*/
1306 0, /*tp_getset*/
1307 0, /*tp_base*/
1308 0, /*tp_dict*/
1309 0, /*tp_descr_get*/
1310 0, /*tp_descr_set*/
1311 0, /*tp_dictoffset*/
1312 0, /*tp_init*/
1313 0, /*tp_alloc*/
1314 0, /*tp_new*/
1315 0, /*tp_free*/
1316 0, /*tp_is_gc*/
1317};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001318
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001319PyDoc_STRVAR(unicodedata_docstring,
1320"This module provides access to the Unicode Character Database which\n\
1321defines character properties for all Unicode characters. The data in\n\
1322this database is based on the UnicodeData.txt file version\n\
Benjamin Peterson8aa7b892013-10-10 20:22:10 -04001323" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001324\n\
1325The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001326UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001327
1328static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001329 PyModuleDef_HEAD_INIT,
1330 "unicodedata",
1331 unicodedata_docstring,
1332 -1,
1333 unicodedata_functions,
1334 NULL,
1335 NULL,
1336 NULL,
1337 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001338};
1339
Mark Hammond62b1ab12002-07-23 06:31:15 +00001340PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001341PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001342{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001343 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001344
Christian Heimes90aa7642007-12-19 02:45:37 +00001345 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001346
Martin v. Löwis1a214512008-06-11 05:26:20 +00001347 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001348 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001349 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001350
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001351 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001352 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001353 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001354
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001355 /* Previous versions */
1356 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1357 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001358 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001359
Fredrik Lundh06d12682001-01-24 07:59:11 +00001360 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001361 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001362 if (v != NULL)
1363 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001364 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001365}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001366
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001367/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001368Local variables:
1369c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001370indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001371End:
1372*/