blob: 47ada37818271f5ea904c360a37efbbeb6419eec [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
Victor Stinner65a31442014-07-01 16:45:52 +020016#define PY_SSIZE_T_CLEAN
17
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000018#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000020#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000021
Larry Hastings61272b72014-01-07 12:41:53 -080022/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080023module unicodedata
Larry Hastingsc2047262014-01-25 20:43:29 -080024class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Larry Hastings61272b72014-01-07 12:41:53 -080025[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080026/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080027
Fredrik Lundh06d12682001-01-24 07:59:11 +000028/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000029
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000030typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000031 const unsigned char category; /* index into
32 _PyUnicode_CategoryNames */
33 const unsigned char combining; /* combining class value 0 - 255 */
34 const unsigned char bidirectional; /* index into
35 _PyUnicode_BidirectionalNames */
36 const unsigned char mirrored; /* true if mirrored in bidir mode */
37 const unsigned char east_asian_width; /* index into
38 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000039 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000040} _PyUnicode_DatabaseRecord;
41
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000042typedef struct change_record {
43 /* sequence of fields should be the same as in merge_old_version */
44 const unsigned char bidir_changed;
45 const unsigned char category_changed;
46 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000047 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000048 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000049} change_record;
50
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000051/* data file generated by Tools/unicode/makeunicodedata.py */
52#include "unicodedata_db.h"
53
54static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000055_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000056{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000057 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000058 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000059 index = 0;
60 else {
61 index = index1[(code>>SHIFT)];
62 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
63 }
64
65 return &_PyUnicode_Database_Records[index];
66}
67
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000068/* ------------- Previous-version API ------------------------------------- */
69typedef struct previous_version {
70 PyObject_HEAD
71 const char *name;
72 const change_record* (*getrecord)(Py_UCS4);
73 Py_UCS4 (*normalization)(Py_UCS4);
74} PreviousDBVersion;
75
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030076#include "clinic/unicodedata.c.h"
77
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
79
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000080static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000081 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000082 {NULL}
83};
84
Thomas Wouters89f507f2006-12-13 04:49:30 +000085/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000086static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000087#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088
89static PyObject*
90new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
91 Py_UCS4 (*normalization)(Py_UCS4))
92{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000093 PreviousDBVersion *self;
94 self = PyObject_New(PreviousDBVersion, &UCD_Type);
95 if (self == NULL)
96 return NULL;
97 self->name = name;
98 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000099 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000100 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000101}
102
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000103
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000104/* --- Module API --------------------------------------------------------- */
105
Larry Hastings61272b72014-01-07 12:41:53 -0800106/*[clinic input]
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800107unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700108
Serhiy Storchaka63596412015-04-17 21:18:49 +0300109 self: self
110 chr: int(types={'str'})
Larry Hastings31826802013-10-19 00:09:25 -0700111 default: object=NULL
112 /
113
114Converts a Unicode character into its equivalent decimal value.
115
Serhiy Storchaka63596412015-04-17 21:18:49 +0300116Returns the decimal value assigned to the character chr as integer.
117If no such value is defined, default is returned, or, if not given,
118ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800119[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700120
Larry Hastings31826802013-10-19 00:09:25 -0700121static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300122unicodedata_UCD_decimal_impl(PyObject *self, int chr,
Larry Hastings89964c42015-04-14 18:07:59 -0400123 PyObject *default_value)
Serhiy Storchaka63596412015-04-17 21:18:49 +0300124/*[clinic end generated code: output=be23376e1a185231 input=3acf7f2238874a49]*/
Larry Hastings31826802013-10-19 00:09:25 -0700125{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000126 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000127 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300128 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000129
Martin v. Löwis1a214512008-06-11 05:26:20 +0000130 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000131 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000132 if (old->category_changed == 0) {
133 /* unassigned */
134 have_old = 1;
135 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000136 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000137 else if (old->decimal_changed != 0xFF) {
138 have_old = 1;
139 rc = old->decimal_changed;
140 }
141 }
142
143 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000144 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000145 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700146 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 PyErr_SetString(PyExc_ValueError,
148 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000149 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 }
151 else {
Larry Hastings31826802013-10-19 00:09:25 -0700152 Py_INCREF(default_value);
153 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000154 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000155 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000156 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000157}
158
Serhiy Storchaka63596412015-04-17 21:18:49 +0300159/*[clinic input]
160unicodedata.UCD.digit
161
162 self: self
163 chr: int(types={'str'})
164 default: object=NULL
165 /
166
167Converts a Unicode character into its equivalent digit value.
168
169Returns the digit value assigned to the character chr as integer.
170If no such value is defined, default is returned, or, if not given,
171ValueError is raised.
172[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000173
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000174static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300175unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
176/*[clinic end generated code: output=96e18c950171fd2f input=733f093b399f5ab6]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000177{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000178 long rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300179 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000180 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000181 if (rc < 0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300182 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000183 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000184 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 }
186 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300187 Py_INCREF(default_value);
188 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000189 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000190 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000191 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000192}
193
Serhiy Storchaka63596412015-04-17 21:18:49 +0300194/*[clinic input]
195unicodedata.UCD.numeric
196
197 self: self
198 chr: int(types={'str'})
199 default: object=NULL
200 /
201
202Converts a Unicode character into its equivalent numeric value.
203
204Returns the numeric value assigned to the character chr as float.
205If no such value is defined, default is returned, or, if not given,
206ValueError is raised.
207[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000208
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000209static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300210unicodedata_UCD_numeric_impl(PyObject *self, int chr,
211 PyObject *default_value)
212/*[clinic end generated code: output=53ce281fe85b10c4 input=c5875fa7cc768fb2]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000213{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000214 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000215 double rc;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300216 Py_UCS4 c = (Py_UCS4)chr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000217
Martin v. Löwis1a214512008-06-11 05:26:20 +0000218 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000219 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000220 if (old->category_changed == 0) {
221 /* unassigned */
222 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000224 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000225 else if (old->decimal_changed != 0xFF) {
226 have_old = 1;
227 rc = old->decimal_changed;
228 }
229 }
230
231 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000232 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233 if (rc == -1.0) {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300234 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000235 PyErr_SetString(PyExc_ValueError, "not a numeric character");
236 return NULL;
237 }
238 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +0300239 Py_INCREF(default_value);
240 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000241 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000242 }
243 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000244}
245
Serhiy Storchaka63596412015-04-17 21:18:49 +0300246/*[clinic input]
247unicodedata.UCD.category
248
249 self: self
250 chr: int(types={'str'})
251 /
252
253Returns the general category assigned to the character chr as string.
254[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000255
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000256static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300257unicodedata_UCD_category_impl(PyObject *self, int chr)
258/*[clinic end generated code: output=8571539ee2e6783a input=f5edd6fd04bd455d]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000259{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000260 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300261 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000262 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000263 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000264 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000265 if (old->category_changed != 0xFF)
266 index = old->category_changed;
267 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000268 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000269}
270
Serhiy Storchaka63596412015-04-17 21:18:49 +0300271/*[clinic input]
272unicodedata.UCD.bidirectional
273
274 self: self
275 chr: int(types={'str'})
276 /
277
278Returns the bidirectional class assigned to the character chr as string.
279
280If no such value is defined, an empty string is returned.
281[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000282
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000283static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300284unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
285/*[clinic end generated code: output=d36310ce2039bb92 input=5ce2f877b35305b5]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000286{
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000287 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300288 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000289 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000290 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000291 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000292 if (old->category_changed == 0)
293 index = 0; /* unassigned */
294 else if (old->bidir_changed != 0xFF)
295 index = old->bidir_changed;
296 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000297 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000298}
299
Serhiy Storchaka63596412015-04-17 21:18:49 +0300300/*[clinic input]
301unicodedata.UCD.combining -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000302
Serhiy Storchaka63596412015-04-17 21:18:49 +0300303 self: self
304 chr: int(types={'str'})
305 /
306
307Returns the canonical combining class assigned to the character chr as integer.
308
309Returns 0 if no combining class is defined.
310[clinic start generated code]*/
311
312static int
313unicodedata_UCD_combining_impl(PyObject *self, int chr)
314/*[clinic end generated code: output=cad056d0cb6a5920 input=9125ea7d50b319e7]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000315{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000316 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300317 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000318 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000319 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000320 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000321 if (old->category_changed == 0)
322 index = 0; /* unassigned */
323 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300324 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000325}
326
Serhiy Storchaka63596412015-04-17 21:18:49 +0300327/*[clinic input]
328unicodedata.UCD.mirrored -> int
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000329
Serhiy Storchaka63596412015-04-17 21:18:49 +0300330 self: self
331 chr: int(types={'str'})
332 /
333
334Returns the mirrored property assigned to the character chr as integer.
335
336Returns 1 if the character has been identified as a "mirrored"
337character in bidirectional text, 0 otherwise.
338[clinic start generated code]*/
339
340static int
341unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
342/*[clinic end generated code: output=2532dbf8121b50e6 input=4e51e8aaf8d7e23e]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000343{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000344 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300345 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000346 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000347 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000348 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000349 if (old->category_changed == 0)
350 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000351 else if (old->mirrored_changed != 0xFF)
352 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000353 }
Serhiy Storchaka63596412015-04-17 21:18:49 +0300354 return index;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000355}
356
Serhiy Storchaka63596412015-04-17 21:18:49 +0300357/*[clinic input]
358unicodedata.UCD.east_asian_width
359
360 self: self
361 chr: int(types={'str'})
362 /
363
364Returns the east asian width assigned to the character chr as string.
365[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000366
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000367static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300368unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
369/*[clinic end generated code: output=484e8537d9ee8197 input=f93c61f37276c8f0]*/
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000370{
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000371 int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300372 Py_UCS4 c = (Py_UCS4)chr;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000373 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000374 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000375 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000376 if (old->category_changed == 0)
377 index = 0; /* unassigned */
378 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000379 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000380}
381
Serhiy Storchaka63596412015-04-17 21:18:49 +0300382/*[clinic input]
383unicodedata.UCD.decomposition
384
385 self: self
386 chr: int(types={'str'})
387 /
388
389Returns the character decomposition mapping assigned to the character chr as string.
390
391An empty string is returned in case no such mapping is defined.
392[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000393
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000394static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +0300395unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
396/*[clinic end generated code: output=7d699f3ec7565d27 input=7f2c0ee66d75468f]*/
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000397{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000398 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000399 int code, index, count;
400 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000401 unsigned int prefix_index;
Serhiy Storchaka63596412015-04-17 21:18:49 +0300402 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000403
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000404 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000405
Martin v. Löwis1a214512008-06-11 05:26:20 +0000406 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000407 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000408 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000409 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000410 }
411
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000412 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000413 index = 0;
414 else {
415 index = decomp_index1[(code>>DECOMP_SHIFT)];
416 index = decomp_index2[(index<<DECOMP_SHIFT)+
417 (code&((1<<DECOMP_SHIFT)-1))];
418 }
419
Tim Peters69b83b12001-11-30 07:23:05 +0000420 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000421 is prefix code (from*/
422 count = decomp_data[index] >> 8;
423
424 /* XXX: could allocate the PyString up front instead
425 (strlen(prefix) + 5 * count + 1 bytes) */
426
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000427 /* Based on how index is calculated above and decomp_data is generated
428 from Tools/unicode/makeunicodedata.py, it should not be possible
429 to overflow decomp_prefix. */
430 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200431 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000432
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000433 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000434 i = strlen(decomp_prefix[prefix_index]);
435 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000436
437 while (count-- > 0) {
438 if (i)
439 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000440 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000441 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
442 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000443 i += strlen(decomp + i);
444 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000445 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000446}
447
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000448static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000449get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000450{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000451 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000452 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000453 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000454 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000455 /* unassigned in old version */
456 *index = 0;
457 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000458 else {
459 *index = decomp_index1[(code>>DECOMP_SHIFT)];
460 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
461 (code&((1<<DECOMP_SHIFT)-1))];
462 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000463
Martin v. Löwis677bde22002-11-23 22:08:15 +0000464 /* high byte is number of hex bytes (usually one or two), low byte
465 is prefix code (from*/
466 *count = decomp_data[*index] >> 8;
467 *prefix = decomp_data[*index] & 255;
468
469 (*index)++;
470}
471
472#define SBase 0xAC00
473#define LBase 0x1100
474#define VBase 0x1161
475#define TBase 0x11A7
476#define LCount 19
477#define VCount 21
478#define TCount 28
479#define NCount (VCount*TCount)
480#define SCount (LCount*NCount)
481
482static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000483nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000484{
485 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200486 Py_UCS4 *output;
487 Py_ssize_t i, o, osize;
488 int kind;
489 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000490 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200491 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000492 Py_ssize_t space, isize;
493 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000494 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000495
Martin v. Löwis677bde22002-11-23 22:08:15 +0000496 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200497 isize = PyUnicode_GET_LENGTH(input);
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500498 space = isize;
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300499 /* Overallocate at most 10 characters. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500500 if (space > 10) {
501 if (space <= PY_SSIZE_T_MAX - 10)
502 space += 10;
503 }
504 else {
505 space *= 2;
506 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200507 osize = space;
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500508 output = PyMem_NEW(Py_UCS4, space);
Martin v. Löwis22970662011-09-29 13:39:38 +0200509 if (!output) {
510 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000511 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200512 }
513 i = o = 0;
514 kind = PyUnicode_KIND(input);
515 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000516
Martin v. Löwis22970662011-09-29 13:39:38 +0200517 while (i < isize) {
518 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000519 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200520 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000521 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300522 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000523 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000524 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200525 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000526 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000527 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
528 if (new_output == NULL) {
529 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200530 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000531 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200532 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000533 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000534 }
535 /* Hangul Decomposition. */
536 if (SBase <= code && code < (SBase+SCount)) {
537 int SIndex = code - SBase;
538 int L = LBase + SIndex / NCount;
539 int V = VBase + (SIndex % NCount) / TCount;
540 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200541 output[o++] = L;
542 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000543 space -= 2;
544 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200545 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000546 space --;
547 }
548 continue;
549 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000550 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000551 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000552 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
553 if (value != 0) {
554 stack[stackptr++] = value;
555 continue;
556 }
557 }
558
559 /* Other decompositions. */
560 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000561
562 /* Copy character if it is not decomposable, or has a
563 compatibility decomposition, but we do NFD. */
564 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200565 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000566 space--;
567 continue;
568 }
569 /* Copy decomposition onto the stack, in reverse
570 order. */
571 while(count) {
572 code = decomp_data[index + (--count)];
573 stack[stackptr++] = code;
574 }
575 }
576 }
577
Martin v. Löwis22970662011-09-29 13:39:38 +0200578 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
579 output, o);
580 PyMem_Free(output);
581 if (!result)
582 return NULL;
583 /* result is guaranteed to be ready, as it is compact. */
584 kind = PyUnicode_KIND(result);
585 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000586
587 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200588 i = 0;
589 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
590 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
591 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000592 if (prev == 0 || cur == 0 || prev <= cur) {
593 prev = cur;
594 continue;
595 }
596 /* Non-canonical order. Need to switch *i with previous. */
597 o = i - 1;
598 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200599 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
600 PyUnicode_WRITE(kind, data, o+1,
601 PyUnicode_READ(kind, data, o));
602 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000603 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200604 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000605 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200606 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000607 if (prev == 0 || prev <= cur)
608 break;
609 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200610 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000611 }
612 return result;
613}
614
615static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200616find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000617{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200618 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000619 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200620 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000621 if (code < start)
622 return -1;
623 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200624 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000625 return nfc[index].index + delta;
626 }
627 }
628 return -1;
629}
630
631static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000632nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000633{
634 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200635 int kind;
636 void *data;
637 Py_UCS4 *output;
638 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000639 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200640 Py_UCS4 code;
641 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000642 int cskipped = 0;
643
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000644 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000645 if (!result)
646 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200647 /* result will be "ready". */
648 kind = PyUnicode_KIND(result);
649 data = PyUnicode_DATA(result);
650 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000651
Martin v. Löwis22970662011-09-29 13:39:38 +0200652 /* We allocate a buffer for the output.
653 If we find that we made no changes, we still return
654 the NFD result. */
Benjamin Petersonb779bfb2015-03-02 11:17:05 -0500655 output = PyMem_NEW(Py_UCS4, len);
Martin v. Löwis22970662011-09-29 13:39:38 +0200656 if (!output) {
657 PyErr_NoMemory();
658 Py_DECREF(result);
659 return 0;
660 }
661 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000662
Martin v. Löwis677bde22002-11-23 22:08:15 +0000663 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200664 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000665 for (index = 0; index < cskipped; index++) {
666 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000667 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000668 Remove from list. */
669 skipped[index] = skipped[cskipped-1];
670 cskipped--;
671 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000672 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000673 }
674 }
675 /* Hangul Composition. We don't need to check for <LV,T>
676 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200677 code = PyUnicode_READ(kind, data, i);
678 if (LBase <= code && code < (LBase+LCount) &&
679 i + 1 < len &&
680 VBase <= PyUnicode_READ(kind, data, i+1) &&
681 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000682 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200683 LIndex = code - LBase;
684 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000685 code = SBase + (LIndex*VCount+VIndex)*TCount;
686 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200687 if (i < len &&
688 TBase <= PyUnicode_READ(kind, data, i) &&
689 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
690 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000691 i++;
692 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200693 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000694 continue;
695 }
696
Martin v. Löwis22970662011-09-29 13:39:38 +0200697 /* code is still input[i] here */
698 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000699 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200700 output[o++] = code;
701 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000702 continue;
703 }
704 /* Find next unblocked character. */
705 i1 = i+1;
706 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200707 /* output base character for now; might be updated later. */
708 output[o] = PyUnicode_READ(kind, data, i);
709 while (i1 < len) {
710 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
711 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000712 if (comb) {
713 if (comb1 == 0)
714 break;
715 if (comb >= comb1) {
716 /* Character is blocked. */
717 i1++;
718 continue;
719 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000720 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200721 l = find_nfc_index(self, nfc_last, code1);
722 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000723 is a starter, we don't need to look further.
724 Otherwise, record the combining class. */
725 if (l == -1) {
726 not_combinable:
727 if (comb1 == 0)
728 break;
729 comb = comb1;
730 i1++;
731 continue;
732 }
733 index = f*TOTAL_LAST + l;
734 index1 = comp_index[index >> COMP_SHIFT];
735 code = comp_data[(index1<<COMP_SHIFT)+
736 (index&((1<<COMP_SHIFT)-1))];
737 if (code == 0)
738 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000739
Martin v. Löwis677bde22002-11-23 22:08:15 +0000740 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200741 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000742 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000743 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000744 skipped[cskipped++] = i1;
745 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200746 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000747 if (f == -1)
748 break;
749 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200750 /* Output character was already written.
751 Just advance the indices. */
752 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000753 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200754 if (o == len) {
755 /* No changes. Return original string. */
756 PyMem_Free(output);
757 return result;
758 }
759 Py_DECREF(result);
760 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
761 output, o);
762 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000763 return result;
764}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000765
766/* Return 1 if the input is certainly normalized, 0 if it might not be. */
767static int
768is_normalized(PyObject *self, PyObject *input, int nfc, int k)
769{
Martin v. Löwis22970662011-09-29 13:39:38 +0200770 Py_ssize_t i, len;
771 int kind;
772 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000773 unsigned char prev_combining = 0, quickcheck_mask;
774
775 /* An older version of the database is requested, quickchecks must be
776 disabled. */
777 if (self && UCD_Check(self))
778 return 0;
779
780 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
781 as described in http://unicode.org/reports/tr15/#Annex8. */
782 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
783
Martin v. Löwis22970662011-09-29 13:39:38 +0200784 i = 0;
785 kind = PyUnicode_KIND(input);
786 data = PyUnicode_DATA(input);
787 len = PyUnicode_GET_LENGTH(input);
788 while (i < len) {
789 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
790 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000791 unsigned char combining = record->combining;
792 unsigned char quickcheck = record->normalization_quick_check;
793
794 if (quickcheck & quickcheck_mask)
795 return 0; /* this string might need normalization */
796 if (combining && prev_combining > combining)
797 return 0; /* non-canonical sort order, not normalized */
798 prev_combining = combining;
799 }
800 return 1; /* certainly normalized */
801}
802
Serhiy Storchaka63596412015-04-17 21:18:49 +0300803/*[clinic input]
804unicodedata.UCD.normalize
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000805
Serhiy Storchaka63596412015-04-17 21:18:49 +0300806 self: self
807 form: str
808 unistr as input: object(subclass_of='&PyUnicode_Type')
809 /
810
811Return the normal form 'form' for the Unicode string unistr.
812
813Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
814[clinic start generated code]*/
815
816static PyObject *
817unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
818 PyObject *input)
819/*[clinic end generated code: output=62d1f8870027efdc input=cd092e631cf11883]*/
Martin v. Löwis677bde22002-11-23 22:08:15 +0000820{
Martin v. Löwis22970662011-09-29 13:39:38 +0200821 if (PyUnicode_READY(input) == -1)
822 return NULL;
823
824 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000825 /* Special case empty input strings, since resizing
826 them later would cause internal errors. */
827 Py_INCREF(input);
828 return input;
829 }
830
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000831 if (strcmp(form, "NFC") == 0) {
832 if (is_normalized(self, input, 1, 0)) {
833 Py_INCREF(input);
834 return input;
835 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000836 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000837 }
838 if (strcmp(form, "NFKC") == 0) {
839 if (is_normalized(self, input, 1, 1)) {
840 Py_INCREF(input);
841 return input;
842 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000843 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000844 }
845 if (strcmp(form, "NFD") == 0) {
846 if (is_normalized(self, input, 0, 0)) {
847 Py_INCREF(input);
848 return input;
849 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000850 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000851 }
852 if (strcmp(form, "NFKD") == 0) {
853 if (is_normalized(self, input, 0, 1)) {
854 Py_INCREF(input);
855 return input;
856 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000857 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000858 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000859 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
860 return NULL;
861}
862
Fredrik Lundh06d12682001-01-24 07:59:11 +0000863/* -------------------------------------------------------------------- */
864/* unicode character name tables */
865
866/* data file generated by Tools/unicode/makeunicodedata.py */
867#include "unicodename_db.h"
868
869/* -------------------------------------------------------------------- */
870/* database code (cut and pasted from the unidb package) */
871
872static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000873_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000874{
875 int i;
876 unsigned long h = 0;
877 unsigned long ix;
878 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200879 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000880 ix = h & 0xff000000;
881 if (ix)
882 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
883 }
884 return h;
885}
886
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000887static char *hangul_syllables[][3] = {
888 { "G", "A", "" },
889 { "GG", "AE", "G" },
890 { "N", "YA", "GG" },
891 { "D", "YAE", "GS" },
892 { "DD", "EO", "N", },
893 { "R", "E", "NJ" },
894 { "M", "YEO", "NH" },
895 { "B", "YE", "D" },
896 { "BB", "O", "L" },
897 { "S", "WA", "LG" },
898 { "SS", "WAE", "LM" },
899 { "", "OE", "LB" },
900 { "J", "YO", "LS" },
901 { "JJ", "U", "LT" },
902 { "C", "WEO", "LP" },
903 { "K", "WE", "LH" },
904 { "T", "WI", "M" },
905 { "P", "YU", "B" },
906 { "H", "EU", "BS" },
907 { 0, "YI", "S" },
908 { 0, "I", "SS" },
909 { 0, 0, "NG" },
910 { 0, 0, "J" },
911 { 0, 0, "C" },
912 { 0, 0, "K" },
913 { 0, 0, "T" },
914 { 0, 0, "P" },
915 { 0, 0, "H" }
916};
917
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000918/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000919static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000920is_unified_ideograph(Py_UCS4 code)
921{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000922 return
923 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500924 (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000925 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
926 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
927 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000928}
929
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200930/* macros used to determine if the given code point is in the PUA range that
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300931 * we are using to store aliases and named sequences */
932#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
933#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
934 (cp < named_sequences_end))
935
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000936static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300937_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
938 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000939{
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200940 /* Find the name associated with the given code point.
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300941 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
942 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000943 int offset;
944 int i;
945 int word;
946 unsigned char* w;
947
Martin v. Löwisc3509122006-03-11 12:16:23 +0000948 if (code >= 0x110000)
949 return 0;
950
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200951 /* XXX should we just skip all the code points in the PUAs here? */
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300952 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
953 return 0;
954
Martin v. Löwis1a214512008-06-11 05:26:20 +0000955 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300956 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +0300957 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300958 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
959 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +0300960 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +0000961 if (old->category_changed == 0) {
962 /* unassigned */
963 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000964 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000965 }
966
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000967 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000968 /* Hangul syllable. */
969 int SIndex = code - SBase;
970 int L = SIndex / NCount;
971 int V = (SIndex % NCount) / TCount;
972 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000973
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000974 if (buflen < 27)
975 /* Worst case: HANGUL SYLLABLE <10chars>. */
976 return 0;
977 strcpy(buffer, "HANGUL SYLLABLE ");
978 buffer += 16;
979 strcpy(buffer, hangul_syllables[L][0]);
980 buffer += strlen(hangul_syllables[L][0]);
981 strcpy(buffer, hangul_syllables[V][1]);
982 buffer += strlen(hangul_syllables[V][1]);
983 strcpy(buffer, hangul_syllables[T][2]);
984 buffer += strlen(hangul_syllables[T][2]);
985 *buffer = '\0';
986 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000987 }
988
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000989 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000990 if (buflen < 28)
991 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
992 return 0;
993 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
994 return 1;
995 }
996
Fredrik Lundh06d12682001-01-24 07:59:11 +0000997 /* get offset into phrasebook */
998 offset = phrasebook_offset1[(code>>phrasebook_shift)];
999 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1000 (code&((1<<phrasebook_shift)-1))];
1001 if (!offset)
1002 return 0;
1003
1004 i = 0;
1005
1006 for (;;) {
1007 /* get word index */
1008 word = phrasebook[offset] - phrasebook_short;
1009 if (word >= 0) {
1010 word = (word << 8) + phrasebook[offset+1];
1011 offset += 2;
1012 } else
1013 word = phrasebook[offset++];
1014 if (i) {
1015 if (i > buflen)
1016 return 0; /* buffer overflow */
1017 buffer[i++] = ' ';
1018 }
1019 /* copy word string from lexicon. the last character in the
1020 word has bit 7 set. the last word in a string ends with
1021 0x80 */
1022 w = lexicon + lexicon_offset[word];
1023 while (*w < 128) {
1024 if (i >= buflen)
1025 return 0; /* buffer overflow */
1026 buffer[i++] = *w++;
1027 }
1028 if (i >= buflen)
1029 return 0; /* buffer overflow */
1030 buffer[i++] = *w & 127;
1031 if (*w == 128)
1032 break; /* end of word */
1033 }
1034
1035 return 1;
1036}
1037
1038static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001039_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001040{
1041 /* check if code corresponds to the given name */
1042 int i;
1043 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001044 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001045 return 0;
1046 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001047 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001048 return 0;
1049 }
1050 return buffer[namelen] == '\0';
1051}
1052
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001053static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001054find_syllable(const char *str, int *len, int *pos, int count, int column)
1055{
1056 int i, len1;
1057 *len = -1;
1058 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001059 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001060 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001061 if (len1 <= *len)
1062 continue;
1063 if (strncmp(str, s, len1) == 0) {
1064 *len = len1;
1065 *pos = i;
1066 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001067 }
1068 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001069 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001070 }
1071}
1072
Fredrik Lundh06d12682001-01-24 07:59:11 +00001073static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001074_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001075{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001076 /* check if named sequences are allowed */
1077 if (!with_named_seq && IS_NAMED_SEQ(cp))
1078 return 0;
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001079 /* if the code point is in the PUA range that we use for aliases,
1080 * convert it to obtain the right code point */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001081 if (IS_ALIAS(cp))
1082 *code = name_aliases[cp-aliases_start];
1083 else
1084 *code = cp;
1085 return 1;
1086}
1087
1088static int
1089_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1090 int with_named_seq)
1091{
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001092 /* Return the code point associated with the given name.
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001093 * Named aliases are resolved too (unless self != NULL (i.e. we are using
Serhiy Storchakad3faf432015-01-18 11:28:37 +02001094 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001095 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001096 unsigned int h, v;
1097 unsigned int mask = code_size-1;
1098 unsigned int i, incr;
1099
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001100 /* Check for hangul syllables. */
1101 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001102 int len, L = -1, V = -1, T = -1;
1103 const char *pos = name + 16;
1104 find_syllable(pos, &len, &L, LCount, 0);
1105 pos += len;
1106 find_syllable(pos, &len, &V, VCount, 1);
1107 pos += len;
1108 find_syllable(pos, &len, &T, TCount, 2);
1109 pos += len;
1110 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1111 *code = SBase + (L*VCount+V)*TCount + T;
1112 return 1;
1113 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001114 /* Otherwise, it's an illegal syllable name. */
1115 return 0;
1116 }
1117
1118 /* Check for unified ideographs. */
1119 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1120 /* Four or five hexdigits must follow. */
1121 v = 0;
1122 name += 22;
1123 namelen -= 22;
1124 if (namelen != 4 && namelen != 5)
1125 return 0;
1126 while (namelen--) {
1127 v *= 16;
1128 if (*name >= '0' && *name <= '9')
1129 v += *name - '0';
1130 else if (*name >= 'A' && *name <= 'F')
1131 v += *name - 'A' + 10;
1132 else
1133 return 0;
1134 name++;
1135 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001136 if (!is_unified_ideograph(v))
1137 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001138 *code = v;
1139 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001140 }
1141
Fredrik Lundh06d12682001-01-24 07:59:11 +00001142 /* the following is the same as python's dictionary lookup, with
1143 only minor changes. see the makeunicodedata script for more
1144 details */
1145
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001146 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001147 i = (~h) & mask;
1148 v = code_hash[i];
1149 if (!v)
1150 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001151 if (_cmpname(self, v, name, namelen))
1152 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001153 incr = (h ^ (h >> 3)) & mask;
1154 if (!incr)
1155 incr = mask;
1156 for (;;) {
1157 i = (i + incr) & mask;
1158 v = code_hash[i];
1159 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001160 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001161 if (_cmpname(self, v, name, namelen))
1162 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001163 incr = incr << 1;
1164 if (incr > mask)
1165 incr = incr ^ code_poly;
1166 }
1167}
1168
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001169static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001170{
1171 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001172 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001173 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001174};
1175
1176/* -------------------------------------------------------------------- */
1177/* Python bindings */
1178
Serhiy Storchaka63596412015-04-17 21:18:49 +03001179/*[clinic input]
1180unicodedata.UCD.name
1181
1182 self: self
1183 chr: int(types={'str'})
1184 default: object=NULL
1185 /
1186
1187Returns the name assigned to the character chr as a string.
1188
1189If no name is defined, default is returned, or, if not given,
1190ValueError is raised.
1191[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001192
Fredrik Lundh06d12682001-01-24 07:59:11 +00001193static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001194unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1195/*[clinic end generated code: output=6bbb37a326407707 input=51ee2f971c918113]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001196{
1197 char name[NAME_MAXLEN];
Serhiy Storchaka63596412015-04-17 21:18:49 +03001198 Py_UCS4 c = (Py_UCS4)chr;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001199
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001200 if (!_getucname(self, c, name, sizeof(name), 0)) {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001201 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001202 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001203 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001204 }
1205 else {
Serhiy Storchaka63596412015-04-17 21:18:49 +03001206 Py_INCREF(default_value);
1207 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001208 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001209 }
1210
Walter Dörwald4254e762007-06-05 16:04:09 +00001211 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001212}
1213
Serhiy Storchaka63596412015-04-17 21:18:49 +03001214/*[clinic input]
1215unicodedata.UCD.lookup
1216
1217 self: self
1218 name: str(types={'str', 'robuffer'}, length=True)
1219 /
1220
1221Look up character by name.
1222
1223If a character with the given name is found, return the
1224corresponding character. If not found, KeyError is raised.
1225[clinic start generated code]*/
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001226
Fredrik Lundh06d12682001-01-24 07:59:11 +00001227static PyObject *
Serhiy Storchaka63596412015-04-17 21:18:49 +03001228unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1229 Py_ssize_clean_t name_length)
1230/*[clinic end generated code: output=765cb8186788e6be input=f2bf29706135a590]*/
Fredrik Lundh06d12682001-01-24 07:59:11 +00001231{
1232 Py_UCS4 code;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001233 unsigned int index;
Serhiy Storchaka63596412015-04-17 21:18:49 +03001234 if (name_length > INT_MAX) {
Victor Stinner65a31442014-07-01 16:45:52 +02001235 PyErr_SetString(PyExc_KeyError, "name too long");
1236 return NULL;
1237 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001238
Serhiy Storchaka63596412015-04-17 21:18:49 +03001239 if (!_getcode(self, name, (int)name_length, &code, 1)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001240 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001241 return NULL;
1242 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001243 /* check if code is in the PUA range that we use for named sequences
1244 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001245 if (IS_NAMED_SEQ(code)) {
1246 index = code-named_sequences_start;
1247 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1248 named_sequences[index].seq,
1249 named_sequences[index].seqlen);
1250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001252}
1253
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001254/* XXX Add doc strings. */
1255
1256static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001257 UNICODEDATA_UCD_DECIMAL_METHODDEF
Serhiy Storchaka63596412015-04-17 21:18:49 +03001258 UNICODEDATA_UCD_DIGIT_METHODDEF
1259 UNICODEDATA_UCD_NUMERIC_METHODDEF
1260 UNICODEDATA_UCD_CATEGORY_METHODDEF
1261 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1262 UNICODEDATA_UCD_COMBINING_METHODDEF
1263 UNICODEDATA_UCD_MIRRORED_METHODDEF
1264 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1265 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1266 UNICODEDATA_UCD_NAME_METHODDEF
1267 UNICODEDATA_UCD_LOOKUP_METHODDEF
1268 UNICODEDATA_UCD_NORMALIZE_METHODDEF
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001269 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001270};
1271
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001272static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001273 /* The ob_type field must be initialized in the module init function
1274 * to be portable to Windows without using C++. */
1275 PyVarObject_HEAD_INIT(NULL, 0)
1276 "unicodedata.UCD", /*tp_name*/
1277 sizeof(PreviousDBVersion), /*tp_basicsize*/
1278 0, /*tp_itemsize*/
1279 /* methods */
1280 (destructor)PyObject_Del, /*tp_dealloc*/
1281 0, /*tp_print*/
1282 0, /*tp_getattr*/
1283 0, /*tp_setattr*/
1284 0, /*tp_reserved*/
1285 0, /*tp_repr*/
1286 0, /*tp_as_number*/
1287 0, /*tp_as_sequence*/
1288 0, /*tp_as_mapping*/
1289 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001290 0, /*tp_call*/
1291 0, /*tp_str*/
1292 PyObject_GenericGetAttr,/*tp_getattro*/
1293 0, /*tp_setattro*/
1294 0, /*tp_as_buffer*/
1295 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1296 0, /*tp_doc*/
1297 0, /*tp_traverse*/
1298 0, /*tp_clear*/
1299 0, /*tp_richcompare*/
1300 0, /*tp_weaklistoffset*/
1301 0, /*tp_iter*/
1302 0, /*tp_iternext*/
1303 unicodedata_functions, /*tp_methods*/
1304 DB_members, /*tp_members*/
1305 0, /*tp_getset*/
1306 0, /*tp_base*/
1307 0, /*tp_dict*/
1308 0, /*tp_descr_get*/
1309 0, /*tp_descr_set*/
1310 0, /*tp_dictoffset*/
1311 0, /*tp_init*/
1312 0, /*tp_alloc*/
1313 0, /*tp_new*/
1314 0, /*tp_free*/
1315 0, /*tp_is_gc*/
1316};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001317
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001318PyDoc_STRVAR(unicodedata_docstring,
1319"This module provides access to the Unicode Character Database which\n\
1320defines character properties for all Unicode characters. The data in\n\
1321this database is based on the UnicodeData.txt file version\n\
Benjamin Peterson8aa7b892013-10-10 20:22:10 -04001322" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001323\n\
1324The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001325UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001326
1327static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001328 PyModuleDef_HEAD_INIT,
1329 "unicodedata",
1330 unicodedata_docstring,
1331 -1,
1332 unicodedata_functions,
1333 NULL,
1334 NULL,
1335 NULL,
1336 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001337};
1338
Mark Hammond62b1ab12002-07-23 06:31:15 +00001339PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001340PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001341{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001342 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001343
Christian Heimes90aa7642007-12-19 02:45:37 +00001344 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001345
Martin v. Löwis1a214512008-06-11 05:26:20 +00001346 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001347 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001348 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001349
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001350 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001351 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001352 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001353
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001354 /* Previous versions */
1355 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1356 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001357 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001358
Fredrik Lundh06d12682001-01-24 07:59:11 +00001359 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001360 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001361 if (v != NULL)
1362 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001363 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001364}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001365
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001367Local variables:
1368c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001369indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001370End:
1371*/