blob: 3253db21f1b94098aa2e7f5d373b5851989b9d33 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
16#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000017#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000018#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019
Larry Hastings61272b72014-01-07 12:41:53 -080020/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080021module unicodedata
Larry Hastingsc2047262014-01-25 20:43:29 -080022class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Larry Hastings61272b72014-01-07 12:41:53 -080023[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080024/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080025
Fredrik Lundh06d12682001-01-24 07:59:11 +000026/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000027
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000028typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000029 const unsigned char category; /* index into
30 _PyUnicode_CategoryNames */
31 const unsigned char combining; /* combining class value 0 - 255 */
32 const unsigned char bidirectional; /* index into
33 _PyUnicode_BidirectionalNames */
34 const unsigned char mirrored; /* true if mirrored in bidir mode */
35 const unsigned char east_asian_width; /* index into
36 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000037 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000038} _PyUnicode_DatabaseRecord;
39
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000040typedef struct change_record {
41 /* sequence of fields should be the same as in merge_old_version */
42 const unsigned char bidir_changed;
43 const unsigned char category_changed;
44 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000045 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000046 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000047} change_record;
48
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000049/* data file generated by Tools/unicode/makeunicodedata.py */
50#include "unicodedata_db.h"
51
52static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000053_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000054{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000055 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000056 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000057 index = 0;
58 else {
59 index = index1[(code>>SHIFT)];
60 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
61 }
62
63 return &_PyUnicode_Database_Records[index];
64}
65
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000066/* ------------- Previous-version API ------------------------------------- */
67typedef struct previous_version {
68 PyObject_HEAD
69 const char *name;
70 const change_record* (*getrecord)(Py_UCS4);
71 Py_UCS4 (*normalization)(Py_UCS4);
72} PreviousDBVersion;
73
74#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
75
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000076static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078 {NULL}
79};
80
Thomas Wouters89f507f2006-12-13 04:49:30 +000081/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000082static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000083#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000084
85static PyObject*
86new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
87 Py_UCS4 (*normalization)(Py_UCS4))
88{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 PreviousDBVersion *self;
90 self = PyObject_New(PreviousDBVersion, &UCD_Type);
91 if (self == NULL)
92 return NULL;
93 self->name = name;
94 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000095 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000096 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000097}
98
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000099
100static Py_UCS4 getuchar(PyUnicodeObject *obj)
101{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200102 if (PyUnicode_READY(obj))
103 return (Py_UCS4)-1;
104 if (PyUnicode_GET_LENGTH(obj) == 1) {
105 if (PyUnicode_READY(obj))
106 return (Py_UCS4)-1;
107 return PyUnicode_READ_CHAR(obj, 0);
108 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000109 PyErr_SetString(PyExc_TypeError,
110 "need a single Unicode character as parameter");
111 return (Py_UCS4)-1;
112}
113
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000114/* --- Module API --------------------------------------------------------- */
115
Larry Hastings61272b72014-01-07 12:41:53 -0800116/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -0800117
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800118unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700119
Larry Hastings77561cc2014-01-07 12:13:13 -0800120 unichr: object(type='PyUnicodeObject *', subclass_of='&PyUnicode_Type')
Larry Hastings31826802013-10-19 00:09:25 -0700121 default: object=NULL
122 /
123
124Converts a Unicode character into its equivalent decimal value.
125
126Returns the decimal value assigned to the Unicode character unichr
127as integer. If no such value is defined, default is returned, or, if
128not given, ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800129[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700130
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800131PyDoc_STRVAR(unicodedata_UCD_decimal__doc__,
Larry Hastings2623c8c2014-02-08 22:15:29 -0800132"decimal($self, unichr, default=None, /)\n"
133"--\n"
134"\n"
Larry Hastings31826802013-10-19 00:09:25 -0700135"Converts a Unicode character into its equivalent decimal value.\n"
136"\n"
Larry Hastings31826802013-10-19 00:09:25 -0700137"Returns the decimal value assigned to the Unicode character unichr\n"
138"as integer. If no such value is defined, default is returned, or, if\n"
139"not given, ValueError is raised.");
140
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800141#define UNICODEDATA_UCD_DECIMAL_METHODDEF \
142 {"decimal", (PyCFunction)unicodedata_UCD_decimal, METH_VARARGS, unicodedata_UCD_decimal__doc__},
Larry Hastings31826802013-10-19 00:09:25 -0700143
144static PyObject *
Larry Hastingsc2047262014-01-25 20:43:29 -0800145unicodedata_UCD_decimal_impl(PreviousDBVersion *self, PyUnicodeObject *unichr, PyObject *default_value);
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000146
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000147static PyObject *
Larry Hastingsc2047262014-01-25 20:43:29 -0800148unicodedata_UCD_decimal(PreviousDBVersion *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000149{
Larry Hastings31826802013-10-19 00:09:25 -0700150 PyObject *return_value = NULL;
Larry Hastings77561cc2014-01-07 12:13:13 -0800151 PyUnicodeObject *unichr;
Larry Hastings31826802013-10-19 00:09:25 -0700152 PyObject *default_value = NULL;
153
154 if (!PyArg_ParseTuple(args,
155 "O!|O:decimal",
156 &PyUnicode_Type, &unichr, &default_value))
157 goto exit;
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800158 return_value = unicodedata_UCD_decimal_impl(self, unichr, default_value);
Larry Hastings31826802013-10-19 00:09:25 -0700159
160exit:
161 return return_value;
162}
163
164static PyObject *
Larry Hastingsc2047262014-01-25 20:43:29 -0800165unicodedata_UCD_decimal_impl(PreviousDBVersion *self, PyUnicodeObject *unichr, PyObject *default_value)
Larry Hastings2623c8c2014-02-08 22:15:29 -0800166/*[clinic end generated code: output=8689669896d293df input=c25c9d2b4de076b1]*/
Larry Hastings31826802013-10-19 00:09:25 -0700167{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000168 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000170 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000171
Larry Hastingsc2047262014-01-25 20:43:29 -0800172 c = getuchar(unichr);
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000173 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000174 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000175
Martin v. Löwis1a214512008-06-11 05:26:20 +0000176 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000177 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000178 if (old->category_changed == 0) {
179 /* unassigned */
180 have_old = 1;
181 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000182 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000183 else if (old->decimal_changed != 0xFF) {
184 have_old = 1;
185 rc = old->decimal_changed;
186 }
187 }
188
189 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000190 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000191 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700192 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000193 PyErr_SetString(PyExc_ValueError,
194 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000195 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 }
197 else {
Larry Hastings31826802013-10-19 00:09:25 -0700198 Py_INCREF(default_value);
199 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000200 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000202 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203}
204
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000205PyDoc_STRVAR(unicodedata_digit__doc__,
206"digit(unichr[, default])\n\
207\n\
208Returns the digit value assigned to the Unicode character unichr as\n\
209integer. If no such value is defined, default is returned, or, if\n\
210not given, ValueError is raised.");
211
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000212static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000213unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000214{
215 PyUnicodeObject *v;
216 PyObject *defobj = NULL;
217 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000218 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000219
Fredrik Lundh06d12682001-01-24 07:59:11 +0000220 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000221 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000222 c = getuchar(v);
223 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000224 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000225 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000226 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000227 if (defobj == NULL) {
228 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000229 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000230 }
231 else {
232 Py_INCREF(defobj);
233 return defobj;
234 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000235 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000236 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000237}
238
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000239PyDoc_STRVAR(unicodedata_numeric__doc__,
240"numeric(unichr[, default])\n\
241\n\
242Returns the numeric value assigned to the Unicode character unichr\n\
243as float. If no such value is defined, default is returned, or, if\n\
244not given, ValueError is raised.");
245
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000246static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000247unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000248{
249 PyUnicodeObject *v;
250 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000251 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000252 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000253 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000254
Fredrik Lundh06d12682001-01-24 07:59:11 +0000255 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000256 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000257 c = getuchar(v);
258 if (c == (Py_UCS4)-1)
259 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000260
Martin v. Löwis1a214512008-06-11 05:26:20 +0000261 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000262 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000263 if (old->category_changed == 0) {
264 /* unassigned */
265 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000267 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000268 else if (old->decimal_changed != 0xFF) {
269 have_old = 1;
270 rc = old->decimal_changed;
271 }
272 }
273
274 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000275 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000276 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000277 if (defobj == NULL) {
278 PyErr_SetString(PyExc_ValueError, "not a numeric character");
279 return NULL;
280 }
281 else {
282 Py_INCREF(defobj);
283 return defobj;
284 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000285 }
286 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000287}
288
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000289PyDoc_STRVAR(unicodedata_category__doc__,
290"category(unichr)\n\
291\n\
292Returns the general category assigned to the Unicode character\n\
293unichr as string.");
294
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000295static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000296unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000297{
298 PyUnicodeObject *v;
299 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000300 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000301
302 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000303 &PyUnicode_Type, &v))
304 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000305 c = getuchar(v);
306 if (c == (Py_UCS4)-1)
307 return NULL;
308 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000309 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000310 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000311 if (old->category_changed != 0xFF)
312 index = old->category_changed;
313 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000314 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000315}
316
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000317PyDoc_STRVAR(unicodedata_bidirectional__doc__,
318"bidirectional(unichr)\n\
319\n\
Ezio Melottie3d7e542012-12-14 20:12:25 +0200320Returns the bidirectional class assigned to the Unicode character\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000321unichr as string. If no such value is defined, an empty string is\n\
322returned.");
323
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000324static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000325unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000326{
327 PyUnicodeObject *v;
328 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000329 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000330
331 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000332 &PyUnicode_Type, &v))
333 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000334 c = getuchar(v);
335 if (c == (Py_UCS4)-1)
336 return NULL;
337 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000338 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000339 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000340 if (old->category_changed == 0)
341 index = 0; /* unassigned */
342 else if (old->bidir_changed != 0xFF)
343 index = old->bidir_changed;
344 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000345 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000346}
347
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000348PyDoc_STRVAR(unicodedata_combining__doc__,
349"combining(unichr)\n\
350\n\
351Returns the canonical combining class assigned to the Unicode\n\
352character unichr as integer. Returns 0 if no combining class is\n\
353defined.");
354
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000355static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000356unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000357{
358 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000359 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000360 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000361
362 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000363 &PyUnicode_Type, &v))
364 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000365 c = getuchar(v);
366 if (c == (Py_UCS4)-1)
367 return NULL;
368 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000369 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000370 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000371 if (old->category_changed == 0)
372 index = 0; /* unassigned */
373 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000374 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000375}
376
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000377PyDoc_STRVAR(unicodedata_mirrored__doc__,
378"mirrored(unichr)\n\
379\n\
380Returns the mirrored property assigned to the Unicode character\n\
381unichr as integer. Returns 1 if the character has been identified as\n\
382a \"mirrored\" character in bidirectional text, 0 otherwise.");
383
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000384static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000385unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000386{
387 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000388 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000389 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000390
391 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000392 &PyUnicode_Type, &v))
393 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000394 c = getuchar(v);
395 if (c == (Py_UCS4)-1)
396 return NULL;
397 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000398 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000399 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000400 if (old->category_changed == 0)
401 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000402 else if (old->mirrored_changed != 0xFF)
403 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000404 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000405 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000406}
407
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000408PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
409"east_asian_width(unichr)\n\
410\n\
411Returns the east asian width assigned to the Unicode character\n\
412unichr as string.");
413
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000414static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000415unicodedata_east_asian_width(PyObject *self, PyObject *args)
416{
417 PyUnicodeObject *v;
418 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000419 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000420
421 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000422 &PyUnicode_Type, &v))
423 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000424 c = getuchar(v);
425 if (c == (Py_UCS4)-1)
426 return NULL;
427 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000428 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000429 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000430 if (old->category_changed == 0)
431 index = 0; /* unassigned */
432 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000433 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000434}
435
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000436PyDoc_STRVAR(unicodedata_decomposition__doc__,
437"decomposition(unichr)\n\
438\n\
439Returns the character decomposition mapping assigned to the Unicode\n\
440character unichr as string. An empty string is returned in case no\n\
441such mapping is defined.");
442
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000443static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000444unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000445{
446 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000447 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000448 int code, index, count;
449 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000450 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000451 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000452
453 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000454 &PyUnicode_Type, &v))
455 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000456 c = getuchar(v);
457 if (c == (Py_UCS4)-1)
458 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000459
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000460 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000461
Martin v. Löwis1a214512008-06-11 05:26:20 +0000462 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000463 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000464 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000465 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000466 }
467
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000468 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000469 index = 0;
470 else {
471 index = decomp_index1[(code>>DECOMP_SHIFT)];
472 index = decomp_index2[(index<<DECOMP_SHIFT)+
473 (code&((1<<DECOMP_SHIFT)-1))];
474 }
475
Tim Peters69b83b12001-11-30 07:23:05 +0000476 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000477 is prefix code (from*/
478 count = decomp_data[index] >> 8;
479
480 /* XXX: could allocate the PyString up front instead
481 (strlen(prefix) + 5 * count + 1 bytes) */
482
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000483 /* Based on how index is calculated above and decomp_data is generated
484 from Tools/unicode/makeunicodedata.py, it should not be possible
485 to overflow decomp_prefix. */
486 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200487 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000488
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000489 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000490 i = strlen(decomp_prefix[prefix_index]);
491 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000492
493 while (count-- > 0) {
494 if (i)
495 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000496 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000497 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
498 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000499 i += strlen(decomp + i);
500 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000501 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000502}
503
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000504static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000505get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000506{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000507 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000508 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000509 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000510 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000511 /* unassigned in old version */
512 *index = 0;
513 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000514 else {
515 *index = decomp_index1[(code>>DECOMP_SHIFT)];
516 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
517 (code&((1<<DECOMP_SHIFT)-1))];
518 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000519
Martin v. Löwis677bde22002-11-23 22:08:15 +0000520 /* high byte is number of hex bytes (usually one or two), low byte
521 is prefix code (from*/
522 *count = decomp_data[*index] >> 8;
523 *prefix = decomp_data[*index] & 255;
524
525 (*index)++;
526}
527
528#define SBase 0xAC00
529#define LBase 0x1100
530#define VBase 0x1161
531#define TBase 0x11A7
532#define LCount 19
533#define VCount 21
534#define TCount 28
535#define NCount (VCount*TCount)
536#define SCount (LCount*NCount)
537
538static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000539nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000540{
541 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200542 Py_UCS4 *output;
543 Py_ssize_t i, o, osize;
544 int kind;
545 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000546 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200547 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000548 Py_ssize_t space, isize;
549 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000550 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000551
Martin v. Löwis677bde22002-11-23 22:08:15 +0000552 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200553 isize = PyUnicode_GET_LENGTH(input);
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300554 /* Overallocate at most 10 characters. */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000555 space = (isize > 10 ? 10 : isize) + isize;
Martin v. Löwis22970662011-09-29 13:39:38 +0200556 osize = space;
557 output = PyMem_Malloc(space * sizeof(Py_UCS4));
558 if (!output) {
559 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000560 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200561 }
562 i = o = 0;
563 kind = PyUnicode_KIND(input);
564 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000565
Martin v. Löwis22970662011-09-29 13:39:38 +0200566 while (i < isize) {
567 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000568 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200569 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000570 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300571 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000572 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000573 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200574 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000575 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000576 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
577 if (new_output == NULL) {
578 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200579 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000580 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200581 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000582 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000583 }
584 /* Hangul Decomposition. */
585 if (SBase <= code && code < (SBase+SCount)) {
586 int SIndex = code - SBase;
587 int L = LBase + SIndex / NCount;
588 int V = VBase + (SIndex % NCount) / TCount;
589 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200590 output[o++] = L;
591 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000592 space -= 2;
593 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200594 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000595 space --;
596 }
597 continue;
598 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000599 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000600 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000601 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
602 if (value != 0) {
603 stack[stackptr++] = value;
604 continue;
605 }
606 }
607
608 /* Other decompositions. */
609 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000610
611 /* Copy character if it is not decomposable, or has a
612 compatibility decomposition, but we do NFD. */
613 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200614 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000615 space--;
616 continue;
617 }
618 /* Copy decomposition onto the stack, in reverse
619 order. */
620 while(count) {
621 code = decomp_data[index + (--count)];
622 stack[stackptr++] = code;
623 }
624 }
625 }
626
Martin v. Löwis22970662011-09-29 13:39:38 +0200627 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
628 output, o);
629 PyMem_Free(output);
630 if (!result)
631 return NULL;
632 /* result is guaranteed to be ready, as it is compact. */
633 kind = PyUnicode_KIND(result);
634 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000635
636 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200637 i = 0;
638 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
639 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
640 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000641 if (prev == 0 || cur == 0 || prev <= cur) {
642 prev = cur;
643 continue;
644 }
645 /* Non-canonical order. Need to switch *i with previous. */
646 o = i - 1;
647 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200648 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
649 PyUnicode_WRITE(kind, data, o+1,
650 PyUnicode_READ(kind, data, o));
651 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000652 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200653 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000654 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200655 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000656 if (prev == 0 || prev <= cur)
657 break;
658 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200659 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000660 }
661 return result;
662}
663
664static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200665find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000666{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200667 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000668 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200669 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000670 if (code < start)
671 return -1;
672 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200673 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000674 return nfc[index].index + delta;
675 }
676 }
677 return -1;
678}
679
680static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000681nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000682{
683 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200684 int kind;
685 void *data;
686 Py_UCS4 *output;
687 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000688 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200689 Py_UCS4 code;
690 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000691 int cskipped = 0;
692
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000693 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000694 if (!result)
695 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200696 /* result will be "ready". */
697 kind = PyUnicode_KIND(result);
698 data = PyUnicode_DATA(result);
699 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000700
Martin v. Löwis22970662011-09-29 13:39:38 +0200701 /* We allocate a buffer for the output.
702 If we find that we made no changes, we still return
703 the NFD result. */
704 output = PyMem_Malloc(len * sizeof(Py_UCS4));
705 if (!output) {
706 PyErr_NoMemory();
707 Py_DECREF(result);
708 return 0;
709 }
710 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000711
Martin v. Löwis677bde22002-11-23 22:08:15 +0000712 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200713 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000714 for (index = 0; index < cskipped; index++) {
715 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000716 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000717 Remove from list. */
718 skipped[index] = skipped[cskipped-1];
719 cskipped--;
720 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000721 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000722 }
723 }
724 /* Hangul Composition. We don't need to check for <LV,T>
725 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200726 code = PyUnicode_READ(kind, data, i);
727 if (LBase <= code && code < (LBase+LCount) &&
728 i + 1 < len &&
729 VBase <= PyUnicode_READ(kind, data, i+1) &&
730 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000731 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200732 LIndex = code - LBase;
733 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000734 code = SBase + (LIndex*VCount+VIndex)*TCount;
735 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200736 if (i < len &&
737 TBase <= PyUnicode_READ(kind, data, i) &&
738 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
739 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000740 i++;
741 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200742 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000743 continue;
744 }
745
Martin v. Löwis22970662011-09-29 13:39:38 +0200746 /* code is still input[i] here */
747 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000748 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200749 output[o++] = code;
750 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000751 continue;
752 }
753 /* Find next unblocked character. */
754 i1 = i+1;
755 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200756 /* output base character for now; might be updated later. */
757 output[o] = PyUnicode_READ(kind, data, i);
758 while (i1 < len) {
759 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
760 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000761 if (comb) {
762 if (comb1 == 0)
763 break;
764 if (comb >= comb1) {
765 /* Character is blocked. */
766 i1++;
767 continue;
768 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000769 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200770 l = find_nfc_index(self, nfc_last, code1);
771 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000772 is a starter, we don't need to look further.
773 Otherwise, record the combining class. */
774 if (l == -1) {
775 not_combinable:
776 if (comb1 == 0)
777 break;
778 comb = comb1;
779 i1++;
780 continue;
781 }
782 index = f*TOTAL_LAST + l;
783 index1 = comp_index[index >> COMP_SHIFT];
784 code = comp_data[(index1<<COMP_SHIFT)+
785 (index&((1<<COMP_SHIFT)-1))];
786 if (code == 0)
787 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000788
Martin v. Löwis677bde22002-11-23 22:08:15 +0000789 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200790 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000791 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000792 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000793 skipped[cskipped++] = i1;
794 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200795 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000796 if (f == -1)
797 break;
798 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200799 /* Output character was already written.
800 Just advance the indices. */
801 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000802 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200803 if (o == len) {
804 /* No changes. Return original string. */
805 PyMem_Free(output);
806 return result;
807 }
808 Py_DECREF(result);
809 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
810 output, o);
811 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000812 return result;
813}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000814
815/* Return 1 if the input is certainly normalized, 0 if it might not be. */
816static int
817is_normalized(PyObject *self, PyObject *input, int nfc, int k)
818{
Martin v. Löwis22970662011-09-29 13:39:38 +0200819 Py_ssize_t i, len;
820 int kind;
821 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000822 unsigned char prev_combining = 0, quickcheck_mask;
823
824 /* An older version of the database is requested, quickchecks must be
825 disabled. */
826 if (self && UCD_Check(self))
827 return 0;
828
829 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
830 as described in http://unicode.org/reports/tr15/#Annex8. */
831 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
832
Martin v. Löwis22970662011-09-29 13:39:38 +0200833 i = 0;
834 kind = PyUnicode_KIND(input);
835 data = PyUnicode_DATA(input);
836 len = PyUnicode_GET_LENGTH(input);
837 while (i < len) {
838 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
839 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000840 unsigned char combining = record->combining;
841 unsigned char quickcheck = record->normalization_quick_check;
842
843 if (quickcheck & quickcheck_mask)
844 return 0; /* this string might need normalization */
845 if (combining && prev_combining > combining)
846 return 0; /* non-canonical sort order, not normalized */
847 prev_combining = combining;
848 }
849 return 1; /* certainly normalized */
850}
851
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000852PyDoc_STRVAR(unicodedata_normalize__doc__,
853"normalize(form, unistr)\n\
854\n\
855Return the normal form 'form' for the Unicode string unistr. Valid\n\
856values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
857
Martin v. Löwis677bde22002-11-23 22:08:15 +0000858static PyObject*
859unicodedata_normalize(PyObject *self, PyObject *args)
860{
861 char *form;
862 PyObject *input;
863
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000864 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000865 &form, &PyUnicode_Type, &input))
866 return NULL;
867
Martin v. Löwis22970662011-09-29 13:39:38 +0200868 if (PyUnicode_READY(input) == -1)
869 return NULL;
870
871 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000872 /* Special case empty input strings, since resizing
873 them later would cause internal errors. */
874 Py_INCREF(input);
875 return input;
876 }
877
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000878 if (strcmp(form, "NFC") == 0) {
879 if (is_normalized(self, input, 1, 0)) {
880 Py_INCREF(input);
881 return input;
882 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000883 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000884 }
885 if (strcmp(form, "NFKC") == 0) {
886 if (is_normalized(self, input, 1, 1)) {
887 Py_INCREF(input);
888 return input;
889 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000890 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000891 }
892 if (strcmp(form, "NFD") == 0) {
893 if (is_normalized(self, input, 0, 0)) {
894 Py_INCREF(input);
895 return input;
896 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000897 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000898 }
899 if (strcmp(form, "NFKD") == 0) {
900 if (is_normalized(self, input, 0, 1)) {
901 Py_INCREF(input);
902 return input;
903 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000904 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000905 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000906 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
907 return NULL;
908}
909
Fredrik Lundh06d12682001-01-24 07:59:11 +0000910/* -------------------------------------------------------------------- */
911/* unicode character name tables */
912
913/* data file generated by Tools/unicode/makeunicodedata.py */
914#include "unicodename_db.h"
915
916/* -------------------------------------------------------------------- */
917/* database code (cut and pasted from the unidb package) */
918
919static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000920_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000921{
922 int i;
923 unsigned long h = 0;
924 unsigned long ix;
925 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200926 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000927 ix = h & 0xff000000;
928 if (ix)
929 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
930 }
931 return h;
932}
933
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000934static char *hangul_syllables[][3] = {
935 { "G", "A", "" },
936 { "GG", "AE", "G" },
937 { "N", "YA", "GG" },
938 { "D", "YAE", "GS" },
939 { "DD", "EO", "N", },
940 { "R", "E", "NJ" },
941 { "M", "YEO", "NH" },
942 { "B", "YE", "D" },
943 { "BB", "O", "L" },
944 { "S", "WA", "LG" },
945 { "SS", "WAE", "LM" },
946 { "", "OE", "LB" },
947 { "J", "YO", "LS" },
948 { "JJ", "U", "LT" },
949 { "C", "WEO", "LP" },
950 { "K", "WE", "LH" },
951 { "T", "WI", "M" },
952 { "P", "YU", "B" },
953 { "H", "EU", "BS" },
954 { 0, "YI", "S" },
955 { 0, "I", "SS" },
956 { 0, 0, "NG" },
957 { 0, 0, "J" },
958 { 0, 0, "C" },
959 { 0, 0, "K" },
960 { 0, 0, "T" },
961 { 0, 0, "P" },
962 { 0, 0, "H" }
963};
964
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000965/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000966static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000967is_unified_ideograph(Py_UCS4 code)
968{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000969 return
970 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500971 (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000972 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
973 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
974 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000975}
976
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300977/* macros used to determine if the given codepoint is in the PUA range that
978 * we are using to store aliases and named sequences */
979#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
980#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
981 (cp < named_sequences_end))
982
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000983static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300984_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
985 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000986{
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300987 /* Find the name associated with the given codepoint.
988 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
989 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000990 int offset;
991 int i;
992 int word;
993 unsigned char* w;
994
Martin v. Löwisc3509122006-03-11 12:16:23 +0000995 if (code >= 0x110000)
996 return 0;
997
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300998 /* XXX should we just skip all the codepoints in the PUAs here? */
999 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1000 return 0;
1001
Martin v. Löwis1a214512008-06-11 05:26:20 +00001002 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001003 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +03001004 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001005 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1006 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +03001007 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +00001008 if (old->category_changed == 0) {
1009 /* unassigned */
1010 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001011 }
Martin v. Löwisc3509122006-03-11 12:16:23 +00001012 }
1013
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +00001014 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001015 /* Hangul syllable. */
1016 int SIndex = code - SBase;
1017 int L = SIndex / NCount;
1018 int V = (SIndex % NCount) / TCount;
1019 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001020
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001021 if (buflen < 27)
1022 /* Worst case: HANGUL SYLLABLE <10chars>. */
1023 return 0;
1024 strcpy(buffer, "HANGUL SYLLABLE ");
1025 buffer += 16;
1026 strcpy(buffer, hangul_syllables[L][0]);
1027 buffer += strlen(hangul_syllables[L][0]);
1028 strcpy(buffer, hangul_syllables[V][1]);
1029 buffer += strlen(hangul_syllables[V][1]);
1030 strcpy(buffer, hangul_syllables[T][2]);
1031 buffer += strlen(hangul_syllables[T][2]);
1032 *buffer = '\0';
1033 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001034 }
1035
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001036 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001037 if (buflen < 28)
1038 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1039 return 0;
1040 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1041 return 1;
1042 }
1043
Fredrik Lundh06d12682001-01-24 07:59:11 +00001044 /* get offset into phrasebook */
1045 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1046 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1047 (code&((1<<phrasebook_shift)-1))];
1048 if (!offset)
1049 return 0;
1050
1051 i = 0;
1052
1053 for (;;) {
1054 /* get word index */
1055 word = phrasebook[offset] - phrasebook_short;
1056 if (word >= 0) {
1057 word = (word << 8) + phrasebook[offset+1];
1058 offset += 2;
1059 } else
1060 word = phrasebook[offset++];
1061 if (i) {
1062 if (i > buflen)
1063 return 0; /* buffer overflow */
1064 buffer[i++] = ' ';
1065 }
1066 /* copy word string from lexicon. the last character in the
1067 word has bit 7 set. the last word in a string ends with
1068 0x80 */
1069 w = lexicon + lexicon_offset[word];
1070 while (*w < 128) {
1071 if (i >= buflen)
1072 return 0; /* buffer overflow */
1073 buffer[i++] = *w++;
1074 }
1075 if (i >= buflen)
1076 return 0; /* buffer overflow */
1077 buffer[i++] = *w & 127;
1078 if (*w == 128)
1079 break; /* end of word */
1080 }
1081
1082 return 1;
1083}
1084
1085static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001086_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001087{
1088 /* check if code corresponds to the given name */
1089 int i;
1090 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001091 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001092 return 0;
1093 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001094 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001095 return 0;
1096 }
1097 return buffer[namelen] == '\0';
1098}
1099
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001100static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001101find_syllable(const char *str, int *len, int *pos, int count, int column)
1102{
1103 int i, len1;
1104 *len = -1;
1105 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001106 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001107 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001108 if (len1 <= *len)
1109 continue;
1110 if (strncmp(str, s, len1) == 0) {
1111 *len = len1;
1112 *pos = i;
1113 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001114 }
1115 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001116 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001117 }
1118}
1119
Fredrik Lundh06d12682001-01-24 07:59:11 +00001120static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001121_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001122{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001123 /* check if named sequences are allowed */
1124 if (!with_named_seq && IS_NAMED_SEQ(cp))
1125 return 0;
1126 /* if the codepoint is in the PUA range that we use for aliases,
1127 * convert it to obtain the right codepoint */
1128 if (IS_ALIAS(cp))
1129 *code = name_aliases[cp-aliases_start];
1130 else
1131 *code = cp;
1132 return 1;
1133}
1134
1135static int
1136_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1137 int with_named_seq)
1138{
1139 /* Return the codepoint associated with the given name.
1140 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1141 * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
1142 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001143 unsigned int h, v;
1144 unsigned int mask = code_size-1;
1145 unsigned int i, incr;
1146
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001147 /* Check for hangul syllables. */
1148 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001149 int len, L = -1, V = -1, T = -1;
1150 const char *pos = name + 16;
1151 find_syllable(pos, &len, &L, LCount, 0);
1152 pos += len;
1153 find_syllable(pos, &len, &V, VCount, 1);
1154 pos += len;
1155 find_syllable(pos, &len, &T, TCount, 2);
1156 pos += len;
1157 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1158 *code = SBase + (L*VCount+V)*TCount + T;
1159 return 1;
1160 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001161 /* Otherwise, it's an illegal syllable name. */
1162 return 0;
1163 }
1164
1165 /* Check for unified ideographs. */
1166 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1167 /* Four or five hexdigits must follow. */
1168 v = 0;
1169 name += 22;
1170 namelen -= 22;
1171 if (namelen != 4 && namelen != 5)
1172 return 0;
1173 while (namelen--) {
1174 v *= 16;
1175 if (*name >= '0' && *name <= '9')
1176 v += *name - '0';
1177 else if (*name >= 'A' && *name <= 'F')
1178 v += *name - 'A' + 10;
1179 else
1180 return 0;
1181 name++;
1182 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001183 if (!is_unified_ideograph(v))
1184 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001185 *code = v;
1186 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001187 }
1188
Fredrik Lundh06d12682001-01-24 07:59:11 +00001189 /* the following is the same as python's dictionary lookup, with
1190 only minor changes. see the makeunicodedata script for more
1191 details */
1192
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001193 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001194 i = (~h) & mask;
1195 v = code_hash[i];
1196 if (!v)
1197 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001198 if (_cmpname(self, v, name, namelen))
1199 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001200 incr = (h ^ (h >> 3)) & mask;
1201 if (!incr)
1202 incr = mask;
1203 for (;;) {
1204 i = (i + incr) & mask;
1205 v = code_hash[i];
1206 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001207 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001208 if (_cmpname(self, v, name, namelen))
1209 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001210 incr = incr << 1;
1211 if (incr > mask)
1212 incr = incr ^ code_poly;
1213 }
1214}
1215
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001216static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001217{
1218 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001219 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001220 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001221};
1222
1223/* -------------------------------------------------------------------- */
1224/* Python bindings */
1225
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001226PyDoc_STRVAR(unicodedata_name__doc__,
1227"name(unichr[, default])\n\
1228Returns the name assigned to the Unicode character unichr as a\n\
1229string. If no name is defined, default is returned, or, if not\n\
1230given, ValueError is raised.");
1231
Fredrik Lundh06d12682001-01-24 07:59:11 +00001232static PyObject *
1233unicodedata_name(PyObject* self, PyObject* args)
1234{
1235 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001236 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001237
1238 PyUnicodeObject* v;
1239 PyObject* defobj = NULL;
1240 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1241 return NULL;
1242
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001243 c = getuchar(v);
1244 if (c == (Py_UCS4)-1)
1245 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001246
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001247 if (!_getucname(self, c, name, sizeof(name), 0)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001248 if (defobj == NULL) {
1249 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001250 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001251 }
1252 else {
1253 Py_INCREF(defobj);
1254 return defobj;
1255 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001256 }
1257
Walter Dörwald4254e762007-06-05 16:04:09 +00001258 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001259}
1260
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001261PyDoc_STRVAR(unicodedata_lookup__doc__,
1262"lookup(name)\n\
1263\n\
1264Look up character by name. If a character with the\n\
1265given name is found, return the corresponding Unicode\n\
1266character. If not found, KeyError is raised.");
1267
Fredrik Lundh06d12682001-01-24 07:59:11 +00001268static PyObject *
1269unicodedata_lookup(PyObject* self, PyObject* args)
1270{
1271 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001272
1273 char* name;
1274 int namelen;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001275 unsigned int index;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001276 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1277 return NULL;
1278
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001279 if (!_getcode(self, name, namelen, &code, 1)) {
1280 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001281 return NULL;
1282 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001283 /* check if code is in the PUA range that we use for named sequences
1284 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001285 if (IS_NAMED_SEQ(code)) {
1286 index = code-named_sequences_start;
1287 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1288 named_sequences[index].seq,
1289 named_sequences[index].seqlen);
1290 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001291 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001292}
1293
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001294/* XXX Add doc strings. */
1295
1296static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001297 UNICODEDATA_UCD_DECIMAL_METHODDEF
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001298 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1299 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1300 {"category", unicodedata_category, METH_VARARGS,
1301 unicodedata_category__doc__},
1302 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1303 unicodedata_bidirectional__doc__},
1304 {"combining", unicodedata_combining, METH_VARARGS,
1305 unicodedata_combining__doc__},
1306 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1307 unicodedata_mirrored__doc__},
1308 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1309 unicodedata_east_asian_width__doc__},
1310 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1311 unicodedata_decomposition__doc__},
1312 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1313 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1314 {"normalize", unicodedata_normalize, METH_VARARGS,
1315 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001316 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001317};
1318
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001319static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001320 /* The ob_type field must be initialized in the module init function
1321 * to be portable to Windows without using C++. */
1322 PyVarObject_HEAD_INIT(NULL, 0)
1323 "unicodedata.UCD", /*tp_name*/
1324 sizeof(PreviousDBVersion), /*tp_basicsize*/
1325 0, /*tp_itemsize*/
1326 /* methods */
1327 (destructor)PyObject_Del, /*tp_dealloc*/
1328 0, /*tp_print*/
1329 0, /*tp_getattr*/
1330 0, /*tp_setattr*/
1331 0, /*tp_reserved*/
1332 0, /*tp_repr*/
1333 0, /*tp_as_number*/
1334 0, /*tp_as_sequence*/
1335 0, /*tp_as_mapping*/
1336 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001337 0, /*tp_call*/
1338 0, /*tp_str*/
1339 PyObject_GenericGetAttr,/*tp_getattro*/
1340 0, /*tp_setattro*/
1341 0, /*tp_as_buffer*/
1342 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1343 0, /*tp_doc*/
1344 0, /*tp_traverse*/
1345 0, /*tp_clear*/
1346 0, /*tp_richcompare*/
1347 0, /*tp_weaklistoffset*/
1348 0, /*tp_iter*/
1349 0, /*tp_iternext*/
1350 unicodedata_functions, /*tp_methods*/
1351 DB_members, /*tp_members*/
1352 0, /*tp_getset*/
1353 0, /*tp_base*/
1354 0, /*tp_dict*/
1355 0, /*tp_descr_get*/
1356 0, /*tp_descr_set*/
1357 0, /*tp_dictoffset*/
1358 0, /*tp_init*/
1359 0, /*tp_alloc*/
1360 0, /*tp_new*/
1361 0, /*tp_free*/
1362 0, /*tp_is_gc*/
1363};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001364
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001365PyDoc_STRVAR(unicodedata_docstring,
1366"This module provides access to the Unicode Character Database which\n\
1367defines character properties for all Unicode characters. The data in\n\
1368this database is based on the UnicodeData.txt file version\n\
Benjamin Peterson8aa7b892013-10-10 20:22:10 -04001369" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001370\n\
1371The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001372UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001373
1374static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001375 PyModuleDef_HEAD_INIT,
1376 "unicodedata",
1377 unicodedata_docstring,
1378 -1,
1379 unicodedata_functions,
1380 NULL,
1381 NULL,
1382 NULL,
1383 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001384};
1385
Mark Hammond62b1ab12002-07-23 06:31:15 +00001386PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001387PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001388{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001389 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001390
Christian Heimes90aa7642007-12-19 02:45:37 +00001391 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001392
Martin v. Löwis1a214512008-06-11 05:26:20 +00001393 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001394 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001395 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001396
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001397 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001398 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001399 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001400
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001401 /* Previous versions */
1402 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1403 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001404 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001405
Fredrik Lundh06d12682001-01-24 07:59:11 +00001406 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001407 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001408 if (v != NULL)
1409 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001410 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001411}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001412
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001413/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001414Local variables:
1415c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001416indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001417End:
1418*/