blob: eca0054b45aefe0bbb8afdae8b9b294786d0e10c [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
16#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000017#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000018#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019
Larry Hastings44e2eaa2013-11-23 15:37:55 -080020/*[clinic]
21module unicodedata
22class unicodedata.UCD
23[clinic]*/
24/*[clinic checksum: da39a3ee5e6b4b0d3255bfef95601890afd80709]*/
25
Fredrik Lundh06d12682001-01-24 07:59:11 +000026/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000027
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000028typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000029 const unsigned char category; /* index into
30 _PyUnicode_CategoryNames */
31 const unsigned char combining; /* combining class value 0 - 255 */
32 const unsigned char bidirectional; /* index into
33 _PyUnicode_BidirectionalNames */
34 const unsigned char mirrored; /* true if mirrored in bidir mode */
35 const unsigned char east_asian_width; /* index into
36 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000037 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000038} _PyUnicode_DatabaseRecord;
39
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000040typedef struct change_record {
41 /* sequence of fields should be the same as in merge_old_version */
42 const unsigned char bidir_changed;
43 const unsigned char category_changed;
44 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000045 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000046 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000047} change_record;
48
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000049/* data file generated by Tools/unicode/makeunicodedata.py */
50#include "unicodedata_db.h"
51
52static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000053_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000054{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000055 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000056 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000057 index = 0;
58 else {
59 index = index1[(code>>SHIFT)];
60 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
61 }
62
63 return &_PyUnicode_Database_Records[index];
64}
65
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000066/* ------------- Previous-version API ------------------------------------- */
67typedef struct previous_version {
68 PyObject_HEAD
69 const char *name;
70 const change_record* (*getrecord)(Py_UCS4);
71 Py_UCS4 (*normalization)(Py_UCS4);
72} PreviousDBVersion;
73
74#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
75
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000076static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078 {NULL}
79};
80
Thomas Wouters89f507f2006-12-13 04:49:30 +000081/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000082static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000083#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000084
85static PyObject*
86new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
87 Py_UCS4 (*normalization)(Py_UCS4))
88{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 PreviousDBVersion *self;
90 self = PyObject_New(PreviousDBVersion, &UCD_Type);
91 if (self == NULL)
92 return NULL;
93 self->name = name;
94 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000095 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000096 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000097}
98
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000099
100static Py_UCS4 getuchar(PyUnicodeObject *obj)
101{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200102 if (PyUnicode_READY(obj))
103 return (Py_UCS4)-1;
104 if (PyUnicode_GET_LENGTH(obj) == 1) {
105 if (PyUnicode_READY(obj))
106 return (Py_UCS4)-1;
107 return PyUnicode_READ_CHAR(obj, 0);
108 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000109 PyErr_SetString(PyExc_TypeError,
110 "need a single Unicode character as parameter");
111 return (Py_UCS4)-1;
112}
113
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000114/* --- Module API --------------------------------------------------------- */
115
Larry Hastings31826802013-10-19 00:09:25 -0700116/*[clinic]
Larry Hastings44e2eaa2013-11-23 15:37:55 -0800117
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800118unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700119
120 unichr: object(type='str')
121 default: object=NULL
122 /
123
124Converts a Unicode character into its equivalent decimal value.
125
126Returns the decimal value assigned to the Unicode character unichr
127as integer. If no such value is defined, default is returned, or, if
128not given, ValueError is raised.
129[clinic]*/
130
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800131PyDoc_STRVAR(unicodedata_UCD_decimal__doc__,
Larry Hastings44e2eaa2013-11-23 15:37:55 -0800132"decimal(unichr, default=None)\n"
Larry Hastings31826802013-10-19 00:09:25 -0700133"Converts a Unicode character into its equivalent decimal value.\n"
134"\n"
Larry Hastings31826802013-10-19 00:09:25 -0700135"Returns the decimal value assigned to the Unicode character unichr\n"
136"as integer. If no such value is defined, default is returned, or, if\n"
137"not given, ValueError is raised.");
138
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800139#define UNICODEDATA_UCD_DECIMAL_METHODDEF \
140 {"decimal", (PyCFunction)unicodedata_UCD_decimal, METH_VARARGS, unicodedata_UCD_decimal__doc__},
Larry Hastings31826802013-10-19 00:09:25 -0700141
142static PyObject *
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800143unicodedata_UCD_decimal_impl(PyObject *self, PyObject *unichr, PyObject *default_value);
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000144
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000145static PyObject *
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800146unicodedata_UCD_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000147{
Larry Hastings31826802013-10-19 00:09:25 -0700148 PyObject *return_value = NULL;
149 PyObject *unichr;
150 PyObject *default_value = NULL;
151
152 if (!PyArg_ParseTuple(args,
153 "O!|O:decimal",
154 &PyUnicode_Type, &unichr, &default_value))
155 goto exit;
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800156 return_value = unicodedata_UCD_decimal_impl(self, unichr, default_value);
Larry Hastings31826802013-10-19 00:09:25 -0700157
158exit:
159 return return_value;
160}
161
162static PyObject *
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800163unicodedata_UCD_decimal_impl(PyObject *self, PyObject *unichr, PyObject *default_value)
Larry Hastings44e2eaa2013-11-23 15:37:55 -0800164/*[clinic checksum: 9576fa55f4ea0be82968af39dc9d0283e634beeb]*/
Larry Hastings31826802013-10-19 00:09:25 -0700165{
166 PyUnicodeObject *v = (PyUnicodeObject *)unichr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000167 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000168 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000169 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000170
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000171 c = getuchar(v);
172 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000173 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000174
Martin v. Löwis1a214512008-06-11 05:26:20 +0000175 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000176 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000177 if (old->category_changed == 0) {
178 /* unassigned */
179 have_old = 1;
180 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000181 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000182 else if (old->decimal_changed != 0xFF) {
183 have_old = 1;
184 rc = old->decimal_changed;
185 }
186 }
187
188 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000189 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000190 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700191 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000192 PyErr_SetString(PyExc_ValueError,
193 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000194 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000195 }
196 else {
Larry Hastings31826802013-10-19 00:09:25 -0700197 Py_INCREF(default_value);
198 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000199 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000200 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000201 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000202}
203
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000204PyDoc_STRVAR(unicodedata_digit__doc__,
205"digit(unichr[, default])\n\
206\n\
207Returns the digit value assigned to the Unicode character unichr as\n\
208integer. If no such value is defined, default is returned, or, if\n\
209not given, ValueError is raised.");
210
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000211static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000212unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000213{
214 PyUnicodeObject *v;
215 PyObject *defobj = NULL;
216 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000217 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000218
Fredrik Lundh06d12682001-01-24 07:59:11 +0000219 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000220 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000221 c = getuchar(v);
222 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000223 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000224 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000225 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000226 if (defobj == NULL) {
227 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000228 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 }
230 else {
231 Py_INCREF(defobj);
232 return defobj;
233 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000234 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000235 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000236}
237
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000238PyDoc_STRVAR(unicodedata_numeric__doc__,
239"numeric(unichr[, default])\n\
240\n\
241Returns the numeric value assigned to the Unicode character unichr\n\
242as float. If no such value is defined, default is returned, or, if\n\
243not given, ValueError is raised.");
244
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000245static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000246unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000247{
248 PyUnicodeObject *v;
249 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000250 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000251 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000252 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000253
Fredrik Lundh06d12682001-01-24 07:59:11 +0000254 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000255 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000256 c = getuchar(v);
257 if (c == (Py_UCS4)-1)
258 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000259
Martin v. Löwis1a214512008-06-11 05:26:20 +0000260 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000261 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000262 if (old->category_changed == 0) {
263 /* unassigned */
264 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000265 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000266 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000267 else if (old->decimal_changed != 0xFF) {
268 have_old = 1;
269 rc = old->decimal_changed;
270 }
271 }
272
273 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000274 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000275 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000276 if (defobj == NULL) {
277 PyErr_SetString(PyExc_ValueError, "not a numeric character");
278 return NULL;
279 }
280 else {
281 Py_INCREF(defobj);
282 return defobj;
283 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000284 }
285 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000286}
287
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000288PyDoc_STRVAR(unicodedata_category__doc__,
289"category(unichr)\n\
290\n\
291Returns the general category assigned to the Unicode character\n\
292unichr as string.");
293
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000294static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000295unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000296{
297 PyUnicodeObject *v;
298 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000299 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000300
301 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000302 &PyUnicode_Type, &v))
303 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000304 c = getuchar(v);
305 if (c == (Py_UCS4)-1)
306 return NULL;
307 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000308 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000309 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000310 if (old->category_changed != 0xFF)
311 index = old->category_changed;
312 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000313 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000314}
315
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000316PyDoc_STRVAR(unicodedata_bidirectional__doc__,
317"bidirectional(unichr)\n\
318\n\
Ezio Melottie3d7e542012-12-14 20:12:25 +0200319Returns the bidirectional class assigned to the Unicode character\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000320unichr as string. If no such value is defined, an empty string is\n\
321returned.");
322
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000323static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000324unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000325{
326 PyUnicodeObject *v;
327 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000328 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000329
330 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000331 &PyUnicode_Type, &v))
332 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000333 c = getuchar(v);
334 if (c == (Py_UCS4)-1)
335 return NULL;
336 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000337 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000338 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000339 if (old->category_changed == 0)
340 index = 0; /* unassigned */
341 else if (old->bidir_changed != 0xFF)
342 index = old->bidir_changed;
343 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000344 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000345}
346
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000347PyDoc_STRVAR(unicodedata_combining__doc__,
348"combining(unichr)\n\
349\n\
350Returns the canonical combining class assigned to the Unicode\n\
351character unichr as integer. Returns 0 if no combining class is\n\
352defined.");
353
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000354static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000355unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000356{
357 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000358 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000359 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000360
361 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000362 &PyUnicode_Type, &v))
363 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000364 c = getuchar(v);
365 if (c == (Py_UCS4)-1)
366 return NULL;
367 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000368 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000369 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000370 if (old->category_changed == 0)
371 index = 0; /* unassigned */
372 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000373 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000374}
375
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000376PyDoc_STRVAR(unicodedata_mirrored__doc__,
377"mirrored(unichr)\n\
378\n\
379Returns the mirrored property assigned to the Unicode character\n\
380unichr as integer. Returns 1 if the character has been identified as\n\
381a \"mirrored\" character in bidirectional text, 0 otherwise.");
382
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000383static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000384unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000385{
386 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000387 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000388 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000389
390 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000391 &PyUnicode_Type, &v))
392 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000393 c = getuchar(v);
394 if (c == (Py_UCS4)-1)
395 return NULL;
396 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000397 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000398 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000399 if (old->category_changed == 0)
400 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000401 else if (old->mirrored_changed != 0xFF)
402 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000403 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000404 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000405}
406
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000407PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
408"east_asian_width(unichr)\n\
409\n\
410Returns the east asian width assigned to the Unicode character\n\
411unichr as string.");
412
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000413static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000414unicodedata_east_asian_width(PyObject *self, PyObject *args)
415{
416 PyUnicodeObject *v;
417 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000418 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000419
420 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000421 &PyUnicode_Type, &v))
422 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000423 c = getuchar(v);
424 if (c == (Py_UCS4)-1)
425 return NULL;
426 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000427 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000428 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000429 if (old->category_changed == 0)
430 index = 0; /* unassigned */
431 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000432 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000433}
434
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000435PyDoc_STRVAR(unicodedata_decomposition__doc__,
436"decomposition(unichr)\n\
437\n\
438Returns the character decomposition mapping assigned to the Unicode\n\
439character unichr as string. An empty string is returned in case no\n\
440such mapping is defined.");
441
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000442static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000443unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000444{
445 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000446 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000447 int code, index, count;
448 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000449 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000450 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000451
452 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000453 &PyUnicode_Type, &v))
454 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000455 c = getuchar(v);
456 if (c == (Py_UCS4)-1)
457 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000458
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000459 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000460
Martin v. Löwis1a214512008-06-11 05:26:20 +0000461 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000462 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000463 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000464 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000465 }
466
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000467 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000468 index = 0;
469 else {
470 index = decomp_index1[(code>>DECOMP_SHIFT)];
471 index = decomp_index2[(index<<DECOMP_SHIFT)+
472 (code&((1<<DECOMP_SHIFT)-1))];
473 }
474
Tim Peters69b83b12001-11-30 07:23:05 +0000475 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000476 is prefix code (from*/
477 count = decomp_data[index] >> 8;
478
479 /* XXX: could allocate the PyString up front instead
480 (strlen(prefix) + 5 * count + 1 bytes) */
481
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000482 /* Based on how index is calculated above and decomp_data is generated
483 from Tools/unicode/makeunicodedata.py, it should not be possible
484 to overflow decomp_prefix. */
485 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200486 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000487
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000488 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000489 i = strlen(decomp_prefix[prefix_index]);
490 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000491
492 while (count-- > 0) {
493 if (i)
494 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000495 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000496 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
497 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000498 i += strlen(decomp + i);
499 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000500 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000501}
502
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000503static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000504get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000505{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000506 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000507 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000508 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000509 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000510 /* unassigned in old version */
511 *index = 0;
512 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000513 else {
514 *index = decomp_index1[(code>>DECOMP_SHIFT)];
515 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
516 (code&((1<<DECOMP_SHIFT)-1))];
517 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000518
Martin v. Löwis677bde22002-11-23 22:08:15 +0000519 /* high byte is number of hex bytes (usually one or two), low byte
520 is prefix code (from*/
521 *count = decomp_data[*index] >> 8;
522 *prefix = decomp_data[*index] & 255;
523
524 (*index)++;
525}
526
527#define SBase 0xAC00
528#define LBase 0x1100
529#define VBase 0x1161
530#define TBase 0x11A7
531#define LCount 19
532#define VCount 21
533#define TCount 28
534#define NCount (VCount*TCount)
535#define SCount (LCount*NCount)
536
537static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000538nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000539{
540 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200541 Py_UCS4 *output;
542 Py_ssize_t i, o, osize;
543 int kind;
544 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000545 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200546 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000547 Py_ssize_t space, isize;
548 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000549 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000550
Martin v. Löwis677bde22002-11-23 22:08:15 +0000551 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200552 isize = PyUnicode_GET_LENGTH(input);
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300553 /* Overallocate at most 10 characters. */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000554 space = (isize > 10 ? 10 : isize) + isize;
Martin v. Löwis22970662011-09-29 13:39:38 +0200555 osize = space;
556 output = PyMem_Malloc(space * sizeof(Py_UCS4));
557 if (!output) {
558 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000559 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200560 }
561 i = o = 0;
562 kind = PyUnicode_KIND(input);
563 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000564
Martin v. Löwis22970662011-09-29 13:39:38 +0200565 while (i < isize) {
566 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000567 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200568 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000569 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300570 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000571 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000572 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200573 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000574 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000575 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
576 if (new_output == NULL) {
577 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200578 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000579 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200580 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000581 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000582 }
583 /* Hangul Decomposition. */
584 if (SBase <= code && code < (SBase+SCount)) {
585 int SIndex = code - SBase;
586 int L = LBase + SIndex / NCount;
587 int V = VBase + (SIndex % NCount) / TCount;
588 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200589 output[o++] = L;
590 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000591 space -= 2;
592 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200593 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000594 space --;
595 }
596 continue;
597 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000598 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000599 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000600 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
601 if (value != 0) {
602 stack[stackptr++] = value;
603 continue;
604 }
605 }
606
607 /* Other decompositions. */
608 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000609
610 /* Copy character if it is not decomposable, or has a
611 compatibility decomposition, but we do NFD. */
612 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200613 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000614 space--;
615 continue;
616 }
617 /* Copy decomposition onto the stack, in reverse
618 order. */
619 while(count) {
620 code = decomp_data[index + (--count)];
621 stack[stackptr++] = code;
622 }
623 }
624 }
625
Martin v. Löwis22970662011-09-29 13:39:38 +0200626 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
627 output, o);
628 PyMem_Free(output);
629 if (!result)
630 return NULL;
631 /* result is guaranteed to be ready, as it is compact. */
632 kind = PyUnicode_KIND(result);
633 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000634
635 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200636 i = 0;
637 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
638 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
639 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000640 if (prev == 0 || cur == 0 || prev <= cur) {
641 prev = cur;
642 continue;
643 }
644 /* Non-canonical order. Need to switch *i with previous. */
645 o = i - 1;
646 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200647 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
648 PyUnicode_WRITE(kind, data, o+1,
649 PyUnicode_READ(kind, data, o));
650 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000651 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200652 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000653 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200654 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000655 if (prev == 0 || prev <= cur)
656 break;
657 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200658 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000659 }
660 return result;
661}
662
663static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200664find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000665{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200666 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000667 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200668 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000669 if (code < start)
670 return -1;
671 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200672 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000673 return nfc[index].index + delta;
674 }
675 }
676 return -1;
677}
678
679static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000680nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000681{
682 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200683 int kind;
684 void *data;
685 Py_UCS4 *output;
686 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000687 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200688 Py_UCS4 code;
689 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000690 int cskipped = 0;
691
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000692 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000693 if (!result)
694 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200695 /* result will be "ready". */
696 kind = PyUnicode_KIND(result);
697 data = PyUnicode_DATA(result);
698 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000699
Martin v. Löwis22970662011-09-29 13:39:38 +0200700 /* We allocate a buffer for the output.
701 If we find that we made no changes, we still return
702 the NFD result. */
703 output = PyMem_Malloc(len * sizeof(Py_UCS4));
704 if (!output) {
705 PyErr_NoMemory();
706 Py_DECREF(result);
707 return 0;
708 }
709 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000710
Martin v. Löwis677bde22002-11-23 22:08:15 +0000711 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200712 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000713 for (index = 0; index < cskipped; index++) {
714 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000715 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000716 Remove from list. */
717 skipped[index] = skipped[cskipped-1];
718 cskipped--;
719 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000720 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000721 }
722 }
723 /* Hangul Composition. We don't need to check for <LV,T>
724 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200725 code = PyUnicode_READ(kind, data, i);
726 if (LBase <= code && code < (LBase+LCount) &&
727 i + 1 < len &&
728 VBase <= PyUnicode_READ(kind, data, i+1) &&
729 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000730 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200731 LIndex = code - LBase;
732 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000733 code = SBase + (LIndex*VCount+VIndex)*TCount;
734 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200735 if (i < len &&
736 TBase <= PyUnicode_READ(kind, data, i) &&
737 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
738 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000739 i++;
740 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200741 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000742 continue;
743 }
744
Martin v. Löwis22970662011-09-29 13:39:38 +0200745 /* code is still input[i] here */
746 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000747 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200748 output[o++] = code;
749 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000750 continue;
751 }
752 /* Find next unblocked character. */
753 i1 = i+1;
754 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200755 /* output base character for now; might be updated later. */
756 output[o] = PyUnicode_READ(kind, data, i);
757 while (i1 < len) {
758 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
759 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000760 if (comb) {
761 if (comb1 == 0)
762 break;
763 if (comb >= comb1) {
764 /* Character is blocked. */
765 i1++;
766 continue;
767 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000768 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200769 l = find_nfc_index(self, nfc_last, code1);
770 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000771 is a starter, we don't need to look further.
772 Otherwise, record the combining class. */
773 if (l == -1) {
774 not_combinable:
775 if (comb1 == 0)
776 break;
777 comb = comb1;
778 i1++;
779 continue;
780 }
781 index = f*TOTAL_LAST + l;
782 index1 = comp_index[index >> COMP_SHIFT];
783 code = comp_data[(index1<<COMP_SHIFT)+
784 (index&((1<<COMP_SHIFT)-1))];
785 if (code == 0)
786 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000787
Martin v. Löwis677bde22002-11-23 22:08:15 +0000788 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200789 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000790 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000791 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000792 skipped[cskipped++] = i1;
793 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200794 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000795 if (f == -1)
796 break;
797 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200798 /* Output character was already written.
799 Just advance the indices. */
800 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000801 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200802 if (o == len) {
803 /* No changes. Return original string. */
804 PyMem_Free(output);
805 return result;
806 }
807 Py_DECREF(result);
808 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
809 output, o);
810 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000811 return result;
812}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000813
814/* Return 1 if the input is certainly normalized, 0 if it might not be. */
815static int
816is_normalized(PyObject *self, PyObject *input, int nfc, int k)
817{
Martin v. Löwis22970662011-09-29 13:39:38 +0200818 Py_ssize_t i, len;
819 int kind;
820 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000821 unsigned char prev_combining = 0, quickcheck_mask;
822
823 /* An older version of the database is requested, quickchecks must be
824 disabled. */
825 if (self && UCD_Check(self))
826 return 0;
827
828 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
829 as described in http://unicode.org/reports/tr15/#Annex8. */
830 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
831
Martin v. Löwis22970662011-09-29 13:39:38 +0200832 i = 0;
833 kind = PyUnicode_KIND(input);
834 data = PyUnicode_DATA(input);
835 len = PyUnicode_GET_LENGTH(input);
836 while (i < len) {
837 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
838 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000839 unsigned char combining = record->combining;
840 unsigned char quickcheck = record->normalization_quick_check;
841
842 if (quickcheck & quickcheck_mask)
843 return 0; /* this string might need normalization */
844 if (combining && prev_combining > combining)
845 return 0; /* non-canonical sort order, not normalized */
846 prev_combining = combining;
847 }
848 return 1; /* certainly normalized */
849}
850
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000851PyDoc_STRVAR(unicodedata_normalize__doc__,
852"normalize(form, unistr)\n\
853\n\
854Return the normal form 'form' for the Unicode string unistr. Valid\n\
855values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
856
Martin v. Löwis677bde22002-11-23 22:08:15 +0000857static PyObject*
858unicodedata_normalize(PyObject *self, PyObject *args)
859{
860 char *form;
861 PyObject *input;
862
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000863 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000864 &form, &PyUnicode_Type, &input))
865 return NULL;
866
Martin v. Löwis22970662011-09-29 13:39:38 +0200867 if (PyUnicode_READY(input) == -1)
868 return NULL;
869
870 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000871 /* Special case empty input strings, since resizing
872 them later would cause internal errors. */
873 Py_INCREF(input);
874 return input;
875 }
876
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000877 if (strcmp(form, "NFC") == 0) {
878 if (is_normalized(self, input, 1, 0)) {
879 Py_INCREF(input);
880 return input;
881 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000882 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000883 }
884 if (strcmp(form, "NFKC") == 0) {
885 if (is_normalized(self, input, 1, 1)) {
886 Py_INCREF(input);
887 return input;
888 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000889 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000890 }
891 if (strcmp(form, "NFD") == 0) {
892 if (is_normalized(self, input, 0, 0)) {
893 Py_INCREF(input);
894 return input;
895 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000896 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000897 }
898 if (strcmp(form, "NFKD") == 0) {
899 if (is_normalized(self, input, 0, 1)) {
900 Py_INCREF(input);
901 return input;
902 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000903 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000904 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000905 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
906 return NULL;
907}
908
Fredrik Lundh06d12682001-01-24 07:59:11 +0000909/* -------------------------------------------------------------------- */
910/* unicode character name tables */
911
912/* data file generated by Tools/unicode/makeunicodedata.py */
913#include "unicodename_db.h"
914
915/* -------------------------------------------------------------------- */
916/* database code (cut and pasted from the unidb package) */
917
918static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000919_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000920{
921 int i;
922 unsigned long h = 0;
923 unsigned long ix;
924 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200925 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000926 ix = h & 0xff000000;
927 if (ix)
928 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
929 }
930 return h;
931}
932
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000933static char *hangul_syllables[][3] = {
934 { "G", "A", "" },
935 { "GG", "AE", "G" },
936 { "N", "YA", "GG" },
937 { "D", "YAE", "GS" },
938 { "DD", "EO", "N", },
939 { "R", "E", "NJ" },
940 { "M", "YEO", "NH" },
941 { "B", "YE", "D" },
942 { "BB", "O", "L" },
943 { "S", "WA", "LG" },
944 { "SS", "WAE", "LM" },
945 { "", "OE", "LB" },
946 { "J", "YO", "LS" },
947 { "JJ", "U", "LT" },
948 { "C", "WEO", "LP" },
949 { "K", "WE", "LH" },
950 { "T", "WI", "M" },
951 { "P", "YU", "B" },
952 { "H", "EU", "BS" },
953 { 0, "YI", "S" },
954 { 0, "I", "SS" },
955 { 0, 0, "NG" },
956 { 0, 0, "J" },
957 { 0, 0, "C" },
958 { 0, 0, "K" },
959 { 0, 0, "T" },
960 { 0, 0, "P" },
961 { 0, 0, "H" }
962};
963
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000964/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000965static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000966is_unified_ideograph(Py_UCS4 code)
967{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000968 return
969 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500970 (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000971 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
972 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
973 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000974}
975
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300976/* macros used to determine if the given codepoint is in the PUA range that
977 * we are using to store aliases and named sequences */
978#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
979#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
980 (cp < named_sequences_end))
981
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000982static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300983_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
984 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000985{
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300986 /* Find the name associated with the given codepoint.
987 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
988 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000989 int offset;
990 int i;
991 int word;
992 unsigned char* w;
993
Martin v. Löwisc3509122006-03-11 12:16:23 +0000994 if (code >= 0x110000)
995 return 0;
996
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300997 /* XXX should we just skip all the codepoints in the PUAs here? */
998 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
999 return 0;
1000
Martin v. Löwis1a214512008-06-11 05:26:20 +00001001 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001002 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +03001003 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001004 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1005 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +03001006 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +00001007 if (old->category_changed == 0) {
1008 /* unassigned */
1009 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001010 }
Martin v. Löwisc3509122006-03-11 12:16:23 +00001011 }
1012
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +00001013 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001014 /* Hangul syllable. */
1015 int SIndex = code - SBase;
1016 int L = SIndex / NCount;
1017 int V = (SIndex % NCount) / TCount;
1018 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001019
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001020 if (buflen < 27)
1021 /* Worst case: HANGUL SYLLABLE <10chars>. */
1022 return 0;
1023 strcpy(buffer, "HANGUL SYLLABLE ");
1024 buffer += 16;
1025 strcpy(buffer, hangul_syllables[L][0]);
1026 buffer += strlen(hangul_syllables[L][0]);
1027 strcpy(buffer, hangul_syllables[V][1]);
1028 buffer += strlen(hangul_syllables[V][1]);
1029 strcpy(buffer, hangul_syllables[T][2]);
1030 buffer += strlen(hangul_syllables[T][2]);
1031 *buffer = '\0';
1032 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001033 }
1034
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001035 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001036 if (buflen < 28)
1037 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1038 return 0;
1039 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1040 return 1;
1041 }
1042
Fredrik Lundh06d12682001-01-24 07:59:11 +00001043 /* get offset into phrasebook */
1044 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1045 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1046 (code&((1<<phrasebook_shift)-1))];
1047 if (!offset)
1048 return 0;
1049
1050 i = 0;
1051
1052 for (;;) {
1053 /* get word index */
1054 word = phrasebook[offset] - phrasebook_short;
1055 if (word >= 0) {
1056 word = (word << 8) + phrasebook[offset+1];
1057 offset += 2;
1058 } else
1059 word = phrasebook[offset++];
1060 if (i) {
1061 if (i > buflen)
1062 return 0; /* buffer overflow */
1063 buffer[i++] = ' ';
1064 }
1065 /* copy word string from lexicon. the last character in the
1066 word has bit 7 set. the last word in a string ends with
1067 0x80 */
1068 w = lexicon + lexicon_offset[word];
1069 while (*w < 128) {
1070 if (i >= buflen)
1071 return 0; /* buffer overflow */
1072 buffer[i++] = *w++;
1073 }
1074 if (i >= buflen)
1075 return 0; /* buffer overflow */
1076 buffer[i++] = *w & 127;
1077 if (*w == 128)
1078 break; /* end of word */
1079 }
1080
1081 return 1;
1082}
1083
1084static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001085_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001086{
1087 /* check if code corresponds to the given name */
1088 int i;
1089 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001090 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001091 return 0;
1092 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001093 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001094 return 0;
1095 }
1096 return buffer[namelen] == '\0';
1097}
1098
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001099static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001100find_syllable(const char *str, int *len, int *pos, int count, int column)
1101{
1102 int i, len1;
1103 *len = -1;
1104 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001105 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001106 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001107 if (len1 <= *len)
1108 continue;
1109 if (strncmp(str, s, len1) == 0) {
1110 *len = len1;
1111 *pos = i;
1112 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001113 }
1114 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001115 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001116 }
1117}
1118
Fredrik Lundh06d12682001-01-24 07:59:11 +00001119static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001120_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001121{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001122 /* check if named sequences are allowed */
1123 if (!with_named_seq && IS_NAMED_SEQ(cp))
1124 return 0;
1125 /* if the codepoint is in the PUA range that we use for aliases,
1126 * convert it to obtain the right codepoint */
1127 if (IS_ALIAS(cp))
1128 *code = name_aliases[cp-aliases_start];
1129 else
1130 *code = cp;
1131 return 1;
1132}
1133
1134static int
1135_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1136 int with_named_seq)
1137{
1138 /* Return the codepoint associated with the given name.
1139 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1140 * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
1141 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001142 unsigned int h, v;
1143 unsigned int mask = code_size-1;
1144 unsigned int i, incr;
1145
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001146 /* Check for hangul syllables. */
1147 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001148 int len, L = -1, V = -1, T = -1;
1149 const char *pos = name + 16;
1150 find_syllable(pos, &len, &L, LCount, 0);
1151 pos += len;
1152 find_syllable(pos, &len, &V, VCount, 1);
1153 pos += len;
1154 find_syllable(pos, &len, &T, TCount, 2);
1155 pos += len;
1156 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1157 *code = SBase + (L*VCount+V)*TCount + T;
1158 return 1;
1159 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001160 /* Otherwise, it's an illegal syllable name. */
1161 return 0;
1162 }
1163
1164 /* Check for unified ideographs. */
1165 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1166 /* Four or five hexdigits must follow. */
1167 v = 0;
1168 name += 22;
1169 namelen -= 22;
1170 if (namelen != 4 && namelen != 5)
1171 return 0;
1172 while (namelen--) {
1173 v *= 16;
1174 if (*name >= '0' && *name <= '9')
1175 v += *name - '0';
1176 else if (*name >= 'A' && *name <= 'F')
1177 v += *name - 'A' + 10;
1178 else
1179 return 0;
1180 name++;
1181 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001182 if (!is_unified_ideograph(v))
1183 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001184 *code = v;
1185 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001186 }
1187
Fredrik Lundh06d12682001-01-24 07:59:11 +00001188 /* the following is the same as python's dictionary lookup, with
1189 only minor changes. see the makeunicodedata script for more
1190 details */
1191
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001192 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001193 i = (~h) & mask;
1194 v = code_hash[i];
1195 if (!v)
1196 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001197 if (_cmpname(self, v, name, namelen))
1198 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001199 incr = (h ^ (h >> 3)) & mask;
1200 if (!incr)
1201 incr = mask;
1202 for (;;) {
1203 i = (i + incr) & mask;
1204 v = code_hash[i];
1205 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001206 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001207 if (_cmpname(self, v, name, namelen))
1208 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001209 incr = incr << 1;
1210 if (incr > mask)
1211 incr = incr ^ code_poly;
1212 }
1213}
1214
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001215static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001216{
1217 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001218 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001219 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001220};
1221
1222/* -------------------------------------------------------------------- */
1223/* Python bindings */
1224
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001225PyDoc_STRVAR(unicodedata_name__doc__,
1226"name(unichr[, default])\n\
1227Returns the name assigned to the Unicode character unichr as a\n\
1228string. If no name is defined, default is returned, or, if not\n\
1229given, ValueError is raised.");
1230
Fredrik Lundh06d12682001-01-24 07:59:11 +00001231static PyObject *
1232unicodedata_name(PyObject* self, PyObject* args)
1233{
1234 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001235 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001236
1237 PyUnicodeObject* v;
1238 PyObject* defobj = NULL;
1239 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1240 return NULL;
1241
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001242 c = getuchar(v);
1243 if (c == (Py_UCS4)-1)
1244 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001245
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001246 if (!_getucname(self, c, name, sizeof(name), 0)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001247 if (defobj == NULL) {
1248 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001249 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001250 }
1251 else {
1252 Py_INCREF(defobj);
1253 return defobj;
1254 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001255 }
1256
Walter Dörwald4254e762007-06-05 16:04:09 +00001257 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001258}
1259
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001260PyDoc_STRVAR(unicodedata_lookup__doc__,
1261"lookup(name)\n\
1262\n\
1263Look up character by name. If a character with the\n\
1264given name is found, return the corresponding Unicode\n\
1265character. If not found, KeyError is raised.");
1266
Fredrik Lundh06d12682001-01-24 07:59:11 +00001267static PyObject *
1268unicodedata_lookup(PyObject* self, PyObject* args)
1269{
1270 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001271
1272 char* name;
1273 int namelen;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001274 unsigned int index;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001275 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1276 return NULL;
1277
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001278 if (!_getcode(self, name, namelen, &code, 1)) {
1279 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001280 return NULL;
1281 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001282 /* check if code is in the PUA range that we use for named sequences
1283 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001284 if (IS_NAMED_SEQ(code)) {
1285 index = code-named_sequences_start;
1286 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1287 named_sequences[index].seq,
1288 named_sequences[index].seqlen);
1289 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001290 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001291}
1292
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001293/* XXX Add doc strings. */
1294
1295static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001296 UNICODEDATA_UCD_DECIMAL_METHODDEF
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001297 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1298 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1299 {"category", unicodedata_category, METH_VARARGS,
1300 unicodedata_category__doc__},
1301 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1302 unicodedata_bidirectional__doc__},
1303 {"combining", unicodedata_combining, METH_VARARGS,
1304 unicodedata_combining__doc__},
1305 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1306 unicodedata_mirrored__doc__},
1307 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1308 unicodedata_east_asian_width__doc__},
1309 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1310 unicodedata_decomposition__doc__},
1311 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1312 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1313 {"normalize", unicodedata_normalize, METH_VARARGS,
1314 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001315 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001316};
1317
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001318static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001319 /* The ob_type field must be initialized in the module init function
1320 * to be portable to Windows without using C++. */
1321 PyVarObject_HEAD_INIT(NULL, 0)
1322 "unicodedata.UCD", /*tp_name*/
1323 sizeof(PreviousDBVersion), /*tp_basicsize*/
1324 0, /*tp_itemsize*/
1325 /* methods */
1326 (destructor)PyObject_Del, /*tp_dealloc*/
1327 0, /*tp_print*/
1328 0, /*tp_getattr*/
1329 0, /*tp_setattr*/
1330 0, /*tp_reserved*/
1331 0, /*tp_repr*/
1332 0, /*tp_as_number*/
1333 0, /*tp_as_sequence*/
1334 0, /*tp_as_mapping*/
1335 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001336 0, /*tp_call*/
1337 0, /*tp_str*/
1338 PyObject_GenericGetAttr,/*tp_getattro*/
1339 0, /*tp_setattro*/
1340 0, /*tp_as_buffer*/
1341 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1342 0, /*tp_doc*/
1343 0, /*tp_traverse*/
1344 0, /*tp_clear*/
1345 0, /*tp_richcompare*/
1346 0, /*tp_weaklistoffset*/
1347 0, /*tp_iter*/
1348 0, /*tp_iternext*/
1349 unicodedata_functions, /*tp_methods*/
1350 DB_members, /*tp_members*/
1351 0, /*tp_getset*/
1352 0, /*tp_base*/
1353 0, /*tp_dict*/
1354 0, /*tp_descr_get*/
1355 0, /*tp_descr_set*/
1356 0, /*tp_dictoffset*/
1357 0, /*tp_init*/
1358 0, /*tp_alloc*/
1359 0, /*tp_new*/
1360 0, /*tp_free*/
1361 0, /*tp_is_gc*/
1362};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001363
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001364PyDoc_STRVAR(unicodedata_docstring,
1365"This module provides access to the Unicode Character Database which\n\
1366defines character properties for all Unicode characters. The data in\n\
1367this database is based on the UnicodeData.txt file version\n\
Benjamin Peterson8aa7b892013-10-10 20:22:10 -04001368" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001369\n\
1370The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001371UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001372
1373static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001374 PyModuleDef_HEAD_INIT,
1375 "unicodedata",
1376 unicodedata_docstring,
1377 -1,
1378 unicodedata_functions,
1379 NULL,
1380 NULL,
1381 NULL,
1382 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001383};
1384
Mark Hammond62b1ab12002-07-23 06:31:15 +00001385PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001386PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001387{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001388 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001389
Christian Heimes90aa7642007-12-19 02:45:37 +00001390 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001391
Martin v. Löwis1a214512008-06-11 05:26:20 +00001392 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001393 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001394 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001395
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001396 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001397 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001398 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001399
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001400 /* Previous versions */
1401 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1402 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001403 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001404
Fredrik Lundh06d12682001-01-24 07:59:11 +00001405 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001406 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001407 if (v != NULL)
1408 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001409 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001410}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001411
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001412/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001413Local variables:
1414c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001415indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001416End:
1417*/