blob: ca1620feabe91b4237e82239c88778aa846ff3e7 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
16#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000017#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000018#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019
Larry Hastings61272b72014-01-07 12:41:53 -080020/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080021module unicodedata
Larry Hastingsc2047262014-01-25 20:43:29 -080022class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Larry Hastings61272b72014-01-07 12:41:53 -080023[clinic start generated code]*/
24/*[clinic end generated code: checksum=da39a3ee5e6b4b0d3255bfef95601890afd80709]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080025
Fredrik Lundh06d12682001-01-24 07:59:11 +000026/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000027
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000028typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000029 const unsigned char category; /* index into
30 _PyUnicode_CategoryNames */
31 const unsigned char combining; /* combining class value 0 - 255 */
32 const unsigned char bidirectional; /* index into
33 _PyUnicode_BidirectionalNames */
34 const unsigned char mirrored; /* true if mirrored in bidir mode */
35 const unsigned char east_asian_width; /* index into
36 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000037 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000038} _PyUnicode_DatabaseRecord;
39
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000040typedef struct change_record {
41 /* sequence of fields should be the same as in merge_old_version */
42 const unsigned char bidir_changed;
43 const unsigned char category_changed;
44 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000045 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000046 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000047} change_record;
48
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000049/* data file generated by Tools/unicode/makeunicodedata.py */
50#include "unicodedata_db.h"
51
52static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000053_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000054{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000055 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000056 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000057 index = 0;
58 else {
59 index = index1[(code>>SHIFT)];
60 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
61 }
62
63 return &_PyUnicode_Database_Records[index];
64}
65
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000066/* ------------- Previous-version API ------------------------------------- */
67typedef struct previous_version {
68 PyObject_HEAD
69 const char *name;
70 const change_record* (*getrecord)(Py_UCS4);
71 Py_UCS4 (*normalization)(Py_UCS4);
72} PreviousDBVersion;
73
74#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
75
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000076static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078 {NULL}
79};
80
Thomas Wouters89f507f2006-12-13 04:49:30 +000081/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000082static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000083#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000084
85static PyObject*
86new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
87 Py_UCS4 (*normalization)(Py_UCS4))
88{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 PreviousDBVersion *self;
90 self = PyObject_New(PreviousDBVersion, &UCD_Type);
91 if (self == NULL)
92 return NULL;
93 self->name = name;
94 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000095 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000096 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000097}
98
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000099
100static Py_UCS4 getuchar(PyUnicodeObject *obj)
101{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200102 if (PyUnicode_READY(obj))
103 return (Py_UCS4)-1;
104 if (PyUnicode_GET_LENGTH(obj) == 1) {
105 if (PyUnicode_READY(obj))
106 return (Py_UCS4)-1;
107 return PyUnicode_READ_CHAR(obj, 0);
108 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000109 PyErr_SetString(PyExc_TypeError,
110 "need a single Unicode character as parameter");
111 return (Py_UCS4)-1;
112}
113
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000114/* --- Module API --------------------------------------------------------- */
115
Larry Hastings61272b72014-01-07 12:41:53 -0800116/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -0800117
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800118unicodedata.UCD.decimal
Larry Hastings31826802013-10-19 00:09:25 -0700119
Larry Hastings77561cc2014-01-07 12:13:13 -0800120 unichr: object(type='PyUnicodeObject *', subclass_of='&PyUnicode_Type')
Larry Hastings31826802013-10-19 00:09:25 -0700121 default: object=NULL
122 /
123
124Converts a Unicode character into its equivalent decimal value.
125
126Returns the decimal value assigned to the Unicode character unichr
127as integer. If no such value is defined, default is returned, or, if
128not given, ValueError is raised.
Larry Hastings61272b72014-01-07 12:41:53 -0800129[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -0700130
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800131PyDoc_STRVAR(unicodedata_UCD_decimal__doc__,
Larry Hastings5c661892014-01-24 06:17:25 -0800132"decimal(self, unichr, default=None)\n"
Larry Hastings31826802013-10-19 00:09:25 -0700133"Converts a Unicode character into its equivalent decimal value.\n"
134"\n"
Larry Hastings31826802013-10-19 00:09:25 -0700135"Returns the decimal value assigned to the Unicode character unichr\n"
136"as integer. If no such value is defined, default is returned, or, if\n"
137"not given, ValueError is raised.");
138
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800139#define UNICODEDATA_UCD_DECIMAL_METHODDEF \
140 {"decimal", (PyCFunction)unicodedata_UCD_decimal, METH_VARARGS, unicodedata_UCD_decimal__doc__},
Larry Hastings31826802013-10-19 00:09:25 -0700141
142static PyObject *
Larry Hastingsc2047262014-01-25 20:43:29 -0800143unicodedata_UCD_decimal_impl(PreviousDBVersion *self, PyUnicodeObject *unichr, PyObject *default_value);
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000144
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000145static PyObject *
Larry Hastingsc2047262014-01-25 20:43:29 -0800146unicodedata_UCD_decimal(PreviousDBVersion *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000147{
Larry Hastings31826802013-10-19 00:09:25 -0700148 PyObject *return_value = NULL;
Larry Hastings77561cc2014-01-07 12:13:13 -0800149 PyUnicodeObject *unichr;
Larry Hastings31826802013-10-19 00:09:25 -0700150 PyObject *default_value = NULL;
151
152 if (!PyArg_ParseTuple(args,
153 "O!|O:decimal",
154 &PyUnicode_Type, &unichr, &default_value))
155 goto exit;
Larry Hastingsed4a1c52013-11-18 09:32:13 -0800156 return_value = unicodedata_UCD_decimal_impl(self, unichr, default_value);
Larry Hastings31826802013-10-19 00:09:25 -0700157
158exit:
159 return return_value;
160}
161
162static PyObject *
Larry Hastingsc2047262014-01-25 20:43:29 -0800163unicodedata_UCD_decimal_impl(PreviousDBVersion *self, PyUnicodeObject *unichr, PyObject *default_value)
164/*[clinic end generated code: checksum=e1371a1a016e19fdd3cd2c1af1d1832df095f50b]*/
Larry Hastings31826802013-10-19 00:09:25 -0700165{
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000166 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000167 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000168 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169
Larry Hastingsc2047262014-01-25 20:43:29 -0800170 c = getuchar(unichr);
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000171 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000172 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000173
Martin v. Löwis1a214512008-06-11 05:26:20 +0000174 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000175 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000176 if (old->category_changed == 0) {
177 /* unassigned */
178 have_old = 1;
179 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000180 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000181 else if (old->decimal_changed != 0xFF) {
182 have_old = 1;
183 rc = old->decimal_changed;
184 }
185 }
186
187 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000188 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000189 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700190 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191 PyErr_SetString(PyExc_ValueError,
192 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000193 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194 }
195 else {
Larry Hastings31826802013-10-19 00:09:25 -0700196 Py_INCREF(default_value);
197 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000199 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000200 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201}
202
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000203PyDoc_STRVAR(unicodedata_digit__doc__,
204"digit(unichr[, default])\n\
205\n\
206Returns the digit value assigned to the Unicode character unichr as\n\
207integer. If no such value is defined, default is returned, or, if\n\
208not given, ValueError is raised.");
209
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000210static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000211unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000212{
213 PyUnicodeObject *v;
214 PyObject *defobj = NULL;
215 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000216 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000217
Fredrik Lundh06d12682001-01-24 07:59:11 +0000218 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000219 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000220 c = getuchar(v);
221 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000222 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000223 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000224 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000225 if (defobj == NULL) {
226 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000227 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000228 }
229 else {
230 Py_INCREF(defobj);
231 return defobj;
232 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000233 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000234 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000235}
236
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000237PyDoc_STRVAR(unicodedata_numeric__doc__,
238"numeric(unichr[, default])\n\
239\n\
240Returns the numeric value assigned to the Unicode character unichr\n\
241as float. If no such value is defined, default is returned, or, if\n\
242not given, ValueError is raised.");
243
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000244static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000245unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000246{
247 PyUnicodeObject *v;
248 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000249 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000251 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000252
Fredrik Lundh06d12682001-01-24 07:59:11 +0000253 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000254 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000255 c = getuchar(v);
256 if (c == (Py_UCS4)-1)
257 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000258
Martin v. Löwis1a214512008-06-11 05:26:20 +0000259 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000260 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000261 if (old->category_changed == 0) {
262 /* unassigned */
263 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000266 else if (old->decimal_changed != 0xFF) {
267 have_old = 1;
268 rc = old->decimal_changed;
269 }
270 }
271
272 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000273 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000274 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000275 if (defobj == NULL) {
276 PyErr_SetString(PyExc_ValueError, "not a numeric character");
277 return NULL;
278 }
279 else {
280 Py_INCREF(defobj);
281 return defobj;
282 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000283 }
284 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000285}
286
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000287PyDoc_STRVAR(unicodedata_category__doc__,
288"category(unichr)\n\
289\n\
290Returns the general category assigned to the Unicode character\n\
291unichr as string.");
292
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000293static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000294unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000295{
296 PyUnicodeObject *v;
297 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000298 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000299
300 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000301 &PyUnicode_Type, &v))
302 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000303 c = getuchar(v);
304 if (c == (Py_UCS4)-1)
305 return NULL;
306 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000307 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000308 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000309 if (old->category_changed != 0xFF)
310 index = old->category_changed;
311 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000312 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000313}
314
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000315PyDoc_STRVAR(unicodedata_bidirectional__doc__,
316"bidirectional(unichr)\n\
317\n\
Ezio Melottie3d7e542012-12-14 20:12:25 +0200318Returns the bidirectional class assigned to the Unicode character\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000319unichr as string. If no such value is defined, an empty string is\n\
320returned.");
321
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000322static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000323unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000324{
325 PyUnicodeObject *v;
326 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000327 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000328
329 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000330 &PyUnicode_Type, &v))
331 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000332 c = getuchar(v);
333 if (c == (Py_UCS4)-1)
334 return NULL;
335 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000336 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000337 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000338 if (old->category_changed == 0)
339 index = 0; /* unassigned */
340 else if (old->bidir_changed != 0xFF)
341 index = old->bidir_changed;
342 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000343 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000344}
345
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000346PyDoc_STRVAR(unicodedata_combining__doc__,
347"combining(unichr)\n\
348\n\
349Returns the canonical combining class assigned to the Unicode\n\
350character unichr as integer. Returns 0 if no combining class is\n\
351defined.");
352
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000353static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000354unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000355{
356 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000357 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000358 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000359
360 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000361 &PyUnicode_Type, &v))
362 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000363 c = getuchar(v);
364 if (c == (Py_UCS4)-1)
365 return NULL;
366 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000367 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000368 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000369 if (old->category_changed == 0)
370 index = 0; /* unassigned */
371 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000372 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000373}
374
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000375PyDoc_STRVAR(unicodedata_mirrored__doc__,
376"mirrored(unichr)\n\
377\n\
378Returns the mirrored property assigned to the Unicode character\n\
379unichr as integer. Returns 1 if the character has been identified as\n\
380a \"mirrored\" character in bidirectional text, 0 otherwise.");
381
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000382static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000383unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000384{
385 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000386 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000387 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000388
389 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000390 &PyUnicode_Type, &v))
391 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000392 c = getuchar(v);
393 if (c == (Py_UCS4)-1)
394 return NULL;
395 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000396 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000397 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000398 if (old->category_changed == 0)
399 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000400 else if (old->mirrored_changed != 0xFF)
401 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000402 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000403 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000404}
405
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000406PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
407"east_asian_width(unichr)\n\
408\n\
409Returns the east asian width assigned to the Unicode character\n\
410unichr as string.");
411
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000412static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000413unicodedata_east_asian_width(PyObject *self, PyObject *args)
414{
415 PyUnicodeObject *v;
416 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000417 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000418
419 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000420 &PyUnicode_Type, &v))
421 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000422 c = getuchar(v);
423 if (c == (Py_UCS4)-1)
424 return NULL;
425 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000426 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000427 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000428 if (old->category_changed == 0)
429 index = 0; /* unassigned */
430 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000431 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000432}
433
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000434PyDoc_STRVAR(unicodedata_decomposition__doc__,
435"decomposition(unichr)\n\
436\n\
437Returns the character decomposition mapping assigned to the Unicode\n\
438character unichr as string. An empty string is returned in case no\n\
439such mapping is defined.");
440
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000441static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000442unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000443{
444 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000445 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000446 int code, index, count;
447 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000448 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000449 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000450
451 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000452 &PyUnicode_Type, &v))
453 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000454 c = getuchar(v);
455 if (c == (Py_UCS4)-1)
456 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000457
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000458 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000459
Martin v. Löwis1a214512008-06-11 05:26:20 +0000460 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000461 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000462 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000463 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000464 }
465
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000466 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000467 index = 0;
468 else {
469 index = decomp_index1[(code>>DECOMP_SHIFT)];
470 index = decomp_index2[(index<<DECOMP_SHIFT)+
471 (code&((1<<DECOMP_SHIFT)-1))];
472 }
473
Tim Peters69b83b12001-11-30 07:23:05 +0000474 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000475 is prefix code (from*/
476 count = decomp_data[index] >> 8;
477
478 /* XXX: could allocate the PyString up front instead
479 (strlen(prefix) + 5 * count + 1 bytes) */
480
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000481 /* Based on how index is calculated above and decomp_data is generated
482 from Tools/unicode/makeunicodedata.py, it should not be possible
483 to overflow decomp_prefix. */
484 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200485 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000486
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000487 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000488 i = strlen(decomp_prefix[prefix_index]);
489 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000490
491 while (count-- > 0) {
492 if (i)
493 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000494 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000495 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
496 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000497 i += strlen(decomp + i);
498 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000499 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000500}
501
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000502static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000503get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000504{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000505 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000506 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000507 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000508 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000509 /* unassigned in old version */
510 *index = 0;
511 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000512 else {
513 *index = decomp_index1[(code>>DECOMP_SHIFT)];
514 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
515 (code&((1<<DECOMP_SHIFT)-1))];
516 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000517
Martin v. Löwis677bde22002-11-23 22:08:15 +0000518 /* high byte is number of hex bytes (usually one or two), low byte
519 is prefix code (from*/
520 *count = decomp_data[*index] >> 8;
521 *prefix = decomp_data[*index] & 255;
522
523 (*index)++;
524}
525
526#define SBase 0xAC00
527#define LBase 0x1100
528#define VBase 0x1161
529#define TBase 0x11A7
530#define LCount 19
531#define VCount 21
532#define TCount 28
533#define NCount (VCount*TCount)
534#define SCount (LCount*NCount)
535
536static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000537nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000538{
539 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200540 Py_UCS4 *output;
541 Py_ssize_t i, o, osize;
542 int kind;
543 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000544 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200545 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000546 Py_ssize_t space, isize;
547 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000548 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000549
Martin v. Löwis677bde22002-11-23 22:08:15 +0000550 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200551 isize = PyUnicode_GET_LENGTH(input);
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300552 /* Overallocate at most 10 characters. */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000553 space = (isize > 10 ? 10 : isize) + isize;
Martin v. Löwis22970662011-09-29 13:39:38 +0200554 osize = space;
555 output = PyMem_Malloc(space * sizeof(Py_UCS4));
556 if (!output) {
557 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000558 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200559 }
560 i = o = 0;
561 kind = PyUnicode_KIND(input);
562 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000563
Martin v. Löwis22970662011-09-29 13:39:38 +0200564 while (i < isize) {
565 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000566 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200567 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000568 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300569 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000570 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000571 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200572 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000573 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000574 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
575 if (new_output == NULL) {
576 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200577 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000578 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200579 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000580 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000581 }
582 /* Hangul Decomposition. */
583 if (SBase <= code && code < (SBase+SCount)) {
584 int SIndex = code - SBase;
585 int L = LBase + SIndex / NCount;
586 int V = VBase + (SIndex % NCount) / TCount;
587 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200588 output[o++] = L;
589 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000590 space -= 2;
591 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200592 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000593 space --;
594 }
595 continue;
596 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000597 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000598 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000599 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
600 if (value != 0) {
601 stack[stackptr++] = value;
602 continue;
603 }
604 }
605
606 /* Other decompositions. */
607 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000608
609 /* Copy character if it is not decomposable, or has a
610 compatibility decomposition, but we do NFD. */
611 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200612 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000613 space--;
614 continue;
615 }
616 /* Copy decomposition onto the stack, in reverse
617 order. */
618 while(count) {
619 code = decomp_data[index + (--count)];
620 stack[stackptr++] = code;
621 }
622 }
623 }
624
Martin v. Löwis22970662011-09-29 13:39:38 +0200625 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
626 output, o);
627 PyMem_Free(output);
628 if (!result)
629 return NULL;
630 /* result is guaranteed to be ready, as it is compact. */
631 kind = PyUnicode_KIND(result);
632 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000633
634 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200635 i = 0;
636 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
637 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
638 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000639 if (prev == 0 || cur == 0 || prev <= cur) {
640 prev = cur;
641 continue;
642 }
643 /* Non-canonical order. Need to switch *i with previous. */
644 o = i - 1;
645 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200646 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
647 PyUnicode_WRITE(kind, data, o+1,
648 PyUnicode_READ(kind, data, o));
649 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000650 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200651 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000652 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200653 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000654 if (prev == 0 || prev <= cur)
655 break;
656 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200657 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000658 }
659 return result;
660}
661
662static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200663find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000664{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200665 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000666 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200667 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000668 if (code < start)
669 return -1;
670 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200671 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000672 return nfc[index].index + delta;
673 }
674 }
675 return -1;
676}
677
678static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000679nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000680{
681 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200682 int kind;
683 void *data;
684 Py_UCS4 *output;
685 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000686 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200687 Py_UCS4 code;
688 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000689 int cskipped = 0;
690
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000691 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000692 if (!result)
693 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200694 /* result will be "ready". */
695 kind = PyUnicode_KIND(result);
696 data = PyUnicode_DATA(result);
697 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000698
Martin v. Löwis22970662011-09-29 13:39:38 +0200699 /* We allocate a buffer for the output.
700 If we find that we made no changes, we still return
701 the NFD result. */
702 output = PyMem_Malloc(len * sizeof(Py_UCS4));
703 if (!output) {
704 PyErr_NoMemory();
705 Py_DECREF(result);
706 return 0;
707 }
708 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000709
Martin v. Löwis677bde22002-11-23 22:08:15 +0000710 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200711 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000712 for (index = 0; index < cskipped; index++) {
713 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000714 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000715 Remove from list. */
716 skipped[index] = skipped[cskipped-1];
717 cskipped--;
718 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000719 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000720 }
721 }
722 /* Hangul Composition. We don't need to check for <LV,T>
723 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200724 code = PyUnicode_READ(kind, data, i);
725 if (LBase <= code && code < (LBase+LCount) &&
726 i + 1 < len &&
727 VBase <= PyUnicode_READ(kind, data, i+1) &&
728 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000729 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200730 LIndex = code - LBase;
731 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000732 code = SBase + (LIndex*VCount+VIndex)*TCount;
733 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200734 if (i < len &&
735 TBase <= PyUnicode_READ(kind, data, i) &&
736 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
737 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000738 i++;
739 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200740 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000741 continue;
742 }
743
Martin v. Löwis22970662011-09-29 13:39:38 +0200744 /* code is still input[i] here */
745 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000746 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200747 output[o++] = code;
748 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000749 continue;
750 }
751 /* Find next unblocked character. */
752 i1 = i+1;
753 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200754 /* output base character for now; might be updated later. */
755 output[o] = PyUnicode_READ(kind, data, i);
756 while (i1 < len) {
757 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
758 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000759 if (comb) {
760 if (comb1 == 0)
761 break;
762 if (comb >= comb1) {
763 /* Character is blocked. */
764 i1++;
765 continue;
766 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000767 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200768 l = find_nfc_index(self, nfc_last, code1);
769 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000770 is a starter, we don't need to look further.
771 Otherwise, record the combining class. */
772 if (l == -1) {
773 not_combinable:
774 if (comb1 == 0)
775 break;
776 comb = comb1;
777 i1++;
778 continue;
779 }
780 index = f*TOTAL_LAST + l;
781 index1 = comp_index[index >> COMP_SHIFT];
782 code = comp_data[(index1<<COMP_SHIFT)+
783 (index&((1<<COMP_SHIFT)-1))];
784 if (code == 0)
785 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000786
Martin v. Löwis677bde22002-11-23 22:08:15 +0000787 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200788 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000789 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000790 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000791 skipped[cskipped++] = i1;
792 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200793 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000794 if (f == -1)
795 break;
796 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200797 /* Output character was already written.
798 Just advance the indices. */
799 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000800 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200801 if (o == len) {
802 /* No changes. Return original string. */
803 PyMem_Free(output);
804 return result;
805 }
806 Py_DECREF(result);
807 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
808 output, o);
809 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000810 return result;
811}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000812
813/* Return 1 if the input is certainly normalized, 0 if it might not be. */
814static int
815is_normalized(PyObject *self, PyObject *input, int nfc, int k)
816{
Martin v. Löwis22970662011-09-29 13:39:38 +0200817 Py_ssize_t i, len;
818 int kind;
819 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000820 unsigned char prev_combining = 0, quickcheck_mask;
821
822 /* An older version of the database is requested, quickchecks must be
823 disabled. */
824 if (self && UCD_Check(self))
825 return 0;
826
827 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
828 as described in http://unicode.org/reports/tr15/#Annex8. */
829 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
830
Martin v. Löwis22970662011-09-29 13:39:38 +0200831 i = 0;
832 kind = PyUnicode_KIND(input);
833 data = PyUnicode_DATA(input);
834 len = PyUnicode_GET_LENGTH(input);
835 while (i < len) {
836 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
837 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000838 unsigned char combining = record->combining;
839 unsigned char quickcheck = record->normalization_quick_check;
840
841 if (quickcheck & quickcheck_mask)
842 return 0; /* this string might need normalization */
843 if (combining && prev_combining > combining)
844 return 0; /* non-canonical sort order, not normalized */
845 prev_combining = combining;
846 }
847 return 1; /* certainly normalized */
848}
849
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000850PyDoc_STRVAR(unicodedata_normalize__doc__,
851"normalize(form, unistr)\n\
852\n\
853Return the normal form 'form' for the Unicode string unistr. Valid\n\
854values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
855
Martin v. Löwis677bde22002-11-23 22:08:15 +0000856static PyObject*
857unicodedata_normalize(PyObject *self, PyObject *args)
858{
859 char *form;
860 PyObject *input;
861
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000862 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000863 &form, &PyUnicode_Type, &input))
864 return NULL;
865
Martin v. Löwis22970662011-09-29 13:39:38 +0200866 if (PyUnicode_READY(input) == -1)
867 return NULL;
868
869 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000870 /* Special case empty input strings, since resizing
871 them later would cause internal errors. */
872 Py_INCREF(input);
873 return input;
874 }
875
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000876 if (strcmp(form, "NFC") == 0) {
877 if (is_normalized(self, input, 1, 0)) {
878 Py_INCREF(input);
879 return input;
880 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000881 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000882 }
883 if (strcmp(form, "NFKC") == 0) {
884 if (is_normalized(self, input, 1, 1)) {
885 Py_INCREF(input);
886 return input;
887 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000888 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000889 }
890 if (strcmp(form, "NFD") == 0) {
891 if (is_normalized(self, input, 0, 0)) {
892 Py_INCREF(input);
893 return input;
894 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000895 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000896 }
897 if (strcmp(form, "NFKD") == 0) {
898 if (is_normalized(self, input, 0, 1)) {
899 Py_INCREF(input);
900 return input;
901 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000902 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000903 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000904 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
905 return NULL;
906}
907
Fredrik Lundh06d12682001-01-24 07:59:11 +0000908/* -------------------------------------------------------------------- */
909/* unicode character name tables */
910
911/* data file generated by Tools/unicode/makeunicodedata.py */
912#include "unicodename_db.h"
913
914/* -------------------------------------------------------------------- */
915/* database code (cut and pasted from the unidb package) */
916
917static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000918_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000919{
920 int i;
921 unsigned long h = 0;
922 unsigned long ix;
923 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200924 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000925 ix = h & 0xff000000;
926 if (ix)
927 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
928 }
929 return h;
930}
931
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000932static char *hangul_syllables[][3] = {
933 { "G", "A", "" },
934 { "GG", "AE", "G" },
935 { "N", "YA", "GG" },
936 { "D", "YAE", "GS" },
937 { "DD", "EO", "N", },
938 { "R", "E", "NJ" },
939 { "M", "YEO", "NH" },
940 { "B", "YE", "D" },
941 { "BB", "O", "L" },
942 { "S", "WA", "LG" },
943 { "SS", "WAE", "LM" },
944 { "", "OE", "LB" },
945 { "J", "YO", "LS" },
946 { "JJ", "U", "LT" },
947 { "C", "WEO", "LP" },
948 { "K", "WE", "LH" },
949 { "T", "WI", "M" },
950 { "P", "YU", "B" },
951 { "H", "EU", "BS" },
952 { 0, "YI", "S" },
953 { 0, "I", "SS" },
954 { 0, 0, "NG" },
955 { 0, 0, "J" },
956 { 0, 0, "C" },
957 { 0, 0, "K" },
958 { 0, 0, "T" },
959 { 0, 0, "P" },
960 { 0, 0, "H" }
961};
962
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000963/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000964static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000965is_unified_ideograph(Py_UCS4 code)
966{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000967 return
968 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500969 (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000970 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
971 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
972 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000973}
974
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300975/* macros used to determine if the given codepoint is in the PUA range that
976 * we are using to store aliases and named sequences */
977#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
978#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
979 (cp < named_sequences_end))
980
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000981static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300982_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
983 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000984{
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300985 /* Find the name associated with the given codepoint.
986 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
987 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000988 int offset;
989 int i;
990 int word;
991 unsigned char* w;
992
Martin v. Löwisc3509122006-03-11 12:16:23 +0000993 if (code >= 0x110000)
994 return 0;
995
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300996 /* XXX should we just skip all the codepoints in the PUAs here? */
997 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
998 return 0;
999
Martin v. Löwis1a214512008-06-11 05:26:20 +00001000 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001001 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +03001002 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001003 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1004 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +03001005 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +00001006 if (old->category_changed == 0) {
1007 /* unassigned */
1008 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001009 }
Martin v. Löwisc3509122006-03-11 12:16:23 +00001010 }
1011
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +00001012 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001013 /* Hangul syllable. */
1014 int SIndex = code - SBase;
1015 int L = SIndex / NCount;
1016 int V = (SIndex % NCount) / TCount;
1017 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001018
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001019 if (buflen < 27)
1020 /* Worst case: HANGUL SYLLABLE <10chars>. */
1021 return 0;
1022 strcpy(buffer, "HANGUL SYLLABLE ");
1023 buffer += 16;
1024 strcpy(buffer, hangul_syllables[L][0]);
1025 buffer += strlen(hangul_syllables[L][0]);
1026 strcpy(buffer, hangul_syllables[V][1]);
1027 buffer += strlen(hangul_syllables[V][1]);
1028 strcpy(buffer, hangul_syllables[T][2]);
1029 buffer += strlen(hangul_syllables[T][2]);
1030 *buffer = '\0';
1031 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001032 }
1033
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001034 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001035 if (buflen < 28)
1036 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1037 return 0;
1038 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1039 return 1;
1040 }
1041
Fredrik Lundh06d12682001-01-24 07:59:11 +00001042 /* get offset into phrasebook */
1043 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1044 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1045 (code&((1<<phrasebook_shift)-1))];
1046 if (!offset)
1047 return 0;
1048
1049 i = 0;
1050
1051 for (;;) {
1052 /* get word index */
1053 word = phrasebook[offset] - phrasebook_short;
1054 if (word >= 0) {
1055 word = (word << 8) + phrasebook[offset+1];
1056 offset += 2;
1057 } else
1058 word = phrasebook[offset++];
1059 if (i) {
1060 if (i > buflen)
1061 return 0; /* buffer overflow */
1062 buffer[i++] = ' ';
1063 }
1064 /* copy word string from lexicon. the last character in the
1065 word has bit 7 set. the last word in a string ends with
1066 0x80 */
1067 w = lexicon + lexicon_offset[word];
1068 while (*w < 128) {
1069 if (i >= buflen)
1070 return 0; /* buffer overflow */
1071 buffer[i++] = *w++;
1072 }
1073 if (i >= buflen)
1074 return 0; /* buffer overflow */
1075 buffer[i++] = *w & 127;
1076 if (*w == 128)
1077 break; /* end of word */
1078 }
1079
1080 return 1;
1081}
1082
1083static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001084_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001085{
1086 /* check if code corresponds to the given name */
1087 int i;
1088 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001089 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001090 return 0;
1091 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001092 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001093 return 0;
1094 }
1095 return buffer[namelen] == '\0';
1096}
1097
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001098static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001099find_syllable(const char *str, int *len, int *pos, int count, int column)
1100{
1101 int i, len1;
1102 *len = -1;
1103 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001104 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001105 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001106 if (len1 <= *len)
1107 continue;
1108 if (strncmp(str, s, len1) == 0) {
1109 *len = len1;
1110 *pos = i;
1111 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001112 }
1113 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001114 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001115 }
1116}
1117
Fredrik Lundh06d12682001-01-24 07:59:11 +00001118static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001119_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001120{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001121 /* check if named sequences are allowed */
1122 if (!with_named_seq && IS_NAMED_SEQ(cp))
1123 return 0;
1124 /* if the codepoint is in the PUA range that we use for aliases,
1125 * convert it to obtain the right codepoint */
1126 if (IS_ALIAS(cp))
1127 *code = name_aliases[cp-aliases_start];
1128 else
1129 *code = cp;
1130 return 1;
1131}
1132
1133static int
1134_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1135 int with_named_seq)
1136{
1137 /* Return the codepoint associated with the given name.
1138 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1139 * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
1140 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001141 unsigned int h, v;
1142 unsigned int mask = code_size-1;
1143 unsigned int i, incr;
1144
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001145 /* Check for hangul syllables. */
1146 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001147 int len, L = -1, V = -1, T = -1;
1148 const char *pos = name + 16;
1149 find_syllable(pos, &len, &L, LCount, 0);
1150 pos += len;
1151 find_syllable(pos, &len, &V, VCount, 1);
1152 pos += len;
1153 find_syllable(pos, &len, &T, TCount, 2);
1154 pos += len;
1155 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1156 *code = SBase + (L*VCount+V)*TCount + T;
1157 return 1;
1158 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001159 /* Otherwise, it's an illegal syllable name. */
1160 return 0;
1161 }
1162
1163 /* Check for unified ideographs. */
1164 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1165 /* Four or five hexdigits must follow. */
1166 v = 0;
1167 name += 22;
1168 namelen -= 22;
1169 if (namelen != 4 && namelen != 5)
1170 return 0;
1171 while (namelen--) {
1172 v *= 16;
1173 if (*name >= '0' && *name <= '9')
1174 v += *name - '0';
1175 else if (*name >= 'A' && *name <= 'F')
1176 v += *name - 'A' + 10;
1177 else
1178 return 0;
1179 name++;
1180 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001181 if (!is_unified_ideograph(v))
1182 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001183 *code = v;
1184 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001185 }
1186
Fredrik Lundh06d12682001-01-24 07:59:11 +00001187 /* the following is the same as python's dictionary lookup, with
1188 only minor changes. see the makeunicodedata script for more
1189 details */
1190
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001191 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001192 i = (~h) & mask;
1193 v = code_hash[i];
1194 if (!v)
1195 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001196 if (_cmpname(self, v, name, namelen))
1197 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001198 incr = (h ^ (h >> 3)) & mask;
1199 if (!incr)
1200 incr = mask;
1201 for (;;) {
1202 i = (i + incr) & mask;
1203 v = code_hash[i];
1204 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001205 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001206 if (_cmpname(self, v, name, namelen))
1207 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001208 incr = incr << 1;
1209 if (incr > mask)
1210 incr = incr ^ code_poly;
1211 }
1212}
1213
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001214static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001215{
1216 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001217 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001218 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001219};
1220
1221/* -------------------------------------------------------------------- */
1222/* Python bindings */
1223
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001224PyDoc_STRVAR(unicodedata_name__doc__,
1225"name(unichr[, default])\n\
1226Returns the name assigned to the Unicode character unichr as a\n\
1227string. If no name is defined, default is returned, or, if not\n\
1228given, ValueError is raised.");
1229
Fredrik Lundh06d12682001-01-24 07:59:11 +00001230static PyObject *
1231unicodedata_name(PyObject* self, PyObject* args)
1232{
1233 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001234 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001235
1236 PyUnicodeObject* v;
1237 PyObject* defobj = NULL;
1238 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1239 return NULL;
1240
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001241 c = getuchar(v);
1242 if (c == (Py_UCS4)-1)
1243 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001244
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001245 if (!_getucname(self, c, name, sizeof(name), 0)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001246 if (defobj == NULL) {
1247 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001248 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001249 }
1250 else {
1251 Py_INCREF(defobj);
1252 return defobj;
1253 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001254 }
1255
Walter Dörwald4254e762007-06-05 16:04:09 +00001256 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001257}
1258
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001259PyDoc_STRVAR(unicodedata_lookup__doc__,
1260"lookup(name)\n\
1261\n\
1262Look up character by name. If a character with the\n\
1263given name is found, return the corresponding Unicode\n\
1264character. If not found, KeyError is raised.");
1265
Fredrik Lundh06d12682001-01-24 07:59:11 +00001266static PyObject *
1267unicodedata_lookup(PyObject* self, PyObject* args)
1268{
1269 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001270
1271 char* name;
1272 int namelen;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001273 unsigned int index;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001274 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1275 return NULL;
1276
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001277 if (!_getcode(self, name, namelen, &code, 1)) {
1278 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001279 return NULL;
1280 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001281 /* check if code is in the PUA range that we use for named sequences
1282 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001283 if (IS_NAMED_SEQ(code)) {
1284 index = code-named_sequences_start;
1285 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1286 named_sequences[index].seq,
1287 named_sequences[index].seqlen);
1288 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001290}
1291
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001292/* XXX Add doc strings. */
1293
1294static PyMethodDef unicodedata_functions[] = {
Larry Hastingsed4a1c52013-11-18 09:32:13 -08001295 UNICODEDATA_UCD_DECIMAL_METHODDEF
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001296 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1297 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1298 {"category", unicodedata_category, METH_VARARGS,
1299 unicodedata_category__doc__},
1300 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1301 unicodedata_bidirectional__doc__},
1302 {"combining", unicodedata_combining, METH_VARARGS,
1303 unicodedata_combining__doc__},
1304 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1305 unicodedata_mirrored__doc__},
1306 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1307 unicodedata_east_asian_width__doc__},
1308 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1309 unicodedata_decomposition__doc__},
1310 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1311 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1312 {"normalize", unicodedata_normalize, METH_VARARGS,
1313 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001314 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001315};
1316
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001317static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001318 /* The ob_type field must be initialized in the module init function
1319 * to be portable to Windows without using C++. */
1320 PyVarObject_HEAD_INIT(NULL, 0)
1321 "unicodedata.UCD", /*tp_name*/
1322 sizeof(PreviousDBVersion), /*tp_basicsize*/
1323 0, /*tp_itemsize*/
1324 /* methods */
1325 (destructor)PyObject_Del, /*tp_dealloc*/
1326 0, /*tp_print*/
1327 0, /*tp_getattr*/
1328 0, /*tp_setattr*/
1329 0, /*tp_reserved*/
1330 0, /*tp_repr*/
1331 0, /*tp_as_number*/
1332 0, /*tp_as_sequence*/
1333 0, /*tp_as_mapping*/
1334 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001335 0, /*tp_call*/
1336 0, /*tp_str*/
1337 PyObject_GenericGetAttr,/*tp_getattro*/
1338 0, /*tp_setattro*/
1339 0, /*tp_as_buffer*/
1340 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1341 0, /*tp_doc*/
1342 0, /*tp_traverse*/
1343 0, /*tp_clear*/
1344 0, /*tp_richcompare*/
1345 0, /*tp_weaklistoffset*/
1346 0, /*tp_iter*/
1347 0, /*tp_iternext*/
1348 unicodedata_functions, /*tp_methods*/
1349 DB_members, /*tp_members*/
1350 0, /*tp_getset*/
1351 0, /*tp_base*/
1352 0, /*tp_dict*/
1353 0, /*tp_descr_get*/
1354 0, /*tp_descr_set*/
1355 0, /*tp_dictoffset*/
1356 0, /*tp_init*/
1357 0, /*tp_alloc*/
1358 0, /*tp_new*/
1359 0, /*tp_free*/
1360 0, /*tp_is_gc*/
1361};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001362
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001363PyDoc_STRVAR(unicodedata_docstring,
1364"This module provides access to the Unicode Character Database which\n\
1365defines character properties for all Unicode characters. The data in\n\
1366this database is based on the UnicodeData.txt file version\n\
Benjamin Peterson8aa7b892013-10-10 20:22:10 -04001367" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001368\n\
1369The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001370UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001371
1372static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001373 PyModuleDef_HEAD_INIT,
1374 "unicodedata",
1375 unicodedata_docstring,
1376 -1,
1377 unicodedata_functions,
1378 NULL,
1379 NULL,
1380 NULL,
1381 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001382};
1383
Mark Hammond62b1ab12002-07-23 06:31:15 +00001384PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001385PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001386{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001387 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001388
Christian Heimes90aa7642007-12-19 02:45:37 +00001389 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001390
Martin v. Löwis1a214512008-06-11 05:26:20 +00001391 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001392 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001393 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001394
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001395 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001396 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001397 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001398
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001399 /* Previous versions */
1400 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1401 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001402 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001403
Fredrik Lundh06d12682001-01-24 07:59:11 +00001404 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001405 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001406 if (v != NULL)
1407 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001408 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001409}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001410
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001411/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001412Local variables:
1413c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001414indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001415End:
1416*/