blob: 5097d44020eb450fe37e89a7c9e75b5f1109bdc3 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
16#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000017#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000018#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019
20/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000021
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000022typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000023 const unsigned char category; /* index into
24 _PyUnicode_CategoryNames */
25 const unsigned char combining; /* combining class value 0 - 255 */
26 const unsigned char bidirectional; /* index into
27 _PyUnicode_BidirectionalNames */
28 const unsigned char mirrored; /* true if mirrored in bidir mode */
29 const unsigned char east_asian_width; /* index into
30 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000031 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000032} _PyUnicode_DatabaseRecord;
33
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000034typedef struct change_record {
35 /* sequence of fields should be the same as in merge_old_version */
36 const unsigned char bidir_changed;
37 const unsigned char category_changed;
38 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000039 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000040 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000041} change_record;
42
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000043/* data file generated by Tools/unicode/makeunicodedata.py */
44#include "unicodedata_db.h"
45
46static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000047_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000049 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000050 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000051 index = 0;
52 else {
53 index = index1[(code>>SHIFT)];
54 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
55 }
56
57 return &_PyUnicode_Database_Records[index];
58}
59
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000060/* ------------- Previous-version API ------------------------------------- */
61typedef struct previous_version {
62 PyObject_HEAD
63 const char *name;
64 const change_record* (*getrecord)(Py_UCS4);
65 Py_UCS4 (*normalization)(Py_UCS4);
66} PreviousDBVersion;
67
68#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
69
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000070static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000071 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000072 {NULL}
73};
74
Thomas Wouters89f507f2006-12-13 04:49:30 +000075/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000076static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000077#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078
79static PyObject*
80new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
81 Py_UCS4 (*normalization)(Py_UCS4))
82{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000083 PreviousDBVersion *self;
84 self = PyObject_New(PreviousDBVersion, &UCD_Type);
85 if (self == NULL)
86 return NULL;
87 self->name = name;
88 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000089 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000090 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000091}
92
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000093
94static Py_UCS4 getuchar(PyUnicodeObject *obj)
95{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020096 if (PyUnicode_READY(obj))
97 return (Py_UCS4)-1;
98 if (PyUnicode_GET_LENGTH(obj) == 1) {
99 if (PyUnicode_READY(obj))
100 return (Py_UCS4)-1;
101 return PyUnicode_READ_CHAR(obj, 0);
102 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000103 PyErr_SetString(PyExc_TypeError,
104 "need a single Unicode character as parameter");
105 return (Py_UCS4)-1;
106}
107
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000108/* --- Module API --------------------------------------------------------- */
109
Larry Hastings31826802013-10-19 00:09:25 -0700110/*[clinic]
111module unicodedata
112unicodedata.decimal
113
114 unichr: object(type='str')
115 default: object=NULL
116 /
117
118Converts a Unicode character into its equivalent decimal value.
119
120Returns the decimal value assigned to the Unicode character unichr
121as integer. If no such value is defined, default is returned, or, if
122not given, ValueError is raised.
123[clinic]*/
124
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000125PyDoc_STRVAR(unicodedata_decimal__doc__,
Larry Hastings31826802013-10-19 00:09:25 -0700126"Converts a Unicode character into its equivalent decimal value.\n"
127"\n"
128"unicodedata.decimal(unichr, default=None)\n"
129"\n"
130"Returns the decimal value assigned to the Unicode character unichr\n"
131"as integer. If no such value is defined, default is returned, or, if\n"
132"not given, ValueError is raised.");
133
134#define UNICODEDATA_DECIMAL_METHODDEF \
135 {"decimal", (PyCFunction)unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
136
137static PyObject *
138unicodedata_decimal_impl(PyObject *self, PyObject *unichr, PyObject *default_value);
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000139
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000140static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000141unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000142{
Larry Hastings31826802013-10-19 00:09:25 -0700143 PyObject *return_value = NULL;
144 PyObject *unichr;
145 PyObject *default_value = NULL;
146
147 if (!PyArg_ParseTuple(args,
148 "O!|O:decimal",
149 &PyUnicode_Type, &unichr, &default_value))
150 goto exit;
151 return_value = unicodedata_decimal_impl(self, unichr, default_value);
152
153exit:
154 return return_value;
155}
156
157static PyObject *
158unicodedata_decimal_impl(PyObject *self, PyObject *unichr, PyObject *default_value)
159/*[clinic checksum: 76c8d1c3dbee495d4cfd86ca6829543a3129344a]*/
160{
161 PyUnicodeObject *v = (PyUnicodeObject *)unichr;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000162 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000163 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000164 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000165
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000166 c = getuchar(v);
167 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000168 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000169
Martin v. Löwis1a214512008-06-11 05:26:20 +0000170 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000171 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000172 if (old->category_changed == 0) {
173 /* unassigned */
174 have_old = 1;
175 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000176 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000177 else if (old->decimal_changed != 0xFF) {
178 have_old = 1;
179 rc = old->decimal_changed;
180 }
181 }
182
183 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000184 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000185 if (rc < 0) {
Larry Hastings31826802013-10-19 00:09:25 -0700186 if (default_value == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000187 PyErr_SetString(PyExc_ValueError,
188 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000189 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000190 }
191 else {
Larry Hastings31826802013-10-19 00:09:25 -0700192 Py_INCREF(default_value);
193 return default_value;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000195 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000196 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000197}
198
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000199PyDoc_STRVAR(unicodedata_digit__doc__,
200"digit(unichr[, default])\n\
201\n\
202Returns the digit value assigned to the Unicode character unichr as\n\
203integer. If no such value is defined, default is returned, or, if\n\
204not given, ValueError is raised.");
205
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000206static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000207unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000208{
209 PyUnicodeObject *v;
210 PyObject *defobj = NULL;
211 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000212 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000213
Fredrik Lundh06d12682001-01-24 07:59:11 +0000214 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000215 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000216 c = getuchar(v);
217 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000218 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000219 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000220 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000221 if (defobj == NULL) {
222 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000223 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000224 }
225 else {
226 Py_INCREF(defobj);
227 return defobj;
228 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000229 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000230 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000231}
232
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000233PyDoc_STRVAR(unicodedata_numeric__doc__,
234"numeric(unichr[, default])\n\
235\n\
236Returns the numeric value assigned to the Unicode character unichr\n\
237as float. If no such value is defined, default is returned, or, if\n\
238not given, ValueError is raised.");
239
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000240static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000241unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000242{
243 PyUnicodeObject *v;
244 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000245 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000246 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000247 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000248
Fredrik Lundh06d12682001-01-24 07:59:11 +0000249 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000250 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000251 c = getuchar(v);
252 if (c == (Py_UCS4)-1)
253 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000254
Martin v. Löwis1a214512008-06-11 05:26:20 +0000255 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000256 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000257 if (old->category_changed == 0) {
258 /* unassigned */
259 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000260 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000262 else if (old->decimal_changed != 0xFF) {
263 have_old = 1;
264 rc = old->decimal_changed;
265 }
266 }
267
268 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000269 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000271 if (defobj == NULL) {
272 PyErr_SetString(PyExc_ValueError, "not a numeric character");
273 return NULL;
274 }
275 else {
276 Py_INCREF(defobj);
277 return defobj;
278 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000279 }
280 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000281}
282
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000283PyDoc_STRVAR(unicodedata_category__doc__,
284"category(unichr)\n\
285\n\
286Returns the general category assigned to the Unicode character\n\
287unichr as string.");
288
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000289static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000290unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000291{
292 PyUnicodeObject *v;
293 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000294 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000295
296 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000297 &PyUnicode_Type, &v))
298 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000299 c = getuchar(v);
300 if (c == (Py_UCS4)-1)
301 return NULL;
302 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000303 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000304 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000305 if (old->category_changed != 0xFF)
306 index = old->category_changed;
307 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000308 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000309}
310
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000311PyDoc_STRVAR(unicodedata_bidirectional__doc__,
312"bidirectional(unichr)\n\
313\n\
Ezio Melottie3d7e542012-12-14 20:12:25 +0200314Returns the bidirectional class assigned to the Unicode character\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000315unichr as string. If no such value is defined, an empty string is\n\
316returned.");
317
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000318static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000319unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000320{
321 PyUnicodeObject *v;
322 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000323 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000324
325 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000326 &PyUnicode_Type, &v))
327 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000328 c = getuchar(v);
329 if (c == (Py_UCS4)-1)
330 return NULL;
331 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000332 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000333 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000334 if (old->category_changed == 0)
335 index = 0; /* unassigned */
336 else if (old->bidir_changed != 0xFF)
337 index = old->bidir_changed;
338 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000339 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000340}
341
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000342PyDoc_STRVAR(unicodedata_combining__doc__,
343"combining(unichr)\n\
344\n\
345Returns the canonical combining class assigned to the Unicode\n\
346character unichr as integer. Returns 0 if no combining class is\n\
347defined.");
348
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000349static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000350unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000351{
352 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000353 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000354 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000355
356 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000357 &PyUnicode_Type, &v))
358 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000359 c = getuchar(v);
360 if (c == (Py_UCS4)-1)
361 return NULL;
362 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000363 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000364 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000365 if (old->category_changed == 0)
366 index = 0; /* unassigned */
367 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000368 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000369}
370
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000371PyDoc_STRVAR(unicodedata_mirrored__doc__,
372"mirrored(unichr)\n\
373\n\
374Returns the mirrored property assigned to the Unicode character\n\
375unichr as integer. Returns 1 if the character has been identified as\n\
376a \"mirrored\" character in bidirectional text, 0 otherwise.");
377
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000378static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000379unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000380{
381 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000382 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000383 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000384
385 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000386 &PyUnicode_Type, &v))
387 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000388 c = getuchar(v);
389 if (c == (Py_UCS4)-1)
390 return NULL;
391 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000392 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000393 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000394 if (old->category_changed == 0)
395 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000396 else if (old->mirrored_changed != 0xFF)
397 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000398 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000399 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000400}
401
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000402PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
403"east_asian_width(unichr)\n\
404\n\
405Returns the east asian width assigned to the Unicode character\n\
406unichr as string.");
407
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000408static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000409unicodedata_east_asian_width(PyObject *self, PyObject *args)
410{
411 PyUnicodeObject *v;
412 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000413 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000414
415 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000416 &PyUnicode_Type, &v))
417 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000418 c = getuchar(v);
419 if (c == (Py_UCS4)-1)
420 return NULL;
421 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000422 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000423 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000424 if (old->category_changed == 0)
425 index = 0; /* unassigned */
426 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000427 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000428}
429
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000430PyDoc_STRVAR(unicodedata_decomposition__doc__,
431"decomposition(unichr)\n\
432\n\
433Returns the character decomposition mapping assigned to the Unicode\n\
434character unichr as string. An empty string is returned in case no\n\
435such mapping is defined.");
436
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000437static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000438unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000439{
440 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000441 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000442 int code, index, count;
443 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000444 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000445 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000446
447 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000448 &PyUnicode_Type, &v))
449 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000450 c = getuchar(v);
451 if (c == (Py_UCS4)-1)
452 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000453
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000454 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000455
Martin v. Löwis1a214512008-06-11 05:26:20 +0000456 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000457 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000458 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000459 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000460 }
461
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000462 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000463 index = 0;
464 else {
465 index = decomp_index1[(code>>DECOMP_SHIFT)];
466 index = decomp_index2[(index<<DECOMP_SHIFT)+
467 (code&((1<<DECOMP_SHIFT)-1))];
468 }
469
Tim Peters69b83b12001-11-30 07:23:05 +0000470 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000471 is prefix code (from*/
472 count = decomp_data[index] >> 8;
473
474 /* XXX: could allocate the PyString up front instead
475 (strlen(prefix) + 5 * count + 1 bytes) */
476
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000477 /* Based on how index is calculated above and decomp_data is generated
478 from Tools/unicode/makeunicodedata.py, it should not be possible
479 to overflow decomp_prefix. */
480 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200481 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000482
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000483 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000484 i = strlen(decomp_prefix[prefix_index]);
485 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000486
487 while (count-- > 0) {
488 if (i)
489 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000490 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000491 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
492 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000493 i += strlen(decomp + i);
494 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000495 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000496}
497
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000498static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000499get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000500{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000501 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000502 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000503 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000504 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000505 /* unassigned in old version */
506 *index = 0;
507 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000508 else {
509 *index = decomp_index1[(code>>DECOMP_SHIFT)];
510 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
511 (code&((1<<DECOMP_SHIFT)-1))];
512 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000513
Martin v. Löwis677bde22002-11-23 22:08:15 +0000514 /* high byte is number of hex bytes (usually one or two), low byte
515 is prefix code (from*/
516 *count = decomp_data[*index] >> 8;
517 *prefix = decomp_data[*index] & 255;
518
519 (*index)++;
520}
521
522#define SBase 0xAC00
523#define LBase 0x1100
524#define VBase 0x1161
525#define TBase 0x11A7
526#define LCount 19
527#define VCount 21
528#define TCount 28
529#define NCount (VCount*TCount)
530#define SCount (LCount*NCount)
531
532static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000533nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000534{
535 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200536 Py_UCS4 *output;
537 Py_ssize_t i, o, osize;
538 int kind;
539 void *data;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000540 /* Longest decomposition in Unicode 3.2: U+FDFA */
Martin v. Löwis22970662011-09-29 13:39:38 +0200541 Py_UCS4 stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000542 Py_ssize_t space, isize;
543 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000544 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000545
Martin v. Löwis677bde22002-11-23 22:08:15 +0000546 stackptr = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200547 isize = PyUnicode_GET_LENGTH(input);
Ezio Melotti7c4a7e62013-08-26 01:32:56 +0300548 /* Overallocate at most 10 characters. */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000549 space = (isize > 10 ? 10 : isize) + isize;
Martin v. Löwis22970662011-09-29 13:39:38 +0200550 osize = space;
551 output = PyMem_Malloc(space * sizeof(Py_UCS4));
552 if (!output) {
553 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000554 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200555 }
556 i = o = 0;
557 kind = PyUnicode_KIND(input);
558 data = PyUnicode_DATA(input);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000559
Martin v. Löwis22970662011-09-29 13:39:38 +0200560 while (i < isize) {
561 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000562 while(stackptr) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200563 Py_UCS4 code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000564 /* Hangul Decomposition adds three characters in
Ezio Melotti85a86292013-08-17 16:57:41 +0300565 a single step, so we need at least that much room. */
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000566 if (space < 3) {
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000567 Py_UCS4 *new_output;
Martin v. Löwis22970662011-09-29 13:39:38 +0200568 osize += 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000569 space += 10;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000570 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
571 if (new_output == NULL) {
572 PyMem_Free(output);
Martin v. Löwis22970662011-09-29 13:39:38 +0200573 PyErr_NoMemory();
Martin v. Löwis677bde22002-11-23 22:08:15 +0000574 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200575 }
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +0000576 output = new_output;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000577 }
578 /* Hangul Decomposition. */
579 if (SBase <= code && code < (SBase+SCount)) {
580 int SIndex = code - SBase;
581 int L = LBase + SIndex / NCount;
582 int V = VBase + (SIndex % NCount) / TCount;
583 int T = TBase + SIndex % TCount;
Martin v. Löwis22970662011-09-29 13:39:38 +0200584 output[o++] = L;
585 output[o++] = V;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000586 space -= 2;
587 if (T != TBase) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200588 output[o++] = T;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000589 space --;
590 }
591 continue;
592 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000593 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000594 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000595 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
596 if (value != 0) {
597 stack[stackptr++] = value;
598 continue;
599 }
600 }
601
602 /* Other decompositions. */
603 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000604
605 /* Copy character if it is not decomposable, or has a
606 compatibility decomposition, but we do NFD. */
607 if (!count || (prefix && !k)) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200608 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000609 space--;
610 continue;
611 }
612 /* Copy decomposition onto the stack, in reverse
613 order. */
614 while(count) {
615 code = decomp_data[index + (--count)];
616 stack[stackptr++] = code;
617 }
618 }
619 }
620
Martin v. Löwis22970662011-09-29 13:39:38 +0200621 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
622 output, o);
623 PyMem_Free(output);
624 if (!result)
625 return NULL;
626 /* result is guaranteed to be ready, as it is compact. */
627 kind = PyUnicode_KIND(result);
628 data = PyUnicode_DATA(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000629
630 /* Sort canonically. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200631 i = 0;
632 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
633 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
634 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000635 if (prev == 0 || cur == 0 || prev <= cur) {
636 prev = cur;
637 continue;
638 }
639 /* Non-canonical order. Need to switch *i with previous. */
640 o = i - 1;
641 while (1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200642 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
643 PyUnicode_WRITE(kind, data, o+1,
644 PyUnicode_READ(kind, data, o));
645 PyUnicode_WRITE(kind, data, o, tmp);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000646 o--;
Martin v. Löwis22970662011-09-29 13:39:38 +0200647 if (o < 0)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000648 break;
Martin v. Löwis22970662011-09-29 13:39:38 +0200649 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000650 if (prev == 0 || prev <= cur)
651 break;
652 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200653 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000654 }
655 return result;
656}
657
658static int
Martin v. Löwis22970662011-09-29 13:39:38 +0200659find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000660{
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200661 unsigned int index;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000662 for (index = 0; nfc[index].start; index++) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200663 unsigned int start = nfc[index].start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000664 if (code < start)
665 return -1;
666 if (code <= start + nfc[index].count) {
Antoine Pitrou1d4bd252011-10-06 15:44:15 +0200667 unsigned int delta = code - start;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000668 return nfc[index].index + delta;
669 }
670 }
671 return -1;
672}
673
674static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000675nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000676{
677 PyObject *result;
Martin v. Löwis22970662011-09-29 13:39:38 +0200678 int kind;
679 void *data;
680 Py_UCS4 *output;
681 Py_ssize_t i, i1, o, len;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000682 int f,l,index,index1,comb;
Martin v. Löwis22970662011-09-29 13:39:38 +0200683 Py_UCS4 code;
684 Py_ssize_t skipped[20];
Martin v. Löwis677bde22002-11-23 22:08:15 +0000685 int cskipped = 0;
686
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000687 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000688 if (!result)
689 return NULL;
Martin v. Löwis22970662011-09-29 13:39:38 +0200690 /* result will be "ready". */
691 kind = PyUnicode_KIND(result);
692 data = PyUnicode_DATA(result);
693 len = PyUnicode_GET_LENGTH(result);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000694
Martin v. Löwis22970662011-09-29 13:39:38 +0200695 /* We allocate a buffer for the output.
696 If we find that we made no changes, we still return
697 the NFD result. */
698 output = PyMem_Malloc(len * sizeof(Py_UCS4));
699 if (!output) {
700 PyErr_NoMemory();
701 Py_DECREF(result);
702 return 0;
703 }
704 i = o = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000705
Martin v. Löwis677bde22002-11-23 22:08:15 +0000706 again:
Martin v. Löwis22970662011-09-29 13:39:38 +0200707 while (i < len) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000708 for (index = 0; index < cskipped; index++) {
709 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000710 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000711 Remove from list. */
712 skipped[index] = skipped[cskipped-1];
713 cskipped--;
714 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000715 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000716 }
717 }
718 /* Hangul Composition. We don't need to check for <LV,T>
719 pairs, since we always have decomposed data. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200720 code = PyUnicode_READ(kind, data, i);
721 if (LBase <= code && code < (LBase+LCount) &&
722 i + 1 < len &&
723 VBase <= PyUnicode_READ(kind, data, i+1) &&
724 PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000725 int LIndex, VIndex;
Martin v. Löwis22970662011-09-29 13:39:38 +0200726 LIndex = code - LBase;
727 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000728 code = SBase + (LIndex*VCount+VIndex)*TCount;
729 i+=2;
Martin v. Löwis22970662011-09-29 13:39:38 +0200730 if (i < len &&
731 TBase <= PyUnicode_READ(kind, data, i) &&
732 PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
733 code += PyUnicode_READ(kind, data, i)-TBase;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000734 i++;
735 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200736 output[o++] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000737 continue;
738 }
739
Martin v. Löwis22970662011-09-29 13:39:38 +0200740 /* code is still input[i] here */
741 f = find_nfc_index(self, nfc_first, code);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000742 if (f == -1) {
Martin v. Löwis22970662011-09-29 13:39:38 +0200743 output[o++] = code;
744 i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000745 continue;
746 }
747 /* Find next unblocked character. */
748 i1 = i+1;
749 comb = 0;
Martin v. Löwis22970662011-09-29 13:39:38 +0200750 /* output base character for now; might be updated later. */
751 output[o] = PyUnicode_READ(kind, data, i);
752 while (i1 < len) {
753 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
754 int comb1 = _getrecord_ex(code1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000755 if (comb) {
756 if (comb1 == 0)
757 break;
758 if (comb >= comb1) {
759 /* Character is blocked. */
760 i1++;
761 continue;
762 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000763 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200764 l = find_nfc_index(self, nfc_last, code1);
765 /* i1 cannot be combined with i. If i1
Martin v. Löwis677bde22002-11-23 22:08:15 +0000766 is a starter, we don't need to look further.
767 Otherwise, record the combining class. */
768 if (l == -1) {
769 not_combinable:
770 if (comb1 == 0)
771 break;
772 comb = comb1;
773 i1++;
774 continue;
775 }
776 index = f*TOTAL_LAST + l;
777 index1 = comp_index[index >> COMP_SHIFT];
778 code = comp_data[(index1<<COMP_SHIFT)+
779 (index&((1<<COMP_SHIFT)-1))];
780 if (code == 0)
781 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000782
Martin v. Löwis677bde22002-11-23 22:08:15 +0000783 /* Replace the original character. */
Martin v. Löwis22970662011-09-29 13:39:38 +0200784 output[o] = code;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000785 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000786 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000787 skipped[cskipped++] = i1;
788 i1++;
Martin v. Löwis22970662011-09-29 13:39:38 +0200789 f = find_nfc_index(self, nfc_first, output[o]);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000790 if (f == -1)
791 break;
792 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200793 /* Output character was already written.
794 Just advance the indices. */
795 o++; i++;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000796 }
Martin v. Löwis22970662011-09-29 13:39:38 +0200797 if (o == len) {
798 /* No changes. Return original string. */
799 PyMem_Free(output);
800 return result;
801 }
802 Py_DECREF(result);
803 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
804 output, o);
805 PyMem_Free(output);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000806 return result;
807}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000808
809/* Return 1 if the input is certainly normalized, 0 if it might not be. */
810static int
811is_normalized(PyObject *self, PyObject *input, int nfc, int k)
812{
Martin v. Löwis22970662011-09-29 13:39:38 +0200813 Py_ssize_t i, len;
814 int kind;
815 void *data;
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000816 unsigned char prev_combining = 0, quickcheck_mask;
817
818 /* An older version of the database is requested, quickchecks must be
819 disabled. */
820 if (self && UCD_Check(self))
821 return 0;
822
823 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
824 as described in http://unicode.org/reports/tr15/#Annex8. */
825 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
826
Martin v. Löwis22970662011-09-29 13:39:38 +0200827 i = 0;
828 kind = PyUnicode_KIND(input);
829 data = PyUnicode_DATA(input);
830 len = PyUnicode_GET_LENGTH(input);
831 while (i < len) {
832 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
833 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000834 unsigned char combining = record->combining;
835 unsigned char quickcheck = record->normalization_quick_check;
836
837 if (quickcheck & quickcheck_mask)
838 return 0; /* this string might need normalization */
839 if (combining && prev_combining > combining)
840 return 0; /* non-canonical sort order, not normalized */
841 prev_combining = combining;
842 }
843 return 1; /* certainly normalized */
844}
845
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000846PyDoc_STRVAR(unicodedata_normalize__doc__,
847"normalize(form, unistr)\n\
848\n\
849Return the normal form 'form' for the Unicode string unistr. Valid\n\
850values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
851
Martin v. Löwis677bde22002-11-23 22:08:15 +0000852static PyObject*
853unicodedata_normalize(PyObject *self, PyObject *args)
854{
855 char *form;
856 PyObject *input;
857
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000858 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000859 &form, &PyUnicode_Type, &input))
860 return NULL;
861
Martin v. Löwis22970662011-09-29 13:39:38 +0200862 if (PyUnicode_READY(input) == -1)
863 return NULL;
864
865 if (PyUnicode_GET_LENGTH(input) == 0) {
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000866 /* Special case empty input strings, since resizing
867 them later would cause internal errors. */
868 Py_INCREF(input);
869 return input;
870 }
871
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000872 if (strcmp(form, "NFC") == 0) {
873 if (is_normalized(self, input, 1, 0)) {
874 Py_INCREF(input);
875 return input;
876 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000877 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000878 }
879 if (strcmp(form, "NFKC") == 0) {
880 if (is_normalized(self, input, 1, 1)) {
881 Py_INCREF(input);
882 return input;
883 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000884 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000885 }
886 if (strcmp(form, "NFD") == 0) {
887 if (is_normalized(self, input, 0, 0)) {
888 Py_INCREF(input);
889 return input;
890 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000891 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000892 }
893 if (strcmp(form, "NFKD") == 0) {
894 if (is_normalized(self, input, 0, 1)) {
895 Py_INCREF(input);
896 return input;
897 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000898 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000899 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000900 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
901 return NULL;
902}
903
Fredrik Lundh06d12682001-01-24 07:59:11 +0000904/* -------------------------------------------------------------------- */
905/* unicode character name tables */
906
907/* data file generated by Tools/unicode/makeunicodedata.py */
908#include "unicodename_db.h"
909
910/* -------------------------------------------------------------------- */
911/* database code (cut and pasted from the unidb package) */
912
913static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000914_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000915{
916 int i;
917 unsigned long h = 0;
918 unsigned long ix;
919 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200920 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000921 ix = h & 0xff000000;
922 if (ix)
923 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
924 }
925 return h;
926}
927
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000928static char *hangul_syllables[][3] = {
929 { "G", "A", "" },
930 { "GG", "AE", "G" },
931 { "N", "YA", "GG" },
932 { "D", "YAE", "GS" },
933 { "DD", "EO", "N", },
934 { "R", "E", "NJ" },
935 { "M", "YEO", "NH" },
936 { "B", "YE", "D" },
937 { "BB", "O", "L" },
938 { "S", "WA", "LG" },
939 { "SS", "WAE", "LM" },
940 { "", "OE", "LB" },
941 { "J", "YO", "LS" },
942 { "JJ", "U", "LT" },
943 { "C", "WEO", "LP" },
944 { "K", "WE", "LH" },
945 { "T", "WI", "M" },
946 { "P", "YU", "B" },
947 { "H", "EU", "BS" },
948 { 0, "YI", "S" },
949 { 0, "I", "SS" },
950 { 0, 0, "NG" },
951 { 0, 0, "J" },
952 { 0, 0, "C" },
953 { 0, 0, "K" },
954 { 0, 0, "T" },
955 { 0, 0, "P" },
956 { 0, 0, "H" }
957};
958
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000959/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000960static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000961is_unified_ideograph(Py_UCS4 code)
962{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000963 return
964 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
Benjamin Peterson71f660e2012-02-20 22:24:29 -0500965 (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000966 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
967 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
968 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000969}
970
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300971/* macros used to determine if the given codepoint is in the PUA range that
972 * we are using to store aliases and named sequences */
973#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
974#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
975 (cp < named_sequences_end))
976
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000977static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300978_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
979 int with_alias_and_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000980{
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300981 /* Find the name associated with the given codepoint.
982 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
983 * that we are using for aliases and named sequences. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000984 int offset;
985 int i;
986 int word;
987 unsigned char* w;
988
Martin v. Löwisc3509122006-03-11 12:16:23 +0000989 if (code >= 0x110000)
990 return 0;
991
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300992 /* XXX should we just skip all the codepoints in the PUAs here? */
993 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
994 return 0;
995
Martin v. Löwis1a214512008-06-11 05:26:20 +0000996 if (self && UCD_Check(self)) {
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300997 /* in 3.2.0 there are no aliases and named sequences */
Ezio Melotti4837e392011-10-22 00:24:17 +0300998 const change_record *old;
Ezio Melotti931b8aa2011-10-21 21:57:36 +0300999 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1000 return 0;
Ezio Melotti4837e392011-10-22 00:24:17 +03001001 old = get_old_record(self, code);
Martin v. Löwisc3509122006-03-11 12:16:23 +00001002 if (old->category_changed == 0) {
1003 /* unassigned */
1004 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001005 }
Martin v. Löwisc3509122006-03-11 12:16:23 +00001006 }
1007
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +00001008 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001009 /* Hangul syllable. */
1010 int SIndex = code - SBase;
1011 int L = SIndex / NCount;
1012 int V = (SIndex % NCount) / TCount;
1013 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001014
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001015 if (buflen < 27)
1016 /* Worst case: HANGUL SYLLABLE <10chars>. */
1017 return 0;
1018 strcpy(buffer, "HANGUL SYLLABLE ");
1019 buffer += 16;
1020 strcpy(buffer, hangul_syllables[L][0]);
1021 buffer += strlen(hangul_syllables[L][0]);
1022 strcpy(buffer, hangul_syllables[V][1]);
1023 buffer += strlen(hangul_syllables[V][1]);
1024 strcpy(buffer, hangul_syllables[T][2]);
1025 buffer += strlen(hangul_syllables[T][2]);
1026 *buffer = '\0';
1027 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001028 }
1029
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001030 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001031 if (buflen < 28)
1032 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1033 return 0;
1034 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1035 return 1;
1036 }
1037
Fredrik Lundh06d12682001-01-24 07:59:11 +00001038 /* get offset into phrasebook */
1039 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1040 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1041 (code&((1<<phrasebook_shift)-1))];
1042 if (!offset)
1043 return 0;
1044
1045 i = 0;
1046
1047 for (;;) {
1048 /* get word index */
1049 word = phrasebook[offset] - phrasebook_short;
1050 if (word >= 0) {
1051 word = (word << 8) + phrasebook[offset+1];
1052 offset += 2;
1053 } else
1054 word = phrasebook[offset++];
1055 if (i) {
1056 if (i > buflen)
1057 return 0; /* buffer overflow */
1058 buffer[i++] = ' ';
1059 }
1060 /* copy word string from lexicon. the last character in the
1061 word has bit 7 set. the last word in a string ends with
1062 0x80 */
1063 w = lexicon + lexicon_offset[word];
1064 while (*w < 128) {
1065 if (i >= buflen)
1066 return 0; /* buffer overflow */
1067 buffer[i++] = *w++;
1068 }
1069 if (i >= buflen)
1070 return 0; /* buffer overflow */
1071 buffer[i++] = *w & 127;
1072 if (*w == 128)
1073 break; /* end of word */
1074 }
1075
1076 return 1;
1077}
1078
1079static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001080_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001081{
1082 /* check if code corresponds to the given name */
1083 int i;
1084 char buffer[NAME_MAXLEN];
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001085 if (!_getucname(self, code, buffer, sizeof(buffer), 1))
Fredrik Lundh06d12682001-01-24 07:59:11 +00001086 return 0;
1087 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +02001088 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +00001089 return 0;
1090 }
1091 return buffer[namelen] == '\0';
1092}
1093
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001094static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001095find_syllable(const char *str, int *len, int *pos, int count, int column)
1096{
1097 int i, len1;
1098 *len = -1;
1099 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001100 char *s = hangul_syllables[i][column];
Antoine Pitrou1d4bd252011-10-06 15:44:15 +02001101 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001102 if (len1 <= *len)
1103 continue;
1104 if (strncmp(str, s, len1) == 0) {
1105 *len = len1;
1106 *pos = i;
1107 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001108 }
1109 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001110 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001111 }
1112}
1113
Fredrik Lundh06d12682001-01-24 07:59:11 +00001114static int
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001115_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001116{
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001117 /* check if named sequences are allowed */
1118 if (!with_named_seq && IS_NAMED_SEQ(cp))
1119 return 0;
1120 /* if the codepoint is in the PUA range that we use for aliases,
1121 * convert it to obtain the right codepoint */
1122 if (IS_ALIAS(cp))
1123 *code = name_aliases[cp-aliases_start];
1124 else
1125 *code = cp;
1126 return 1;
1127}
1128
1129static int
1130_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1131 int with_named_seq)
1132{
1133 /* Return the codepoint associated with the given name.
1134 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1135 * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
1136 * using for the named sequence, and the caller must then convert it. */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001137 unsigned int h, v;
1138 unsigned int mask = code_size-1;
1139 unsigned int i, incr;
1140
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001141 /* Check for hangul syllables. */
1142 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001143 int len, L = -1, V = -1, T = -1;
1144 const char *pos = name + 16;
1145 find_syllable(pos, &len, &L, LCount, 0);
1146 pos += len;
1147 find_syllable(pos, &len, &V, VCount, 1);
1148 pos += len;
1149 find_syllable(pos, &len, &T, TCount, 2);
1150 pos += len;
1151 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1152 *code = SBase + (L*VCount+V)*TCount + T;
1153 return 1;
1154 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001155 /* Otherwise, it's an illegal syllable name. */
1156 return 0;
1157 }
1158
1159 /* Check for unified ideographs. */
1160 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1161 /* Four or five hexdigits must follow. */
1162 v = 0;
1163 name += 22;
1164 namelen -= 22;
1165 if (namelen != 4 && namelen != 5)
1166 return 0;
1167 while (namelen--) {
1168 v *= 16;
1169 if (*name >= '0' && *name <= '9')
1170 v += *name - '0';
1171 else if (*name >= 'A' && *name <= 'F')
1172 v += *name - 'A' + 10;
1173 else
1174 return 0;
1175 name++;
1176 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001177 if (!is_unified_ideograph(v))
1178 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001179 *code = v;
1180 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001181 }
1182
Fredrik Lundh06d12682001-01-24 07:59:11 +00001183 /* the following is the same as python's dictionary lookup, with
1184 only minor changes. see the makeunicodedata script for more
1185 details */
1186
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001187 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001188 i = (~h) & mask;
1189 v = code_hash[i];
1190 if (!v)
1191 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001192 if (_cmpname(self, v, name, namelen))
1193 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001194 incr = (h ^ (h >> 3)) & mask;
1195 if (!incr)
1196 incr = mask;
1197 for (;;) {
1198 i = (i + incr) & mask;
1199 v = code_hash[i];
1200 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001201 return 0;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001202 if (_cmpname(self, v, name, namelen))
1203 return _check_alias_and_seq(v, code, with_named_seq);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001204 incr = incr << 1;
1205 if (incr > mask)
1206 incr = incr ^ code_poly;
1207 }
1208}
1209
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001210static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001211{
1212 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001213 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001214 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001215};
1216
1217/* -------------------------------------------------------------------- */
1218/* Python bindings */
1219
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001220PyDoc_STRVAR(unicodedata_name__doc__,
1221"name(unichr[, default])\n\
1222Returns the name assigned to the Unicode character unichr as a\n\
1223string. If no name is defined, default is returned, or, if not\n\
1224given, ValueError is raised.");
1225
Fredrik Lundh06d12682001-01-24 07:59:11 +00001226static PyObject *
1227unicodedata_name(PyObject* self, PyObject* args)
1228{
1229 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001230 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001231
1232 PyUnicodeObject* v;
1233 PyObject* defobj = NULL;
1234 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1235 return NULL;
1236
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001237 c = getuchar(v);
1238 if (c == (Py_UCS4)-1)
1239 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001240
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001241 if (!_getucname(self, c, name, sizeof(name), 0)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001242 if (defobj == NULL) {
1243 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001244 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001245 }
1246 else {
1247 Py_INCREF(defobj);
1248 return defobj;
1249 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001250 }
1251
Walter Dörwald4254e762007-06-05 16:04:09 +00001252 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001253}
1254
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001255PyDoc_STRVAR(unicodedata_lookup__doc__,
1256"lookup(name)\n\
1257\n\
1258Look up character by name. If a character with the\n\
1259given name is found, return the corresponding Unicode\n\
1260character. If not found, KeyError is raised.");
1261
Fredrik Lundh06d12682001-01-24 07:59:11 +00001262static PyObject *
1263unicodedata_lookup(PyObject* self, PyObject* args)
1264{
1265 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001266
1267 char* name;
1268 int namelen;
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001269 unsigned int index;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001270 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1271 return NULL;
1272
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001273 if (!_getcode(self, name, namelen, &code, 1)) {
1274 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001275 return NULL;
1276 }
Stefan Kraha4b4dea2012-09-23 15:51:16 +02001277 /* check if code is in the PUA range that we use for named sequences
1278 and convert it */
Ezio Melotti931b8aa2011-10-21 21:57:36 +03001279 if (IS_NAMED_SEQ(code)) {
1280 index = code-named_sequences_start;
1281 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1282 named_sequences[index].seq,
1283 named_sequences[index].seqlen);
1284 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001285 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001286}
1287
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001288/* XXX Add doc strings. */
1289
1290static PyMethodDef unicodedata_functions[] = {
Larry Hastings31826802013-10-19 00:09:25 -07001291 UNICODEDATA_DECIMAL_METHODDEF
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001292 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1293 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1294 {"category", unicodedata_category, METH_VARARGS,
1295 unicodedata_category__doc__},
1296 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1297 unicodedata_bidirectional__doc__},
1298 {"combining", unicodedata_combining, METH_VARARGS,
1299 unicodedata_combining__doc__},
1300 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1301 unicodedata_mirrored__doc__},
1302 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1303 unicodedata_east_asian_width__doc__},
1304 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1305 unicodedata_decomposition__doc__},
1306 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1307 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1308 {"normalize", unicodedata_normalize, METH_VARARGS,
1309 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001310 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001311};
1312
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001313static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001314 /* The ob_type field must be initialized in the module init function
1315 * to be portable to Windows without using C++. */
1316 PyVarObject_HEAD_INIT(NULL, 0)
1317 "unicodedata.UCD", /*tp_name*/
1318 sizeof(PreviousDBVersion), /*tp_basicsize*/
1319 0, /*tp_itemsize*/
1320 /* methods */
1321 (destructor)PyObject_Del, /*tp_dealloc*/
1322 0, /*tp_print*/
1323 0, /*tp_getattr*/
1324 0, /*tp_setattr*/
1325 0, /*tp_reserved*/
1326 0, /*tp_repr*/
1327 0, /*tp_as_number*/
1328 0, /*tp_as_sequence*/
1329 0, /*tp_as_mapping*/
1330 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001331 0, /*tp_call*/
1332 0, /*tp_str*/
1333 PyObject_GenericGetAttr,/*tp_getattro*/
1334 0, /*tp_setattro*/
1335 0, /*tp_as_buffer*/
1336 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1337 0, /*tp_doc*/
1338 0, /*tp_traverse*/
1339 0, /*tp_clear*/
1340 0, /*tp_richcompare*/
1341 0, /*tp_weaklistoffset*/
1342 0, /*tp_iter*/
1343 0, /*tp_iternext*/
1344 unicodedata_functions, /*tp_methods*/
1345 DB_members, /*tp_members*/
1346 0, /*tp_getset*/
1347 0, /*tp_base*/
1348 0, /*tp_dict*/
1349 0, /*tp_descr_get*/
1350 0, /*tp_descr_set*/
1351 0, /*tp_dictoffset*/
1352 0, /*tp_init*/
1353 0, /*tp_alloc*/
1354 0, /*tp_new*/
1355 0, /*tp_free*/
1356 0, /*tp_is_gc*/
1357};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001358
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001359PyDoc_STRVAR(unicodedata_docstring,
1360"This module provides access to the Unicode Character Database which\n\
1361defines character properties for all Unicode characters. The data in\n\
1362this database is based on the UnicodeData.txt file version\n\
Benjamin Peterson8aa7b892013-10-10 20:22:10 -04001363" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001364\n\
1365The module uses the same names and symbols as defined by the\n\
Benjamin Peterson577dd612013-10-10 20:16:25 -04001366UnicodeData File Format " UNIDATA_VERSION ".");
Martin v. Löwis1a214512008-06-11 05:26:20 +00001367
1368static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001369 PyModuleDef_HEAD_INIT,
1370 "unicodedata",
1371 unicodedata_docstring,
1372 -1,
1373 unicodedata_functions,
1374 NULL,
1375 NULL,
1376 NULL,
1377 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001378};
1379
Mark Hammond62b1ab12002-07-23 06:31:15 +00001380PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001381PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001382{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001383 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001384
Christian Heimes90aa7642007-12-19 02:45:37 +00001385 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001386
Martin v. Löwis1a214512008-06-11 05:26:20 +00001387 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001388 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001389 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001390
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001391 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001392 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001393 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001394
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001395 /* Previous versions */
1396 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1397 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001398 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001399
Fredrik Lundh06d12682001-01-24 07:59:11 +00001400 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001401 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001402 if (v != NULL)
1403 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001404 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001405}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001406
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001407/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001408Local variables:
1409c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001410indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001411End:
1412*/