blob: f636590f0b23fd2836279d47a230d3f130107813 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti4c5475d2010-03-22 23:16:42 +00003 unicodedata -- Provides access to the Unicode 5.2 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti4c5475d2010-03-22 23:16:42 +00005 Data was extracted from the Unicode 5.2 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000030 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000031} _PyUnicode_DatabaseRecord;
32
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000033typedef struct change_record {
34 /* sequence of fields should be the same as in merge_old_version */
35 const unsigned char bidir_changed;
36 const unsigned char category_changed;
37 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000038 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000039 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000040} change_record;
41
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000042/* data file generated by Tools/unicode/makeunicodedata.py */
43#include "unicodedata_db.h"
44
45static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000046_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000049 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000050 index = 0;
51 else {
52 index = index1[(code>>SHIFT)];
53 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54 }
55
56 return &_PyUnicode_Database_Records[index];
57}
58
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000059/* ------------- Previous-version API ------------------------------------- */
60typedef struct previous_version {
61 PyObject_HEAD
62 const char *name;
63 const change_record* (*getrecord)(Py_UCS4);
64 Py_UCS4 (*normalization)(Py_UCS4);
65} PreviousDBVersion;
66
67#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
68
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000069static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000070 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000071 {NULL}
72};
73
Thomas Wouters89f507f2006-12-13 04:49:30 +000074/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000075static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000076#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000077
78static PyObject*
79new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
80 Py_UCS4 (*normalization)(Py_UCS4))
81{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000082 PreviousDBVersion *self;
83 self = PyObject_New(PreviousDBVersion, &UCD_Type);
84 if (self == NULL)
85 return NULL;
86 self->name = name;
87 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090}
91
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000092
93static Py_UCS4 getuchar(PyUnicodeObject *obj)
94{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020095 if (PyUnicode_READY(obj))
96 return (Py_UCS4)-1;
97 if (PyUnicode_GET_LENGTH(obj) == 1) {
98 if (PyUnicode_READY(obj))
99 return (Py_UCS4)-1;
100 return PyUnicode_READ_CHAR(obj, 0);
101 }
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000102 PyErr_SetString(PyExc_TypeError,
103 "need a single Unicode character as parameter");
104 return (Py_UCS4)-1;
105}
106
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000107/* --- Module API --------------------------------------------------------- */
108
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000109PyDoc_STRVAR(unicodedata_decimal__doc__,
110"decimal(unichr[, default])\n\
111\n\
112Returns the decimal value assigned to the Unicode character unichr\n\
113as integer. If no such value is defined, default is returned, or, if\n\
114not given, ValueError is raised.");
115
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000116static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000117unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000118{
119 PyUnicodeObject *v;
120 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000121 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000122 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000123 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000124
Fredrik Lundh06d12682001-01-24 07:59:11 +0000125 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000126 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000127 c = getuchar(v);
128 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000129 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000130
Martin v. Löwis1a214512008-06-11 05:26:20 +0000131 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000132 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000133 if (old->category_changed == 0) {
134 /* unassigned */
135 have_old = 1;
136 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000137 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000138 else if (old->decimal_changed != 0xFF) {
139 have_old = 1;
140 rc = old->decimal_changed;
141 }
142 }
143
144 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000145 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000146 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 if (defobj == NULL) {
148 PyErr_SetString(PyExc_ValueError,
149 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000150 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000151 }
152 else {
153 Py_INCREF(defobj);
154 return defobj;
155 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000156 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000157 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000158}
159
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000160PyDoc_STRVAR(unicodedata_digit__doc__,
161"digit(unichr[, default])\n\
162\n\
163Returns the digit value assigned to the Unicode character unichr as\n\
164integer. If no such value is defined, default is returned, or, if\n\
165not given, ValueError is raised.");
166
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000167static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000168unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000169{
170 PyUnicodeObject *v;
171 PyObject *defobj = NULL;
172 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000173 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000174
Fredrik Lundh06d12682001-01-24 07:59:11 +0000175 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000176 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000177 c = getuchar(v);
178 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000179 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000180 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000181 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000182 if (defobj == NULL) {
183 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000184 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 }
186 else {
187 Py_INCREF(defobj);
188 return defobj;
189 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000190 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000191 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000192}
193
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000194PyDoc_STRVAR(unicodedata_numeric__doc__,
195"numeric(unichr[, default])\n\
196\n\
197Returns the numeric value assigned to the Unicode character unichr\n\
198as float. If no such value is defined, default is returned, or, if\n\
199not given, ValueError is raised.");
200
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000201static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000202unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000203{
204 PyUnicodeObject *v;
205 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000206 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000207 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000208 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000209
Fredrik Lundh06d12682001-01-24 07:59:11 +0000210 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000211 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000212 c = getuchar(v);
213 if (c == (Py_UCS4)-1)
214 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000215
Martin v. Löwis1a214512008-06-11 05:26:20 +0000216 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000217 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000218 if (old->category_changed == 0) {
219 /* unassigned */
220 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000222 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000223 else if (old->decimal_changed != 0xFF) {
224 have_old = 1;
225 rc = old->decimal_changed;
226 }
227 }
228
229 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000230 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000232 if (defobj == NULL) {
233 PyErr_SetString(PyExc_ValueError, "not a numeric character");
234 return NULL;
235 }
236 else {
237 Py_INCREF(defobj);
238 return defobj;
239 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000240 }
241 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000242}
243
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000244PyDoc_STRVAR(unicodedata_category__doc__,
245"category(unichr)\n\
246\n\
247Returns the general category assigned to the Unicode character\n\
248unichr as string.");
249
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000250static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000251unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000252{
253 PyUnicodeObject *v;
254 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000255 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000256
257 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000258 &PyUnicode_Type, &v))
259 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000260 c = getuchar(v);
261 if (c == (Py_UCS4)-1)
262 return NULL;
263 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000264 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000265 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000266 if (old->category_changed != 0xFF)
267 index = old->category_changed;
268 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000269 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000270}
271
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000272PyDoc_STRVAR(unicodedata_bidirectional__doc__,
273"bidirectional(unichr)\n\
274\n\
275Returns the bidirectional category assigned to the Unicode character\n\
276unichr as string. If no such value is defined, an empty string is\n\
277returned.");
278
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000279static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000280unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000281{
282 PyUnicodeObject *v;
283 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000284 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000285
286 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000287 &PyUnicode_Type, &v))
288 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000289 c = getuchar(v);
290 if (c == (Py_UCS4)-1)
291 return NULL;
292 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000293 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000294 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000295 if (old->category_changed == 0)
296 index = 0; /* unassigned */
297 else if (old->bidir_changed != 0xFF)
298 index = old->bidir_changed;
299 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000300 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000301}
302
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000303PyDoc_STRVAR(unicodedata_combining__doc__,
304"combining(unichr)\n\
305\n\
306Returns the canonical combining class assigned to the Unicode\n\
307character unichr as integer. Returns 0 if no combining class is\n\
308defined.");
309
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000310static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000311unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000312{
313 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000314 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000315 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000316
317 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000318 &PyUnicode_Type, &v))
319 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000320 c = getuchar(v);
321 if (c == (Py_UCS4)-1)
322 return NULL;
323 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000324 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000325 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000326 if (old->category_changed == 0)
327 index = 0; /* unassigned */
328 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000329 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000330}
331
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000332PyDoc_STRVAR(unicodedata_mirrored__doc__,
333"mirrored(unichr)\n\
334\n\
335Returns the mirrored property assigned to the Unicode character\n\
336unichr as integer. Returns 1 if the character has been identified as\n\
337a \"mirrored\" character in bidirectional text, 0 otherwise.");
338
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000339static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000340unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000341{
342 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000343 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000344 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000345
346 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000347 &PyUnicode_Type, &v))
348 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000349 c = getuchar(v);
350 if (c == (Py_UCS4)-1)
351 return NULL;
352 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000353 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000354 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000355 if (old->category_changed == 0)
356 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000357 else if (old->mirrored_changed != 0xFF)
358 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000359 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000360 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000361}
362
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000363PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
364"east_asian_width(unichr)\n\
365\n\
366Returns the east asian width assigned to the Unicode character\n\
367unichr as string.");
368
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000369static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000370unicodedata_east_asian_width(PyObject *self, PyObject *args)
371{
372 PyUnicodeObject *v;
373 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000374 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000375
376 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000377 &PyUnicode_Type, &v))
378 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000379 c = getuchar(v);
380 if (c == (Py_UCS4)-1)
381 return NULL;
382 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000383 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000384 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000385 if (old->category_changed == 0)
386 index = 0; /* unassigned */
387 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000388 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000389}
390
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000391PyDoc_STRVAR(unicodedata_decomposition__doc__,
392"decomposition(unichr)\n\
393\n\
394Returns the character decomposition mapping assigned to the Unicode\n\
395character unichr as string. An empty string is returned in case no\n\
396such mapping is defined.");
397
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000398static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000399unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000400{
401 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000402 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000403 int code, index, count;
404 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000405 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000406 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000407
408 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000409 &PyUnicode_Type, &v))
410 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000411 c = getuchar(v);
412 if (c == (Py_UCS4)-1)
413 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000414
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000415 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000416
Martin v. Löwis1a214512008-06-11 05:26:20 +0000417 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000418 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000419 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000420 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000421 }
422
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000423 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000424 index = 0;
425 else {
426 index = decomp_index1[(code>>DECOMP_SHIFT)];
427 index = decomp_index2[(index<<DECOMP_SHIFT)+
428 (code&((1<<DECOMP_SHIFT)-1))];
429 }
430
Tim Peters69b83b12001-11-30 07:23:05 +0000431 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000432 is prefix code (from*/
433 count = decomp_data[index] >> 8;
434
435 /* XXX: could allocate the PyString up front instead
436 (strlen(prefix) + 5 * count + 1 bytes) */
437
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000438 /* Based on how index is calculated above and decomp_data is generated
439 from Tools/unicode/makeunicodedata.py, it should not be possible
440 to overflow decomp_prefix. */
441 prefix_index = decomp_data[index] & 255;
Victor Stinner63941882011-09-29 00:42:28 +0200442 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000443
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000444 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000445 i = strlen(decomp_prefix[prefix_index]);
446 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000447
448 while (count-- > 0) {
449 if (i)
450 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000451 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000452 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
453 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000454 i += strlen(decomp + i);
455 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000456 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000457}
458
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000459static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000460get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000461{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000462 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000463 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000465 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000466 /* unassigned in old version */
467 *index = 0;
468 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000469 else {
470 *index = decomp_index1[(code>>DECOMP_SHIFT)];
471 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
472 (code&((1<<DECOMP_SHIFT)-1))];
473 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000474
Martin v. Löwis677bde22002-11-23 22:08:15 +0000475 /* high byte is number of hex bytes (usually one or two), low byte
476 is prefix code (from*/
477 *count = decomp_data[*index] >> 8;
478 *prefix = decomp_data[*index] & 255;
479
480 (*index)++;
481}
482
483#define SBase 0xAC00
484#define LBase 0x1100
485#define VBase 0x1161
486#define TBase 0x11A7
487#define LCount 19
488#define VCount 21
489#define TCount 28
490#define NCount (VCount*TCount)
491#define SCount (LCount*NCount)
492
493static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000494nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000495{
496 PyObject *result;
497 Py_UNICODE *i, *end, *o;
498 /* Longest decomposition in Unicode 3.2: U+FDFA */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000499 Py_UNICODE stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000500 Py_ssize_t space, isize;
501 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000502 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000503
Martin v. Löwis677bde22002-11-23 22:08:15 +0000504 stackptr = 0;
505 isize = PyUnicode_GET_SIZE(input);
506 /* Overallocate atmost 10 characters. */
507 space = (isize > 10 ? 10 : isize) + isize;
508 result = PyUnicode_FromUnicode(NULL, space);
509 if (!result)
510 return NULL;
511 i = PyUnicode_AS_UNICODE(input);
512 end = i + isize;
513 o = PyUnicode_AS_UNICODE(result);
514
515 while (i < end) {
516 stack[stackptr++] = *i++;
517 while(stackptr) {
518 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000519 /* Hangul Decomposition adds three characters in
520 a single step, so we need atleast that much room. */
521 if (space < 3) {
Martin v. Löwis5b222132007-06-10 09:51:05 +0000522 Py_ssize_t newsize = PyUnicode_GET_SIZE(result) + 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000523 space += 10;
524 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000525 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000526 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000527 }
528 /* Hangul Decomposition. */
529 if (SBase <= code && code < (SBase+SCount)) {
530 int SIndex = code - SBase;
531 int L = LBase + SIndex / NCount;
532 int V = VBase + (SIndex % NCount) / TCount;
533 int T = TBase + SIndex % TCount;
534 *o++ = L;
535 *o++ = V;
536 space -= 2;
537 if (T != TBase) {
538 *o++ = T;
539 space --;
540 }
541 continue;
542 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000543 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000544 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000545 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
546 if (value != 0) {
547 stack[stackptr++] = value;
548 continue;
549 }
550 }
551
552 /* Other decompositions. */
553 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000554
555 /* Copy character if it is not decomposable, or has a
556 compatibility decomposition, but we do NFD. */
557 if (!count || (prefix && !k)) {
558 *o++ = code;
559 space--;
560 continue;
561 }
562 /* Copy decomposition onto the stack, in reverse
563 order. */
564 while(count) {
565 code = decomp_data[index + (--count)];
566 stack[stackptr++] = code;
567 }
568 }
569 }
570
571 /* Drop overallocation. Cannot fail. */
572 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
573
574 /* Sort canonically. */
575 i = PyUnicode_AS_UNICODE(result);
576 prev = _getrecord_ex(*i)->combining;
577 end = i + PyUnicode_GET_SIZE(result);
578 for (i++; i < end; i++) {
579 cur = _getrecord_ex(*i)->combining;
580 if (prev == 0 || cur == 0 || prev <= cur) {
581 prev = cur;
582 continue;
583 }
584 /* Non-canonical order. Need to switch *i with previous. */
585 o = i - 1;
586 while (1) {
587 Py_UNICODE tmp = o[1];
588 o[1] = o[0];
589 o[0] = tmp;
590 o--;
591 if (o < PyUnicode_AS_UNICODE(result))
592 break;
593 prev = _getrecord_ex(*o)->combining;
594 if (prev == 0 || prev <= cur)
595 break;
596 }
597 prev = _getrecord_ex(*i)->combining;
598 }
599 return result;
600}
601
602static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000603find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000604{
605 int index;
606 for (index = 0; nfc[index].start; index++) {
607 int start = nfc[index].start;
608 if (code < start)
609 return -1;
610 if (code <= start + nfc[index].count) {
611 int delta = code - start;
612 return nfc[index].index + delta;
613 }
614 }
615 return -1;
616}
617
618static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000619nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000620{
621 PyObject *result;
622 Py_UNICODE *i, *i1, *o, *end;
623 int f,l,index,index1,comb;
624 Py_UNICODE code;
625 Py_UNICODE *skipped[20];
626 int cskipped = 0;
627
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000628 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000629 if (!result)
630 return NULL;
631
632 /* We are going to modify result in-place.
633 If nfd_nfkd is changed to sometimes return the input,
634 this code needs to be reviewed. */
635 assert(result != input);
636
637 i = PyUnicode_AS_UNICODE(result);
638 end = i + PyUnicode_GET_SIZE(result);
639 o = PyUnicode_AS_UNICODE(result);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000640
Martin v. Löwis677bde22002-11-23 22:08:15 +0000641 again:
642 while (i < end) {
643 for (index = 0; index < cskipped; index++) {
644 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000645 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000646 Remove from list. */
647 skipped[index] = skipped[cskipped-1];
648 cskipped--;
649 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000650 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000651 }
652 }
653 /* Hangul Composition. We don't need to check for <LV,T>
654 pairs, since we always have decomposed data. */
655 if (LBase <= *i && *i < (LBase+LCount) &&
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000656 i + 1 < end &&
Martin v. Löwis677bde22002-11-23 22:08:15 +0000657 VBase <= i[1] && i[1] <= (VBase+VCount)) {
658 int LIndex, VIndex;
659 LIndex = i[0] - LBase;
660 VIndex = i[1] - VBase;
661 code = SBase + (LIndex*VCount+VIndex)*TCount;
662 i+=2;
663 if (i < end &&
664 TBase <= *i && *i <= (TBase+TCount)) {
665 code += *i-TBase;
666 i++;
667 }
668 *o++ = code;
669 continue;
670 }
671
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000672 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000673 if (f == -1) {
674 *o++ = *i++;
675 continue;
676 }
677 /* Find next unblocked character. */
678 i1 = i+1;
679 comb = 0;
680 while (i1 < end) {
681 int comb1 = _getrecord_ex(*i1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000682 if (comb) {
683 if (comb1 == 0)
684 break;
685 if (comb >= comb1) {
686 /* Character is blocked. */
687 i1++;
688 continue;
689 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000690 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000691 l = find_nfc_index(self, nfc_last, *i1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000692 /* *i1 cannot be combined with *i. If *i1
693 is a starter, we don't need to look further.
694 Otherwise, record the combining class. */
695 if (l == -1) {
696 not_combinable:
697 if (comb1 == 0)
698 break;
699 comb = comb1;
700 i1++;
701 continue;
702 }
703 index = f*TOTAL_LAST + l;
704 index1 = comp_index[index >> COMP_SHIFT];
705 code = comp_data[(index1<<COMP_SHIFT)+
706 (index&((1<<COMP_SHIFT)-1))];
707 if (code == 0)
708 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000709
Martin v. Löwis677bde22002-11-23 22:08:15 +0000710 /* Replace the original character. */
711 *i = code;
712 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000713 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000714 skipped[cskipped++] = i1;
715 i1++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000716 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000717 if (f == -1)
718 break;
719 }
720 *o++ = *i++;
721 }
722 if (o != end)
723 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
724 return result;
725}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000726
727/* Return 1 if the input is certainly normalized, 0 if it might not be. */
728static int
729is_normalized(PyObject *self, PyObject *input, int nfc, int k)
730{
731 Py_UNICODE *i, *end;
732 unsigned char prev_combining = 0, quickcheck_mask;
733
734 /* An older version of the database is requested, quickchecks must be
735 disabled. */
736 if (self && UCD_Check(self))
737 return 0;
738
739 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
740 as described in http://unicode.org/reports/tr15/#Annex8. */
741 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
742
743 i = PyUnicode_AS_UNICODE(input);
744 end = i + PyUnicode_GET_SIZE(input);
745 while (i < end) {
746 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
747 unsigned char combining = record->combining;
748 unsigned char quickcheck = record->normalization_quick_check;
749
750 if (quickcheck & quickcheck_mask)
751 return 0; /* this string might need normalization */
752 if (combining && prev_combining > combining)
753 return 0; /* non-canonical sort order, not normalized */
754 prev_combining = combining;
755 }
756 return 1; /* certainly normalized */
757}
758
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000759PyDoc_STRVAR(unicodedata_normalize__doc__,
760"normalize(form, unistr)\n\
761\n\
762Return the normal form 'form' for the Unicode string unistr. Valid\n\
763values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
764
Martin v. Löwis677bde22002-11-23 22:08:15 +0000765static PyObject*
766unicodedata_normalize(PyObject *self, PyObject *args)
767{
768 char *form;
769 PyObject *input;
770
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000771 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000772 &form, &PyUnicode_Type, &input))
773 return NULL;
774
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000775 if (PyUnicode_GetSize(input) == 0) {
776 /* Special case empty input strings, since resizing
777 them later would cause internal errors. */
778 Py_INCREF(input);
779 return input;
780 }
781
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000782 if (strcmp(form, "NFC") == 0) {
783 if (is_normalized(self, input, 1, 0)) {
784 Py_INCREF(input);
785 return input;
786 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000787 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000788 }
789 if (strcmp(form, "NFKC") == 0) {
790 if (is_normalized(self, input, 1, 1)) {
791 Py_INCREF(input);
792 return input;
793 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000794 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000795 }
796 if (strcmp(form, "NFD") == 0) {
797 if (is_normalized(self, input, 0, 0)) {
798 Py_INCREF(input);
799 return input;
800 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000801 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000802 }
803 if (strcmp(form, "NFKD") == 0) {
804 if (is_normalized(self, input, 0, 1)) {
805 Py_INCREF(input);
806 return input;
807 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000808 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000809 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000810 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
811 return NULL;
812}
813
Fredrik Lundh06d12682001-01-24 07:59:11 +0000814/* -------------------------------------------------------------------- */
815/* unicode character name tables */
816
817/* data file generated by Tools/unicode/makeunicodedata.py */
818#include "unicodename_db.h"
819
820/* -------------------------------------------------------------------- */
821/* database code (cut and pasted from the unidb package) */
822
823static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000824_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000825{
826 int i;
827 unsigned long h = 0;
828 unsigned long ix;
829 for (i = 0; i < len; i++) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000830 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000831 ix = h & 0xff000000;
832 if (ix)
833 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
834 }
835 return h;
836}
837
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000838static char *hangul_syllables[][3] = {
839 { "G", "A", "" },
840 { "GG", "AE", "G" },
841 { "N", "YA", "GG" },
842 { "D", "YAE", "GS" },
843 { "DD", "EO", "N", },
844 { "R", "E", "NJ" },
845 { "M", "YEO", "NH" },
846 { "B", "YE", "D" },
847 { "BB", "O", "L" },
848 { "S", "WA", "LG" },
849 { "SS", "WAE", "LM" },
850 { "", "OE", "LB" },
851 { "J", "YO", "LS" },
852 { "JJ", "U", "LT" },
853 { "C", "WEO", "LP" },
854 { "K", "WE", "LH" },
855 { "T", "WI", "M" },
856 { "P", "YU", "B" },
857 { "H", "EU", "BS" },
858 { 0, "YI", "S" },
859 { 0, "I", "SS" },
860 { 0, 0, "NG" },
861 { 0, 0, "J" },
862 { 0, 0, "C" },
863 { 0, 0, "K" },
864 { 0, 0, "T" },
865 { 0, 0, "P" },
866 { 0, 0, "H" }
867};
868
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000869/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000870static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000871is_unified_ideograph(Py_UCS4 code)
872{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000873 return
874 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
875 (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */
876 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
877 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
878 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000879}
880
881static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000882_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000883{
884 int offset;
885 int i;
886 int word;
887 unsigned char* w;
888
Martin v. Löwisc3509122006-03-11 12:16:23 +0000889 if (code >= 0x110000)
890 return 0;
891
Martin v. Löwis1a214512008-06-11 05:26:20 +0000892 if (self && UCD_Check(self)) {
Martin v. Löwisc3509122006-03-11 12:16:23 +0000893 const change_record *old = get_old_record(self, code);
894 if (old->category_changed == 0) {
895 /* unassigned */
896 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000897 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000898 }
899
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000900 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000901 /* Hangul syllable. */
902 int SIndex = code - SBase;
903 int L = SIndex / NCount;
904 int V = (SIndex % NCount) / TCount;
905 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000906
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000907 if (buflen < 27)
908 /* Worst case: HANGUL SYLLABLE <10chars>. */
909 return 0;
910 strcpy(buffer, "HANGUL SYLLABLE ");
911 buffer += 16;
912 strcpy(buffer, hangul_syllables[L][0]);
913 buffer += strlen(hangul_syllables[L][0]);
914 strcpy(buffer, hangul_syllables[V][1]);
915 buffer += strlen(hangul_syllables[V][1]);
916 strcpy(buffer, hangul_syllables[T][2]);
917 buffer += strlen(hangul_syllables[T][2]);
918 *buffer = '\0';
919 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000920 }
921
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000922 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000923 if (buflen < 28)
924 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
925 return 0;
926 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
927 return 1;
928 }
929
Fredrik Lundh06d12682001-01-24 07:59:11 +0000930 /* get offset into phrasebook */
931 offset = phrasebook_offset1[(code>>phrasebook_shift)];
932 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
933 (code&((1<<phrasebook_shift)-1))];
934 if (!offset)
935 return 0;
936
937 i = 0;
938
939 for (;;) {
940 /* get word index */
941 word = phrasebook[offset] - phrasebook_short;
942 if (word >= 0) {
943 word = (word << 8) + phrasebook[offset+1];
944 offset += 2;
945 } else
946 word = phrasebook[offset++];
947 if (i) {
948 if (i > buflen)
949 return 0; /* buffer overflow */
950 buffer[i++] = ' ';
951 }
952 /* copy word string from lexicon. the last character in the
953 word has bit 7 set. the last word in a string ends with
954 0x80 */
955 w = lexicon + lexicon_offset[word];
956 while (*w < 128) {
957 if (i >= buflen)
958 return 0; /* buffer overflow */
959 buffer[i++] = *w++;
960 }
961 if (i >= buflen)
962 return 0; /* buffer overflow */
963 buffer[i++] = *w & 127;
964 if (*w == 128)
965 break; /* end of word */
966 }
967
968 return 1;
969}
970
971static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000972_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000973{
974 /* check if code corresponds to the given name */
975 int i;
976 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000977 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000978 return 0;
979 for (i = 0; i < namelen; i++) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000980 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +0000981 return 0;
982 }
983 return buffer[namelen] == '\0';
984}
985
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000986static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000987find_syllable(const char *str, int *len, int *pos, int count, int column)
988{
989 int i, len1;
990 *len = -1;
991 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000992 char *s = hangul_syllables[i][column];
993 len1 = strlen(s);
994 if (len1 <= *len)
995 continue;
996 if (strncmp(str, s, len1) == 0) {
997 *len = len1;
998 *pos = i;
999 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001000 }
1001 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001002 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001003 }
1004}
1005
Fredrik Lundh06d12682001-01-24 07:59:11 +00001006static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001007_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001008{
1009 unsigned int h, v;
1010 unsigned int mask = code_size-1;
1011 unsigned int i, incr;
1012
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001013 /* Check for hangul syllables. */
1014 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001015 int len, L = -1, V = -1, T = -1;
1016 const char *pos = name + 16;
1017 find_syllable(pos, &len, &L, LCount, 0);
1018 pos += len;
1019 find_syllable(pos, &len, &V, VCount, 1);
1020 pos += len;
1021 find_syllable(pos, &len, &T, TCount, 2);
1022 pos += len;
1023 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1024 *code = SBase + (L*VCount+V)*TCount + T;
1025 return 1;
1026 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001027 /* Otherwise, it's an illegal syllable name. */
1028 return 0;
1029 }
1030
1031 /* Check for unified ideographs. */
1032 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1033 /* Four or five hexdigits must follow. */
1034 v = 0;
1035 name += 22;
1036 namelen -= 22;
1037 if (namelen != 4 && namelen != 5)
1038 return 0;
1039 while (namelen--) {
1040 v *= 16;
1041 if (*name >= '0' && *name <= '9')
1042 v += *name - '0';
1043 else if (*name >= 'A' && *name <= 'F')
1044 v += *name - 'A' + 10;
1045 else
1046 return 0;
1047 name++;
1048 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001049 if (!is_unified_ideograph(v))
1050 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001051 *code = v;
1052 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001053 }
1054
Fredrik Lundh06d12682001-01-24 07:59:11 +00001055 /* the following is the same as python's dictionary lookup, with
1056 only minor changes. see the makeunicodedata script for more
1057 details */
1058
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001059 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001060 i = (~h) & mask;
1061 v = code_hash[i];
1062 if (!v)
1063 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001064 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001065 *code = v;
1066 return 1;
1067 }
1068 incr = (h ^ (h >> 3)) & mask;
1069 if (!incr)
1070 incr = mask;
1071 for (;;) {
1072 i = (i + incr) & mask;
1073 v = code_hash[i];
1074 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001075 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001076 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001077 *code = v;
1078 return 1;
1079 }
1080 incr = incr << 1;
1081 if (incr > mask)
1082 incr = incr ^ code_poly;
1083 }
1084}
1085
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001086static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001087{
1088 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001089 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001090 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001091};
1092
1093/* -------------------------------------------------------------------- */
1094/* Python bindings */
1095
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001096PyDoc_STRVAR(unicodedata_name__doc__,
1097"name(unichr[, default])\n\
1098Returns the name assigned to the Unicode character unichr as a\n\
1099string. If no name is defined, default is returned, or, if not\n\
1100given, ValueError is raised.");
1101
Fredrik Lundh06d12682001-01-24 07:59:11 +00001102static PyObject *
1103unicodedata_name(PyObject* self, PyObject* args)
1104{
1105 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001106 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001107
1108 PyUnicodeObject* v;
1109 PyObject* defobj = NULL;
1110 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1111 return NULL;
1112
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001113 c = getuchar(v);
1114 if (c == (Py_UCS4)-1)
1115 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001116
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001117 if (!_getucname(self, c, name, sizeof(name))) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001118 if (defobj == NULL) {
1119 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001120 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001121 }
1122 else {
1123 Py_INCREF(defobj);
1124 return defobj;
1125 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001126 }
1127
Walter Dörwald4254e762007-06-05 16:04:09 +00001128 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001129}
1130
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001131PyDoc_STRVAR(unicodedata_lookup__doc__,
1132"lookup(name)\n\
1133\n\
1134Look up character by name. If a character with the\n\
1135given name is found, return the corresponding Unicode\n\
1136character. If not found, KeyError is raised.");
1137
Fredrik Lundh06d12682001-01-24 07:59:11 +00001138static PyObject *
1139unicodedata_lookup(PyObject* self, PyObject* args)
1140{
1141 Py_UCS4 code;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001142
1143 char* name;
1144 int namelen;
1145 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1146 return NULL;
1147
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001148 if (!_getcode(self, name, namelen, &code)) {
Guido van Rossum806c2462007-08-06 23:33:07 +00001149 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1150 name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001151 return NULL;
1152 }
1153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154 return PyUnicode_FromOrdinal(code);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001155}
1156
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001157/* XXX Add doc strings. */
1158
1159static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001160 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1161 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1162 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1163 {"category", unicodedata_category, METH_VARARGS,
1164 unicodedata_category__doc__},
1165 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1166 unicodedata_bidirectional__doc__},
1167 {"combining", unicodedata_combining, METH_VARARGS,
1168 unicodedata_combining__doc__},
1169 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1170 unicodedata_mirrored__doc__},
1171 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1172 unicodedata_east_asian_width__doc__},
1173 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1174 unicodedata_decomposition__doc__},
1175 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1176 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1177 {"normalize", unicodedata_normalize, METH_VARARGS,
1178 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001179 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001180};
1181
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001182static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001183 /* The ob_type field must be initialized in the module init function
1184 * to be portable to Windows without using C++. */
1185 PyVarObject_HEAD_INIT(NULL, 0)
1186 "unicodedata.UCD", /*tp_name*/
1187 sizeof(PreviousDBVersion), /*tp_basicsize*/
1188 0, /*tp_itemsize*/
1189 /* methods */
1190 (destructor)PyObject_Del, /*tp_dealloc*/
1191 0, /*tp_print*/
1192 0, /*tp_getattr*/
1193 0, /*tp_setattr*/
1194 0, /*tp_reserved*/
1195 0, /*tp_repr*/
1196 0, /*tp_as_number*/
1197 0, /*tp_as_sequence*/
1198 0, /*tp_as_mapping*/
1199 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001200 0, /*tp_call*/
1201 0, /*tp_str*/
1202 PyObject_GenericGetAttr,/*tp_getattro*/
1203 0, /*tp_setattro*/
1204 0, /*tp_as_buffer*/
1205 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1206 0, /*tp_doc*/
1207 0, /*tp_traverse*/
1208 0, /*tp_clear*/
1209 0, /*tp_richcompare*/
1210 0, /*tp_weaklistoffset*/
1211 0, /*tp_iter*/
1212 0, /*tp_iternext*/
1213 unicodedata_functions, /*tp_methods*/
1214 DB_members, /*tp_members*/
1215 0, /*tp_getset*/
1216 0, /*tp_base*/
1217 0, /*tp_dict*/
1218 0, /*tp_descr_get*/
1219 0, /*tp_descr_set*/
1220 0, /*tp_dictoffset*/
1221 0, /*tp_init*/
1222 0, /*tp_alloc*/
1223 0, /*tp_new*/
1224 0, /*tp_free*/
1225 0, /*tp_is_gc*/
1226};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001227
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001228PyDoc_STRVAR(unicodedata_docstring,
1229"This module provides access to the Unicode Character Database which\n\
1230defines character properties for all Unicode characters. The data in\n\
1231this database is based on the UnicodeData.txt file version\n\
Ezio Melotti4c5475d2010-03-22 23:16:42 +000012325.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001233\n\
1234The module uses the same names and symbols as defined by the\n\
Ezio Melottid96b2f22010-03-23 00:39:22 +00001235UnicodeData File Format 5.2.0 (see\n\
1236http://www.unicode.org/reports/tr44/tr44-4.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001237
Martin v. Löwis1a214512008-06-11 05:26:20 +00001238
1239static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001240 PyModuleDef_HEAD_INIT,
1241 "unicodedata",
1242 unicodedata_docstring,
1243 -1,
1244 unicodedata_functions,
1245 NULL,
1246 NULL,
1247 NULL,
1248 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001249};
1250
Mark Hammond62b1ab12002-07-23 06:31:15 +00001251PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001252PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001253{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001254 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001255
Christian Heimes90aa7642007-12-19 02:45:37 +00001256 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001257
Martin v. Löwis1a214512008-06-11 05:26:20 +00001258 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001259 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001260 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001261
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001262 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001263 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001264 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001265
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001266 /* Previous versions */
1267 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1268 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001269 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001270
Fredrik Lundh06d12682001-01-24 07:59:11 +00001271 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001272 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001273 if (v != NULL)
1274 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001275 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001276}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001277
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001278/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001279Local variables:
1280c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001281indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001282End:
1283*/