blob: bd96e3643f92dc3b4064be2d630ebf893b753457 [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti4c5475d2010-03-22 23:16:42 +00003 unicodedata -- Provides access to the Unicode 5.2 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti4c5475d2010-03-22 23:16:42 +00005 Data was extracted from the Unicode 5.2 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000030 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000031} _PyUnicode_DatabaseRecord;
32
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000033typedef struct change_record {
34 /* sequence of fields should be the same as in merge_old_version */
35 const unsigned char bidir_changed;
36 const unsigned char category_changed;
37 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000038 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000039 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000040} change_record;
41
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000042/* data file generated by Tools/unicode/makeunicodedata.py */
43#include "unicodedata_db.h"
44
45static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000046_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000049 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000050 index = 0;
51 else {
52 index = index1[(code>>SHIFT)];
53 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54 }
55
56 return &_PyUnicode_Database_Records[index];
57}
58
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000059/* ------------- Previous-version API ------------------------------------- */
60typedef struct previous_version {
61 PyObject_HEAD
62 const char *name;
63 const change_record* (*getrecord)(Py_UCS4);
64 Py_UCS4 (*normalization)(Py_UCS4);
65} PreviousDBVersion;
66
67#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
68
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000069static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000070 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000071 {NULL}
72};
73
Thomas Wouters89f507f2006-12-13 04:49:30 +000074/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000075static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000076#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000077
78static PyObject*
79new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
80 Py_UCS4 (*normalization)(Py_UCS4))
81{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000082 PreviousDBVersion *self;
83 self = PyObject_New(PreviousDBVersion, &UCD_Type);
84 if (self == NULL)
85 return NULL;
86 self->name = name;
87 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090}
91
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000092
93static Py_UCS4 getuchar(PyUnicodeObject *obj)
94{
95 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
96
97 if (PyUnicode_GET_SIZE(obj) == 1)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000098 return *v;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000099#ifndef Py_UNICODE_WIDE
100 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
101 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
102 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000103 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000104#endif
105 PyErr_SetString(PyExc_TypeError,
106 "need a single Unicode character as parameter");
107 return (Py_UCS4)-1;
108}
109
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000110/* --- Module API --------------------------------------------------------- */
111
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000112PyDoc_STRVAR(unicodedata_decimal__doc__,
113"decimal(unichr[, default])\n\
114\n\
115Returns the decimal value assigned to the Unicode character unichr\n\
116as integer. If no such value is defined, default is returned, or, if\n\
117not given, ValueError is raised.");
118
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000119static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000120unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000121{
122 PyUnicodeObject *v;
123 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000124 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000125 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000126 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000127
Fredrik Lundh06d12682001-01-24 07:59:11 +0000128 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000129 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000130 c = getuchar(v);
131 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000132 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000133
Martin v. Löwis1a214512008-06-11 05:26:20 +0000134 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000135 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000136 if (old->category_changed == 0) {
137 /* unassigned */
138 have_old = 1;
139 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000140 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000141 else if (old->decimal_changed != 0xFF) {
142 have_old = 1;
143 rc = old->decimal_changed;
144 }
145 }
146
147 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000148 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000149 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 if (defobj == NULL) {
151 PyErr_SetString(PyExc_ValueError,
152 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000153 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000154 }
155 else {
156 Py_INCREF(defobj);
157 return defobj;
158 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000159 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000160 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000161}
162
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000163PyDoc_STRVAR(unicodedata_digit__doc__,
164"digit(unichr[, default])\n\
165\n\
166Returns the digit value assigned to the Unicode character unichr as\n\
167integer. If no such value is defined, default is returned, or, if\n\
168not given, ValueError is raised.");
169
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000170static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000171unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000172{
173 PyUnicodeObject *v;
174 PyObject *defobj = NULL;
175 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000176 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000177
Fredrik Lundh06d12682001-01-24 07:59:11 +0000178 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000179 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000180 c = getuchar(v);
181 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000182 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000183 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000184 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 if (defobj == NULL) {
186 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000187 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000188 }
189 else {
190 Py_INCREF(defobj);
191 return defobj;
192 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000194 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000195}
196
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000197PyDoc_STRVAR(unicodedata_numeric__doc__,
198"numeric(unichr[, default])\n\
199\n\
200Returns the numeric value assigned to the Unicode character unichr\n\
201as float. If no such value is defined, default is returned, or, if\n\
202not given, ValueError is raised.");
203
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000204static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000205unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000206{
207 PyUnicodeObject *v;
208 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000209 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000210 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000211 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000212
Fredrik Lundh06d12682001-01-24 07:59:11 +0000213 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000214 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000215 c = getuchar(v);
216 if (c == (Py_UCS4)-1)
217 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000218
Martin v. Löwis1a214512008-06-11 05:26:20 +0000219 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000220 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000221 if (old->category_changed == 0) {
222 /* unassigned */
223 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000225 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000226 else if (old->decimal_changed != 0xFF) {
227 have_old = 1;
228 rc = old->decimal_changed;
229 }
230 }
231
232 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000233 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000235 if (defobj == NULL) {
236 PyErr_SetString(PyExc_ValueError, "not a numeric character");
237 return NULL;
238 }
239 else {
240 Py_INCREF(defobj);
241 return defobj;
242 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000243 }
244 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000245}
246
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000247PyDoc_STRVAR(unicodedata_category__doc__,
248"category(unichr)\n\
249\n\
250Returns the general category assigned to the Unicode character\n\
251unichr as string.");
252
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000253static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000254unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000255{
256 PyUnicodeObject *v;
257 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000258 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000259
260 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 &PyUnicode_Type, &v))
262 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000263 c = getuchar(v);
264 if (c == (Py_UCS4)-1)
265 return NULL;
266 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000267 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000268 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000269 if (old->category_changed != 0xFF)
270 index = old->category_changed;
271 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000272 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000273}
274
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000275PyDoc_STRVAR(unicodedata_bidirectional__doc__,
276"bidirectional(unichr)\n\
277\n\
278Returns the bidirectional category assigned to the Unicode character\n\
279unichr as string. If no such value is defined, an empty string is\n\
280returned.");
281
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000282static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000283unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000284{
285 PyUnicodeObject *v;
286 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000287 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000288
289 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000290 &PyUnicode_Type, &v))
291 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000292 c = getuchar(v);
293 if (c == (Py_UCS4)-1)
294 return NULL;
295 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000296 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000297 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000298 if (old->category_changed == 0)
299 index = 0; /* unassigned */
300 else if (old->bidir_changed != 0xFF)
301 index = old->bidir_changed;
302 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000303 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000304}
305
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000306PyDoc_STRVAR(unicodedata_combining__doc__,
307"combining(unichr)\n\
308\n\
309Returns the canonical combining class assigned to the Unicode\n\
310character unichr as integer. Returns 0 if no combining class is\n\
311defined.");
312
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000313static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000314unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000315{
316 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000317 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000318 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000319
320 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000321 &PyUnicode_Type, &v))
322 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000323 c = getuchar(v);
324 if (c == (Py_UCS4)-1)
325 return NULL;
326 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000327 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000328 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000329 if (old->category_changed == 0)
330 index = 0; /* unassigned */
331 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000332 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000333}
334
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000335PyDoc_STRVAR(unicodedata_mirrored__doc__,
336"mirrored(unichr)\n\
337\n\
338Returns the mirrored property assigned to the Unicode character\n\
339unichr as integer. Returns 1 if the character has been identified as\n\
340a \"mirrored\" character in bidirectional text, 0 otherwise.");
341
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000342static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000343unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000344{
345 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000346 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000347 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000348
349 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000350 &PyUnicode_Type, &v))
351 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000352 c = getuchar(v);
353 if (c == (Py_UCS4)-1)
354 return NULL;
355 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000356 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000357 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000358 if (old->category_changed == 0)
359 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000360 else if (old->mirrored_changed != 0xFF)
361 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000362 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000363 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000364}
365
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000366PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
367"east_asian_width(unichr)\n\
368\n\
369Returns the east asian width assigned to the Unicode character\n\
370unichr as string.");
371
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000372static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000373unicodedata_east_asian_width(PyObject *self, PyObject *args)
374{
375 PyUnicodeObject *v;
376 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000377 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000378
379 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000380 &PyUnicode_Type, &v))
381 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000382 c = getuchar(v);
383 if (c == (Py_UCS4)-1)
384 return NULL;
385 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000386 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000387 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000388 if (old->category_changed == 0)
389 index = 0; /* unassigned */
390 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000391 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000392}
393
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000394PyDoc_STRVAR(unicodedata_decomposition__doc__,
395"decomposition(unichr)\n\
396\n\
397Returns the character decomposition mapping assigned to the Unicode\n\
398character unichr as string. An empty string is returned in case no\n\
399such mapping is defined.");
400
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000401static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000402unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000403{
404 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000405 char decomp[256];
406 int code, index, count, i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000407 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000408 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000409
410 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000411 &PyUnicode_Type, &v))
412 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000413 c = getuchar(v);
414 if (c == (Py_UCS4)-1)
415 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000416
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000417 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000418
Martin v. Löwis1a214512008-06-11 05:26:20 +0000419 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000420 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000421 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000422 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000423 }
424
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000425 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000426 index = 0;
427 else {
428 index = decomp_index1[(code>>DECOMP_SHIFT)];
429 index = decomp_index2[(index<<DECOMP_SHIFT)+
430 (code&((1<<DECOMP_SHIFT)-1))];
431 }
432
Tim Peters69b83b12001-11-30 07:23:05 +0000433 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000434 is prefix code (from*/
435 count = decomp_data[index] >> 8;
436
437 /* XXX: could allocate the PyString up front instead
438 (strlen(prefix) + 5 * count + 1 bytes) */
439
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000440 /* Based on how index is calculated above and decomp_data is generated
441 from Tools/unicode/makeunicodedata.py, it should not be possible
442 to overflow decomp_prefix. */
443 prefix_index = decomp_data[index] & 255;
444 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
445
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000446 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000447 i = strlen(decomp_prefix[prefix_index]);
448 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000449
450 while (count-- > 0) {
451 if (i)
452 decomp[i++] = ' ';
Tim Peters69b83b12001-11-30 07:23:05 +0000453 assert((size_t)i < sizeof(decomp));
454 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
455 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000456 i += strlen(decomp + i);
457 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000458
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000459 decomp[i] = '\0';
460
Walter Dörwald4254e762007-06-05 16:04:09 +0000461 return PyUnicode_FromString(decomp);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000462}
463
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000464static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000465get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000466{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000467 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000468 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000469 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000470 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000471 /* unassigned in old version */
472 *index = 0;
473 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000474 else {
475 *index = decomp_index1[(code>>DECOMP_SHIFT)];
476 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
477 (code&((1<<DECOMP_SHIFT)-1))];
478 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000479
Martin v. Löwis677bde22002-11-23 22:08:15 +0000480 /* high byte is number of hex bytes (usually one or two), low byte
481 is prefix code (from*/
482 *count = decomp_data[*index] >> 8;
483 *prefix = decomp_data[*index] & 255;
484
485 (*index)++;
486}
487
488#define SBase 0xAC00
489#define LBase 0x1100
490#define VBase 0x1161
491#define TBase 0x11A7
492#define LCount 19
493#define VCount 21
494#define TCount 28
495#define NCount (VCount*TCount)
496#define SCount (LCount*NCount)
497
498static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000499nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000500{
501 PyObject *result;
502 Py_UNICODE *i, *end, *o;
503 /* Longest decomposition in Unicode 3.2: U+FDFA */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000504 Py_UNICODE stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000505 Py_ssize_t space, isize;
506 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000507 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000508
Martin v. Löwis677bde22002-11-23 22:08:15 +0000509 stackptr = 0;
510 isize = PyUnicode_GET_SIZE(input);
511 /* Overallocate atmost 10 characters. */
512 space = (isize > 10 ? 10 : isize) + isize;
513 result = PyUnicode_FromUnicode(NULL, space);
514 if (!result)
515 return NULL;
516 i = PyUnicode_AS_UNICODE(input);
517 end = i + isize;
518 o = PyUnicode_AS_UNICODE(result);
519
520 while (i < end) {
521 stack[stackptr++] = *i++;
522 while(stackptr) {
523 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000524 /* Hangul Decomposition adds three characters in
525 a single step, so we need atleast that much room. */
526 if (space < 3) {
Martin v. Löwis5b222132007-06-10 09:51:05 +0000527 Py_ssize_t newsize = PyUnicode_GET_SIZE(result) + 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000528 space += 10;
529 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000530 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000531 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000532 }
533 /* Hangul Decomposition. */
534 if (SBase <= code && code < (SBase+SCount)) {
535 int SIndex = code - SBase;
536 int L = LBase + SIndex / NCount;
537 int V = VBase + (SIndex % NCount) / TCount;
538 int T = TBase + SIndex % TCount;
539 *o++ = L;
540 *o++ = V;
541 space -= 2;
542 if (T != TBase) {
543 *o++ = T;
544 space --;
545 }
546 continue;
547 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000548 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000549 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000550 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
551 if (value != 0) {
552 stack[stackptr++] = value;
553 continue;
554 }
555 }
556
557 /* Other decompositions. */
558 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000559
560 /* Copy character if it is not decomposable, or has a
561 compatibility decomposition, but we do NFD. */
562 if (!count || (prefix && !k)) {
563 *o++ = code;
564 space--;
565 continue;
566 }
567 /* Copy decomposition onto the stack, in reverse
568 order. */
569 while(count) {
570 code = decomp_data[index + (--count)];
571 stack[stackptr++] = code;
572 }
573 }
574 }
575
576 /* Drop overallocation. Cannot fail. */
577 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
578
579 /* Sort canonically. */
580 i = PyUnicode_AS_UNICODE(result);
581 prev = _getrecord_ex(*i)->combining;
582 end = i + PyUnicode_GET_SIZE(result);
583 for (i++; i < end; i++) {
584 cur = _getrecord_ex(*i)->combining;
585 if (prev == 0 || cur == 0 || prev <= cur) {
586 prev = cur;
587 continue;
588 }
589 /* Non-canonical order. Need to switch *i with previous. */
590 o = i - 1;
591 while (1) {
592 Py_UNICODE tmp = o[1];
593 o[1] = o[0];
594 o[0] = tmp;
595 o--;
596 if (o < PyUnicode_AS_UNICODE(result))
597 break;
598 prev = _getrecord_ex(*o)->combining;
599 if (prev == 0 || prev <= cur)
600 break;
601 }
602 prev = _getrecord_ex(*i)->combining;
603 }
604 return result;
605}
606
607static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000608find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000609{
610 int index;
611 for (index = 0; nfc[index].start; index++) {
612 int start = nfc[index].start;
613 if (code < start)
614 return -1;
615 if (code <= start + nfc[index].count) {
616 int delta = code - start;
617 return nfc[index].index + delta;
618 }
619 }
620 return -1;
621}
622
623static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000624nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000625{
626 PyObject *result;
627 Py_UNICODE *i, *i1, *o, *end;
628 int f,l,index,index1,comb;
629 Py_UNICODE code;
630 Py_UNICODE *skipped[20];
631 int cskipped = 0;
632
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000633 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000634 if (!result)
635 return NULL;
636
637 /* We are going to modify result in-place.
638 If nfd_nfkd is changed to sometimes return the input,
639 this code needs to be reviewed. */
640 assert(result != input);
641
642 i = PyUnicode_AS_UNICODE(result);
643 end = i + PyUnicode_GET_SIZE(result);
644 o = PyUnicode_AS_UNICODE(result);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000645
Martin v. Löwis677bde22002-11-23 22:08:15 +0000646 again:
647 while (i < end) {
648 for (index = 0; index < cskipped; index++) {
649 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000650 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000651 Remove from list. */
652 skipped[index] = skipped[cskipped-1];
653 cskipped--;
654 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000655 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000656 }
657 }
658 /* Hangul Composition. We don't need to check for <LV,T>
659 pairs, since we always have decomposed data. */
660 if (LBase <= *i && *i < (LBase+LCount) &&
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000661 i + 1 < end &&
Martin v. Löwis677bde22002-11-23 22:08:15 +0000662 VBase <= i[1] && i[1] <= (VBase+VCount)) {
663 int LIndex, VIndex;
664 LIndex = i[0] - LBase;
665 VIndex = i[1] - VBase;
666 code = SBase + (LIndex*VCount+VIndex)*TCount;
667 i+=2;
668 if (i < end &&
669 TBase <= *i && *i <= (TBase+TCount)) {
670 code += *i-TBase;
671 i++;
672 }
673 *o++ = code;
674 continue;
675 }
676
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000677 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000678 if (f == -1) {
679 *o++ = *i++;
680 continue;
681 }
682 /* Find next unblocked character. */
683 i1 = i+1;
684 comb = 0;
685 while (i1 < end) {
686 int comb1 = _getrecord_ex(*i1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000687 if (comb) {
688 if (comb1 == 0)
689 break;
690 if (comb >= comb1) {
691 /* Character is blocked. */
692 i1++;
693 continue;
694 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000695 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000696 l = find_nfc_index(self, nfc_last, *i1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000697 /* *i1 cannot be combined with *i. If *i1
698 is a starter, we don't need to look further.
699 Otherwise, record the combining class. */
700 if (l == -1) {
701 not_combinable:
702 if (comb1 == 0)
703 break;
704 comb = comb1;
705 i1++;
706 continue;
707 }
708 index = f*TOTAL_LAST + l;
709 index1 = comp_index[index >> COMP_SHIFT];
710 code = comp_data[(index1<<COMP_SHIFT)+
711 (index&((1<<COMP_SHIFT)-1))];
712 if (code == 0)
713 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000714
Martin v. Löwis677bde22002-11-23 22:08:15 +0000715 /* Replace the original character. */
716 *i = code;
717 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000718 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000719 skipped[cskipped++] = i1;
720 i1++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000721 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000722 if (f == -1)
723 break;
724 }
725 *o++ = *i++;
726 }
727 if (o != end)
728 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
729 return result;
730}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000731
732/* Return 1 if the input is certainly normalized, 0 if it might not be. */
733static int
734is_normalized(PyObject *self, PyObject *input, int nfc, int k)
735{
736 Py_UNICODE *i, *end;
737 unsigned char prev_combining = 0, quickcheck_mask;
738
739 /* An older version of the database is requested, quickchecks must be
740 disabled. */
741 if (self && UCD_Check(self))
742 return 0;
743
744 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
745 as described in http://unicode.org/reports/tr15/#Annex8. */
746 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
747
748 i = PyUnicode_AS_UNICODE(input);
749 end = i + PyUnicode_GET_SIZE(input);
750 while (i < end) {
751 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
752 unsigned char combining = record->combining;
753 unsigned char quickcheck = record->normalization_quick_check;
754
755 if (quickcheck & quickcheck_mask)
756 return 0; /* this string might need normalization */
757 if (combining && prev_combining > combining)
758 return 0; /* non-canonical sort order, not normalized */
759 prev_combining = combining;
760 }
761 return 1; /* certainly normalized */
762}
763
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000764PyDoc_STRVAR(unicodedata_normalize__doc__,
765"normalize(form, unistr)\n\
766\n\
767Return the normal form 'form' for the Unicode string unistr. Valid\n\
768values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
769
Martin v. Löwis677bde22002-11-23 22:08:15 +0000770static PyObject*
771unicodedata_normalize(PyObject *self, PyObject *args)
772{
773 char *form;
774 PyObject *input;
775
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000776 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000777 &form, &PyUnicode_Type, &input))
778 return NULL;
779
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000780 if (PyUnicode_GetSize(input) == 0) {
781 /* Special case empty input strings, since resizing
782 them later would cause internal errors. */
783 Py_INCREF(input);
784 return input;
785 }
786
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000787 if (strcmp(form, "NFC") == 0) {
788 if (is_normalized(self, input, 1, 0)) {
789 Py_INCREF(input);
790 return input;
791 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000792 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000793 }
794 if (strcmp(form, "NFKC") == 0) {
795 if (is_normalized(self, input, 1, 1)) {
796 Py_INCREF(input);
797 return input;
798 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000799 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000800 }
801 if (strcmp(form, "NFD") == 0) {
802 if (is_normalized(self, input, 0, 0)) {
803 Py_INCREF(input);
804 return input;
805 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000806 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000807 }
808 if (strcmp(form, "NFKD") == 0) {
809 if (is_normalized(self, input, 0, 1)) {
810 Py_INCREF(input);
811 return input;
812 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000813 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000814 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000815 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
816 return NULL;
817}
818
Fredrik Lundh06d12682001-01-24 07:59:11 +0000819/* -------------------------------------------------------------------- */
820/* unicode character name tables */
821
822/* data file generated by Tools/unicode/makeunicodedata.py */
823#include "unicodename_db.h"
824
825/* -------------------------------------------------------------------- */
826/* database code (cut and pasted from the unidb package) */
827
828static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000829_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000830{
831 int i;
832 unsigned long h = 0;
833 unsigned long ix;
834 for (i = 0; i < len; i++) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000835 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000836 ix = h & 0xff000000;
837 if (ix)
838 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
839 }
840 return h;
841}
842
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000843static char *hangul_syllables[][3] = {
844 { "G", "A", "" },
845 { "GG", "AE", "G" },
846 { "N", "YA", "GG" },
847 { "D", "YAE", "GS" },
848 { "DD", "EO", "N", },
849 { "R", "E", "NJ" },
850 { "M", "YEO", "NH" },
851 { "B", "YE", "D" },
852 { "BB", "O", "L" },
853 { "S", "WA", "LG" },
854 { "SS", "WAE", "LM" },
855 { "", "OE", "LB" },
856 { "J", "YO", "LS" },
857 { "JJ", "U", "LT" },
858 { "C", "WEO", "LP" },
859 { "K", "WE", "LH" },
860 { "T", "WI", "M" },
861 { "P", "YU", "B" },
862 { "H", "EU", "BS" },
863 { 0, "YI", "S" },
864 { 0, "I", "SS" },
865 { 0, 0, "NG" },
866 { 0, 0, "J" },
867 { 0, 0, "C" },
868 { 0, 0, "K" },
869 { 0, 0, "T" },
870 { 0, 0, "P" },
871 { 0, 0, "H" }
872};
873
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000874/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000875static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000876is_unified_ideograph(Py_UCS4 code)
877{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000878 return
879 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
880 (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */
881 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
882 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
883 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000884}
885
886static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000887_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000888{
889 int offset;
890 int i;
891 int word;
892 unsigned char* w;
893
Martin v. Löwisc3509122006-03-11 12:16:23 +0000894 if (code >= 0x110000)
895 return 0;
896
Martin v. Löwis1a214512008-06-11 05:26:20 +0000897 if (self && UCD_Check(self)) {
Martin v. Löwisc3509122006-03-11 12:16:23 +0000898 const change_record *old = get_old_record(self, code);
899 if (old->category_changed == 0) {
900 /* unassigned */
901 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000902 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000903 }
904
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000905 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000906 /* Hangul syllable. */
907 int SIndex = code - SBase;
908 int L = SIndex / NCount;
909 int V = (SIndex % NCount) / TCount;
910 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000911
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000912 if (buflen < 27)
913 /* Worst case: HANGUL SYLLABLE <10chars>. */
914 return 0;
915 strcpy(buffer, "HANGUL SYLLABLE ");
916 buffer += 16;
917 strcpy(buffer, hangul_syllables[L][0]);
918 buffer += strlen(hangul_syllables[L][0]);
919 strcpy(buffer, hangul_syllables[V][1]);
920 buffer += strlen(hangul_syllables[V][1]);
921 strcpy(buffer, hangul_syllables[T][2]);
922 buffer += strlen(hangul_syllables[T][2]);
923 *buffer = '\0';
924 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000925 }
926
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000927 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000928 if (buflen < 28)
929 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
930 return 0;
931 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
932 return 1;
933 }
934
Fredrik Lundh06d12682001-01-24 07:59:11 +0000935 /* get offset into phrasebook */
936 offset = phrasebook_offset1[(code>>phrasebook_shift)];
937 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
938 (code&((1<<phrasebook_shift)-1))];
939 if (!offset)
940 return 0;
941
942 i = 0;
943
944 for (;;) {
945 /* get word index */
946 word = phrasebook[offset] - phrasebook_short;
947 if (word >= 0) {
948 word = (word << 8) + phrasebook[offset+1];
949 offset += 2;
950 } else
951 word = phrasebook[offset++];
952 if (i) {
953 if (i > buflen)
954 return 0; /* buffer overflow */
955 buffer[i++] = ' ';
956 }
957 /* copy word string from lexicon. the last character in the
958 word has bit 7 set. the last word in a string ends with
959 0x80 */
960 w = lexicon + lexicon_offset[word];
961 while (*w < 128) {
962 if (i >= buflen)
963 return 0; /* buffer overflow */
964 buffer[i++] = *w++;
965 }
966 if (i >= buflen)
967 return 0; /* buffer overflow */
968 buffer[i++] = *w & 127;
969 if (*w == 128)
970 break; /* end of word */
971 }
972
973 return 1;
974}
975
976static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000977_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000978{
979 /* check if code corresponds to the given name */
980 int i;
981 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000982 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000983 return 0;
984 for (i = 0; i < namelen; i++) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000985 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +0000986 return 0;
987 }
988 return buffer[namelen] == '\0';
989}
990
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000991static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000992find_syllable(const char *str, int *len, int *pos, int count, int column)
993{
994 int i, len1;
995 *len = -1;
996 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000997 char *s = hangul_syllables[i][column];
998 len1 = strlen(s);
999 if (len1 <= *len)
1000 continue;
1001 if (strncmp(str, s, len1) == 0) {
1002 *len = len1;
1003 *pos = i;
1004 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001005 }
1006 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001007 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001008 }
1009}
1010
Fredrik Lundh06d12682001-01-24 07:59:11 +00001011static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001012_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001013{
1014 unsigned int h, v;
1015 unsigned int mask = code_size-1;
1016 unsigned int i, incr;
1017
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001018 /* Check for hangul syllables. */
1019 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001020 int len, L = -1, V = -1, T = -1;
1021 const char *pos = name + 16;
1022 find_syllable(pos, &len, &L, LCount, 0);
1023 pos += len;
1024 find_syllable(pos, &len, &V, VCount, 1);
1025 pos += len;
1026 find_syllable(pos, &len, &T, TCount, 2);
1027 pos += len;
1028 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1029 *code = SBase + (L*VCount+V)*TCount + T;
1030 return 1;
1031 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001032 /* Otherwise, it's an illegal syllable name. */
1033 return 0;
1034 }
1035
1036 /* Check for unified ideographs. */
1037 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1038 /* Four or five hexdigits must follow. */
1039 v = 0;
1040 name += 22;
1041 namelen -= 22;
1042 if (namelen != 4 && namelen != 5)
1043 return 0;
1044 while (namelen--) {
1045 v *= 16;
1046 if (*name >= '0' && *name <= '9')
1047 v += *name - '0';
1048 else if (*name >= 'A' && *name <= 'F')
1049 v += *name - 'A' + 10;
1050 else
1051 return 0;
1052 name++;
1053 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001054 if (!is_unified_ideograph(v))
1055 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001056 *code = v;
1057 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001058 }
1059
Fredrik Lundh06d12682001-01-24 07:59:11 +00001060 /* the following is the same as python's dictionary lookup, with
1061 only minor changes. see the makeunicodedata script for more
1062 details */
1063
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001064 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001065 i = (~h) & mask;
1066 v = code_hash[i];
1067 if (!v)
1068 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001069 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001070 *code = v;
1071 return 1;
1072 }
1073 incr = (h ^ (h >> 3)) & mask;
1074 if (!incr)
1075 incr = mask;
1076 for (;;) {
1077 i = (i + incr) & mask;
1078 v = code_hash[i];
1079 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001080 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001081 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001082 *code = v;
1083 return 1;
1084 }
1085 incr = incr << 1;
1086 if (incr > mask)
1087 incr = incr ^ code_poly;
1088 }
1089}
1090
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001091static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001092{
1093 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001094 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001095 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001096};
1097
1098/* -------------------------------------------------------------------- */
1099/* Python bindings */
1100
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001101PyDoc_STRVAR(unicodedata_name__doc__,
1102"name(unichr[, default])\n\
1103Returns the name assigned to the Unicode character unichr as a\n\
1104string. If no name is defined, default is returned, or, if not\n\
1105given, ValueError is raised.");
1106
Fredrik Lundh06d12682001-01-24 07:59:11 +00001107static PyObject *
1108unicodedata_name(PyObject* self, PyObject* args)
1109{
1110 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001111 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001112
1113 PyUnicodeObject* v;
1114 PyObject* defobj = NULL;
1115 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1116 return NULL;
1117
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001118 c = getuchar(v);
1119 if (c == (Py_UCS4)-1)
1120 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001121
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001122 if (!_getucname(self, c, name, sizeof(name))) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001123 if (defobj == NULL) {
1124 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001125 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001126 }
1127 else {
1128 Py_INCREF(defobj);
1129 return defobj;
1130 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001131 }
1132
Walter Dörwald4254e762007-06-05 16:04:09 +00001133 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001134}
1135
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001136PyDoc_STRVAR(unicodedata_lookup__doc__,
1137"lookup(name)\n\
1138\n\
1139Look up character by name. If a character with the\n\
1140given name is found, return the corresponding Unicode\n\
1141character. If not found, KeyError is raised.");
1142
Fredrik Lundh06d12682001-01-24 07:59:11 +00001143static PyObject *
1144unicodedata_lookup(PyObject* self, PyObject* args)
1145{
1146 Py_UCS4 code;
Guido van Rossum806c2462007-08-06 23:33:07 +00001147 Py_UNICODE str[2];
Fredrik Lundh06d12682001-01-24 07:59:11 +00001148
1149 char* name;
1150 int namelen;
1151 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1152 return NULL;
1153
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001154 if (!_getcode(self, name, namelen, &code)) {
Guido van Rossum806c2462007-08-06 23:33:07 +00001155 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1156 name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001157 return NULL;
1158 }
1159
Guido van Rossum806c2462007-08-06 23:33:07 +00001160#ifndef Py_UNICODE_WIDE
1161 if (code >= 0x10000) {
1162 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1163 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1164 return PyUnicode_FromUnicode(str, 2);
1165 }
1166#endif
Fredrik Lundh06d12682001-01-24 07:59:11 +00001167 str[0] = (Py_UNICODE) code;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001168 return PyUnicode_FromUnicode(str, 1);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001169}
1170
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001171/* XXX Add doc strings. */
1172
1173static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001174 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1175 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1176 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1177 {"category", unicodedata_category, METH_VARARGS,
1178 unicodedata_category__doc__},
1179 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1180 unicodedata_bidirectional__doc__},
1181 {"combining", unicodedata_combining, METH_VARARGS,
1182 unicodedata_combining__doc__},
1183 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1184 unicodedata_mirrored__doc__},
1185 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1186 unicodedata_east_asian_width__doc__},
1187 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1188 unicodedata_decomposition__doc__},
1189 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1190 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1191 {"normalize", unicodedata_normalize, METH_VARARGS,
1192 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001193 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001194};
1195
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001196static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001197 /* The ob_type field must be initialized in the module init function
1198 * to be portable to Windows without using C++. */
1199 PyVarObject_HEAD_INIT(NULL, 0)
1200 "unicodedata.UCD", /*tp_name*/
1201 sizeof(PreviousDBVersion), /*tp_basicsize*/
1202 0, /*tp_itemsize*/
1203 /* methods */
1204 (destructor)PyObject_Del, /*tp_dealloc*/
1205 0, /*tp_print*/
1206 0, /*tp_getattr*/
1207 0, /*tp_setattr*/
1208 0, /*tp_reserved*/
1209 0, /*tp_repr*/
1210 0, /*tp_as_number*/
1211 0, /*tp_as_sequence*/
1212 0, /*tp_as_mapping*/
1213 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001214 0, /*tp_call*/
1215 0, /*tp_str*/
1216 PyObject_GenericGetAttr,/*tp_getattro*/
1217 0, /*tp_setattro*/
1218 0, /*tp_as_buffer*/
1219 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1220 0, /*tp_doc*/
1221 0, /*tp_traverse*/
1222 0, /*tp_clear*/
1223 0, /*tp_richcompare*/
1224 0, /*tp_weaklistoffset*/
1225 0, /*tp_iter*/
1226 0, /*tp_iternext*/
1227 unicodedata_functions, /*tp_methods*/
1228 DB_members, /*tp_members*/
1229 0, /*tp_getset*/
1230 0, /*tp_base*/
1231 0, /*tp_dict*/
1232 0, /*tp_descr_get*/
1233 0, /*tp_descr_set*/
1234 0, /*tp_dictoffset*/
1235 0, /*tp_init*/
1236 0, /*tp_alloc*/
1237 0, /*tp_new*/
1238 0, /*tp_free*/
1239 0, /*tp_is_gc*/
1240};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001241
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001242PyDoc_STRVAR(unicodedata_docstring,
1243"This module provides access to the Unicode Character Database which\n\
1244defines character properties for all Unicode characters. The data in\n\
1245this database is based on the UnicodeData.txt file version\n\
Ezio Melotti4c5475d2010-03-22 23:16:42 +000012465.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001247\n\
1248The module uses the same names and symbols as defined by the\n\
Ezio Melottid96b2f22010-03-23 00:39:22 +00001249UnicodeData File Format 5.2.0 (see\n\
1250http://www.unicode.org/reports/tr44/tr44-4.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001251
Martin v. Löwis1a214512008-06-11 05:26:20 +00001252
1253static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001254 PyModuleDef_HEAD_INIT,
1255 "unicodedata",
1256 unicodedata_docstring,
1257 -1,
1258 unicodedata_functions,
1259 NULL,
1260 NULL,
1261 NULL,
1262 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001263};
1264
Mark Hammond62b1ab12002-07-23 06:31:15 +00001265PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001266PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001267{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001268 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001269
Christian Heimes90aa7642007-12-19 02:45:37 +00001270 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001271
Martin v. Löwis1a214512008-06-11 05:26:20 +00001272 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001273 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001274 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001275
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001276 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001277 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001278 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001279
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001280 /* Previous versions */
1281 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1282 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001283 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001284
Fredrik Lundh06d12682001-01-24 07:59:11 +00001285 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001286 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001287 if (v != NULL)
1288 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001289 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001290}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001291
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001292/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001293Local variables:
1294c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001295indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001296End:
1297*/