blob: 887056ce5dc1b40b2b281f3f0d2347031cbdccdc [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti4c5475d2010-03-22 23:16:42 +00003 unicodedata -- Provides access to the Unicode 5.2 data base.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti4c5475d2010-03-22 23:16:42 +00005 Data was extracted from the Unicode 5.2 UnicodeData.txt file.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00006
Fredrik Lundhcfcea492000-09-25 08:07:06 +00007 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +00009 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000010
Fredrik Lundhcfcea492000-09-25 08:07:06 +000011 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000012
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000016#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000017#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000018
19/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000020
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000021typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000022 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000030 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000031} _PyUnicode_DatabaseRecord;
32
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000033typedef struct change_record {
34 /* sequence of fields should be the same as in merge_old_version */
35 const unsigned char bidir_changed;
36 const unsigned char category_changed;
37 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000038 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000039 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000040} change_record;
41
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000042/* data file generated by Tools/unicode/makeunicodedata.py */
43#include "unicodedata_db.h"
44
45static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000046_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000047{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000049 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000050 index = 0;
51 else {
52 index = index1[(code>>SHIFT)];
53 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54 }
55
56 return &_PyUnicode_Database_Records[index];
57}
58
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000059/* ------------- Previous-version API ------------------------------------- */
60typedef struct previous_version {
61 PyObject_HEAD
62 const char *name;
63 const change_record* (*getrecord)(Py_UCS4);
64 Py_UCS4 (*normalization)(Py_UCS4);
65} PreviousDBVersion;
66
67#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
68
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000069static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000070 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000071 {NULL}
72};
73
Thomas Wouters89f507f2006-12-13 04:49:30 +000074/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000075static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000076#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000077
78static PyObject*
79new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
80 Py_UCS4 (*normalization)(Py_UCS4))
81{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000082 PreviousDBVersion *self;
83 self = PyObject_New(PreviousDBVersion, &UCD_Type);
84 if (self == NULL)
85 return NULL;
86 self->name = name;
87 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000088 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000090}
91
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000092
93static Py_UCS4 getuchar(PyUnicodeObject *obj)
94{
95 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
96
97 if (PyUnicode_GET_SIZE(obj) == 1)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000098 return *v;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000099#ifndef Py_UNICODE_WIDE
100 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
101 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
102 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000103 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000104#endif
105 PyErr_SetString(PyExc_TypeError,
106 "need a single Unicode character as parameter");
107 return (Py_UCS4)-1;
108}
109
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000110/* --- Module API --------------------------------------------------------- */
111
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000112PyDoc_STRVAR(unicodedata_decimal__doc__,
113"decimal(unichr[, default])\n\
114\n\
115Returns the decimal value assigned to the Unicode character unichr\n\
116as integer. If no such value is defined, default is returned, or, if\n\
117not given, ValueError is raised.");
118
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000119static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000120unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000121{
122 PyUnicodeObject *v;
123 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000124 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000125 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000126 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000127
Fredrik Lundh06d12682001-01-24 07:59:11 +0000128 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000129 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000130 c = getuchar(v);
131 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000132 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000133
Martin v. Löwis1a214512008-06-11 05:26:20 +0000134 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000135 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000136 if (old->category_changed == 0) {
137 /* unassigned */
138 have_old = 1;
139 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000140 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000141 else if (old->decimal_changed != 0xFF) {
142 have_old = 1;
143 rc = old->decimal_changed;
144 }
145 }
146
147 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000148 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000149 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 if (defobj == NULL) {
151 PyErr_SetString(PyExc_ValueError,
152 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000153 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000154 }
155 else {
156 Py_INCREF(defobj);
157 return defobj;
158 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000159 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000160 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000161}
162
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000163PyDoc_STRVAR(unicodedata_digit__doc__,
164"digit(unichr[, default])\n\
165\n\
166Returns the digit value assigned to the Unicode character unichr as\n\
167integer. If no such value is defined, default is returned, or, if\n\
168not given, ValueError is raised.");
169
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000170static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000171unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000172{
173 PyUnicodeObject *v;
174 PyObject *defobj = NULL;
175 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000176 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000177
Fredrik Lundh06d12682001-01-24 07:59:11 +0000178 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000179 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000180 c = getuchar(v);
181 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000182 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000183 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000184 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000185 if (defobj == NULL) {
186 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000187 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000188 }
189 else {
190 Py_INCREF(defobj);
191 return defobj;
192 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000193 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000194 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000195}
196
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000197PyDoc_STRVAR(unicodedata_numeric__doc__,
198"numeric(unichr[, default])\n\
199\n\
200Returns the numeric value assigned to the Unicode character unichr\n\
201as float. If no such value is defined, default is returned, or, if\n\
202not given, ValueError is raised.");
203
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000204static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000205unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000206{
207 PyUnicodeObject *v;
208 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000209 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000210 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000211 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000212
Fredrik Lundh06d12682001-01-24 07:59:11 +0000213 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000214 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000215 c = getuchar(v);
216 if (c == (Py_UCS4)-1)
217 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000218
Martin v. Löwis1a214512008-06-11 05:26:20 +0000219 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000220 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000221 if (old->category_changed == 0) {
222 /* unassigned */
223 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000225 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000226 else if (old->decimal_changed != 0xFF) {
227 have_old = 1;
228 rc = old->decimal_changed;
229 }
230 }
231
232 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000233 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000235 if (defobj == NULL) {
236 PyErr_SetString(PyExc_ValueError, "not a numeric character");
237 return NULL;
238 }
239 else {
240 Py_INCREF(defobj);
241 return defobj;
242 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000243 }
244 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000245}
246
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000247PyDoc_STRVAR(unicodedata_category__doc__,
248"category(unichr)\n\
249\n\
250Returns the general category assigned to the Unicode character\n\
251unichr as string.");
252
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000253static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000254unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000255{
256 PyUnicodeObject *v;
257 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000258 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000259
260 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 &PyUnicode_Type, &v))
262 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000263 c = getuchar(v);
264 if (c == (Py_UCS4)-1)
265 return NULL;
266 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000267 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000268 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000269 if (old->category_changed != 0xFF)
270 index = old->category_changed;
271 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000272 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000273}
274
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000275PyDoc_STRVAR(unicodedata_bidirectional__doc__,
276"bidirectional(unichr)\n\
277\n\
278Returns the bidirectional category assigned to the Unicode character\n\
279unichr as string. If no such value is defined, an empty string is\n\
280returned.");
281
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000282static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000283unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000284{
285 PyUnicodeObject *v;
286 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000287 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000288
289 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000290 &PyUnicode_Type, &v))
291 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000292 c = getuchar(v);
293 if (c == (Py_UCS4)-1)
294 return NULL;
295 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000296 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000297 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000298 if (old->category_changed == 0)
299 index = 0; /* unassigned */
300 else if (old->bidir_changed != 0xFF)
301 index = old->bidir_changed;
302 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000303 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000304}
305
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000306PyDoc_STRVAR(unicodedata_combining__doc__,
307"combining(unichr)\n\
308\n\
309Returns the canonical combining class assigned to the Unicode\n\
310character unichr as integer. Returns 0 if no combining class is\n\
311defined.");
312
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000313static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000314unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000315{
316 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000317 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000318 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000319
320 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000321 &PyUnicode_Type, &v))
322 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000323 c = getuchar(v);
324 if (c == (Py_UCS4)-1)
325 return NULL;
326 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000327 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000328 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000329 if (old->category_changed == 0)
330 index = 0; /* unassigned */
331 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000332 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000333}
334
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000335PyDoc_STRVAR(unicodedata_mirrored__doc__,
336"mirrored(unichr)\n\
337\n\
338Returns the mirrored property assigned to the Unicode character\n\
339unichr as integer. Returns 1 if the character has been identified as\n\
340a \"mirrored\" character in bidirectional text, 0 otherwise.");
341
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000342static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000343unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000344{
345 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000346 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000347 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000348
349 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000350 &PyUnicode_Type, &v))
351 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000352 c = getuchar(v);
353 if (c == (Py_UCS4)-1)
354 return NULL;
355 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000356 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000357 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000358 if (old->category_changed == 0)
359 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000360 else if (old->mirrored_changed != 0xFF)
361 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000362 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000363 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000364}
365
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000366PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
367"east_asian_width(unichr)\n\
368\n\
369Returns the east asian width assigned to the Unicode character\n\
370unichr as string.");
371
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000372static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000373unicodedata_east_asian_width(PyObject *self, PyObject *args)
374{
375 PyUnicodeObject *v;
376 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000377 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000378
379 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000380 &PyUnicode_Type, &v))
381 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000382 c = getuchar(v);
383 if (c == (Py_UCS4)-1)
384 return NULL;
385 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000386 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000387 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000388 if (old->category_changed == 0)
389 index = 0; /* unassigned */
390 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000391 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000392}
393
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000394PyDoc_STRVAR(unicodedata_decomposition__doc__,
395"decomposition(unichr)\n\
396\n\
397Returns the character decomposition mapping assigned to the Unicode\n\
398character unichr as string. An empty string is returned in case no\n\
399such mapping is defined.");
400
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000401static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000402unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000403{
404 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000405 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000406 int code, index, count;
407 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000408 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000409 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000410
411 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000412 &PyUnicode_Type, &v))
413 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000414 c = getuchar(v);
415 if (c == (Py_UCS4)-1)
416 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000417
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000418 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000419
Martin v. Löwis1a214512008-06-11 05:26:20 +0000420 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000421 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000422 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000423 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000424 }
425
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000426 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000427 index = 0;
428 else {
429 index = decomp_index1[(code>>DECOMP_SHIFT)];
430 index = decomp_index2[(index<<DECOMP_SHIFT)+
431 (code&((1<<DECOMP_SHIFT)-1))];
432 }
433
Tim Peters69b83b12001-11-30 07:23:05 +0000434 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000435 is prefix code (from*/
436 count = decomp_data[index] >> 8;
437
438 /* XXX: could allocate the PyString up front instead
439 (strlen(prefix) + 5 * count + 1 bytes) */
440
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000441 /* Based on how index is calculated above and decomp_data is generated
442 from Tools/unicode/makeunicodedata.py, it should not be possible
443 to overflow decomp_prefix. */
444 prefix_index = decomp_data[index] & 255;
445 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
446
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000447 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000448 i = strlen(decomp_prefix[prefix_index]);
449 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000450
451 while (count-- > 0) {
452 if (i)
453 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000454 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000455 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
456 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000457 i += strlen(decomp + i);
458 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000459 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000460}
461
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000462static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000463get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000464{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000465 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000466 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000467 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000468 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000469 /* unassigned in old version */
470 *index = 0;
471 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000472 else {
473 *index = decomp_index1[(code>>DECOMP_SHIFT)];
474 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
475 (code&((1<<DECOMP_SHIFT)-1))];
476 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000477
Martin v. Löwis677bde22002-11-23 22:08:15 +0000478 /* high byte is number of hex bytes (usually one or two), low byte
479 is prefix code (from*/
480 *count = decomp_data[*index] >> 8;
481 *prefix = decomp_data[*index] & 255;
482
483 (*index)++;
484}
485
486#define SBase 0xAC00
487#define LBase 0x1100
488#define VBase 0x1161
489#define TBase 0x11A7
490#define LCount 19
491#define VCount 21
492#define TCount 28
493#define NCount (VCount*TCount)
494#define SCount (LCount*NCount)
495
496static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000497nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000498{
499 PyObject *result;
500 Py_UNICODE *i, *end, *o;
501 /* Longest decomposition in Unicode 3.2: U+FDFA */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000502 Py_UNICODE stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000503 Py_ssize_t space, isize;
504 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000505 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000506
Martin v. Löwis677bde22002-11-23 22:08:15 +0000507 stackptr = 0;
508 isize = PyUnicode_GET_SIZE(input);
509 /* Overallocate atmost 10 characters. */
510 space = (isize > 10 ? 10 : isize) + isize;
511 result = PyUnicode_FromUnicode(NULL, space);
512 if (!result)
513 return NULL;
514 i = PyUnicode_AS_UNICODE(input);
515 end = i + isize;
516 o = PyUnicode_AS_UNICODE(result);
517
518 while (i < end) {
519 stack[stackptr++] = *i++;
520 while(stackptr) {
521 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000522 /* Hangul Decomposition adds three characters in
523 a single step, so we need atleast that much room. */
524 if (space < 3) {
Martin v. Löwis5b222132007-06-10 09:51:05 +0000525 Py_ssize_t newsize = PyUnicode_GET_SIZE(result) + 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000526 space += 10;
527 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000528 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000529 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000530 }
531 /* Hangul Decomposition. */
532 if (SBase <= code && code < (SBase+SCount)) {
533 int SIndex = code - SBase;
534 int L = LBase + SIndex / NCount;
535 int V = VBase + (SIndex % NCount) / TCount;
536 int T = TBase + SIndex % TCount;
537 *o++ = L;
538 *o++ = V;
539 space -= 2;
540 if (T != TBase) {
541 *o++ = T;
542 space --;
543 }
544 continue;
545 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000546 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000547 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000548 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
549 if (value != 0) {
550 stack[stackptr++] = value;
551 continue;
552 }
553 }
554
555 /* Other decompositions. */
556 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000557
558 /* Copy character if it is not decomposable, or has a
559 compatibility decomposition, but we do NFD. */
560 if (!count || (prefix && !k)) {
561 *o++ = code;
562 space--;
563 continue;
564 }
565 /* Copy decomposition onto the stack, in reverse
566 order. */
567 while(count) {
568 code = decomp_data[index + (--count)];
569 stack[stackptr++] = code;
570 }
571 }
572 }
573
574 /* Drop overallocation. Cannot fail. */
575 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
576
577 /* Sort canonically. */
578 i = PyUnicode_AS_UNICODE(result);
579 prev = _getrecord_ex(*i)->combining;
580 end = i + PyUnicode_GET_SIZE(result);
581 for (i++; i < end; i++) {
582 cur = _getrecord_ex(*i)->combining;
583 if (prev == 0 || cur == 0 || prev <= cur) {
584 prev = cur;
585 continue;
586 }
587 /* Non-canonical order. Need to switch *i with previous. */
588 o = i - 1;
589 while (1) {
590 Py_UNICODE tmp = o[1];
591 o[1] = o[0];
592 o[0] = tmp;
593 o--;
594 if (o < PyUnicode_AS_UNICODE(result))
595 break;
596 prev = _getrecord_ex(*o)->combining;
597 if (prev == 0 || prev <= cur)
598 break;
599 }
600 prev = _getrecord_ex(*i)->combining;
601 }
602 return result;
603}
604
605static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000606find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000607{
608 int index;
609 for (index = 0; nfc[index].start; index++) {
610 int start = nfc[index].start;
611 if (code < start)
612 return -1;
613 if (code <= start + nfc[index].count) {
614 int delta = code - start;
615 return nfc[index].index + delta;
616 }
617 }
618 return -1;
619}
620
621static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000622nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000623{
624 PyObject *result;
625 Py_UNICODE *i, *i1, *o, *end;
626 int f,l,index,index1,comb;
627 Py_UNICODE code;
628 Py_UNICODE *skipped[20];
629 int cskipped = 0;
630
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000631 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000632 if (!result)
633 return NULL;
634
635 /* We are going to modify result in-place.
636 If nfd_nfkd is changed to sometimes return the input,
637 this code needs to be reviewed. */
638 assert(result != input);
639
640 i = PyUnicode_AS_UNICODE(result);
641 end = i + PyUnicode_GET_SIZE(result);
642 o = PyUnicode_AS_UNICODE(result);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000643
Martin v. Löwis677bde22002-11-23 22:08:15 +0000644 again:
645 while (i < end) {
646 for (index = 0; index < cskipped; index++) {
647 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000648 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000649 Remove from list. */
650 skipped[index] = skipped[cskipped-1];
651 cskipped--;
652 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000653 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000654 }
655 }
656 /* Hangul Composition. We don't need to check for <LV,T>
657 pairs, since we always have decomposed data. */
658 if (LBase <= *i && *i < (LBase+LCount) &&
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000659 i + 1 < end &&
Martin v. Löwis677bde22002-11-23 22:08:15 +0000660 VBase <= i[1] && i[1] <= (VBase+VCount)) {
661 int LIndex, VIndex;
662 LIndex = i[0] - LBase;
663 VIndex = i[1] - VBase;
664 code = SBase + (LIndex*VCount+VIndex)*TCount;
665 i+=2;
666 if (i < end &&
667 TBase <= *i && *i <= (TBase+TCount)) {
668 code += *i-TBase;
669 i++;
670 }
671 *o++ = code;
672 continue;
673 }
674
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000675 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000676 if (f == -1) {
677 *o++ = *i++;
678 continue;
679 }
680 /* Find next unblocked character. */
681 i1 = i+1;
682 comb = 0;
683 while (i1 < end) {
684 int comb1 = _getrecord_ex(*i1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000685 if (comb) {
686 if (comb1 == 0)
687 break;
688 if (comb >= comb1) {
689 /* Character is blocked. */
690 i1++;
691 continue;
692 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000693 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000694 l = find_nfc_index(self, nfc_last, *i1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000695 /* *i1 cannot be combined with *i. If *i1
696 is a starter, we don't need to look further.
697 Otherwise, record the combining class. */
698 if (l == -1) {
699 not_combinable:
700 if (comb1 == 0)
701 break;
702 comb = comb1;
703 i1++;
704 continue;
705 }
706 index = f*TOTAL_LAST + l;
707 index1 = comp_index[index >> COMP_SHIFT];
708 code = comp_data[(index1<<COMP_SHIFT)+
709 (index&((1<<COMP_SHIFT)-1))];
710 if (code == 0)
711 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000712
Martin v. Löwis677bde22002-11-23 22:08:15 +0000713 /* Replace the original character. */
714 *i = code;
715 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000716 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000717 skipped[cskipped++] = i1;
718 i1++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000719 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000720 if (f == -1)
721 break;
722 }
723 *o++ = *i++;
724 }
725 if (o != end)
726 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
727 return result;
728}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000729
730/* Return 1 if the input is certainly normalized, 0 if it might not be. */
731static int
732is_normalized(PyObject *self, PyObject *input, int nfc, int k)
733{
734 Py_UNICODE *i, *end;
735 unsigned char prev_combining = 0, quickcheck_mask;
736
737 /* An older version of the database is requested, quickchecks must be
738 disabled. */
739 if (self && UCD_Check(self))
740 return 0;
741
742 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
743 as described in http://unicode.org/reports/tr15/#Annex8. */
744 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
745
746 i = PyUnicode_AS_UNICODE(input);
747 end = i + PyUnicode_GET_SIZE(input);
748 while (i < end) {
749 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
750 unsigned char combining = record->combining;
751 unsigned char quickcheck = record->normalization_quick_check;
752
753 if (quickcheck & quickcheck_mask)
754 return 0; /* this string might need normalization */
755 if (combining && prev_combining > combining)
756 return 0; /* non-canonical sort order, not normalized */
757 prev_combining = combining;
758 }
759 return 1; /* certainly normalized */
760}
761
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000762PyDoc_STRVAR(unicodedata_normalize__doc__,
763"normalize(form, unistr)\n\
764\n\
765Return the normal form 'form' for the Unicode string unistr. Valid\n\
766values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
767
Martin v. Löwis677bde22002-11-23 22:08:15 +0000768static PyObject*
769unicodedata_normalize(PyObject *self, PyObject *args)
770{
771 char *form;
772 PyObject *input;
773
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000774 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000775 &form, &PyUnicode_Type, &input))
776 return NULL;
777
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000778 if (PyUnicode_GetSize(input) == 0) {
779 /* Special case empty input strings, since resizing
780 them later would cause internal errors. */
781 Py_INCREF(input);
782 return input;
783 }
784
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000785 if (strcmp(form, "NFC") == 0) {
786 if (is_normalized(self, input, 1, 0)) {
787 Py_INCREF(input);
788 return input;
789 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000790 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000791 }
792 if (strcmp(form, "NFKC") == 0) {
793 if (is_normalized(self, input, 1, 1)) {
794 Py_INCREF(input);
795 return input;
796 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000797 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000798 }
799 if (strcmp(form, "NFD") == 0) {
800 if (is_normalized(self, input, 0, 0)) {
801 Py_INCREF(input);
802 return input;
803 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000804 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000805 }
806 if (strcmp(form, "NFKD") == 0) {
807 if (is_normalized(self, input, 0, 1)) {
808 Py_INCREF(input);
809 return input;
810 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000811 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000812 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000813 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
814 return NULL;
815}
816
Fredrik Lundh06d12682001-01-24 07:59:11 +0000817/* -------------------------------------------------------------------- */
818/* unicode character name tables */
819
820/* data file generated by Tools/unicode/makeunicodedata.py */
821#include "unicodename_db.h"
822
823/* -------------------------------------------------------------------- */
824/* database code (cut and pasted from the unidb package) */
825
826static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000827_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000828{
829 int i;
830 unsigned long h = 0;
831 unsigned long ix;
832 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200833 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000834 ix = h & 0xff000000;
835 if (ix)
836 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
837 }
838 return h;
839}
840
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000841static char *hangul_syllables[][3] = {
842 { "G", "A", "" },
843 { "GG", "AE", "G" },
844 { "N", "YA", "GG" },
845 { "D", "YAE", "GS" },
846 { "DD", "EO", "N", },
847 { "R", "E", "NJ" },
848 { "M", "YEO", "NH" },
849 { "B", "YE", "D" },
850 { "BB", "O", "L" },
851 { "S", "WA", "LG" },
852 { "SS", "WAE", "LM" },
853 { "", "OE", "LB" },
854 { "J", "YO", "LS" },
855 { "JJ", "U", "LT" },
856 { "C", "WEO", "LP" },
857 { "K", "WE", "LH" },
858 { "T", "WI", "M" },
859 { "P", "YU", "B" },
860 { "H", "EU", "BS" },
861 { 0, "YI", "S" },
862 { 0, "I", "SS" },
863 { 0, 0, "NG" },
864 { 0, 0, "J" },
865 { 0, 0, "C" },
866 { 0, 0, "K" },
867 { 0, 0, "T" },
868 { 0, 0, "P" },
869 { 0, 0, "H" }
870};
871
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000872/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000873static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000874is_unified_ideograph(Py_UCS4 code)
875{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000876 return
877 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
878 (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */
879 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
880 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
881 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000882}
883
884static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000885_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000886{
887 int offset;
888 int i;
889 int word;
890 unsigned char* w;
891
Martin v. Löwisc3509122006-03-11 12:16:23 +0000892 if (code >= 0x110000)
893 return 0;
894
Martin v. Löwis1a214512008-06-11 05:26:20 +0000895 if (self && UCD_Check(self)) {
Martin v. Löwisc3509122006-03-11 12:16:23 +0000896 const change_record *old = get_old_record(self, code);
897 if (old->category_changed == 0) {
898 /* unassigned */
899 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000900 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000901 }
902
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000903 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000904 /* Hangul syllable. */
905 int SIndex = code - SBase;
906 int L = SIndex / NCount;
907 int V = (SIndex % NCount) / TCount;
908 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000909
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000910 if (buflen < 27)
911 /* Worst case: HANGUL SYLLABLE <10chars>. */
912 return 0;
913 strcpy(buffer, "HANGUL SYLLABLE ");
914 buffer += 16;
915 strcpy(buffer, hangul_syllables[L][0]);
916 buffer += strlen(hangul_syllables[L][0]);
917 strcpy(buffer, hangul_syllables[V][1]);
918 buffer += strlen(hangul_syllables[V][1]);
919 strcpy(buffer, hangul_syllables[T][2]);
920 buffer += strlen(hangul_syllables[T][2]);
921 *buffer = '\0';
922 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000923 }
924
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000925 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000926 if (buflen < 28)
927 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
928 return 0;
929 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
930 return 1;
931 }
932
Fredrik Lundh06d12682001-01-24 07:59:11 +0000933 /* get offset into phrasebook */
934 offset = phrasebook_offset1[(code>>phrasebook_shift)];
935 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
936 (code&((1<<phrasebook_shift)-1))];
937 if (!offset)
938 return 0;
939
940 i = 0;
941
942 for (;;) {
943 /* get word index */
944 word = phrasebook[offset] - phrasebook_short;
945 if (word >= 0) {
946 word = (word << 8) + phrasebook[offset+1];
947 offset += 2;
948 } else
949 word = phrasebook[offset++];
950 if (i) {
951 if (i > buflen)
952 return 0; /* buffer overflow */
953 buffer[i++] = ' ';
954 }
955 /* copy word string from lexicon. the last character in the
956 word has bit 7 set. the last word in a string ends with
957 0x80 */
958 w = lexicon + lexicon_offset[word];
959 while (*w < 128) {
960 if (i >= buflen)
961 return 0; /* buffer overflow */
962 buffer[i++] = *w++;
963 }
964 if (i >= buflen)
965 return 0; /* buffer overflow */
966 buffer[i++] = *w & 127;
967 if (*w == 128)
968 break; /* end of word */
969 }
970
971 return 1;
972}
973
974static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000975_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000976{
977 /* check if code corresponds to the given name */
978 int i;
979 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000980 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000981 return 0;
982 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200983 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +0000984 return 0;
985 }
986 return buffer[namelen] == '\0';
987}
988
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000989static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000990find_syllable(const char *str, int *len, int *pos, int count, int column)
991{
992 int i, len1;
993 *len = -1;
994 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000995 char *s = hangul_syllables[i][column];
996 len1 = strlen(s);
997 if (len1 <= *len)
998 continue;
999 if (strncmp(str, s, len1) == 0) {
1000 *len = len1;
1001 *pos = i;
1002 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001003 }
1004 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001005 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001006 }
1007}
1008
Fredrik Lundh06d12682001-01-24 07:59:11 +00001009static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001010_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001011{
1012 unsigned int h, v;
1013 unsigned int mask = code_size-1;
1014 unsigned int i, incr;
1015
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001016 /* Check for hangul syllables. */
1017 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001018 int len, L = -1, V = -1, T = -1;
1019 const char *pos = name + 16;
1020 find_syllable(pos, &len, &L, LCount, 0);
1021 pos += len;
1022 find_syllable(pos, &len, &V, VCount, 1);
1023 pos += len;
1024 find_syllable(pos, &len, &T, TCount, 2);
1025 pos += len;
1026 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1027 *code = SBase + (L*VCount+V)*TCount + T;
1028 return 1;
1029 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001030 /* Otherwise, it's an illegal syllable name. */
1031 return 0;
1032 }
1033
1034 /* Check for unified ideographs. */
1035 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1036 /* Four or five hexdigits must follow. */
1037 v = 0;
1038 name += 22;
1039 namelen -= 22;
1040 if (namelen != 4 && namelen != 5)
1041 return 0;
1042 while (namelen--) {
1043 v *= 16;
1044 if (*name >= '0' && *name <= '9')
1045 v += *name - '0';
1046 else if (*name >= 'A' && *name <= 'F')
1047 v += *name - 'A' + 10;
1048 else
1049 return 0;
1050 name++;
1051 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001052 if (!is_unified_ideograph(v))
1053 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001054 *code = v;
1055 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001056 }
1057
Fredrik Lundh06d12682001-01-24 07:59:11 +00001058 /* the following is the same as python's dictionary lookup, with
1059 only minor changes. see the makeunicodedata script for more
1060 details */
1061
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001062 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001063 i = (~h) & mask;
1064 v = code_hash[i];
1065 if (!v)
1066 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001067 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001068 *code = v;
1069 return 1;
1070 }
1071 incr = (h ^ (h >> 3)) & mask;
1072 if (!incr)
1073 incr = mask;
1074 for (;;) {
1075 i = (i + incr) & mask;
1076 v = code_hash[i];
1077 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001078 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001079 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001080 *code = v;
1081 return 1;
1082 }
1083 incr = incr << 1;
1084 if (incr > mask)
1085 incr = incr ^ code_poly;
1086 }
1087}
1088
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001089static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001090{
1091 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001092 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001093 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001094};
1095
1096/* -------------------------------------------------------------------- */
1097/* Python bindings */
1098
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001099PyDoc_STRVAR(unicodedata_name__doc__,
1100"name(unichr[, default])\n\
1101Returns the name assigned to the Unicode character unichr as a\n\
1102string. If no name is defined, default is returned, or, if not\n\
1103given, ValueError is raised.");
1104
Fredrik Lundh06d12682001-01-24 07:59:11 +00001105static PyObject *
1106unicodedata_name(PyObject* self, PyObject* args)
1107{
1108 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001109 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001110
1111 PyUnicodeObject* v;
1112 PyObject* defobj = NULL;
1113 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1114 return NULL;
1115
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001116 c = getuchar(v);
1117 if (c == (Py_UCS4)-1)
1118 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001119
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001120 if (!_getucname(self, c, name, sizeof(name))) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001121 if (defobj == NULL) {
1122 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001123 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001124 }
1125 else {
1126 Py_INCREF(defobj);
1127 return defobj;
1128 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001129 }
1130
Walter Dörwald4254e762007-06-05 16:04:09 +00001131 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001132}
1133
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001134PyDoc_STRVAR(unicodedata_lookup__doc__,
1135"lookup(name)\n\
1136\n\
1137Look up character by name. If a character with the\n\
1138given name is found, return the corresponding Unicode\n\
1139character. If not found, KeyError is raised.");
1140
Fredrik Lundh06d12682001-01-24 07:59:11 +00001141static PyObject *
1142unicodedata_lookup(PyObject* self, PyObject* args)
1143{
1144 Py_UCS4 code;
Guido van Rossum806c2462007-08-06 23:33:07 +00001145 Py_UNICODE str[2];
Fredrik Lundh06d12682001-01-24 07:59:11 +00001146
1147 char* name;
1148 int namelen;
1149 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1150 return NULL;
1151
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001152 if (!_getcode(self, name, namelen, &code)) {
Guido van Rossum806c2462007-08-06 23:33:07 +00001153 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1154 name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001155 return NULL;
1156 }
1157
Guido van Rossum806c2462007-08-06 23:33:07 +00001158#ifndef Py_UNICODE_WIDE
1159 if (code >= 0x10000) {
1160 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1161 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1162 return PyUnicode_FromUnicode(str, 2);
1163 }
1164#endif
Fredrik Lundh06d12682001-01-24 07:59:11 +00001165 str[0] = (Py_UNICODE) code;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001166 return PyUnicode_FromUnicode(str, 1);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001167}
1168
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001169/* XXX Add doc strings. */
1170
1171static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001172 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1173 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1174 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1175 {"category", unicodedata_category, METH_VARARGS,
1176 unicodedata_category__doc__},
1177 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1178 unicodedata_bidirectional__doc__},
1179 {"combining", unicodedata_combining, METH_VARARGS,
1180 unicodedata_combining__doc__},
1181 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1182 unicodedata_mirrored__doc__},
1183 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1184 unicodedata_east_asian_width__doc__},
1185 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1186 unicodedata_decomposition__doc__},
1187 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1188 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1189 {"normalize", unicodedata_normalize, METH_VARARGS,
1190 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001191 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001192};
1193
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001194static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001195 /* The ob_type field must be initialized in the module init function
1196 * to be portable to Windows without using C++. */
1197 PyVarObject_HEAD_INIT(NULL, 0)
1198 "unicodedata.UCD", /*tp_name*/
1199 sizeof(PreviousDBVersion), /*tp_basicsize*/
1200 0, /*tp_itemsize*/
1201 /* methods */
1202 (destructor)PyObject_Del, /*tp_dealloc*/
1203 0, /*tp_print*/
1204 0, /*tp_getattr*/
1205 0, /*tp_setattr*/
1206 0, /*tp_reserved*/
1207 0, /*tp_repr*/
1208 0, /*tp_as_number*/
1209 0, /*tp_as_sequence*/
1210 0, /*tp_as_mapping*/
1211 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001212 0, /*tp_call*/
1213 0, /*tp_str*/
1214 PyObject_GenericGetAttr,/*tp_getattro*/
1215 0, /*tp_setattro*/
1216 0, /*tp_as_buffer*/
1217 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1218 0, /*tp_doc*/
1219 0, /*tp_traverse*/
1220 0, /*tp_clear*/
1221 0, /*tp_richcompare*/
1222 0, /*tp_weaklistoffset*/
1223 0, /*tp_iter*/
1224 0, /*tp_iternext*/
1225 unicodedata_functions, /*tp_methods*/
1226 DB_members, /*tp_members*/
1227 0, /*tp_getset*/
1228 0, /*tp_base*/
1229 0, /*tp_dict*/
1230 0, /*tp_descr_get*/
1231 0, /*tp_descr_set*/
1232 0, /*tp_dictoffset*/
1233 0, /*tp_init*/
1234 0, /*tp_alloc*/
1235 0, /*tp_new*/
1236 0, /*tp_free*/
1237 0, /*tp_is_gc*/
1238};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001239
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001240PyDoc_STRVAR(unicodedata_docstring,
1241"This module provides access to the Unicode Character Database which\n\
1242defines character properties for all Unicode characters. The data in\n\
1243this database is based on the UnicodeData.txt file version\n\
Ezio Melotti4c5475d2010-03-22 23:16:42 +000012445.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001245\n\
1246The module uses the same names and symbols as defined by the\n\
Ezio Melottid96b2f22010-03-23 00:39:22 +00001247UnicodeData File Format 5.2.0 (see\n\
1248http://www.unicode.org/reports/tr44/tr44-4.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001249
Martin v. Löwis1a214512008-06-11 05:26:20 +00001250
1251static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001252 PyModuleDef_HEAD_INIT,
1253 "unicodedata",
1254 unicodedata_docstring,
1255 -1,
1256 unicodedata_functions,
1257 NULL,
1258 NULL,
1259 NULL,
1260 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001261};
1262
Mark Hammond62b1ab12002-07-23 06:31:15 +00001263PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001264PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001265{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001266 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001267
Christian Heimes90aa7642007-12-19 02:45:37 +00001268 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001269
Martin v. Löwis1a214512008-06-11 05:26:20 +00001270 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001271 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001272 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001273
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001274 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001275 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001276 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001277
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001278 /* Previous versions */
1279 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1280 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001281 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001282
Fredrik Lundh06d12682001-01-24 07:59:11 +00001283 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001284 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001285 if (v != NULL)
1286 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001287 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001288}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001289
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001290/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001291Local variables:
1292c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001293indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001294End:
1295*/