blob: 53e48dfa36ec770ae259e2e1cce63cff1feb186a [file] [log] [blame]
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001/* ------------------------------------------------------------------------
2
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02003 unicodedata -- Provides access to the Unicode database.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00004
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02005 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00007
Fredrik Lundhcfcea492000-09-25 08:07:06 +00008 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Florent Xiclunac934f322010-09-03 23:47:32 +000010 Modified by Martin v. Löwis (martin@v.loewis.de)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000011
Fredrik Lundhcfcea492000-09-25 08:07:06 +000012 Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000013
14 ------------------------------------------------------------------------ */
15
16#include "Python.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000017#include "ucnhash.h"
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000018#include "structmember.h"
Fredrik Lundh06d12682001-01-24 07:59:11 +000019
20/* character properties */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +000021
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000022typedef struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000023 const unsigned char category; /* index into
24 _PyUnicode_CategoryNames */
25 const unsigned char combining; /* combining class value 0 - 255 */
26 const unsigned char bidirectional; /* index into
27 _PyUnicode_BidirectionalNames */
28 const unsigned char mirrored; /* true if mirrored in bidir mode */
29 const unsigned char east_asian_width; /* index into
30 _PyUnicode_EastAsianWidth */
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +000031 const unsigned char normalization_quick_check; /* see is_normalized() */
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000032} _PyUnicode_DatabaseRecord;
33
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000034typedef struct change_record {
35 /* sequence of fields should be the same as in merge_old_version */
36 const unsigned char bidir_changed;
37 const unsigned char category_changed;
38 const unsigned char decimal_changed;
Martin v. Löwis93cbca32008-09-10 14:08:48 +000039 const unsigned char mirrored_changed;
Amaury Forgeot d'Arc7d520792009-10-06 21:03:20 +000040 const double numeric_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000041} change_record;
42
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000043/* data file generated by Tools/unicode/makeunicodedata.py */
44#include "unicodedata_db.h"
45
46static const _PyUnicode_DatabaseRecord*
Martin v. Löwis677bde22002-11-23 22:08:15 +000047_getrecord_ex(Py_UCS4 code)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000048{
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000049 int index;
Neal Norwitze9c571f2003-02-28 03:14:37 +000050 if (code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +000051 index = 0;
52 else {
53 index = index1[(code>>SHIFT)];
54 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
55 }
56
57 return &_PyUnicode_Database_Records[index];
58}
59
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000060/* ------------- Previous-version API ------------------------------------- */
61typedef struct previous_version {
62 PyObject_HEAD
63 const char *name;
64 const change_record* (*getrecord)(Py_UCS4);
65 Py_UCS4 (*normalization)(Py_UCS4);
66} PreviousDBVersion;
67
68#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
69
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000070static PyMemberDef DB_members[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000071 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000072 {NULL}
73};
74
Thomas Wouters89f507f2006-12-13 04:49:30 +000075/* forward declaration */
Martin v. Löwis5bd7c022006-03-10 11:20:04 +000076static PyTypeObject UCD_Type;
Martin v. Löwis1a214512008-06-11 05:26:20 +000077#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000078
79static PyObject*
80new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
81 Py_UCS4 (*normalization)(Py_UCS4))
82{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000083 PreviousDBVersion *self;
84 self = PyObject_New(PreviousDBVersion, &UCD_Type);
85 if (self == NULL)
86 return NULL;
87 self->name = name;
88 self->getrecord = getrecord;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000089 self->normalization = normalization;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000090 return (PyObject*)self;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +000091}
92
Walter Dörwaldf342bfc2008-06-03 11:45:02 +000093
94static Py_UCS4 getuchar(PyUnicodeObject *obj)
95{
96 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
97
98 if (PyUnicode_GET_SIZE(obj) == 1)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000099 return *v;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000100#ifndef Py_UNICODE_WIDE
101 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
102 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
103 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000104 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000105#endif
106 PyErr_SetString(PyExc_TypeError,
107 "need a single Unicode character as parameter");
108 return (Py_UCS4)-1;
109}
110
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000111/* --- Module API --------------------------------------------------------- */
112
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000113PyDoc_STRVAR(unicodedata_decimal__doc__,
114"decimal(unichr[, default])\n\
115\n\
116Returns the decimal value assigned to the Unicode character unichr\n\
117as integer. If no such value is defined, default is returned, or, if\n\
118not given, ValueError is raised.");
119
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000120static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000121unicodedata_decimal(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000122{
123 PyUnicodeObject *v;
124 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000125 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000126 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000127 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000128
Fredrik Lundh06d12682001-01-24 07:59:11 +0000129 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000130 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000131 c = getuchar(v);
132 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000133 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000134
Martin v. Löwis1a214512008-06-11 05:26:20 +0000135 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000136 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000137 if (old->category_changed == 0) {
138 /* unassigned */
139 have_old = 1;
140 rc = -1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000142 else if (old->decimal_changed != 0xFF) {
143 have_old = 1;
144 rc = old->decimal_changed;
145 }
146 }
147
148 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000149 rc = Py_UNICODE_TODECIMAL(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000150 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000151 if (defobj == NULL) {
152 PyErr_SetString(PyExc_ValueError,
153 "not a decimal");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000154 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000155 }
156 else {
157 Py_INCREF(defobj);
158 return defobj;
159 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000160 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000161 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000162}
163
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000164PyDoc_STRVAR(unicodedata_digit__doc__,
165"digit(unichr[, default])\n\
166\n\
167Returns the digit value assigned to the Unicode character unichr as\n\
168integer. If no such value is defined, default is returned, or, if\n\
169not given, ValueError is raised.");
170
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000171static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000172unicodedata_digit(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000173{
174 PyUnicodeObject *v;
175 PyObject *defobj = NULL;
176 long rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000177 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000178
Fredrik Lundh06d12682001-01-24 07:59:11 +0000179 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000180 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000181 c = getuchar(v);
182 if (c == (Py_UCS4)-1)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000183 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000184 rc = Py_UNICODE_TODIGIT(c);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000185 if (rc < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000186 if (defobj == NULL) {
187 PyErr_SetString(PyExc_ValueError, "not a digit");
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000188 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000189 }
190 else {
191 Py_INCREF(defobj);
192 return defobj;
193 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000194 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000195 return PyLong_FromLong(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000196}
197
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000198PyDoc_STRVAR(unicodedata_numeric__doc__,
199"numeric(unichr[, default])\n\
200\n\
201Returns the numeric value assigned to the Unicode character unichr\n\
202as float. If no such value is defined, default is returned, or, if\n\
203not given, ValueError is raised.");
204
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000205static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000206unicodedata_numeric(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000207{
208 PyUnicodeObject *v;
209 PyObject *defobj = NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000210 int have_old = 0;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000211 double rc;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000212 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000213
Fredrik Lundh06d12682001-01-24 07:59:11 +0000214 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000215 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000216 c = getuchar(v);
217 if (c == (Py_UCS4)-1)
218 return NULL;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000219
Martin v. Löwis1a214512008-06-11 05:26:20 +0000220 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000221 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000222 if (old->category_changed == 0) {
223 /* unassigned */
224 have_old = 1;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000225 rc = -1.0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000226 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000227 else if (old->decimal_changed != 0xFF) {
228 have_old = 1;
229 rc = old->decimal_changed;
230 }
231 }
232
233 if (!have_old)
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000234 rc = Py_UNICODE_TONUMERIC(c);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000235 if (rc == -1.0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000236 if (defobj == NULL) {
237 PyErr_SetString(PyExc_ValueError, "not a numeric character");
238 return NULL;
239 }
240 else {
241 Py_INCREF(defobj);
242 return defobj;
243 }
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000244 }
245 return PyFloat_FromDouble(rc);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000246}
247
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000248PyDoc_STRVAR(unicodedata_category__doc__,
249"category(unichr)\n\
250\n\
251Returns the general category assigned to the Unicode character\n\
252unichr as string.");
253
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000254static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000255unicodedata_category(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000256{
257 PyUnicodeObject *v;
258 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000259 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000260
261 if (!PyArg_ParseTuple(args, "O!:category",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000262 &PyUnicode_Type, &v))
263 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000264 c = getuchar(v);
265 if (c == (Py_UCS4)-1)
266 return NULL;
267 index = (int) _getrecord_ex(c)->category;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000268 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000269 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000270 if (old->category_changed != 0xFF)
271 index = old->category_changed;
272 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000273 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000274}
275
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000276PyDoc_STRVAR(unicodedata_bidirectional__doc__,
277"bidirectional(unichr)\n\
278\n\
279Returns the bidirectional category assigned to the Unicode character\n\
280unichr as string. If no such value is defined, an empty string is\n\
281returned.");
282
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000283static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000284unicodedata_bidirectional(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000285{
286 PyUnicodeObject *v;
287 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000288 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000289
290 if (!PyArg_ParseTuple(args, "O!:bidirectional",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000291 &PyUnicode_Type, &v))
292 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000293 c = getuchar(v);
294 if (c == (Py_UCS4)-1)
295 return NULL;
296 index = (int) _getrecord_ex(c)->bidirectional;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000297 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000298 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000299 if (old->category_changed == 0)
300 index = 0; /* unassigned */
301 else if (old->bidir_changed != 0xFF)
302 index = old->bidir_changed;
303 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000304 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000305}
306
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000307PyDoc_STRVAR(unicodedata_combining__doc__,
308"combining(unichr)\n\
309\n\
310Returns the canonical combining class assigned to the Unicode\n\
311character unichr as integer. Returns 0 if no combining class is\n\
312defined.");
313
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000314static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000315unicodedata_combining(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000316{
317 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000318 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000319 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000320
321 if (!PyArg_ParseTuple(args, "O!:combining",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322 &PyUnicode_Type, &v))
323 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000324 c = getuchar(v);
325 if (c == (Py_UCS4)-1)
326 return NULL;
327 index = (int) _getrecord_ex(c)->combining;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000328 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000329 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000330 if (old->category_changed == 0)
331 index = 0; /* unassigned */
332 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000333 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000334}
335
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000336PyDoc_STRVAR(unicodedata_mirrored__doc__,
337"mirrored(unichr)\n\
338\n\
339Returns the mirrored property assigned to the Unicode character\n\
340unichr as integer. Returns 1 if the character has been identified as\n\
341a \"mirrored\" character in bidirectional text, 0 otherwise.");
342
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000343static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000344unicodedata_mirrored(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000345{
346 PyUnicodeObject *v;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000347 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000348 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000349
350 if (!PyArg_ParseTuple(args, "O!:mirrored",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000351 &PyUnicode_Type, &v))
352 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000353 c = getuchar(v);
354 if (c == (Py_UCS4)-1)
355 return NULL;
356 index = (int) _getrecord_ex(c)->mirrored;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000357 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000358 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000359 if (old->category_changed == 0)
360 index = 0; /* unassigned */
Martin v. Löwis93cbca32008-09-10 14:08:48 +0000361 else if (old->mirrored_changed != 0xFF)
362 index = old->mirrored_changed;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000363 }
Christian Heimes217cfd12007-12-02 14:31:20 +0000364 return PyLong_FromLong(index);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000365}
366
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000367PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
368"east_asian_width(unichr)\n\
369\n\
370Returns the east asian width assigned to the Unicode character\n\
371unichr as string.");
372
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000373static PyObject *
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000374unicodedata_east_asian_width(PyObject *self, PyObject *args)
375{
376 PyUnicodeObject *v;
377 int index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000378 Py_UCS4 c;
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000379
380 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000381 &PyUnicode_Type, &v))
382 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000383 c = getuchar(v);
384 if (c == (Py_UCS4)-1)
385 return NULL;
386 index = (int) _getrecord_ex(c)->east_asian_width;
Martin v. Löwis1a214512008-06-11 05:26:20 +0000387 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000388 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000389 if (old->category_changed == 0)
390 index = 0; /* unassigned */
391 }
Walter Dörwald4254e762007-06-05 16:04:09 +0000392 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000393}
394
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000395PyDoc_STRVAR(unicodedata_decomposition__doc__,
396"decomposition(unichr)\n\
397\n\
398Returns the character decomposition mapping assigned to the Unicode\n\
399character unichr as string. An empty string is returned in case no\n\
400such mapping is defined.");
401
Hye-Shik Change9ddfbb2004-08-04 07:38:35 +0000402static PyObject *
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000403unicodedata_decomposition(PyObject *self, PyObject *args)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000404{
405 PyUnicodeObject *v;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000406 char decomp[256];
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000407 int code, index, count;
408 size_t i;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000409 unsigned int prefix_index;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000410 Py_UCS4 c;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000411
412 if (!PyArg_ParseTuple(args, "O!:decomposition",
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000413 &PyUnicode_Type, &v))
414 return NULL;
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000415 c = getuchar(v);
416 if (c == (Py_UCS4)-1)
417 return NULL;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000418
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000419 code = (int)c;
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000420
Martin v. Löwis1a214512008-06-11 05:26:20 +0000421 if (self && UCD_Check(self)) {
Walter Dörwaldf342bfc2008-06-03 11:45:02 +0000422 const change_record *old = get_old_record(self, c);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000423 if (old->category_changed == 0)
Walter Dörwald4254e762007-06-05 16:04:09 +0000424 return PyUnicode_FromString(""); /* unassigned */
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000425 }
426
Martin v. Löwis9def6a32002-10-18 16:11:54 +0000427 if (code < 0 || code >= 0x110000)
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000428 index = 0;
429 else {
430 index = decomp_index1[(code>>DECOMP_SHIFT)];
431 index = decomp_index2[(index<<DECOMP_SHIFT)+
432 (code&((1<<DECOMP_SHIFT)-1))];
433 }
434
Tim Peters69b83b12001-11-30 07:23:05 +0000435 /* high byte is number of hex bytes (usually one or two), low byte
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000436 is prefix code (from*/
437 count = decomp_data[index] >> 8;
438
439 /* XXX: could allocate the PyString up front instead
440 (strlen(prefix) + 5 * count + 1 bytes) */
441
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000442 /* Based on how index is calculated above and decomp_data is generated
443 from Tools/unicode/makeunicodedata.py, it should not be possible
444 to overflow decomp_prefix. */
445 prefix_index = decomp_data[index] & 255;
446 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
447
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000448 /* copy prefix */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000449 i = strlen(decomp_prefix[prefix_index]);
450 memcpy(decomp, decomp_prefix[prefix_index], i);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000451
452 while (count-- > 0) {
453 if (i)
454 decomp[i++] = ' ';
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000455 assert(i < sizeof(decomp));
Tim Peters69b83b12001-11-30 07:23:05 +0000456 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
457 decomp_data[++index]);
Fredrik Lundh7b7dd102001-01-21 22:41:08 +0000458 i += strlen(decomp + i);
459 }
Victor Stinner0fcab4a2011-01-04 12:59:15 +0000460 return PyUnicode_FromStringAndSize(decomp, i);
Guido van Rossum2a70a3a2000-03-10 23:10:21 +0000461}
462
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000463static void
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000464get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000465{
Neal Norwitze9c571f2003-02-28 03:14:37 +0000466 if (code >= 0x110000) {
Martin v. Löwis677bde22002-11-23 22:08:15 +0000467 *index = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000468 } else if (self && UCD_Check(self) &&
Martin v. Löwis1a214512008-06-11 05:26:20 +0000469 get_old_record(self, code)->category_changed==0) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000470 /* unassigned in old version */
471 *index = 0;
472 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000473 else {
474 *index = decomp_index1[(code>>DECOMP_SHIFT)];
475 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
476 (code&((1<<DECOMP_SHIFT)-1))];
477 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000478
Martin v. Löwis677bde22002-11-23 22:08:15 +0000479 /* high byte is number of hex bytes (usually one or two), low byte
480 is prefix code (from*/
481 *count = decomp_data[*index] >> 8;
482 *prefix = decomp_data[*index] & 255;
483
484 (*index)++;
485}
486
487#define SBase 0xAC00
488#define LBase 0x1100
489#define VBase 0x1161
490#define TBase 0x11A7
491#define LCount 19
492#define VCount 21
493#define TCount 28
494#define NCount (VCount*TCount)
495#define SCount (LCount*NCount)
496
497static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000498nfd_nfkd(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000499{
500 PyObject *result;
501 Py_UNICODE *i, *end, *o;
502 /* Longest decomposition in Unicode 3.2: U+FDFA */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000503 Py_UNICODE stack[20];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000504 Py_ssize_t space, isize;
505 int index, prefix, count, stackptr;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000506 unsigned char prev, cur;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000507
Martin v. Löwis677bde22002-11-23 22:08:15 +0000508 stackptr = 0;
509 isize = PyUnicode_GET_SIZE(input);
510 /* Overallocate atmost 10 characters. */
511 space = (isize > 10 ? 10 : isize) + isize;
512 result = PyUnicode_FromUnicode(NULL, space);
513 if (!result)
514 return NULL;
515 i = PyUnicode_AS_UNICODE(input);
516 end = i + isize;
517 o = PyUnicode_AS_UNICODE(result);
518
519 while (i < end) {
520 stack[stackptr++] = *i++;
521 while(stackptr) {
522 Py_UNICODE code = stack[--stackptr];
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000523 /* Hangul Decomposition adds three characters in
524 a single step, so we need atleast that much room. */
525 if (space < 3) {
Martin v. Löwis5b222132007-06-10 09:51:05 +0000526 Py_ssize_t newsize = PyUnicode_GET_SIZE(result) + 10;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000527 space += 10;
528 if (PyUnicode_Resize(&result, newsize) == -1)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000529 return NULL;
Martin v. Löwisd2171d22003-11-06 20:47:57 +0000530 o = PyUnicode_AS_UNICODE(result) + newsize - space;
Martin v. Löwis677bde22002-11-23 22:08:15 +0000531 }
532 /* Hangul Decomposition. */
533 if (SBase <= code && code < (SBase+SCount)) {
534 int SIndex = code - SBase;
535 int L = LBase + SIndex / NCount;
536 int V = VBase + (SIndex % NCount) / TCount;
537 int T = TBase + SIndex % TCount;
538 *o++ = L;
539 *o++ = V;
540 space -= 2;
541 if (T != TBase) {
542 *o++ = T;
543 space --;
544 }
545 continue;
546 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000547 /* normalization changes */
Martin v. Löwis1a214512008-06-11 05:26:20 +0000548 if (self && UCD_Check(self)) {
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000549 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
550 if (value != 0) {
551 stack[stackptr++] = value;
552 continue;
553 }
554 }
555
556 /* Other decompositions. */
557 get_decomp_record(self, code, &index, &prefix, &count);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000558
559 /* Copy character if it is not decomposable, or has a
560 compatibility decomposition, but we do NFD. */
561 if (!count || (prefix && !k)) {
562 *o++ = code;
563 space--;
564 continue;
565 }
566 /* Copy decomposition onto the stack, in reverse
567 order. */
568 while(count) {
569 code = decomp_data[index + (--count)];
570 stack[stackptr++] = code;
571 }
572 }
573 }
574
575 /* Drop overallocation. Cannot fail. */
576 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
577
578 /* Sort canonically. */
579 i = PyUnicode_AS_UNICODE(result);
580 prev = _getrecord_ex(*i)->combining;
581 end = i + PyUnicode_GET_SIZE(result);
582 for (i++; i < end; i++) {
583 cur = _getrecord_ex(*i)->combining;
584 if (prev == 0 || cur == 0 || prev <= cur) {
585 prev = cur;
586 continue;
587 }
588 /* Non-canonical order. Need to switch *i with previous. */
589 o = i - 1;
590 while (1) {
591 Py_UNICODE tmp = o[1];
592 o[1] = o[0];
593 o[0] = tmp;
594 o--;
595 if (o < PyUnicode_AS_UNICODE(result))
596 break;
597 prev = _getrecord_ex(*o)->combining;
598 if (prev == 0 || prev <= cur)
599 break;
600 }
601 prev = _getrecord_ex(*i)->combining;
602 }
603 return result;
604}
605
606static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000607find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000608{
609 int index;
610 for (index = 0; nfc[index].start; index++) {
611 int start = nfc[index].start;
612 if (code < start)
613 return -1;
614 if (code <= start + nfc[index].count) {
615 int delta = code - start;
616 return nfc[index].index + delta;
617 }
618 }
619 return -1;
620}
621
622static PyObject*
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000623nfc_nfkc(PyObject *self, PyObject *input, int k)
Martin v. Löwis677bde22002-11-23 22:08:15 +0000624{
625 PyObject *result;
626 Py_UNICODE *i, *i1, *o, *end;
627 int f,l,index,index1,comb;
628 Py_UNICODE code;
629 Py_UNICODE *skipped[20];
630 int cskipped = 0;
631
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000632 result = nfd_nfkd(self, input, k);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000633 if (!result)
634 return NULL;
635
636 /* We are going to modify result in-place.
637 If nfd_nfkd is changed to sometimes return the input,
638 this code needs to be reviewed. */
639 assert(result != input);
640
641 i = PyUnicode_AS_UNICODE(result);
642 end = i + PyUnicode_GET_SIZE(result);
643 o = PyUnicode_AS_UNICODE(result);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000644
Martin v. Löwis677bde22002-11-23 22:08:15 +0000645 again:
646 while (i < end) {
647 for (index = 0; index < cskipped; index++) {
648 if (skipped[index] == i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000649 /* *i character is skipped.
Martin v. Löwis677bde22002-11-23 22:08:15 +0000650 Remove from list. */
651 skipped[index] = skipped[cskipped-1];
652 cskipped--;
653 i++;
Martin v. Löwis2fb661f2002-12-07 14:56:36 +0000654 goto again; /* continue while */
Martin v. Löwis677bde22002-11-23 22:08:15 +0000655 }
656 }
657 /* Hangul Composition. We don't need to check for <LV,T>
658 pairs, since we always have decomposed data. */
659 if (LBase <= *i && *i < (LBase+LCount) &&
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000660 i + 1 < end &&
Martin v. Löwis677bde22002-11-23 22:08:15 +0000661 VBase <= i[1] && i[1] <= (VBase+VCount)) {
662 int LIndex, VIndex;
663 LIndex = i[0] - LBase;
664 VIndex = i[1] - VBase;
665 code = SBase + (LIndex*VCount+VIndex)*TCount;
666 i+=2;
667 if (i < end &&
668 TBase <= *i && *i <= (TBase+TCount)) {
669 code += *i-TBase;
670 i++;
671 }
672 *o++ = code;
673 continue;
674 }
675
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000676 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000677 if (f == -1) {
678 *o++ = *i++;
679 continue;
680 }
681 /* Find next unblocked character. */
682 i1 = i+1;
683 comb = 0;
684 while (i1 < end) {
685 int comb1 = _getrecord_ex(*i1)->combining;
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000686 if (comb) {
687 if (comb1 == 0)
688 break;
689 if (comb >= comb1) {
690 /* Character is blocked. */
691 i1++;
692 continue;
693 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000694 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000695 l = find_nfc_index(self, nfc_last, *i1);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000696 /* *i1 cannot be combined with *i. If *i1
697 is a starter, we don't need to look further.
698 Otherwise, record the combining class. */
699 if (l == -1) {
700 not_combinable:
701 if (comb1 == 0)
702 break;
703 comb = comb1;
704 i1++;
705 continue;
706 }
707 index = f*TOTAL_LAST + l;
708 index1 = comp_index[index >> COMP_SHIFT];
709 code = comp_data[(index1<<COMP_SHIFT)+
710 (index&((1<<COMP_SHIFT)-1))];
711 if (code == 0)
712 goto not_combinable;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000713
Martin v. Löwis677bde22002-11-23 22:08:15 +0000714 /* Replace the original character. */
715 *i = code;
716 /* Mark the second character unused. */
Alexander Belopolsky86f65d52010-12-23 02:27:37 +0000717 assert(cskipped < 20);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000718 skipped[cskipped++] = i1;
719 i1++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000720 f = find_nfc_index(self, nfc_first, *i);
Martin v. Löwis677bde22002-11-23 22:08:15 +0000721 if (f == -1)
722 break;
723 }
724 *o++ = *i++;
725 }
726 if (o != end)
727 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
728 return result;
729}
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000730
731/* Return 1 if the input is certainly normalized, 0 if it might not be. */
732static int
733is_normalized(PyObject *self, PyObject *input, int nfc, int k)
734{
735 Py_UNICODE *i, *end;
736 unsigned char prev_combining = 0, quickcheck_mask;
737
738 /* An older version of the database is requested, quickchecks must be
739 disabled. */
740 if (self && UCD_Check(self))
741 return 0;
742
743 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
744 as described in http://unicode.org/reports/tr15/#Annex8. */
745 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
746
747 i = PyUnicode_AS_UNICODE(input);
748 end = i + PyUnicode_GET_SIZE(input);
749 while (i < end) {
750 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
751 unsigned char combining = record->combining;
752 unsigned char quickcheck = record->normalization_quick_check;
753
754 if (quickcheck & quickcheck_mask)
755 return 0; /* this string might need normalization */
756 if (combining && prev_combining > combining)
757 return 0; /* non-canonical sort order, not normalized */
758 prev_combining = combining;
759 }
760 return 1; /* certainly normalized */
761}
762
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +0000763PyDoc_STRVAR(unicodedata_normalize__doc__,
764"normalize(form, unistr)\n\
765\n\
766Return the normal form 'form' for the Unicode string unistr. Valid\n\
767values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
768
Martin v. Löwis677bde22002-11-23 22:08:15 +0000769static PyObject*
770unicodedata_normalize(PyObject *self, PyObject *args)
771{
772 char *form;
773 PyObject *input;
774
Hye-Shik Chang69dc1c82004-07-15 04:30:25 +0000775 if(!PyArg_ParseTuple(args, "sO!:normalize",
Martin v. Löwis677bde22002-11-23 22:08:15 +0000776 &form, &PyUnicode_Type, &input))
777 return NULL;
778
Martin v. Löwis61e40bd2004-04-17 19:36:48 +0000779 if (PyUnicode_GetSize(input) == 0) {
780 /* Special case empty input strings, since resizing
781 them later would cause internal errors. */
782 Py_INCREF(input);
783 return input;
784 }
785
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000786 if (strcmp(form, "NFC") == 0) {
787 if (is_normalized(self, input, 1, 0)) {
788 Py_INCREF(input);
789 return input;
790 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000791 return nfc_nfkc(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000792 }
793 if (strcmp(form, "NFKC") == 0) {
794 if (is_normalized(self, input, 1, 1)) {
795 Py_INCREF(input);
796 return input;
797 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000798 return nfc_nfkc(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000799 }
800 if (strcmp(form, "NFD") == 0) {
801 if (is_normalized(self, input, 0, 0)) {
802 Py_INCREF(input);
803 return input;
804 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000805 return nfd_nfkd(self, input, 0);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000806 }
807 if (strcmp(form, "NFKD") == 0) {
808 if (is_normalized(self, input, 0, 1)) {
809 Py_INCREF(input);
810 return input;
811 }
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000812 return nfd_nfkd(self, input, 1);
Antoine Pitrou7a0fedf2009-04-27 22:31:40 +0000813 }
Martin v. Löwis677bde22002-11-23 22:08:15 +0000814 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
815 return NULL;
816}
817
Fredrik Lundh06d12682001-01-24 07:59:11 +0000818/* -------------------------------------------------------------------- */
819/* unicode character name tables */
820
821/* data file generated by Tools/unicode/makeunicodedata.py */
822#include "unicodename_db.h"
823
824/* -------------------------------------------------------------------- */
825/* database code (cut and pasted from the unidb package) */
826
827static unsigned long
Fredrik Lundhb95896b2001-02-18 22:06:17 +0000828_gethash(const char *s, int len, int scale)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000829{
830 int i;
831 unsigned long h = 0;
832 unsigned long ix;
833 for (i = 0; i < len; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200834 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
Fredrik Lundh06d12682001-01-24 07:59:11 +0000835 ix = h & 0xff000000;
836 if (ix)
837 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
838 }
839 return h;
840}
841
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000842static char *hangul_syllables[][3] = {
843 { "G", "A", "" },
844 { "GG", "AE", "G" },
845 { "N", "YA", "GG" },
846 { "D", "YAE", "GS" },
847 { "DD", "EO", "N", },
848 { "R", "E", "NJ" },
849 { "M", "YEO", "NH" },
850 { "B", "YE", "D" },
851 { "BB", "O", "L" },
852 { "S", "WA", "LG" },
853 { "SS", "WAE", "LM" },
854 { "", "OE", "LB" },
855 { "J", "YO", "LS" },
856 { "JJ", "U", "LT" },
857 { "C", "WEO", "LP" },
858 { "K", "WE", "LH" },
859 { "T", "WI", "M" },
860 { "P", "YU", "B" },
861 { "H", "EU", "BS" },
862 { 0, "YI", "S" },
863 { 0, "I", "SS" },
864 { 0, 0, "NG" },
865 { 0, 0, "J" },
866 { 0, 0, "C" },
867 { 0, 0, "K" },
868 { 0, 0, "T" },
869 { 0, 0, "P" },
870 { 0, 0, "H" }
871};
872
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000873/* These ranges need to match makeunicodedata.py:cjk_ranges. */
Fredrik Lundh06d12682001-01-24 07:59:11 +0000874static int
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000875is_unified_ideograph(Py_UCS4 code)
876{
Martin v. Löwis5cbc71e2010-11-22 09:00:02 +0000877 return
878 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
879 (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */
880 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
881 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
882 (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000883}
884
885static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000886_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000887{
888 int offset;
889 int i;
890 int word;
891 unsigned char* w;
892
Martin v. Löwisc3509122006-03-11 12:16:23 +0000893 if (code >= 0x110000)
894 return 0;
895
Martin v. Löwis1a214512008-06-11 05:26:20 +0000896 if (self && UCD_Check(self)) {
Martin v. Löwisc3509122006-03-11 12:16:23 +0000897 const change_record *old = get_old_record(self, code);
898 if (old->category_changed == 0) {
899 /* unassigned */
900 return 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000901 }
Martin v. Löwisc3509122006-03-11 12:16:23 +0000902 }
903
Martin v. Löwis2f4be4e2002-11-23 17:11:06 +0000904 if (SBase <= code && code < SBase+SCount) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000905 /* Hangul syllable. */
906 int SIndex = code - SBase;
907 int L = SIndex / NCount;
908 int V = (SIndex % NCount) / TCount;
909 int T = SIndex % TCount;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000910
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000911 if (buflen < 27)
912 /* Worst case: HANGUL SYLLABLE <10chars>. */
913 return 0;
914 strcpy(buffer, "HANGUL SYLLABLE ");
915 buffer += 16;
916 strcpy(buffer, hangul_syllables[L][0]);
917 buffer += strlen(hangul_syllables[L][0]);
918 strcpy(buffer, hangul_syllables[V][1]);
919 buffer += strlen(hangul_syllables[V][1]);
920 strcpy(buffer, hangul_syllables[T][2]);
921 buffer += strlen(hangul_syllables[T][2]);
922 *buffer = '\0';
923 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000924 }
925
Martin v. Löwis8d93ca12002-11-23 22:10:29 +0000926 if (is_unified_ideograph(code)) {
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +0000927 if (buflen < 28)
928 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
929 return 0;
930 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
931 return 1;
932 }
933
Fredrik Lundh06d12682001-01-24 07:59:11 +0000934 /* get offset into phrasebook */
935 offset = phrasebook_offset1[(code>>phrasebook_shift)];
936 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
937 (code&((1<<phrasebook_shift)-1))];
938 if (!offset)
939 return 0;
940
941 i = 0;
942
943 for (;;) {
944 /* get word index */
945 word = phrasebook[offset] - phrasebook_short;
946 if (word >= 0) {
947 word = (word << 8) + phrasebook[offset+1];
948 offset += 2;
949 } else
950 word = phrasebook[offset++];
951 if (i) {
952 if (i > buflen)
953 return 0; /* buffer overflow */
954 buffer[i++] = ' ';
955 }
956 /* copy word string from lexicon. the last character in the
957 word has bit 7 set. the last word in a string ends with
958 0x80 */
959 w = lexicon + lexicon_offset[word];
960 while (*w < 128) {
961 if (i >= buflen)
962 return 0; /* buffer overflow */
963 buffer[i++] = *w++;
964 }
965 if (i >= buflen)
966 return 0; /* buffer overflow */
967 buffer[i++] = *w & 127;
968 if (*w == 128)
969 break; /* end of word */
970 }
971
972 return 1;
973}
974
975static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000976_cmpname(PyObject *self, int code, const char* name, int namelen)
Fredrik Lundh06d12682001-01-24 07:59:11 +0000977{
978 /* check if code corresponds to the given name */
979 int i;
980 char buffer[NAME_MAXLEN];
Martin v. Löwis480f1bb2006-03-09 23:38:20 +0000981 if (!_getucname(self, code, buffer, sizeof(buffer)))
Fredrik Lundh06d12682001-01-24 07:59:11 +0000982 return 0;
983 for (i = 0; i < namelen; i++) {
Antoine Pitroued8ba142011-10-04 13:50:21 +0200984 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
Fredrik Lundh06d12682001-01-24 07:59:11 +0000985 return 0;
986 }
987 return buffer[namelen] == '\0';
988}
989
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000990static void
Martin v. Löwis7d41e292002-11-23 12:22:32 +0000991find_syllable(const char *str, int *len, int *pos, int count, int column)
992{
993 int i, len1;
994 *len = -1;
995 for (i = 0; i < count; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000996 char *s = hangul_syllables[i][column];
997 len1 = strlen(s);
998 if (len1 <= *len)
999 continue;
1000 if (strncmp(str, s, len1) == 0) {
1001 *len = len1;
1002 *pos = i;
1003 }
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001004 }
1005 if (*len == -1) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001006 *len = 0;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001007 }
1008}
1009
Fredrik Lundh06d12682001-01-24 07:59:11 +00001010static int
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001011_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
Fredrik Lundh06d12682001-01-24 07:59:11 +00001012{
1013 unsigned int h, v;
1014 unsigned int mask = code_size-1;
1015 unsigned int i, incr;
1016
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001017 /* Check for hangul syllables. */
1018 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001019 int len, L = -1, V = -1, T = -1;
1020 const char *pos = name + 16;
1021 find_syllable(pos, &len, &L, LCount, 0);
1022 pos += len;
1023 find_syllable(pos, &len, &V, VCount, 1);
1024 pos += len;
1025 find_syllable(pos, &len, &T, TCount, 2);
1026 pos += len;
1027 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1028 *code = SBase + (L*VCount+V)*TCount + T;
1029 return 1;
1030 }
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001031 /* Otherwise, it's an illegal syllable name. */
1032 return 0;
1033 }
1034
1035 /* Check for unified ideographs. */
1036 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1037 /* Four or five hexdigits must follow. */
1038 v = 0;
1039 name += 22;
1040 namelen -= 22;
1041 if (namelen != 4 && namelen != 5)
1042 return 0;
1043 while (namelen--) {
1044 v *= 16;
1045 if (*name >= '0' && *name <= '9')
1046 v += *name - '0';
1047 else if (*name >= 'A' && *name <= 'F')
1048 v += *name - 'A' + 10;
1049 else
1050 return 0;
1051 name++;
1052 }
Martin v. Löwis8d93ca12002-11-23 22:10:29 +00001053 if (!is_unified_ideograph(v))
1054 return 0;
Martin v. Löwisef7fe2e2002-11-23 18:01:32 +00001055 *code = v;
1056 return 1;
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001057 }
1058
Fredrik Lundh06d12682001-01-24 07:59:11 +00001059 /* the following is the same as python's dictionary lookup, with
1060 only minor changes. see the makeunicodedata script for more
1061 details */
1062
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001063 h = (unsigned int) _gethash(name, namelen, code_magic);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001064 i = (~h) & mask;
1065 v = code_hash[i];
1066 if (!v)
1067 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001068 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001069 *code = v;
1070 return 1;
1071 }
1072 incr = (h ^ (h >> 3)) & mask;
1073 if (!incr)
1074 incr = mask;
1075 for (;;) {
1076 i = (i + incr) & mask;
1077 v = code_hash[i];
1078 if (!v)
Fredrik Lundhae763672001-02-18 11:41:49 +00001079 return 0;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001080 if (_cmpname(self, v, name, namelen)) {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001081 *code = v;
1082 return 1;
1083 }
1084 incr = incr << 1;
1085 if (incr > mask)
1086 incr = incr ^ code_poly;
1087 }
1088}
1089
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001090static const _PyUnicode_Name_CAPI hashAPI =
Fredrik Lundh06d12682001-01-24 07:59:11 +00001091{
1092 sizeof(_PyUnicode_Name_CAPI),
Andrew MacIntyre74a3bec2002-06-13 11:55:14 +00001093 _getucname,
Fredrik Lundhb95896b2001-02-18 22:06:17 +00001094 _getcode
Fredrik Lundh06d12682001-01-24 07:59:11 +00001095};
1096
1097/* -------------------------------------------------------------------- */
1098/* Python bindings */
1099
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001100PyDoc_STRVAR(unicodedata_name__doc__,
1101"name(unichr[, default])\n\
1102Returns the name assigned to the Unicode character unichr as a\n\
1103string. If no name is defined, default is returned, or, if not\n\
1104given, ValueError is raised.");
1105
Fredrik Lundh06d12682001-01-24 07:59:11 +00001106static PyObject *
1107unicodedata_name(PyObject* self, PyObject* args)
1108{
1109 char name[NAME_MAXLEN];
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001110 Py_UCS4 c;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001111
1112 PyUnicodeObject* v;
1113 PyObject* defobj = NULL;
1114 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1115 return NULL;
1116
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001117 c = getuchar(v);
1118 if (c == (Py_UCS4)-1)
1119 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001120
Walter Dörwaldf342bfc2008-06-03 11:45:02 +00001121 if (!_getucname(self, c, name, sizeof(name))) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001122 if (defobj == NULL) {
1123 PyErr_SetString(PyExc_ValueError, "no such name");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001124 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001125 }
1126 else {
1127 Py_INCREF(defobj);
1128 return defobj;
1129 }
Fredrik Lundh06d12682001-01-24 07:59:11 +00001130 }
1131
Walter Dörwald4254e762007-06-05 16:04:09 +00001132 return PyUnicode_FromString(name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001133}
1134
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001135PyDoc_STRVAR(unicodedata_lookup__doc__,
1136"lookup(name)\n\
1137\n\
1138Look up character by name. If a character with the\n\
1139given name is found, return the corresponding Unicode\n\
1140character. If not found, KeyError is raised.");
1141
Fredrik Lundh06d12682001-01-24 07:59:11 +00001142static PyObject *
1143unicodedata_lookup(PyObject* self, PyObject* args)
1144{
1145 Py_UCS4 code;
Guido van Rossum806c2462007-08-06 23:33:07 +00001146 Py_UNICODE str[2];
Fredrik Lundh06d12682001-01-24 07:59:11 +00001147
1148 char* name;
1149 int namelen;
1150 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1151 return NULL;
1152
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001153 if (!_getcode(self, name, namelen, &code)) {
Guido van Rossum806c2462007-08-06 23:33:07 +00001154 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1155 name);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001156 return NULL;
1157 }
1158
Guido van Rossum806c2462007-08-06 23:33:07 +00001159#ifndef Py_UNICODE_WIDE
1160 if (code >= 0x10000) {
1161 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1162 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1163 return PyUnicode_FromUnicode(str, 2);
1164 }
1165#endif
Fredrik Lundh06d12682001-01-24 07:59:11 +00001166 str[0] = (Py_UNICODE) code;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001167 return PyUnicode_FromUnicode(str, 1);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001168}
1169
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001170/* XXX Add doc strings. */
1171
1172static PyMethodDef unicodedata_functions[] = {
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001173 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1174 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1175 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1176 {"category", unicodedata_category, METH_VARARGS,
1177 unicodedata_category__doc__},
1178 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1179 unicodedata_bidirectional__doc__},
1180 {"combining", unicodedata_combining, METH_VARARGS,
1181 unicodedata_combining__doc__},
1182 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1183 unicodedata_mirrored__doc__},
1184 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1185 unicodedata_east_asian_width__doc__},
1186 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1187 unicodedata_decomposition__doc__},
1188 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1189 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1190 {"normalize", unicodedata_normalize, METH_VARARGS,
1191 unicodedata_normalize__doc__},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001192 {NULL, NULL} /* sentinel */
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001193};
1194
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001195static PyTypeObject UCD_Type = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001196 /* The ob_type field must be initialized in the module init function
1197 * to be portable to Windows without using C++. */
1198 PyVarObject_HEAD_INIT(NULL, 0)
1199 "unicodedata.UCD", /*tp_name*/
1200 sizeof(PreviousDBVersion), /*tp_basicsize*/
1201 0, /*tp_itemsize*/
1202 /* methods */
1203 (destructor)PyObject_Del, /*tp_dealloc*/
1204 0, /*tp_print*/
1205 0, /*tp_getattr*/
1206 0, /*tp_setattr*/
1207 0, /*tp_reserved*/
1208 0, /*tp_repr*/
1209 0, /*tp_as_number*/
1210 0, /*tp_as_sequence*/
1211 0, /*tp_as_mapping*/
1212 0, /*tp_hash*/
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001213 0, /*tp_call*/
1214 0, /*tp_str*/
1215 PyObject_GenericGetAttr,/*tp_getattro*/
1216 0, /*tp_setattro*/
1217 0, /*tp_as_buffer*/
1218 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1219 0, /*tp_doc*/
1220 0, /*tp_traverse*/
1221 0, /*tp_clear*/
1222 0, /*tp_richcompare*/
1223 0, /*tp_weaklistoffset*/
1224 0, /*tp_iter*/
1225 0, /*tp_iternext*/
1226 unicodedata_functions, /*tp_methods*/
1227 DB_members, /*tp_members*/
1228 0, /*tp_getset*/
1229 0, /*tp_base*/
1230 0, /*tp_dict*/
1231 0, /*tp_descr_get*/
1232 0, /*tp_descr_set*/
1233 0, /*tp_dictoffset*/
1234 0, /*tp_init*/
1235 0, /*tp_alloc*/
1236 0, /*tp_new*/
1237 0, /*tp_free*/
1238 0, /*tp_is_gc*/
1239};
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001240
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001241PyDoc_STRVAR(unicodedata_docstring,
1242"This module provides access to the Unicode Character Database which\n\
1243defines character properties for all Unicode characters. The data in\n\
1244this database is based on the UnicodeData.txt file version\n\
Ezio Melotti98d2c0a2011-11-10 09:36:34 +020012456.0.0 which is publically available from ftp://ftp.unicode.org/.\n\
Hye-Shik Changcf18a5d2005-04-04 16:32:07 +00001246\n\
1247The module uses the same names and symbols as defined by the\n\
Ezio Melotti98d2c0a2011-11-10 09:36:34 +02001248UnicodeData File Format 6.0.0 (see\n\
1249http://www.unicode.org/reports/tr44/tr44-6.html).");
Fredrik Lundh06d12682001-01-24 07:59:11 +00001250
Martin v. Löwis1a214512008-06-11 05:26:20 +00001251
1252static struct PyModuleDef unicodedatamodule = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001253 PyModuleDef_HEAD_INIT,
1254 "unicodedata",
1255 unicodedata_docstring,
1256 -1,
1257 unicodedata_functions,
1258 NULL,
1259 NULL,
1260 NULL,
1261 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00001262};
1263
Mark Hammond62b1ab12002-07-23 06:31:15 +00001264PyMODINIT_FUNC
Martin v. Löwis1a214512008-06-11 05:26:20 +00001265PyInit_unicodedata(void)
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001266{
Fred Drakea2bd8d32002-04-03 21:39:26 +00001267 PyObject *m, *v;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001268
Christian Heimes90aa7642007-12-19 02:45:37 +00001269 Py_TYPE(&UCD_Type) = &PyType_Type;
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001270
Martin v. Löwis1a214512008-06-11 05:26:20 +00001271 m = PyModule_Create(&unicodedatamodule);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001272 if (!m)
Martin v. Löwis1a214512008-06-11 05:26:20 +00001273 return NULL;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001274
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001275 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Martin v. Löwis0e2f9b22006-03-10 11:29:32 +00001276 Py_INCREF(&UCD_Type);
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001277 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
Martin v. Löwisb5c980b2002-11-25 09:13:37 +00001278
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001279 /* Previous versions */
1280 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1281 if (v != NULL)
Martin v. Löwis5bd7c022006-03-10 11:20:04 +00001282 PyModule_AddObject(m, "ucd_3_2_0", v);
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001283
Fredrik Lundh06d12682001-01-24 07:59:11 +00001284 /* Export C API */
Benjamin Petersonb173f782009-05-05 22:31:58 +00001285 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
Fred Drakea2bd8d32002-04-03 21:39:26 +00001286 if (v != NULL)
1287 PyModule_AddObject(m, "ucnhash_CAPI", v);
Martin v. Löwis1a214512008-06-11 05:26:20 +00001288 return m;
Guido van Rossum2a70a3a2000-03-10 23:10:21 +00001289}
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001290
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001291/*
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001292Local variables:
1293c-basic-offset: 4
Martin v. Löwis677bde22002-11-23 22:08:15 +00001294indent-tabs-mode: nil
Martin v. Löwis7d41e292002-11-23 12:22:32 +00001295End:
1296*/